diff mbox series

[mm-unstable,v1,1/5] mm/kvm: add mmu_notifier_test_clear_young()

Message ID 20230217041230.2417228-2-yuzhao@google.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series mm/kvm: lockless accessed bit harvest | expand

Commit Message

Yu Zhao Feb. 17, 2023, 4:12 a.m. UTC
mmu_notifier_test_clear_young() allows the caller to safely test and
clear the accessed bit in KVM PTEs without taking the MMU lock.

This patch adds the generic infrastructure to invoke the subsequent
arch-specific patches. The arch-specific implementations generally
rely on two techniques: RCU and cmpxchg. The former protects KVM page
tables from being freed while the latter clears the accessed bit
atomically against both the hardware and other software page table
walkers.

mmu_notifier_test_clear_young() follows two design patterns: fallback
and batching. For any unsupported cases, it can optionally fall back
to mmu_notifier_ops->clear_young(). For a range of KVM PTEs, it can
test or test and clear their accessed bits according to a bitmap
provided by the caller.

mmu_notifier_test_clear_young() always returns 0 if fallback is not
allowed. If fallback happens, its return value is similar to that of
mmu_notifier_clear_young().

The bitmap parameter has the following specifications:
1. The number of bits should be at least (end-start)/PAGE_SIZE.
2. The offset of each bit is relative to the end. E.g., the offset
   corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better
   suit batching while forward looping.
3. For each KVM PTE with the accessed bit set (young), arch-specific
   implementations flip the corresponding bit in the bitmap. It only
   clears the accessed bit if the old value is 1. A caller can test or
   test and clear the accessed bit by setting the corresponding bit in
   the bitmap to 0 or 1, and the new value will be 1 or 0 for a young
   KVM PTE.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 include/linux/kvm_host.h     | 29 ++++++++++++++++++
 include/linux/mmu_notifier.h | 40 +++++++++++++++++++++++++
 mm/mmu_notifier.c            | 26 ++++++++++++++++
 virt/kvm/kvm_main.c          | 58 ++++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+)

Comments

Sean Christopherson Feb. 23, 2023, 5:13 p.m. UTC | #1
On Thu, Feb 16, 2023, Yu Zhao wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 9c60384b5ae0..1b465df4a93d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
>  	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
>  }
>  
> +static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
> +				 unsigned long end, unsigned long *bitmap)
> +{
> +	int i;
> +	int key;
> +	bool success = true;
> +
> +	trace_kvm_age_hva(start, end);
> +
> +	key = srcu_read_lock(&kvm->srcu);
> +
> +	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> +		struct interval_tree_node *node;
> +		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
> +
> +		kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
> +			gfn_t lsb_gfn;
> +			unsigned long hva_start, hva_end;
> +			struct kvm_gfn_range range = {
> +				.slot = container_of(node, struct kvm_memory_slot,
> +						     hva_node[slots->node_idx]),
> +			};
> +
> +			hva_start = max(start, range.slot->userspace_addr);
> +			hva_end = min(end - 1, range.slot->userspace_addr +
> +					       range.slot->npages * PAGE_SIZE - 1);
> +
> +			range.start = hva_to_gfn_memslot(hva_start, range.slot);
> +			range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
> +
> +			if (WARN_ON_ONCE(range.end <= range.start))
> +				continue;

Extend __kvm_handle_hva_range() instead of copy-pasting.  At a very quick glance,
I believe all that is needed is (minus sanity checks):

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d255964ec331..3296ae2cf6fa 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -544,6 +544,7 @@ struct kvm_hva_range {
        hva_handler_t handler;
        on_lock_fn_t on_lock;
        on_unlock_fn_t on_unlock;
+       bool lockless;
        bool flush_on_ret;
        bool may_block;
 };
@@ -616,7 +617,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
                        gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
                        gfn_range.slot = slot;
 
-                       if (!locked) {
+                       if (!range->lockless && !locked) {
                                locked = true;
                                KVM_MMU_LOCK(kvm);
                                if (!IS_KVM_NULL_FN(range->on_lock))

> +
> +			/* see the comments on the generic kvm_arch_has_test_clear_young() */
> +			lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
> +
> +			success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
> +			if (!success)
> +				break;
> +		}
> +	}
> +
> +	srcu_read_unlock(&kvm->srcu, key);
> +
> +	return success;
> +}
Sean Christopherson Feb. 23, 2023, 5:34 p.m. UTC | #2
On Thu, Feb 16, 2023, Yu Zhao wrote:
> +static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
> +					      unsigned long start, unsigned long end,
> +					      unsigned long *bitmap)
> +{
> +	if (kvm_arch_has_test_clear_young())
> +		return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap);
> +
> +	return false;
> +}
> +
>  static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
>  				       struct mm_struct *mm,
>  				       unsigned long address)
> @@ -903,6 +960,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
>  	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
>  	.clear_young		= kvm_mmu_notifier_clear_young,
>  	.test_young		= kvm_mmu_notifier_test_young,
> +	.test_clear_young	= kvm_mmu_notifier_test_clear_young,

I am strongly opposed to adding yet another "young" mmu_notifier hook for this.
I would much rather we extend (and rename) mmu_notifier_clear_young() to take the
bitmap as an optional parameter, and then have KVM hide the details of whether or
not it supports lockless processing of the range/bitmap.

I also think for KVM x86 in particular, this series should first convert to a
lockless walk for aging ranges, and then add the batching.  It might also make
sense to land x86 first and then follow-up with ARM and PPC.  I haven't looked at
the ARM or PPC patches in too much depth, but on the x86 side of things KVM already
has the pieces in place to support a fully lockless walk, i.e. the x86 specific
changes aren't all that contentious, the only thing we need to figure out is what
to do about nested VMs.
Yu Zhao Feb. 23, 2023, 5:40 p.m. UTC | #3
On Thu, Feb 23, 2023 at 10:14 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 16, 2023, Yu Zhao wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 9c60384b5ae0..1b465df4a93d 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> >       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
> >  }
> >
> > +static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
> > +                              unsigned long end, unsigned long *bitmap)
> > +{
> > +     int i;
> > +     int key;
> > +     bool success = true;
> > +
> > +     trace_kvm_age_hva(start, end);
> > +
> > +     key = srcu_read_lock(&kvm->srcu);
> > +
> > +     for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > +             struct interval_tree_node *node;
> > +             struct kvm_memslots *slots = __kvm_memslots(kvm, i);
> > +
> > +             kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
> > +                     gfn_t lsb_gfn;
> > +                     unsigned long hva_start, hva_end;
> > +                     struct kvm_gfn_range range = {
> > +                             .slot = container_of(node, struct kvm_memory_slot,
> > +                                                  hva_node[slots->node_idx]),
> > +                     };
> > +
> > +                     hva_start = max(start, range.slot->userspace_addr);
> > +                     hva_end = min(end - 1, range.slot->userspace_addr +
> > +                                            range.slot->npages * PAGE_SIZE - 1);
> > +
> > +                     range.start = hva_to_gfn_memslot(hva_start, range.slot);
> > +                     range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
> > +
> > +                     if (WARN_ON_ONCE(range.end <= range.start))
> > +                             continue;
>
> Extend __kvm_handle_hva_range() instead of copy-pasting.  At a very quick glance,
> I believe all that is needed is (minus sanity checks):

Yes, will do.

I do need to add one more parameter to kvm_gfn_range, because that's
what the current kvm_arch_test_clear_young() needs, assuming that
function is acceptable.

Also, just a side note, from MM's POV, the following in
__kvm_handle_hva_range() seems to forget to handle end == 0, if that's
possible?

  hva_end = min(range->end, slot->userspace_addr + (slot->npages <<
PAGE_SHIFT));

> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index d255964ec331..3296ae2cf6fa 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -544,6 +544,7 @@ struct kvm_hva_range {
>         hva_handler_t handler;
>         on_lock_fn_t on_lock;
>         on_unlock_fn_t on_unlock;
> +       bool lockless;
>         bool flush_on_ret;
>         bool may_block;
>  };
> @@ -616,7 +617,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
>                         gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
>                         gfn_range.slot = slot;
>
> -                       if (!locked) {
> +                       if (!range->lockless && !locked) {
>                                 locked = true;
>                                 KVM_MMU_LOCK(kvm);
>                                 if (!IS_KVM_NULL_FN(range->on_lock))
>
> > +
> > +                     /* see the comments on the generic kvm_arch_has_test_clear_young() */
> > +                     lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
> > +
> > +                     success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
> > +                     if (!success)
> > +                             break;
> > +             }
> > +     }
> > +
> > +     srcu_read_unlock(&kvm->srcu, key);
> > +
> > +     return success;
> > +}
Sean Christopherson Feb. 23, 2023, 9:12 p.m. UTC | #4
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 10:14 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 16, 2023, Yu Zhao wrote:
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 9c60384b5ae0..1b465df4a93d 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
> > >       return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
> > >  }
> > >
> > > +static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
> > > +                              unsigned long end, unsigned long *bitmap)
> > > +{
> > > +     int i;
> > > +     int key;
> > > +     bool success = true;
> > > +
> > > +     trace_kvm_age_hva(start, end);
> > > +
> > > +     key = srcu_read_lock(&kvm->srcu);
> > > +
> > > +     for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> > > +             struct interval_tree_node *node;
> > > +             struct kvm_memslots *slots = __kvm_memslots(kvm, i);
> > > +
> > > +             kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
> > > +                     gfn_t lsb_gfn;
> > > +                     unsigned long hva_start, hva_end;
> > > +                     struct kvm_gfn_range range = {
> > > +                             .slot = container_of(node, struct kvm_memory_slot,
> > > +                                                  hva_node[slots->node_idx]),
> > > +                     };
> > > +
> > > +                     hva_start = max(start, range.slot->userspace_addr);
> > > +                     hva_end = min(end - 1, range.slot->userspace_addr +
> > > +                                            range.slot->npages * PAGE_SIZE - 1);
> > > +
> > > +                     range.start = hva_to_gfn_memslot(hva_start, range.slot);
> > > +                     range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
> > > +
> > > +                     if (WARN_ON_ONCE(range.end <= range.start))
> > > +                             continue;
> >
> > Extend __kvm_handle_hva_range() instead of copy-pasting.  At a very quick glance,
> > I believe all that is needed is (minus sanity checks):
> 
> Yes, will do.
> 
> I do need to add one more parameter to kvm_gfn_range, because that's
> what the current kvm_arch_test_clear_young() needs, assuming that
> function is acceptable.
> 
> Also, just a side note, from MM's POV, the following in
> __kvm_handle_hva_range() seems to forget to handle end == 0, if that's
> possible?

It's handled by the WARN_ON_ONCE() at the very top:

static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
						  const struct kvm_hva_range *range)
{
	if (WARN_ON_ONCE(range->end <= range->start))
		return 0;


> 
>   hva_end = min(range->end, slot->userspace_addr + (slot->npages <<
> PAGE_SHIFT));
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4f26b244f6d0..df46fc815c8b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2281,4 +2281,33 @@  static inline void kvm_account_pgtable_pages(void *virt, int nr)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+/*
+ * Architectures that implement kvm_arch_test_clear_young() should override
+ * kvm_arch_has_test_clear_young().
+ *
+ * kvm_arch_has_test_clear_young() is allowed to return false positive. It can
+ * return true if kvm_arch_test_clear_young() is supported but disabled due to
+ * some runtime constraint. In this case, kvm_arch_test_clear_young() should
+ * return false.
+ *
+ * The last parameter to kvm_arch_test_clear_young() is a bitmap with the
+ * following specifications:
+ * 1. The offset of each bit is relative to the second to the last parameter
+ *    lsb_gfn. E.g., the offset corresponding to gfn is lsb_gfn-gfn. This is to
+ *    better suit batching while forward looping.
+ * 2. For each KVM PTE with the accessed bit set, the implementation should flip
+ *    the corresponding bit in the bitmap. It should only clear the accessed bit
+ *    if the old value is 1. This allows the caller to test or test and clear
+ *    the accessed bit.
+ */
+#ifndef kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return false;
+}
+#endif
+
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
+			       gfn_t lsb_gfn, unsigned long *bitmap);
+
 #endif
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 64a3e051c3c4..432b51cd6843 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -122,6 +122,11 @@  struct mmu_notifier_ops {
 			  struct mm_struct *mm,
 			  unsigned long address);
 
+	/* see the comments on mmu_notifier_test_clear_young() */
+	bool (*test_clear_young)(struct mmu_notifier *mn, struct mm_struct *mm,
+				 unsigned long start, unsigned long end,
+				 unsigned long *bitmap);
+
 	/*
 	 * change_pte is called in cases that pte mapping to page is changed:
 	 * for example, when ksm remaps pte to point to a new shared page.
@@ -390,6 +395,9 @@  extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 extern int __mmu_notifier_clear_young(struct mm_struct *mm,
 				      unsigned long start,
 				      unsigned long end);
+extern int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+					   unsigned long start, unsigned long end,
+					   bool fallback, unsigned long *bitmap);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -432,6 +440,31 @@  static inline int mmu_notifier_clear_young(struct mm_struct *mm,
 	return 0;
 }
 
+/*
+ * This function always returns 0 if fallback is not allowed. If fallback
+ * happens, its return value is similar to that of mmu_notifier_clear_young().
+ *
+ * The bitmap has the following specifications:
+ * 1. The number of bits should be at least (end-start)/PAGE_SIZE.
+ * 2. The offset of each bit is relative to the end. E.g., the offset
+ *    corresponding to addr is (end-addr)/PAGE_SIZE-1. This is to better suit
+ *    batching while forward looping.
+ * 3. For each KVM PTE with the accessed bit set (young), this function flips
+ *    the corresponding bit in the bitmap. It only clears the accessed bit if
+ *    the old value is 1. A caller can test or test and clear the accessed bit
+ *    by setting the corresponding bit in the bitmap to 0 or 1, and the new
+ *    value will be 1 or 0 for a young KVM PTE.
+ */
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool fallback, unsigned long *bitmap)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_test_clear_young(mm, start, end, fallback, bitmap);
+
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -684,6 +717,13 @@  static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_test_clear_young(struct mm_struct *mm,
+						unsigned long start, unsigned long end,
+						bool fallback, unsigned long *bitmap)
+{
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 50c0dde1354f..dd39b9b4d6d3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -402,6 +402,32 @@  int __mmu_notifier_clear_young(struct mm_struct *mm,
 	return young;
 }
 
+/* see the comments on mmu_notifier_test_clear_young() */
+int __mmu_notifier_test_clear_young(struct mm_struct *mm,
+				    unsigned long start, unsigned long end,
+				    bool fallback, unsigned long *bitmap)
+{
+	int key;
+	struct mmu_notifier *mn;
+	int young = 0;
+
+	key = srcu_read_lock(&srcu);
+
+	hlist_for_each_entry_srcu(mn, &mm->notifier_subscriptions->list,
+				  hlist, srcu_read_lock_held(&srcu)) {
+		if (mn->ops->test_clear_young &&
+		    mn->ops->test_clear_young(mn, mm, start, end, bitmap))
+			continue;
+
+		if (fallback && mn->ops->clear_young)
+			young |= mn->ops->clear_young(mn, mm, start, end);
+	}
+
+	srcu_read_unlock(&srcu, key);
+
+	return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
 			      unsigned long address)
 {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9c60384b5ae0..1b465df4a93d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -875,6 +875,63 @@  static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
 	return kvm_handle_hva_range_no_flush(mn, start, end, kvm_age_gfn);
 }
 
+static bool kvm_test_clear_young(struct kvm *kvm, unsigned long start,
+				 unsigned long end, unsigned long *bitmap)
+{
+	int i;
+	int key;
+	bool success = true;
+
+	trace_kvm_age_hva(start, end);
+
+	key = srcu_read_lock(&kvm->srcu);
+
+	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+		struct interval_tree_node *node;
+		struct kvm_memslots *slots = __kvm_memslots(kvm, i);
+
+		kvm_for_each_memslot_in_hva_range(node, slots, start, end - 1) {
+			gfn_t lsb_gfn;
+			unsigned long hva_start, hva_end;
+			struct kvm_gfn_range range = {
+				.slot = container_of(node, struct kvm_memory_slot,
+						     hva_node[slots->node_idx]),
+			};
+
+			hva_start = max(start, range.slot->userspace_addr);
+			hva_end = min(end - 1, range.slot->userspace_addr +
+					       range.slot->npages * PAGE_SIZE - 1);
+
+			range.start = hva_to_gfn_memslot(hva_start, range.slot);
+			range.end = hva_to_gfn_memslot(hva_end, range.slot) + 1;
+
+			if (WARN_ON_ONCE(range.end <= range.start))
+				continue;
+
+			/* see the comments on the generic kvm_arch_has_test_clear_young() */
+			lsb_gfn = hva_to_gfn_memslot(end - 1, range.slot);
+
+			success = kvm_arch_test_clear_young(kvm, &range, lsb_gfn, bitmap);
+			if (!success)
+				break;
+		}
+	}
+
+	srcu_read_unlock(&kvm->srcu, key);
+
+	return success;
+}
+
+static bool kvm_mmu_notifier_test_clear_young(struct mmu_notifier *mn, struct mm_struct *mm,
+					      unsigned long start, unsigned long end,
+					      unsigned long *bitmap)
+{
+	if (kvm_arch_has_test_clear_young())
+		return kvm_test_clear_young(mmu_notifier_to_kvm(mn), start, end, bitmap);
+
+	return false;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
@@ -903,6 +960,7 @@  static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
 	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
+	.test_clear_young	= kvm_mmu_notifier_test_clear_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
 };