diff mbox series

[v2,20/26] KVM: x86/mmu: Extend Eager Page Splitting to the shadow MMU

Message ID 20220311002528.2230172-21-dmatlack@google.com
State Changes Requested
Headers show
Series Extend Eager Page Splitting to the shadow MMU | expand

Commit Message

David Matlack March 11, 2022, 12:25 a.m. UTC
Extend KVM's eager page splitting to also split huge pages that are
mapped by the shadow MMU. Specifically, walk through the rmap splitting
all 1GiB pages to 2MiB pages, and splitting all 2MiB pages to 4KiB
pages.

Splitting huge pages mapped by the shadow MMU requries dealing with some
extra complexity beyond that of the TDP MMU:

(1) The shadow MMU has a limit on the number of shadow pages that are
    allowed to be allocated. So, as a policy, Eager Page Splitting
    refuses to split if there are KVM_MIN_FREE_MMU_PAGES or fewer
    pages available.

(2) Huge pages may be mapped by indirect shadow pages which have the
    possibility of being unsync. As a policy we opt not to split such
    pages as their translation may no longer be valid.

(3) Splitting a huge page may end up re-using an existing lower level
    shadow page tables. This is unlike the TDP MMU which always allocates
    new shadow page tables when splitting.  This commit does *not*
    handle such aliasing and opts not to split such huge pages.

(4) When installing the lower level SPTEs, they must be added to the
    rmap which may require allocating additional pte_list_desc structs.
    This commit does *not* handle such cases and instead opts to leave
    such lower-level SPTEs non-present. In this situation TLBs must be
    flushed before dropping the MMU lock as a portion of the huge page
    region is being unmapped.

Suggested-by: Peter Feiner <pfeiner@google.com>
[ This commit is based off of the original implementation of Eager Page
  Splitting from Peter in Google's kernel from 2016. ]
Signed-off-by: David Matlack <dmatlack@google.com>
---
 .../admin-guide/kernel-parameters.txt         |   3 -
 arch/x86/kvm/mmu/mmu.c                        | 307 ++++++++++++++++++
 2 files changed, 307 insertions(+), 3 deletions(-)

Comments

Peter Xu March 16, 2022, 10:26 a.m. UTC | #1
On Fri, Mar 11, 2022 at 12:25:22AM +0000, David Matlack wrote:
> Extend KVM's eager page splitting to also split huge pages that are
> mapped by the shadow MMU. Specifically, walk through the rmap splitting
> all 1GiB pages to 2MiB pages, and splitting all 2MiB pages to 4KiB
> pages.
> 
> Splitting huge pages mapped by the shadow MMU requries dealing with some
> extra complexity beyond that of the TDP MMU:
> 
> (1) The shadow MMU has a limit on the number of shadow pages that are
>     allowed to be allocated. So, as a policy, Eager Page Splitting
>     refuses to split if there are KVM_MIN_FREE_MMU_PAGES or fewer
>     pages available.
> 
> (2) Huge pages may be mapped by indirect shadow pages which have the
>     possibility of being unsync. As a policy we opt not to split such
>     pages as their translation may no longer be valid.
> 
> (3) Splitting a huge page may end up re-using an existing lower level
>     shadow page tables. This is unlike the TDP MMU which always allocates
>     new shadow page tables when splitting.  This commit does *not*
>     handle such aliasing and opts not to split such huge pages.
> 
> (4) When installing the lower level SPTEs, they must be added to the
>     rmap which may require allocating additional pte_list_desc structs.
>     This commit does *not* handle such cases and instead opts to leave
>     such lower-level SPTEs non-present. In this situation TLBs must be
>     flushed before dropping the MMU lock as a portion of the huge page
>     region is being unmapped.
> 
> Suggested-by: Peter Feiner <pfeiner@google.com>
> [ This commit is based off of the original implementation of Eager Page
>   Splitting from Peter in Google's kernel from 2016. ]
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  .../admin-guide/kernel-parameters.txt         |   3 -
>  arch/x86/kvm/mmu/mmu.c                        | 307 ++++++++++++++++++
>  2 files changed, 307 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 05161afd7642..495f6ac53801 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -2360,9 +2360,6 @@
>  			the KVM_CLEAR_DIRTY ioctl, and only for the pages being
>  			cleared.
>  
> -			Eager page splitting currently only supports splitting
> -			huge pages mapped by the TDP MMU.
> -
>  			Default is Y (on).
>  
>  	kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 926ddfaa9e1a..dd56b5b9624f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -727,6 +727,11 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
>  
>  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_mmu_memory_cache *cache)
>  {
> +	static const gfp_t gfp_nocache = GFP_ATOMIC | __GFP_ACCOUNT | __GFP_ZERO;
> +
> +	if (WARN_ON_ONCE(!cache))
> +		return kmem_cache_alloc(pte_list_desc_cache, gfp_nocache);
> +

I also think this is not proper to be added into this patch.  Maybe it'll
be more suitable for the rmap_add() rework patch previously, or maybe it
can be dropped directly if it should never trigger at all. Then we die hard
at below when referencing it.

>  	return kvm_mmu_memory_cache_alloc(cache);
>  }
>  
> @@ -743,6 +748,28 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
>  	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
>  }
>  
> +static gfn_t sptep_to_gfn(u64 *sptep)
> +{
> +	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> +
> +	return kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> +}
> +
> +static unsigned int kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
> +{
> +	if (!sp->role.direct)
> +		return sp->shadowed_translation[index].access;
> +
> +	return sp->role.access;
> +}
> +
> +static unsigned int sptep_to_access(u64 *sptep)
> +{
> +	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> +
> +	return kvm_mmu_page_get_access(sp, sptep - sp->spt);
> +}
> +
>  static void kvm_mmu_page_set_gfn_access(struct kvm_mmu_page *sp, int index,
>  					gfn_t gfn, u32 access)
>  {
> @@ -912,6 +939,9 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
>  	return count;
>  }
>  
> +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
> +					 const struct kvm_memory_slot *slot);
> +
>  static void
>  pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
>  			   struct pte_list_desc *desc, int i,
> @@ -2125,6 +2155,23 @@ static struct kvm_mmu_page *__kvm_mmu_find_shadow_page(struct kvm *kvm,
>  	return sp;
>  }
>  
> +static struct kvm_mmu_page *kvm_mmu_find_direct_sp(struct kvm *kvm, gfn_t gfn,
> +						   union kvm_mmu_page_role role)
> +{
> +	struct kvm_mmu_page *sp;
> +	LIST_HEAD(invalid_list);
> +
> +	BUG_ON(!role.direct);
> +
> +	sp = __kvm_mmu_find_shadow_page(kvm, gfn, role, &invalid_list);
> +
> +	/* Direct SPs are never unsync. */
> +	WARN_ON_ONCE(sp && sp->unsync);
> +
> +	kvm_mmu_commit_zap_page(kvm, &invalid_list);
> +	return sp;
> +}
> +
>  /*
>   * Looks up an existing SP for the given gfn and role if one exists. The
>   * return SP is guaranteed to be synced.
> @@ -6063,12 +6110,266 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
>  		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
>  }
>  
> +static int prepare_to_split_huge_page(struct kvm *kvm,
> +				      const struct kvm_memory_slot *slot,
> +				      u64 *huge_sptep,
> +				      struct kvm_mmu_page **spp,
> +				      bool *flush,
> +				      bool *dropped_lock)
> +{
> +	int r = 0;
> +
> +	*dropped_lock = false;
> +
> +	if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
> +		return -ENOSPC;
> +
> +	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
> +		goto drop_lock;
> +

Not immediately clear on whether there'll be case that *spp is set within
the current function.  Some sanity check might be nice?

> +	*spp = kvm_mmu_alloc_direct_sp_for_split(true);
> +	if (r)
> +		goto drop_lock;
> +
> +	return 0;
> +
> +drop_lock:
> +	if (*flush)
> +		kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
> +
> +	*flush = false;
> +	*dropped_lock = true;
> +
> +	write_unlock(&kvm->mmu_lock);
> +	cond_resched();
> +	*spp = kvm_mmu_alloc_direct_sp_for_split(false);
> +	if (!*spp)
> +		r = -ENOMEM;
> +	write_lock(&kvm->mmu_lock);
> +
> +	return r;
> +}
> +
> +static struct kvm_mmu_page *kvm_mmu_get_sp_for_split(struct kvm *kvm,
> +						     const struct kvm_memory_slot *slot,
> +						     u64 *huge_sptep,
> +						     struct kvm_mmu_page **spp)
> +{
> +	struct kvm_mmu_page *split_sp;
> +	union kvm_mmu_page_role role;
> +	unsigned int access;
> +	gfn_t gfn;
> +
> +	gfn = sptep_to_gfn(huge_sptep);
> +	access = sptep_to_access(huge_sptep);
> +
> +	/*
> +	 * Huge page splitting always uses direct shadow pages since we are
> +	 * directly mapping the huge page GFN region with smaller pages.
> +	 */
> +	role = kvm_mmu_child_role(huge_sptep, true, access);
> +	split_sp = kvm_mmu_find_direct_sp(kvm, gfn, role);
> +
> +	/*
> +	 * Opt not to split if the lower-level SP already exists. This requires
> +	 * more complex handling as the SP may be already partially filled in
> +	 * and may need extra pte_list_desc structs to update parent_ptes.
> +	 */
> +	if (split_sp)
> +		return NULL;

This smells tricky..

Firstly we're trying to lookup the existing SPs that has shadowed this huge
page in split way, with the access bits fetched from the shadow cache (so
without hugepage nx effect).  However could the pages be mapped with
different permissions from the currently hugely mapped page?

IIUC all these is for the fact that we can't allocate pte_list_desc and we
want to make sure we won't make the pte list to be >1.

But I also see that the pte_list check below...

> +
> +	swap(split_sp, *spp);
> +	init_shadow_page(kvm, split_sp, slot, gfn, role);
> +	trace_kvm_mmu_get_page(split_sp, true);
> +
> +	return split_sp;
> +}
> +
> +static int kvm_mmu_split_huge_page(struct kvm *kvm,
> +				   const struct kvm_memory_slot *slot,
> +				   u64 *huge_sptep, struct kvm_mmu_page **spp,
> +				   bool *flush)
> +
> +{
> +	struct kvm_mmu_page *split_sp;
> +	u64 huge_spte, split_spte;
> +	int split_level, index;
> +	unsigned int access;
> +	u64 *split_sptep;
> +	gfn_t split_gfn;
> +
> +	split_sp = kvm_mmu_get_sp_for_split(kvm, slot, huge_sptep, spp);
> +	if (!split_sp)
> +		return -EOPNOTSUPP;
> +
> +	/*
> +	 * Since we did not allocate pte_list_desc_structs for the split, we
> +	 * cannot add a new parent SPTE to parent_ptes. This should never happen
> +	 * in practice though since this is a fresh SP.
> +	 *
> +	 * Note, this makes it safe to pass NULL to __link_shadow_page() below.
> +	 */
> +	if (WARN_ON_ONCE(split_sp->parent_ptes.val))
> +		return -EINVAL;
> +
> +	huge_spte = READ_ONCE(*huge_sptep);
> +
> +	split_level = split_sp->role.level;
> +	access = split_sp->role.access;
> +
> +	for (index = 0; index < PT64_ENT_PER_PAGE; index++) {
> +		split_sptep = &split_sp->spt[index];
> +		split_gfn = kvm_mmu_page_get_gfn(split_sp, index);
> +
> +		BUG_ON(is_shadow_present_pte(*split_sptep));
> +
> +		/*
> +		 * Since we did not allocate pte_list_desc structs for the
> +		 * split, we can't add a new SPTE that maps this GFN.
> +		 * Skipping this SPTE means we're only partially mapping the
> +		 * huge page, which means we'll need to flush TLBs before
> +		 * dropping the MMU lock.
> +		 *
> +		 * Note, this make it safe to pass NULL to __rmap_add() below.
> +		 */
> +		if (gfn_to_rmap(split_gfn, split_level, slot)->val) {
> +			*flush = true;
> +			continue;
> +		}

... here.

IIUC this check should already be able to cover all the cases and it's
accurate on the fact that we don't want to grow any rmap to >1 len.

> +
> +		split_spte = make_huge_page_split_spte(
> +				huge_spte, split_level + 1, index, access);
> +
> +		mmu_spte_set(split_sptep, split_spte);
> +		__rmap_add(kvm, NULL, slot, split_sptep, split_gfn, access);

__rmap_add() with a NULL cache pointer is weird.. same as
__link_shadow_page() below.

I'll stop here for now I guess.. Have you considered having rmap allocation
ready altogether, rather than making this intermediate step but only add
that later?  Because all these look hackish to me..  It's also possible
that I missed something important, if so please shoot.

Thanks,

> +	}
> +
> +	/*
> +	 * Replace the huge spte with a pointer to the populated lower level
> +	 * page table. Since we are making this change without a TLB flush vCPUs
> +	 * will see a mix of the split mappings and the original huge mapping,
> +	 * depending on what's currently in their TLB. This is fine from a
> +	 * correctness standpoint since the translation will either be identical
> +	 * or non-present. To account for non-present mappings, the TLB will be
> +	 * flushed prior to dropping the MMU lock.
> +	 */
> +	__drop_large_spte(kvm, huge_sptep, false);
> +	__link_shadow_page(NULL, huge_sptep, split_sp);
> +
> +	return 0;
> +}
David Matlack March 22, 2022, 12:07 a.m. UTC | #2
On Wed, Mar 16, 2022 at 3:27 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Fri, Mar 11, 2022 at 12:25:22AM +0000, David Matlack wrote:
> > Extend KVM's eager page splitting to also split huge pages that are
> > mapped by the shadow MMU. Specifically, walk through the rmap splitting
> > all 1GiB pages to 2MiB pages, and splitting all 2MiB pages to 4KiB
> > pages.
> >
> > Splitting huge pages mapped by the shadow MMU requries dealing with some
> > extra complexity beyond that of the TDP MMU:
> >
> > (1) The shadow MMU has a limit on the number of shadow pages that are
> >     allowed to be allocated. So, as a policy, Eager Page Splitting
> >     refuses to split if there are KVM_MIN_FREE_MMU_PAGES or fewer
> >     pages available.
> >
> > (2) Huge pages may be mapped by indirect shadow pages which have the
> >     possibility of being unsync. As a policy we opt not to split such
> >     pages as their translation may no longer be valid.
> >
> > (3) Splitting a huge page may end up re-using an existing lower level
> >     shadow page tables. This is unlike the TDP MMU which always allocates
> >     new shadow page tables when splitting.  This commit does *not*
> >     handle such aliasing and opts not to split such huge pages.
> >
> > (4) When installing the lower level SPTEs, they must be added to the
> >     rmap which may require allocating additional pte_list_desc structs.
> >     This commit does *not* handle such cases and instead opts to leave
> >     such lower-level SPTEs non-present. In this situation TLBs must be
> >     flushed before dropping the MMU lock as a portion of the huge page
> >     region is being unmapped.
> >
> > Suggested-by: Peter Feiner <pfeiner@google.com>
> > [ This commit is based off of the original implementation of Eager Page
> >   Splitting from Peter in Google's kernel from 2016. ]
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  .../admin-guide/kernel-parameters.txt         |   3 -
> >  arch/x86/kvm/mmu/mmu.c                        | 307 ++++++++++++++++++
> >  2 files changed, 307 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 05161afd7642..495f6ac53801 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -2360,9 +2360,6 @@
> >                       the KVM_CLEAR_DIRTY ioctl, and only for the pages being
> >                       cleared.
> >
> > -                     Eager page splitting currently only supports splitting
> > -                     huge pages mapped by the TDP MMU.
> > -
> >                       Default is Y (on).
> >
> >       kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 926ddfaa9e1a..dd56b5b9624f 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -727,6 +727,11 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >
> >  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_mmu_memory_cache *cache)
> >  {
> > +     static const gfp_t gfp_nocache = GFP_ATOMIC | __GFP_ACCOUNT | __GFP_ZERO;
> > +
> > +     if (WARN_ON_ONCE(!cache))
> > +             return kmem_cache_alloc(pte_list_desc_cache, gfp_nocache);
> > +
>
> I also think this is not proper to be added into this patch.  Maybe it'll
> be more suitable for the rmap_add() rework patch previously, or maybe it
> can be dropped directly if it should never trigger at all. Then we die hard
> at below when referencing it.
>
> >       return kvm_mmu_memory_cache_alloc(cache);
> >  }
> >
> > @@ -743,6 +748,28 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
> >       return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
> >  }
> >
> > +static gfn_t sptep_to_gfn(u64 *sptep)
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> > +
> > +     return kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > +}
> > +
> > +static unsigned int kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
> > +{
> > +     if (!sp->role.direct)
> > +             return sp->shadowed_translation[index].access;
> > +
> > +     return sp->role.access;
> > +}
> > +
> > +static unsigned int sptep_to_access(u64 *sptep)
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> > +
> > +     return kvm_mmu_page_get_access(sp, sptep - sp->spt);
> > +}
> > +
> >  static void kvm_mmu_page_set_gfn_access(struct kvm_mmu_page *sp, int index,
> >                                       gfn_t gfn, u32 access)
> >  {
> > @@ -912,6 +939,9 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
> >       return count;
> >  }
> >
> > +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
> > +                                      const struct kvm_memory_slot *slot);
> > +
> >  static void
> >  pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
> >                          struct pte_list_desc *desc, int i,
> > @@ -2125,6 +2155,23 @@ static struct kvm_mmu_page *__kvm_mmu_find_shadow_page(struct kvm *kvm,
> >       return sp;
> >  }
> >
> > +static struct kvm_mmu_page *kvm_mmu_find_direct_sp(struct kvm *kvm, gfn_t gfn,
> > +                                                union kvm_mmu_page_role role)
> > +{
> > +     struct kvm_mmu_page *sp;
> > +     LIST_HEAD(invalid_list);
> > +
> > +     BUG_ON(!role.direct);
> > +
> > +     sp = __kvm_mmu_find_shadow_page(kvm, gfn, role, &invalid_list);
> > +
> > +     /* Direct SPs are never unsync. */
> > +     WARN_ON_ONCE(sp && sp->unsync);
> > +
> > +     kvm_mmu_commit_zap_page(kvm, &invalid_list);
> > +     return sp;
> > +}
> > +
> >  /*
> >   * Looks up an existing SP for the given gfn and role if one exists. The
> >   * return SP is guaranteed to be synced.
> > @@ -6063,12 +6110,266 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> >               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
> >  }
> >
> > +static int prepare_to_split_huge_page(struct kvm *kvm,
> > +                                   const struct kvm_memory_slot *slot,
> > +                                   u64 *huge_sptep,
> > +                                   struct kvm_mmu_page **spp,
> > +                                   bool *flush,
> > +                                   bool *dropped_lock)
> > +{
> > +     int r = 0;
> > +
> > +     *dropped_lock = false;
> > +
> > +     if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
> > +             return -ENOSPC;
> > +
> > +     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
> > +             goto drop_lock;
> > +
>
> Not immediately clear on whether there'll be case that *spp is set within
> the current function.  Some sanity check might be nice?
>
> > +     *spp = kvm_mmu_alloc_direct_sp_for_split(true);
> > +     if (r)
> > +             goto drop_lock;
> > +
> > +     return 0;
> > +
> > +drop_lock:
> > +     if (*flush)
> > +             kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
> > +
> > +     *flush = false;
> > +     *dropped_lock = true;
> > +
> > +     write_unlock(&kvm->mmu_lock);
> > +     cond_resched();
> > +     *spp = kvm_mmu_alloc_direct_sp_for_split(false);
> > +     if (!*spp)
> > +             r = -ENOMEM;
> > +     write_lock(&kvm->mmu_lock);
> > +
> > +     return r;
> > +}
> > +
> > +static struct kvm_mmu_page *kvm_mmu_get_sp_for_split(struct kvm *kvm,
> > +                                                  const struct kvm_memory_slot *slot,
> > +                                                  u64 *huge_sptep,
> > +                                                  struct kvm_mmu_page **spp)
> > +{
> > +     struct kvm_mmu_page *split_sp;
> > +     union kvm_mmu_page_role role;
> > +     unsigned int access;
> > +     gfn_t gfn;
> > +
> > +     gfn = sptep_to_gfn(huge_sptep);
> > +     access = sptep_to_access(huge_sptep);
> > +
> > +     /*
> > +      * Huge page splitting always uses direct shadow pages since we are
> > +      * directly mapping the huge page GFN region with smaller pages.
> > +      */
> > +     role = kvm_mmu_child_role(huge_sptep, true, access);
> > +     split_sp = kvm_mmu_find_direct_sp(kvm, gfn, role);
> > +
> > +     /*
> > +      * Opt not to split if the lower-level SP already exists. This requires
> > +      * more complex handling as the SP may be already partially filled in
> > +      * and may need extra pte_list_desc structs to update parent_ptes.
> > +      */
> > +     if (split_sp)
> > +             return NULL;
>
> This smells tricky..
>
> Firstly we're trying to lookup the existing SPs that has shadowed this huge
> page in split way, with the access bits fetched from the shadow cache (so
> without hugepage nx effect).  However could the pages be mapped with
> different permissions from the currently hugely mapped page?
>
> IIUC all these is for the fact that we can't allocate pte_list_desc and we
> want to make sure we won't make the pte list to be >1.
>
> But I also see that the pte_list check below...
>
> > +
> > +     swap(split_sp, *spp);
> > +     init_shadow_page(kvm, split_sp, slot, gfn, role);
> > +     trace_kvm_mmu_get_page(split_sp, true);
> > +
> > +     return split_sp;
> > +}
> > +
> > +static int kvm_mmu_split_huge_page(struct kvm *kvm,
> > +                                const struct kvm_memory_slot *slot,
> > +                                u64 *huge_sptep, struct kvm_mmu_page **spp,
> > +                                bool *flush)
> > +
> > +{
> > +     struct kvm_mmu_page *split_sp;
> > +     u64 huge_spte, split_spte;
> > +     int split_level, index;
> > +     unsigned int access;
> > +     u64 *split_sptep;
> > +     gfn_t split_gfn;
> > +
> > +     split_sp = kvm_mmu_get_sp_for_split(kvm, slot, huge_sptep, spp);
> > +     if (!split_sp)
> > +             return -EOPNOTSUPP;
> > +
> > +     /*
> > +      * Since we did not allocate pte_list_desc_structs for the split, we
> > +      * cannot add a new parent SPTE to parent_ptes. This should never happen
> > +      * in practice though since this is a fresh SP.
> > +      *
> > +      * Note, this makes it safe to pass NULL to __link_shadow_page() below.
> > +      */
> > +     if (WARN_ON_ONCE(split_sp->parent_ptes.val))
> > +             return -EINVAL;
> > +
> > +     huge_spte = READ_ONCE(*huge_sptep);
> > +
> > +     split_level = split_sp->role.level;
> > +     access = split_sp->role.access;
> > +
> > +     for (index = 0; index < PT64_ENT_PER_PAGE; index++) {
> > +             split_sptep = &split_sp->spt[index];
> > +             split_gfn = kvm_mmu_page_get_gfn(split_sp, index);
> > +
> > +             BUG_ON(is_shadow_present_pte(*split_sptep));
> > +
> > +             /*
> > +              * Since we did not allocate pte_list_desc structs for the
> > +              * split, we can't add a new SPTE that maps this GFN.
> > +              * Skipping this SPTE means we're only partially mapping the
> > +              * huge page, which means we'll need to flush TLBs before
> > +              * dropping the MMU lock.
> > +              *
> > +              * Note, this make it safe to pass NULL to __rmap_add() below.
> > +              */
> > +             if (gfn_to_rmap(split_gfn, split_level, slot)->val) {
> > +                     *flush = true;
> > +                     continue;
> > +             }
>
> ... here.
>
> IIUC this check should already be able to cover all the cases and it's
> accurate on the fact that we don't want to grow any rmap to >1 len.
>
> > +
> > +             split_spte = make_huge_page_split_spte(
> > +                             huge_spte, split_level + 1, index, access);
> > +
> > +             mmu_spte_set(split_sptep, split_spte);
> > +             __rmap_add(kvm, NULL, slot, split_sptep, split_gfn, access);
>
> __rmap_add() with a NULL cache pointer is weird.. same as
> __link_shadow_page() below.
>
> I'll stop here for now I guess.. Have you considered having rmap allocation
> ready altogether, rather than making this intermediate step but only add
> that later?  Because all these look hackish to me..  It's also possible
> that I missed something important, if so please shoot.

I'd be happy to do it that way. The reasons I broke it up into the
intermediate steps are:
 - At Google we only support up to including this patch. We don't
handle the cases where the rmap or parent_ptes list need to grow.
 - It seemed like a good way to break up the support into smaller
patches. But I think this backfired since the intermediate steps
introduce their own complexity such as passing in NULL to
__rmap_add().

>
> Thanks,
>
> > +     }
> > +
> > +     /*
> > +      * Replace the huge spte with a pointer to the populated lower level
> > +      * page table. Since we are making this change without a TLB flush vCPUs
> > +      * will see a mix of the split mappings and the original huge mapping,
> > +      * depending on what's currently in their TLB. This is fine from a
> > +      * correctness standpoint since the translation will either be identical
> > +      * or non-present. To account for non-present mappings, the TLB will be
> > +      * flushed prior to dropping the MMU lock.
> > +      */
> > +     __drop_large_spte(kvm, huge_sptep, false);
> > +     __link_shadow_page(NULL, huge_sptep, split_sp);
> > +
> > +     return 0;
> > +}
>
> --
> Peter Xu
>
David Matlack March 22, 2022, 11:58 p.m. UTC | #3
(On Wed, Mar 16, 2022 at 3:27 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Fri, Mar 11, 2022 at 12:25:22AM +0000, David Matlack wrote:
> > Extend KVM's eager page splitting to also split huge pages that are
> > mapped by the shadow MMU. Specifically, walk through the rmap splitting
> > all 1GiB pages to 2MiB pages, and splitting all 2MiB pages to 4KiB
> > pages.
> >
> > Splitting huge pages mapped by the shadow MMU requries dealing with some
> > extra complexity beyond that of the TDP MMU:
> >
> > (1) The shadow MMU has a limit on the number of shadow pages that are
> >     allowed to be allocated. So, as a policy, Eager Page Splitting
> >     refuses to split if there are KVM_MIN_FREE_MMU_PAGES or fewer
> >     pages available.
> >
> > (2) Huge pages may be mapped by indirect shadow pages which have the
> >     possibility of being unsync. As a policy we opt not to split such
> >     pages as their translation may no longer be valid.
> >
> > (3) Splitting a huge page may end up re-using an existing lower level
> >     shadow page tables. This is unlike the TDP MMU which always allocates
> >     new shadow page tables when splitting.  This commit does *not*
> >     handle such aliasing and opts not to split such huge pages.
> >
> > (4) When installing the lower level SPTEs, they must be added to the
> >     rmap which may require allocating additional pte_list_desc structs.
> >     This commit does *not* handle such cases and instead opts to leave
> >     such lower-level SPTEs non-present. In this situation TLBs must be
> >     flushed before dropping the MMU lock as a portion of the huge page
> >     region is being unmapped.
> >
> > Suggested-by: Peter Feiner <pfeiner@google.com>
> > [ This commit is based off of the original implementation of Eager Page
> >   Splitting from Peter in Google's kernel from 2016. ]
> > Signed-off-by: David Matlack <dmatlack@google.com>
> > ---
> >  .../admin-guide/kernel-parameters.txt         |   3 -
> >  arch/x86/kvm/mmu/mmu.c                        | 307 ++++++++++++++++++
> >  2 files changed, 307 insertions(+), 3 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> > index 05161afd7642..495f6ac53801 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -2360,9 +2360,6 @@
> >                       the KVM_CLEAR_DIRTY ioctl, and only for the pages being
> >                       cleared.
> >
> > -                     Eager page splitting currently only supports splitting
> > -                     huge pages mapped by the TDP MMU.
> > -
> >                       Default is Y (on).
> >
> >       kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 926ddfaa9e1a..dd56b5b9624f 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -727,6 +727,11 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
> >
> >  static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_mmu_memory_cache *cache)
> >  {
> > +     static const gfp_t gfp_nocache = GFP_ATOMIC | __GFP_ACCOUNT | __GFP_ZERO;
> > +
> > +     if (WARN_ON_ONCE(!cache))
> > +             return kmem_cache_alloc(pte_list_desc_cache, gfp_nocache);
> > +
>
> I also think this is not proper to be added into this patch.  Maybe it'll
> be more suitable for the rmap_add() rework patch previously, or maybe it
> can be dropped directly if it should never trigger at all. Then we die hard
> at below when referencing it.

I can drop this, Ben suggested the same. cache should really never be
NULL so there's no need for this backup code.

>
> >       return kvm_mmu_memory_cache_alloc(cache);
> >  }
> >
> > @@ -743,6 +748,28 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
> >       return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
> >  }
> >
> > +static gfn_t sptep_to_gfn(u64 *sptep)
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> > +
> > +     return kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
> > +}
> > +
> > +static unsigned int kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
> > +{
> > +     if (!sp->role.direct)
> > +             return sp->shadowed_translation[index].access;
> > +
> > +     return sp->role.access;
> > +}
> > +
> > +static unsigned int sptep_to_access(u64 *sptep)
> > +{
> > +     struct kvm_mmu_page *sp = sptep_to_sp(sptep);
> > +
> > +     return kvm_mmu_page_get_access(sp, sptep - sp->spt);
> > +}
> > +
> >  static void kvm_mmu_page_set_gfn_access(struct kvm_mmu_page *sp, int index,
> >                                       gfn_t gfn, u32 access)
> >  {
> > @@ -912,6 +939,9 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
> >       return count;
> >  }
> >
> > +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
> > +                                      const struct kvm_memory_slot *slot);
> > +
> >  static void
> >  pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
> >                          struct pte_list_desc *desc, int i,
> > @@ -2125,6 +2155,23 @@ static struct kvm_mmu_page *__kvm_mmu_find_shadow_page(struct kvm *kvm,
> >       return sp;
> >  }
> >
> > +static struct kvm_mmu_page *kvm_mmu_find_direct_sp(struct kvm *kvm, gfn_t gfn,
> > +                                                union kvm_mmu_page_role role)
> > +{
> > +     struct kvm_mmu_page *sp;
> > +     LIST_HEAD(invalid_list);
> > +
> > +     BUG_ON(!role.direct);
> > +
> > +     sp = __kvm_mmu_find_shadow_page(kvm, gfn, role, &invalid_list);
> > +
> > +     /* Direct SPs are never unsync. */
> > +     WARN_ON_ONCE(sp && sp->unsync);
> > +
> > +     kvm_mmu_commit_zap_page(kvm, &invalid_list);
> > +     return sp;
> > +}
> > +
> >  /*
> >   * Looks up an existing SP for the given gfn and role if one exists. The
> >   * return SP is guaranteed to be synced.
> > @@ -6063,12 +6110,266 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
> >               kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
> >  }
> >
> > +static int prepare_to_split_huge_page(struct kvm *kvm,
> > +                                   const struct kvm_memory_slot *slot,
> > +                                   u64 *huge_sptep,
> > +                                   struct kvm_mmu_page **spp,
> > +                                   bool *flush,
> > +                                   bool *dropped_lock)
> > +{
> > +     int r = 0;
> > +
> > +     *dropped_lock = false;
> > +
> > +     if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
> > +             return -ENOSPC;
> > +
> > +     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
> > +             goto drop_lock;
> > +
>
> Not immediately clear on whether there'll be case that *spp is set within
> the current function.  Some sanity check might be nice?

Sorry I'm not sure what you mean here. What kind of sanity check did
you have in mind?

>
> > +     *spp = kvm_mmu_alloc_direct_sp_for_split(true);
> > +     if (r)
> > +             goto drop_lock;
> > +
> > +     return 0;
> > +
> > +drop_lock:
> > +     if (*flush)
> > +             kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
> > +
> > +     *flush = false;
> > +     *dropped_lock = true;
> > +
> > +     write_unlock(&kvm->mmu_lock);
> > +     cond_resched();
> > +     *spp = kvm_mmu_alloc_direct_sp_for_split(false);
> > +     if (!*spp)
> > +             r = -ENOMEM;
> > +     write_lock(&kvm->mmu_lock);
> > +
> > +     return r;
> > +}
> > +
> > +static struct kvm_mmu_page *kvm_mmu_get_sp_for_split(struct kvm *kvm,
> > +                                                  const struct kvm_memory_slot *slot,
> > +                                                  u64 *huge_sptep,
> > +                                                  struct kvm_mmu_page **spp)
> > +{
> > +     struct kvm_mmu_page *split_sp;
> > +     union kvm_mmu_page_role role;
> > +     unsigned int access;
> > +     gfn_t gfn;
> > +
> > +     gfn = sptep_to_gfn(huge_sptep);
> > +     access = sptep_to_access(huge_sptep);
> > +
> > +     /*
> > +      * Huge page splitting always uses direct shadow pages since we are
> > +      * directly mapping the huge page GFN region with smaller pages.
> > +      */
> > +     role = kvm_mmu_child_role(huge_sptep, true, access);
> > +     split_sp = kvm_mmu_find_direct_sp(kvm, gfn, role);
> > +
> > +     /*
> > +      * Opt not to split if the lower-level SP already exists. This requires
> > +      * more complex handling as the SP may be already partially filled in
> > +      * and may need extra pte_list_desc structs to update parent_ptes.
> > +      */
> > +     if (split_sp)
> > +             return NULL;
>
> This smells tricky..
>
> Firstly we're trying to lookup the existing SPs that has shadowed this huge
> page in split way, with the access bits fetched from the shadow cache (so
> without hugepage nx effect).

Yeah this is tricky for sure.

For direct shadow pages, sp->role.access is always the guest access
permissions being shadowed (or ACC_ALL for situations where there is
no shadowing, e.g. __direct_map() and the TDP MMU). That's why we use
shadow translation cache to lookup up an existing SP or creating a new
SP, rather than taking the access permissions from the huge SPTE
itself (which may have KVM-specific policies applied such as HugePage
NX, access tracking, etc.). In other words, we want to look up
existing SPs in the same exact way that the fault handler looks them
up.

> However could the pages be mapped with
> different permissions from the currently hugely mapped page?

Yes, I think there can be some differences, such as:

 - The child SPTEs may have execute permission granted due to HugePage
NX while the huge page does not.
 - The child SPTEs may be in a different access tracking state than
the huge page.

There may be others. But no matter what, the same differences are
possible when we split a huge page during a fault, which leads me to
conclude it is safe.

>
> IIUC all these is for the fact that we can't allocate pte_list_desc and we
> want to make sure we won't make the pte list to be >1.
>
> But I also see that the pte_list check below...
>
> > +
> > +     swap(split_sp, *spp);
> > +     init_shadow_page(kvm, split_sp, slot, gfn, role);
> > +     trace_kvm_mmu_get_page(split_sp, true);
> > +
> > +     return split_sp;
> > +}
> > +
> > +static int kvm_mmu_split_huge_page(struct kvm *kvm,
> > +                                const struct kvm_memory_slot *slot,
> > +                                u64 *huge_sptep, struct kvm_mmu_page **spp,
> > +                                bool *flush)
> > +
> > +{
> > +     struct kvm_mmu_page *split_sp;
> > +     u64 huge_spte, split_spte;
> > +     int split_level, index;
> > +     unsigned int access;
> > +     u64 *split_sptep;
> > +     gfn_t split_gfn;
> > +
> > +     split_sp = kvm_mmu_get_sp_for_split(kvm, slot, huge_sptep, spp);
> > +     if (!split_sp)
> > +             return -EOPNOTSUPP;
> > +
> > +     /*
> > +      * Since we did not allocate pte_list_desc_structs for the split, we
> > +      * cannot add a new parent SPTE to parent_ptes. This should never happen
> > +      * in practice though since this is a fresh SP.
> > +      *
> > +      * Note, this makes it safe to pass NULL to __link_shadow_page() below.
> > +      */
> > +     if (WARN_ON_ONCE(split_sp->parent_ptes.val))
> > +             return -EINVAL;
> > +
> > +     huge_spte = READ_ONCE(*huge_sptep);
> > +
> > +     split_level = split_sp->role.level;
> > +     access = split_sp->role.access;
> > +
> > +     for (index = 0; index < PT64_ENT_PER_PAGE; index++) {
> > +             split_sptep = &split_sp->spt[index];
> > +             split_gfn = kvm_mmu_page_get_gfn(split_sp, index);
> > +
> > +             BUG_ON(is_shadow_present_pte(*split_sptep));
> > +
> > +             /*
> > +              * Since we did not allocate pte_list_desc structs for the
> > +              * split, we can't add a new SPTE that maps this GFN.
> > +              * Skipping this SPTE means we're only partially mapping the
> > +              * huge page, which means we'll need to flush TLBs before
> > +              * dropping the MMU lock.
> > +              *
> > +              * Note, this make it safe to pass NULL to __rmap_add() below.
> > +              */
> > +             if (gfn_to_rmap(split_gfn, split_level, slot)->val) {
> > +                     *flush = true;
> > +                     continue;
> > +             }
>
> ... here.
>
> IIUC this check should already be able to cover all the cases and it's
> accurate on the fact that we don't want to grow any rmap to >1 len.
>
> > +
> > +             split_spte = make_huge_page_split_spte(
> > +                             huge_spte, split_level + 1, index, access);
> > +
> > +             mmu_spte_set(split_sptep, split_spte);
> > +             __rmap_add(kvm, NULL, slot, split_sptep, split_gfn, access);
>
> __rmap_add() with a NULL cache pointer is weird.. same as
> __link_shadow_page() below.
>
> I'll stop here for now I guess.. Have you considered having rmap allocation
> ready altogether, rather than making this intermediate step but only add
> that later?  Because all these look hackish to me..  It's also possible
> that I missed something important, if so please shoot.
>
> Thanks,
>
> > +     }
> > +
> > +     /*
> > +      * Replace the huge spte with a pointer to the populated lower level
> > +      * page table. Since we are making this change without a TLB flush vCPUs
> > +      * will see a mix of the split mappings and the original huge mapping,
> > +      * depending on what's currently in their TLB. This is fine from a
> > +      * correctness standpoint since the translation will either be identical
> > +      * or non-present. To account for non-present mappings, the TLB will be
> > +      * flushed prior to dropping the MMU lock.
> > +      */
> > +     __drop_large_spte(kvm, huge_sptep, false);
> > +     __link_shadow_page(NULL, huge_sptep, split_sp);
> > +
> > +     return 0;
> > +}
>
> --
> Peter Xu
>
Peter Xu March 30, 2022, 6:34 p.m. UTC | #4
On Tue, Mar 22, 2022 at 04:58:08PM -0700, David Matlack wrote:
> > > +static int prepare_to_split_huge_page(struct kvm *kvm,
> > > +                                   const struct kvm_memory_slot *slot,
> > > +                                   u64 *huge_sptep,
> > > +                                   struct kvm_mmu_page **spp,
> > > +                                   bool *flush,
> > > +                                   bool *dropped_lock)
> > > +{
> > > +     int r = 0;
> > > +
> > > +     *dropped_lock = false;
> > > +
> > > +     if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
> > > +             return -ENOSPC;
> > > +
> > > +     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
> > > +             goto drop_lock;
> > > +
> >
> > Not immediately clear on whether there'll be case that *spp is set within
> > the current function.  Some sanity check might be nice?
> 
> Sorry I'm not sure what you mean here. What kind of sanity check did
> you have in mind?

Something like "WARN_ON_ONCE(*spp);"?

> 
> >
> > > +     *spp = kvm_mmu_alloc_direct_sp_for_split(true);
> > > +     if (r)
> > > +             goto drop_lock;
> > > +
> > > +     return 0;

Thanks,
David Matlack March 31, 2022, 7:57 p.m. UTC | #5
On Wed, Mar 30, 2022 at 11:34 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Mar 22, 2022 at 04:58:08PM -0700, David Matlack wrote:
> > > > +static int prepare_to_split_huge_page(struct kvm *kvm,
> > > > +                                   const struct kvm_memory_slot *slot,
> > > > +                                   u64 *huge_sptep,
> > > > +                                   struct kvm_mmu_page **spp,
> > > > +                                   bool *flush,
> > > > +                                   bool *dropped_lock)
> > > > +{
> > > > +     int r = 0;
> > > > +
> > > > +     *dropped_lock = false;
> > > > +
> > > > +     if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
> > > > +             return -ENOSPC;
> > > > +
> > > > +     if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
> > > > +             goto drop_lock;
> > > > +
> > >
> > > Not immediately clear on whether there'll be case that *spp is set within
> > > the current function.  Some sanity check might be nice?
> >
> > Sorry I'm not sure what you mean here. What kind of sanity check did
> > you have in mind?
>
> Something like "WARN_ON_ONCE(*spp);"?

Ah I see. I was confused because the previous version of this code
checked if *spp is already set and, if so, skipped the allocation. But
I accidentally introduced a memory leak here when I implemented Ben'
suggestion to defer alloc_memory_for_split() to a subsequent commit.
I'll fix this in v3.

>
> >
> > >
> > > > +     *spp = kvm_mmu_alloc_direct_sp_for_split(true);
> > > > +     if (r)
> > > > +             goto drop_lock;
> > > > +
> > > > +     return 0;
>
> Thanks,
>
> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05161afd7642..495f6ac53801 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2360,9 +2360,6 @@ 
 			the KVM_CLEAR_DIRTY ioctl, and only for the pages being
 			cleared.
 
-			Eager page splitting currently only supports splitting
-			huge pages mapped by the TDP MMU.
-
 			Default is Y (on).
 
 	kvm.enable_vmware_backdoor=[KVM] Support VMware backdoor PV interface.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 926ddfaa9e1a..dd56b5b9624f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -727,6 +727,11 @@  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 
 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_mmu_memory_cache *cache)
 {
+	static const gfp_t gfp_nocache = GFP_ATOMIC | __GFP_ACCOUNT | __GFP_ZERO;
+
+	if (WARN_ON_ONCE(!cache))
+		return kmem_cache_alloc(pte_list_desc_cache, gfp_nocache);
+
 	return kvm_mmu_memory_cache_alloc(cache);
 }
 
@@ -743,6 +748,28 @@  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 }
 
+static gfn_t sptep_to_gfn(u64 *sptep)
+{
+	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+
+	return kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+}
+
+static unsigned int kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
+{
+	if (!sp->role.direct)
+		return sp->shadowed_translation[index].access;
+
+	return sp->role.access;
+}
+
+static unsigned int sptep_to_access(u64 *sptep)
+{
+	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
+
+	return kvm_mmu_page_get_access(sp, sptep - sp->spt);
+}
+
 static void kvm_mmu_page_set_gfn_access(struct kvm_mmu_page *sp, int index,
 					gfn_t gfn, u32 access)
 {
@@ -912,6 +939,9 @@  static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
 	return count;
 }
 
+static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level,
+					 const struct kvm_memory_slot *slot);
+
 static void
 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
 			   struct pte_list_desc *desc, int i,
@@ -2125,6 +2155,23 @@  static struct kvm_mmu_page *__kvm_mmu_find_shadow_page(struct kvm *kvm,
 	return sp;
 }
 
+static struct kvm_mmu_page *kvm_mmu_find_direct_sp(struct kvm *kvm, gfn_t gfn,
+						   union kvm_mmu_page_role role)
+{
+	struct kvm_mmu_page *sp;
+	LIST_HEAD(invalid_list);
+
+	BUG_ON(!role.direct);
+
+	sp = __kvm_mmu_find_shadow_page(kvm, gfn, role, &invalid_list);
+
+	/* Direct SPs are never unsync. */
+	WARN_ON_ONCE(sp && sp->unsync);
+
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	return sp;
+}
+
 /*
  * Looks up an existing SP for the given gfn and role if one exists. The
  * return SP is guaranteed to be synced.
@@ -6063,12 +6110,266 @@  void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
 }
 
+static int prepare_to_split_huge_page(struct kvm *kvm,
+				      const struct kvm_memory_slot *slot,
+				      u64 *huge_sptep,
+				      struct kvm_mmu_page **spp,
+				      bool *flush,
+				      bool *dropped_lock)
+{
+	int r = 0;
+
+	*dropped_lock = false;
+
+	if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES)
+		return -ENOSPC;
+
+	if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
+		goto drop_lock;
+
+	*spp = kvm_mmu_alloc_direct_sp_for_split(true);
+	if (r)
+		goto drop_lock;
+
+	return 0;
+
+drop_lock:
+	if (*flush)
+		kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+
+	*flush = false;
+	*dropped_lock = true;
+
+	write_unlock(&kvm->mmu_lock);
+	cond_resched();
+	*spp = kvm_mmu_alloc_direct_sp_for_split(false);
+	if (!*spp)
+		r = -ENOMEM;
+	write_lock(&kvm->mmu_lock);
+
+	return r;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_sp_for_split(struct kvm *kvm,
+						     const struct kvm_memory_slot *slot,
+						     u64 *huge_sptep,
+						     struct kvm_mmu_page **spp)
+{
+	struct kvm_mmu_page *split_sp;
+	union kvm_mmu_page_role role;
+	unsigned int access;
+	gfn_t gfn;
+
+	gfn = sptep_to_gfn(huge_sptep);
+	access = sptep_to_access(huge_sptep);
+
+	/*
+	 * Huge page splitting always uses direct shadow pages since we are
+	 * directly mapping the huge page GFN region with smaller pages.
+	 */
+	role = kvm_mmu_child_role(huge_sptep, true, access);
+	split_sp = kvm_mmu_find_direct_sp(kvm, gfn, role);
+
+	/*
+	 * Opt not to split if the lower-level SP already exists. This requires
+	 * more complex handling as the SP may be already partially filled in
+	 * and may need extra pte_list_desc structs to update parent_ptes.
+	 */
+	if (split_sp)
+		return NULL;
+
+	swap(split_sp, *spp);
+	init_shadow_page(kvm, split_sp, slot, gfn, role);
+	trace_kvm_mmu_get_page(split_sp, true);
+
+	return split_sp;
+}
+
+static int kvm_mmu_split_huge_page(struct kvm *kvm,
+				   const struct kvm_memory_slot *slot,
+				   u64 *huge_sptep, struct kvm_mmu_page **spp,
+				   bool *flush)
+
+{
+	struct kvm_mmu_page *split_sp;
+	u64 huge_spte, split_spte;
+	int split_level, index;
+	unsigned int access;
+	u64 *split_sptep;
+	gfn_t split_gfn;
+
+	split_sp = kvm_mmu_get_sp_for_split(kvm, slot, huge_sptep, spp);
+	if (!split_sp)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Since we did not allocate pte_list_desc_structs for the split, we
+	 * cannot add a new parent SPTE to parent_ptes. This should never happen
+	 * in practice though since this is a fresh SP.
+	 *
+	 * Note, this makes it safe to pass NULL to __link_shadow_page() below.
+	 */
+	if (WARN_ON_ONCE(split_sp->parent_ptes.val))
+		return -EINVAL;
+
+	huge_spte = READ_ONCE(*huge_sptep);
+
+	split_level = split_sp->role.level;
+	access = split_sp->role.access;
+
+	for (index = 0; index < PT64_ENT_PER_PAGE; index++) {
+		split_sptep = &split_sp->spt[index];
+		split_gfn = kvm_mmu_page_get_gfn(split_sp, index);
+
+		BUG_ON(is_shadow_present_pte(*split_sptep));
+
+		/*
+		 * Since we did not allocate pte_list_desc structs for the
+		 * split, we can't add a new SPTE that maps this GFN.
+		 * Skipping this SPTE means we're only partially mapping the
+		 * huge page, which means we'll need to flush TLBs before
+		 * dropping the MMU lock.
+		 *
+		 * Note, this make it safe to pass NULL to __rmap_add() below.
+		 */
+		if (gfn_to_rmap(split_gfn, split_level, slot)->val) {
+			*flush = true;
+			continue;
+		}
+
+		split_spte = make_huge_page_split_spte(
+				huge_spte, split_level + 1, index, access);
+
+		mmu_spte_set(split_sptep, split_spte);
+		__rmap_add(kvm, NULL, slot, split_sptep, split_gfn, access);
+	}
+
+	/*
+	 * Replace the huge spte with a pointer to the populated lower level
+	 * page table. Since we are making this change without a TLB flush vCPUs
+	 * will see a mix of the split mappings and the original huge mapping,
+	 * depending on what's currently in their TLB. This is fine from a
+	 * correctness standpoint since the translation will either be identical
+	 * or non-present. To account for non-present mappings, the TLB will be
+	 * flushed prior to dropping the MMU lock.
+	 */
+	__drop_large_spte(kvm, huge_sptep, false);
+	__link_shadow_page(NULL, huge_sptep, split_sp);
+
+	return 0;
+}
+
+static bool should_split_huge_page(u64 *huge_sptep)
+{
+	struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep);
+
+	if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
+		return false;
+
+	if (huge_sp->role.invalid)
+		return false;
+
+	/*
+	 * As a policy, do not split huge pages if SP on which they reside
+	 * is unsync. Unsync means the guest is modifying the page table being
+	 * shadowed by huge_sp, so splitting may be a waste of cycles and
+	 * memory.
+	 */
+	if (huge_sp->unsync)
+		return false;
+
+	return true;
+}
+
+static bool rmap_try_split_huge_pages(struct kvm *kvm,
+				      struct kvm_rmap_head *rmap_head,
+				      const struct kvm_memory_slot *slot)
+{
+	struct kvm_mmu_page *sp = NULL;
+	struct rmap_iterator iter;
+	u64 *huge_sptep, spte;
+	bool flush = false;
+	bool dropped_lock;
+	int level;
+	gfn_t gfn;
+	int r;
+
+restart:
+	for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
+		if (!should_split_huge_page(huge_sptep))
+			continue;
+
+		spte = *huge_sptep;
+		level = sptep_to_sp(huge_sptep)->role.level;
+		gfn = sptep_to_gfn(huge_sptep);
+
+		r = prepare_to_split_huge_page(kvm, slot, huge_sptep, &sp, &flush, &dropped_lock);
+		if (r) {
+			trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+			break;
+		}
+
+		if (dropped_lock)
+			goto restart;
+
+		r = kvm_mmu_split_huge_page(kvm, slot, huge_sptep, &sp, &flush);
+
+		trace_kvm_mmu_split_huge_page(gfn, spte, level, r);
+
+		/*
+		 * If splitting is successful we must restart the iterator
+		 * because huge_sptep has just been removed from it.
+		 */
+		if (!r)
+			goto restart;
+	}
+
+	if (sp)
+		kvm_mmu_free_shadow_page(sp);
+
+	return flush;
+}
+
+static void kvm_rmap_try_split_huge_pages(struct kvm *kvm,
+					  const struct kvm_memory_slot *slot,
+					  gfn_t start, gfn_t end,
+					  int target_level)
+{
+	bool flush;
+	int level;
+
+	/*
+	 * Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
+	 * down to the target level. This ensures pages are recursively split
+	 * all the way to the target level. There's no need to split pages
+	 * already at the target level.
+	 *
+	 * Note that TLB flushes must be done before dropping the MMU lock since
+	 * rmap_try_split_huge_pages() may partially split any given huge page,
+	 * i.e. it may effectively unmap (make non-present) a portion of the
+	 * huge page.
+	 */
+	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--) {
+		flush = slot_handle_level_range(kvm, slot,
+						rmap_try_split_huge_pages,
+						level, level, start, end - 1,
+						true, flush);
+	}
+
+	if (flush)
+		kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
+}
+
 /* Must be called with the mmu_lock held in write-mode. */
 void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
 				   const struct kvm_memory_slot *memslot,
 				   u64 start, u64 end,
 				   int target_level)
 {
+	if (kvm_memslots_have_rmaps(kvm))
+		kvm_rmap_try_split_huge_pages(kvm, memslot, start, end,
+					      target_level);
+
 	if (is_tdp_mmu_enabled(kvm))
 		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end,
 						 target_level, false);
@@ -6086,6 +6387,12 @@  void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
 	u64 start = memslot->base_gfn;
 	u64 end = start + memslot->npages;
 
+	if (kvm_memslots_have_rmaps(kvm)) {
+		write_lock(&kvm->mmu_lock);
+		kvm_rmap_try_split_huge_pages(kvm, memslot, start, end, target_level);
+		write_unlock(&kvm->mmu_lock);
+	}
+
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
 		kvm_tdp_mmu_try_split_huge_pages(kvm, memslot, start, end, target_level, true);