diff mbox series

[v3,16/23] KVM: x86/mmu: Cache the access bits of shadowed translations

Message ID 20220401175554.1931568-17-dmatlack@google.com
State Accepted
Headers show
Series [v3,01/23] KVM: x86/mmu: Optimize MMU page cache lookup for all direct SPs | expand

Commit Message

David Matlack April 1, 2022, 5:55 p.m. UTC
In order to split a huge page we need to know what access bits to assign
to the role of the new child page table. This can't be easily derived
from the huge page SPTE itself since KVM applies its own access policies
on top, such as for HugePage NX.

We could walk the guest page tables to determine the correct access
bits, but that is difficult to plumb outside of a vCPU fault context.
Instead, we can store the original access bits for each leaf SPTE
alongside the GFN in the gfns array. The access bits only take up 3
bits, which leaves 61 bits left over for gfns, which is more than
enough. So this change does not require any additional memory.

In order to keep the access bit cache in sync with the guest, we have to
extend FNAME(sync_page) to also update the access bits.

Now that the gfns array caches more information than just GFNs, rename
it to shadowed_translation.

Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu/mmu.c          | 71 ++++++++++++++++++++++++++++-----
 arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++-
 arch/x86/kvm/mmu/paging_tmpl.h  |  8 +++-
 4 files changed, 85 insertions(+), 16 deletions(-)

Comments

kernel test robot April 2, 2022, 6:19 a.m. UTC | #1
Hi David,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 19164ad08bf668bca4f4bfbaacaa0a47c1b737a6]

url:    https://github.com/intel-lab-lkp/linux/commits/David-Matlack/KVM-Extend-Eager-Page-Splitting-to-the-shadow-MMU/20220402-015911
base:   19164ad08bf668bca4f4bfbaacaa0a47c1b737a6
config: x86_64-randconfig-a002 (https://download.01.org/0day-ci/archive/20220402/202204021419.Rh6Pgcl9-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.2.0-19) 11.2.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/5f7a06676291033d880081035c2efae13702a0c4
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review David-Matlack/KVM-Extend-Eager-Page-Splitting-to-the-shadow-MMU/20220402-015911
        git checkout 5f7a06676291033d880081035c2efae13702a0c4
        # save the config file to linux build tree
        mkdir build_dir
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash arch/x86/kvm/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/linux/kernel.h:29,
                    from include/linux/cpumask.h:10,
                    from include/linux/mm_types_task.h:14,
                    from include/linux/mm_types.h:5,
                    from arch/x86/kvm/irq.h:13,
                    from arch/x86/kvm/mmu/mmu.c:18:
   arch/x86/kvm/mmu/mmu.c: In function 'kvm_mmu_page_set_access':
>> include/linux/kern_levels.h:5:25: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 3 has type 'int' [-Wformat=]
       5 | #define KERN_SOH        "\001"          /* ASCII Start Of Header */
         |                         ^~~~~~
   include/linux/printk.h:418:25: note: in definition of macro 'printk_index_wrap'
     418 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ^~~~
   include/linux/printk.h:640:17: note: in expansion of macro 'printk'
     640 |                 printk(fmt, ##__VA_ARGS__);                             \
         |                 ^~~~~~
   include/linux/printk.h:654:9: note: in expansion of macro 'printk_ratelimited'
     654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~~~~
   include/linux/kern_levels.h:11:25: note: in expansion of macro 'KERN_SOH'
      11 | #define KERN_ERR        KERN_SOH "3"    /* error conditions */
         |                         ^~~~~~~~
   include/linux/printk.h:654:28: note: in expansion of macro 'KERN_ERR'
     654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                            ^~~~~~~~
   arch/x86/kvm/mmu/mmu.c:763:17: note: in expansion of macro 'pr_err_ratelimited'
     763 |                 pr_err_ratelimited("access mismatch under direct page %llx "
         |                 ^~~~~~~~~~~~~~~~~~
>> include/linux/kern_levels.h:5:25: warning: format '%llx' expects argument of type 'long long unsigned int', but argument 4 has type 'u32' {aka 'unsigned int'} [-Wformat=]
       5 | #define KERN_SOH        "\001"          /* ASCII Start Of Header */
         |                         ^~~~~~
   include/linux/printk.h:418:25: note: in definition of macro 'printk_index_wrap'
     418 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
         |                         ^~~~
   include/linux/printk.h:640:17: note: in expansion of macro 'printk'
     640 |                 printk(fmt, ##__VA_ARGS__);                             \
         |                 ^~~~~~
   include/linux/printk.h:654:9: note: in expansion of macro 'printk_ratelimited'
     654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |         ^~~~~~~~~~~~~~~~~~
   include/linux/kern_levels.h:11:25: note: in expansion of macro 'KERN_SOH'
      11 | #define KERN_ERR        KERN_SOH "3"    /* error conditions */
         |                         ^~~~~~~~
   include/linux/printk.h:654:28: note: in expansion of macro 'KERN_ERR'
     654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
         |                            ^~~~~~~~
   arch/x86/kvm/mmu/mmu.c:763:17: note: in expansion of macro 'pr_err_ratelimited'
     763 |                 pr_err_ratelimited("access mismatch under direct page %llx "
         |                 ^~~~~~~~~~~~~~~~~~


vim +5 include/linux/kern_levels.h

314ba3520e513a7 Joe Perches 2012-07-30  4  
04d2c8c83d0e3ac Joe Perches 2012-07-30 @5  #define KERN_SOH	"\001"		/* ASCII Start Of Header */
04d2c8c83d0e3ac Joe Perches 2012-07-30  6  #define KERN_SOH_ASCII	'\001'
04d2c8c83d0e3ac Joe Perches 2012-07-30  7
kernel test robot April 2, 2022, 7:01 a.m. UTC | #2
Hi David,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 19164ad08bf668bca4f4bfbaacaa0a47c1b737a6]

url:    https://github.com/intel-lab-lkp/linux/commits/David-Matlack/KVM-Extend-Eager-Page-Splitting-to-the-shadow-MMU/20220402-015911
base:   19164ad08bf668bca4f4bfbaacaa0a47c1b737a6
config: x86_64-randconfig-a014 (https://download.01.org/0day-ci/archive/20220402/202204021411.oOhP2vFP-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project c4a1b07d0979e7ff20d7d541af666d822d66b566)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/5f7a06676291033d880081035c2efae13702a0c4
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review David-Matlack/KVM-Extend-Eager-Page-Splitting-to-the-shadow-MMU/20220402-015911
        git checkout 5f7a06676291033d880081035c2efae13702a0c4
        # save the config file to linux build tree
        mkdir build_dir
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash arch/x86/kvm/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/x86/kvm/mmu/mmu.c:766:8: warning: format specifies type 'unsigned long long' but the argument has type 'int' [-Wformat]
                                      sp->role.access, access);
                                      ^~~~~~~~~~~~~~~
   include/linux/printk.h:654:45: note: expanded from macro 'pr_err_ratelimited'
           printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
                                              ~~~     ^~~~~~~~~~~
   include/linux/printk.h:640:17: note: expanded from macro 'printk_ratelimited'
                   printk(fmt, ##__VA_ARGS__);                             \
                          ~~~    ^~~~~~~~~~~
   include/linux/printk.h:446:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:418:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
>> arch/x86/kvm/mmu/mmu.c:766:25: warning: format specifies type 'unsigned long long' but the argument has type 'u32' (aka 'unsigned int') [-Wformat]
                                      sp->role.access, access);
                                                       ^~~~~~
   include/linux/printk.h:654:45: note: expanded from macro 'pr_err_ratelimited'
           printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
                                              ~~~     ^~~~~~~~~~~
   include/linux/printk.h:640:17: note: expanded from macro 'printk_ratelimited'
                   printk(fmt, ##__VA_ARGS__);                             \
                          ~~~    ^~~~~~~~~~~
   include/linux/printk.h:446:60: note: expanded from macro 'printk'
   #define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
                                                       ~~~    ^~~~~~~~~~~
   include/linux/printk.h:418:19: note: expanded from macro 'printk_index_wrap'
                   _p_func(_fmt, ##__VA_ARGS__);                           \
                           ~~~~    ^~~~~~~~~~~
   2 warnings generated.


vim +766 arch/x86/kvm/mmu/mmu.c

   754	
   755	static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access)
   756	{
   757		if (!sp->role.direct) {
   758			sp->shadowed_translation[index].access = access;
   759			return;
   760		}
   761	
   762		if (WARN_ON(access != sp->role.access))
   763			pr_err_ratelimited("access mismatch under direct page %llx "
   764					   "(expected %llx, got %llx)\n",
   765					   kvm_mmu_page_get_gfn(sp, index),
 > 766					   sp->role.access, access);
   767	}
   768
Sean Christopherson April 9, 2022, 12:02 a.m. UTC | #3
On Fri, Apr 01, 2022, David Matlack wrote:
> @@ -733,7 +733,7 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
>  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
>  {
>  	if (!sp->role.direct)
> -		return sp->gfns[index];
> +		return sp->shadowed_translation[index].gfn;
>  
>  	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
>  }
> @@ -741,7 +741,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
>  static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)

This should be replaced with a single helper to set the gfn+access.  Under no
circumstance should _just_ the gfn change, and that will allow us to optimize
writing the entry.  More below.

>  {
>  	if (!sp->role.direct) {
> -		sp->gfns[index] = gfn;
> +		sp->shadowed_translation[index].gfn = gfn;
>  		return;
>  	}
>  
> @@ -752,6 +752,47 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
>  				   kvm_mmu_page_get_gfn(sp, index), gfn);
>  }
>  
> +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access)
> +{
> +	if (!sp->role.direct) {
> +		sp->shadowed_translation[index].access = access;
> +		return;
> +	}
> +
> +	if (WARN_ON(access != sp->role.access))
> +		pr_err_ratelimited("access mismatch under direct page %llx "

LOL, I realize this is not your code, but ratelimiting under a WARN ain't gonna
help much :-)

This also generates a warning and fails to compile with KVM_WERROR=y, though I
believe the test bots already reported that.


arch/x86/kvm/mmu/mmu.c: In function ‘kvm_mmu_page_set_access’:
include/linux/kern_levels.h:5:25: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘int’ [-Werror=format=]
    5 | #define KERN_SOH        "\001"          /* ASCII Start Of Header */
      |                         ^~~~~~
include/linux/printk.h:418:25: note: in definition of macro ‘printk_index_wrap’
  418 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
      |                         ^~~~
include/linux/printk.h:640:17: note: in expansion of macro ‘printk’
  640 |                 printk(fmt, ##__VA_ARGS__);                             \
      |                 ^~~~~~
include/linux/printk.h:654:9: note: in expansion of macro ‘printk_ratelimited’
  654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
      |         ^~~~~~~~~~~~~~~~~~
include/linux/kern_levels.h:11:25: note: in expansion of macro ‘KERN_SOH’
   11 | #define KERN_ERR        KERN_SOH "3"    /* error conditions */
      |                         ^~~~~~~~
include/linux/printk.h:654:28: note: in expansion of macro ‘KERN_ERR’
  654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
      |                            ^~~~~~~~
arch/x86/kvm/mmu/mmu.c:763:17: note: in expansion of macro ‘pr_err_ratelimited’
  763 |                 pr_err_ratelimited("access mismatch under direct page %llx "
      |                 ^~~~~~~~~~~~~~~~~~


> +				   "(expected %llx, got %llx)\n",
> +				   kvm_mmu_page_get_gfn(sp, index),
> +				   sp->role.access, access);
> +}
> +
> +/*
> + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
> + * that the SPTE itself may have a more constrained access permissions that
> + * what the guest enforces. For example, a guest may create an executable
> + * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
> + */
> +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
> +{
> +	if (!sp->role.direct)
> +		return sp->shadowed_translation[index].access;
> +
> +	/*
> +	 * For direct MMUs (e.g. TDP or non-paging guests) there are no *guest*
> +	 * access permissions being shadowed. So we can just return ACC_ALL
> +	 * here.
> +	 *
> +	 * For indirect MMUs (shadow paging), direct shadow pages exist when KVM
> +	 * is shadowing a guest huge page with smaller pages, since the guest
> +	 * huge page is being directly mapped. In this case the guest access
> +	 * permissions being shadowed are the access permissions of the huge
> +	 * page.
> +	 *
> +	 * In both cases, sp->role.access contains exactly what we want.
> +	 */
> +	return sp->role.access;
> +}

...

> diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> index b6e22ba9c654..3f76f4c1ae59 100644
> --- a/arch/x86/kvm/mmu/mmu_internal.h
> +++ b/arch/x86/kvm/mmu/mmu_internal.h
> @@ -32,6 +32,18 @@ extern bool dbg;
>  
>  typedef u64 __rcu *tdp_ptep_t;
>  
> +/*
> + * Stores the result of the guest translation being shadowed by an SPTE. KVM
> + * shadows two types of guest translations: nGPA -> GPA (shadow EPT/NPT) and
> + * GVA -> GPA (traditional shadow paging). In both cases the result of the
> + * translation is a GPA and a set of access constraints.
> + */
> +struct shadowed_translation_entry {
> +	/* Note, GFNs can have at most 64 - PAGE_SHIFT = 52 bits. */
> +	u64 gfn:52;
> +	u64 access:3;

A bitfield is completely unnecessary and generates bad code.  As is, it generates
_really_ bad code because extracting and setting requires non-standard 64-bit value
masks, multiple operations, and accesses to unaligned data.  The generated code can
be made slightly less awful by using a fully byte for access and 64 bits for GFN,
but it still sucks compared to what we can hand generate.

The other aspect of this is that retrieving the GFN is a frequent operation,
whereas the access is almost never read.  I.e. we should bias for reading the GFN
above all else.

The simple and obvious thing is to not reinvent the wheel.  GFN = (GPA >> PAGE_SHIFT),
and ignoring NX, access lives in the lower 12 bits of a PTE.  Then reading the GFN is
a simple SHR, and reading access info is a simple AND.

We might also be able to optimize FNAME(sync_page), but I don't care much about
that, it's rarely used for nested TDP.

So, keep translation_entry a gfn_t *, then do:

static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
	if (!sp->role.direct)
		return sp->shadowed_translation[index] >> PAGE_SHIFT;

	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
}

static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
					 gfn_t gfn, unsigned int access)
{
	if (!sp->role.direct) {
		sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
		return;
	}

	if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
		pr_err_ratelimited("gfn mismatch under direct page %llx "
				   "(expected %llx, got %llx)\n",
				   sp->gfn,
				   kvm_mmu_page_get_gfn(sp, index), gfn);
}

static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
				    unsigned int access)
{
	if (sp->role.direct)
		return;

	sp->shadowed_translation[index] &= PAGE_MASK;
	sp->shadowed_translation[index] |= access;
}
David Matlack April 14, 2022, 4:47 p.m. UTC | #4
On Fri, Apr 8, 2022 at 5:02 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, Apr 01, 2022, David Matlack wrote:
> > @@ -733,7 +733,7 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
> >  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
> >  {
> >       if (!sp->role.direct)
> > -             return sp->gfns[index];
> > +             return sp->shadowed_translation[index].gfn;
> >
> >       return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
> >  }
> > @@ -741,7 +741,7 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
> >  static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
>
> This should be replaced with a single helper to set the gfn+access.  Under no
> circumstance should _just_ the gfn change, and that will allow us to optimize
> writing the entry.  More below.
>
> >  {
> >       if (!sp->role.direct) {
> > -             sp->gfns[index] = gfn;
> > +             sp->shadowed_translation[index].gfn = gfn;
> >               return;
> >       }
> >
> > @@ -752,6 +752,47 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
> >                                  kvm_mmu_page_get_gfn(sp, index), gfn);
> >  }
> >
> > +static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access)
> > +{
> > +     if (!sp->role.direct) {
> > +             sp->shadowed_translation[index].access = access;
> > +             return;
> > +     }
> > +
> > +     if (WARN_ON(access != sp->role.access))
> > +             pr_err_ratelimited("access mismatch under direct page %llx "
>
> LOL, I realize this is not your code, but ratelimiting under a WARN ain't gonna
> help much :-)

Ha! Yeah this silly. I'll see about adding a precursor patch to make
it less terrible.

>
> This also generates a warning and fails to compile with KVM_WERROR=y, though I
> believe the test bots already reported that.
>
>
> arch/x86/kvm/mmu/mmu.c: In function ‘kvm_mmu_page_set_access’:
> include/linux/kern_levels.h:5:25: error: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 3 has type ‘int’ [-Werror=format=]
>     5 | #define KERN_SOH        "\001"          /* ASCII Start Of Header */
>       |                         ^~~~~~
> include/linux/printk.h:418:25: note: in definition of macro ‘printk_index_wrap’
>   418 |                 _p_func(_fmt, ##__VA_ARGS__);                           \
>       |                         ^~~~
> include/linux/printk.h:640:17: note: in expansion of macro ‘printk’
>   640 |                 printk(fmt, ##__VA_ARGS__);                             \
>       |                 ^~~~~~
> include/linux/printk.h:654:9: note: in expansion of macro ‘printk_ratelimited’
>   654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
>       |         ^~~~~~~~~~~~~~~~~~
> include/linux/kern_levels.h:11:25: note: in expansion of macro ‘KERN_SOH’
>    11 | #define KERN_ERR        KERN_SOH "3"    /* error conditions */
>       |                         ^~~~~~~~
> include/linux/printk.h:654:28: note: in expansion of macro ‘KERN_ERR’
>   654 |         printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
>       |                            ^~~~~~~~
> arch/x86/kvm/mmu/mmu.c:763:17: note: in expansion of macro ‘pr_err_ratelimited’
>   763 |                 pr_err_ratelimited("access mismatch under direct page %llx "
>       |                 ^~~~~~~~~~~~~~~~~~
>
>
> > +                                "(expected %llx, got %llx)\n",
> > +                                kvm_mmu_page_get_gfn(sp, index),
> > +                                sp->role.access, access);
> > +}
> > +
> > +/*
> > + * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
> > + * that the SPTE itself may have a more constrained access permissions that
> > + * what the guest enforces. For example, a guest may create an executable
> > + * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
> > + */
> > +static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
> > +{
> > +     if (!sp->role.direct)
> > +             return sp->shadowed_translation[index].access;
> > +
> > +     /*
> > +      * For direct MMUs (e.g. TDP or non-paging guests) there are no *guest*
> > +      * access permissions being shadowed. So we can just return ACC_ALL
> > +      * here.
> > +      *
> > +      * For indirect MMUs (shadow paging), direct shadow pages exist when KVM
> > +      * is shadowing a guest huge page with smaller pages, since the guest
> > +      * huge page is being directly mapped. In this case the guest access
> > +      * permissions being shadowed are the access permissions of the huge
> > +      * page.
> > +      *
> > +      * In both cases, sp->role.access contains exactly what we want.
> > +      */
> > +     return sp->role.access;
> > +}
>
> ...
>
> > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
> > index b6e22ba9c654..3f76f4c1ae59 100644
> > --- a/arch/x86/kvm/mmu/mmu_internal.h
> > +++ b/arch/x86/kvm/mmu/mmu_internal.h
> > @@ -32,6 +32,18 @@ extern bool dbg;
> >
> >  typedef u64 __rcu *tdp_ptep_t;
> >
> > +/*
> > + * Stores the result of the guest translation being shadowed by an SPTE. KVM
> > + * shadows two types of guest translations: nGPA -> GPA (shadow EPT/NPT) and
> > + * GVA -> GPA (traditional shadow paging). In both cases the result of the
> > + * translation is a GPA and a set of access constraints.
> > + */
> > +struct shadowed_translation_entry {
> > +     /* Note, GFNs can have at most 64 - PAGE_SHIFT = 52 bits. */
> > +     u64 gfn:52;
> > +     u64 access:3;
>
> A bitfield is completely unnecessary and generates bad code.  As is, it generates
> _really_ bad code because extracting and setting requires non-standard 64-bit value
> masks, multiple operations, and accesses to unaligned data.  The generated code can
> be made slightly less awful by using a fully byte for access and 64 bits for GFN,
> but it still sucks compared to what we can hand generate.
>
> The other aspect of this is that retrieving the GFN is a frequent operation,
> whereas the access is almost never read.  I.e. we should bias for reading the GFN
> above all else.
>
> The simple and obvious thing is to not reinvent the wheel.  GFN = (GPA >> PAGE_SHIFT),
> and ignoring NX, access lives in the lower 12 bits of a PTE.  Then reading the GFN is
> a simple SHR, and reading access info is a simple AND.
>
> We might also be able to optimize FNAME(sync_page), but I don't care much about
> that, it's rarely used for nested TDP.
>
> So, keep translation_entry a gfn_t *, then do:

Looks good, will do in v4.

>
> static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
> {
>         if (!sp->role.direct)
>                 return sp->shadowed_translation[index] >> PAGE_SHIFT;
>
>         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
> }
>
> static void kvm_mmu_page_set_translation(struct kvm_mmu_page *sp, int index,
>                                          gfn_t gfn, unsigned int access)
> {
>         if (!sp->role.direct) {
>                 sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) | access;
>                 return;
>         }
>
>         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
>                 pr_err_ratelimited("gfn mismatch under direct page %llx "
>                                    "(expected %llx, got %llx)\n",
>                                    sp->gfn,
>                                    kvm_mmu_page_get_gfn(sp, index), gfn);
> }
>
> static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index,
>                                     unsigned int access)
> {
>         if (sp->role.direct)
>                 return;
>
>         sp->shadowed_translation[index] &= PAGE_MASK;
>         sp->shadowed_translation[index] |= access;
> }
>
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9694dd5e6ccc..be4349c9ffea 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -696,7 +696,7 @@  struct kvm_vcpu_arch {
 
 	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_shadow_page_cache;
-	struct kvm_mmu_memory_cache mmu_gfn_array_cache;
+	struct kvm_mmu_memory_cache mmu_shadowed_info_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
 	/*
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 5e1002d57689..3a425ed80e23 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -708,7 +708,7 @@  static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
 	if (r)
 		return r;
 	if (maybe_indirect) {
-		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache,
+		r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache,
 					       PT64_ROOT_MAX_LEVEL);
 		if (r)
 			return r;
@@ -721,7 +721,7 @@  static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache);
-	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache);
+	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache);
 	kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
 }
 
@@ -733,7 +733,7 @@  static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 {
 	if (!sp->role.direct)
-		return sp->gfns[index];
+		return sp->shadowed_translation[index].gfn;
 
 	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
 }
@@ -741,7 +741,7 @@  static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 {
 	if (!sp->role.direct) {
-		sp->gfns[index] = gfn;
+		sp->shadowed_translation[index].gfn = gfn;
 		return;
 	}
 
@@ -752,6 +752,47 @@  static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
 				   kvm_mmu_page_get_gfn(sp, index), gfn);
 }
 
+static void kvm_mmu_page_set_access(struct kvm_mmu_page *sp, int index, u32 access)
+{
+	if (!sp->role.direct) {
+		sp->shadowed_translation[index].access = access;
+		return;
+	}
+
+	if (WARN_ON(access != sp->role.access))
+		pr_err_ratelimited("access mismatch under direct page %llx "
+				   "(expected %llx, got %llx)\n",
+				   kvm_mmu_page_get_gfn(sp, index),
+				   sp->role.access, access);
+}
+
+/*
+ * For leaf SPTEs, fetch the *guest* access permissions being shadowed. Note
+ * that the SPTE itself may have a more constrained access permissions that
+ * what the guest enforces. For example, a guest may create an executable
+ * huge PTE but KVM may disallow execution to mitigate iTLB multihit.
+ */
+static u32 kvm_mmu_page_get_access(struct kvm_mmu_page *sp, int index)
+{
+	if (!sp->role.direct)
+		return sp->shadowed_translation[index].access;
+
+	/*
+	 * For direct MMUs (e.g. TDP or non-paging guests) there are no *guest*
+	 * access permissions being shadowed. So we can just return ACC_ALL
+	 * here.
+	 *
+	 * For indirect MMUs (shadow paging), direct shadow pages exist when KVM
+	 * is shadowing a guest huge page with smaller pages, since the guest
+	 * huge page is being directly mapped. In this case the guest access
+	 * permissions being shadowed are the access permissions of the huge
+	 * page.
+	 *
+	 * In both cases, sp->role.access contains exactly what we want.
+	 */
+	return sp->role.access;
+}
+
 /*
  * Return the pointer to the large page information for a given gfn,
  * handling slots that are not large page aligned.
@@ -1594,7 +1635,7 @@  static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
 static void __rmap_add(struct kvm *kvm,
 		       struct kvm_mmu_memory_cache *cache,
 		       const struct kvm_memory_slot *slot,
-		       u64 *spte, gfn_t gfn)
+		       u64 *spte, gfn_t gfn, u32 access)
 {
 	struct kvm_mmu_page *sp;
 	struct kvm_rmap_head *rmap_head;
@@ -1602,6 +1643,7 @@  static void __rmap_add(struct kvm *kvm,
 
 	sp = sptep_to_sp(spte);
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+	kvm_mmu_page_set_access(sp, spte - sp->spt, access);
 	kvm_update_page_stats(kvm, sp->role.level, 1);
 
 	rmap_head = gfn_to_rmap(gfn, sp->role.level, slot);
@@ -1615,9 +1657,9 @@  static void __rmap_add(struct kvm *kvm,
 }
 
 static void rmap_add(struct kvm_vcpu *vcpu, const struct kvm_memory_slot *slot,
-		     u64 *spte, gfn_t gfn)
+		     u64 *spte, gfn_t gfn, u32 access)
 {
-	__rmap_add(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, slot, spte, gfn);
+	__rmap_add(vcpu->kvm, &vcpu->arch.mmu_pte_list_desc_cache, slot, spte, gfn, access);
 }
 
 bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
@@ -1678,7 +1720,7 @@  void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
 {
 	free_page((unsigned long)sp->spt);
 	if (!sp->role.direct)
-		free_page((unsigned long)sp->gfns);
+		free_page((unsigned long)sp->shadowed_translation);
 	kmem_cache_free(mmu_page_header_cache, sp);
 }
 
@@ -1715,8 +1757,12 @@  struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm_vcpu *vcpu, bool direc
 
 	sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
 	sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache);
+
+	BUILD_BUG_ON(sizeof(sp->shadowed_translation[0]) != sizeof(u64));
+
 	if (!direct)
-		sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache);
+		sp->shadowed_translation =
+			kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadowed_info_cache);
 
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 
@@ -1738,7 +1784,7 @@  static inline gfp_t gfp_flags_for_split(bool locked)
  *
  * Huge page splitting always uses direct shadow pages since the huge page is
  * being mapped directly with a lower level page table. Thus there's no need to
- * allocate the gfns array.
+ * allocate the shadowed_translation array.
  */
 struct kvm_mmu_page *kvm_mmu_alloc_direct_sp_for_split(bool locked)
 {
@@ -2841,7 +2887,10 @@  static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
 
 	if (!was_rmapped) {
 		WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
-		rmap_add(vcpu, slot, sptep, gfn);
+		rmap_add(vcpu, slot, sptep, gfn, pte_access);
+	} else {
+		/* Already rmapped but the pte_access bits may have changed. */
+		kvm_mmu_page_set_access(sp, sptep - sp->spt, pte_access);
 	}
 
 	return ret;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b6e22ba9c654..3f76f4c1ae59 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -32,6 +32,18 @@  extern bool dbg;
 
 typedef u64 __rcu *tdp_ptep_t;
 
+/*
+ * Stores the result of the guest translation being shadowed by an SPTE. KVM
+ * shadows two types of guest translations: nGPA -> GPA (shadow EPT/NPT) and
+ * GVA -> GPA (traditional shadow paging). In both cases the result of the
+ * translation is a GPA and a set of access constraints.
+ */
+struct shadowed_translation_entry {
+	/* Note, GFNs can have at most 64 - PAGE_SHIFT = 52 bits. */
+	u64 gfn:52;
+	u64 access:3;
+};
+
 struct kvm_mmu_page {
 	/*
 	 * Note, "link" through "spt" fit in a single 64 byte cache line on
@@ -53,8 +65,12 @@  struct kvm_mmu_page {
 	gfn_t gfn;
 
 	u64 *spt;
-	/* hold the gfn of each spte inside spt */
-	gfn_t *gfns;
+	/*
+	 * Caches the result of the intermediate guest translation being
+	 * shadowed by each SPTE. NULL for direct shadow pages.
+	 */
+	struct shadowed_translation_entry *shadowed_translation;
+
 	/* Currently serving as active root */
 	union {
 		int root_count;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index db63b5377465..91c2088464ce 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -1014,7 +1014,8 @@  static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 }
 
 /*
- * Using the cached information from sp->gfns is safe because:
+ * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()
+ * and kvm_mmu_page_get_access()) is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
  *
@@ -1088,12 +1089,15 @@  static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
 			continue;
 
-		if (gfn != sp->gfns[i]) {
+		if (gfn != kvm_mmu_page_get_gfn(sp, i)) {
 			drop_spte(vcpu->kvm, &sp->spt[i]);
 			flush = true;
 			continue;
 		}
 
+		if (pte_access != kvm_mmu_page_get_access(sp, i))
+			kvm_mmu_page_set_access(sp, i, pte_access);
+
 		sptep = &sp->spt[i];
 		spte = *sptep;
 		host_writable = spte & shadow_host_writable_mask;