diff mbox series

[v6,21/22] KVM: Allow for different capacities in kvm_mmu_memory_cache structs

Message ID 20220516232138.1783324-22-dmatlack@google.com
State Accepted
Headers show
Series KVM: Extend Eager Page Splitting to the shadow MMU | expand

Commit Message

David Matlack May 16, 2022, 11:21 p.m. UTC
Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at
declaration time rather than being fixed for all declarations. This will
be used in a follow-up commit to declare an cache in x86 with a capacity
of 512+ objects without having to increase the capacity of all caches in
KVM.

This change requires each cache now specify its capacity at runtime,
since the cache struct itself no longer has a fixed capacity known at
compile time. To protect against someone accidentally defining a
kvm_mmu_memory_cache struct directly (without the extra storage), this
commit includes a WARN_ON() in kvm_mmu_topup_memory_cache().

In order to support different capacities, this commit changes the
objects pointer array to be dynamically allocated the first time the
cache is topped-up.

While here, opportunistically clean up the stack-allocated
kvm_mmu_memory_cache structs in riscv and arm64 to use designated
initializers.

No functional change intended.

Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: David Matlack <dmatlack@google.com>
---
 arch/arm64/kvm/mmu.c      |  2 +-
 arch/riscv/kvm/mmu.c      |  5 +----
 include/linux/kvm_types.h |  6 +++++-
 virt/kvm/kvm_main.c       | 33 ++++++++++++++++++++++++++++++---
 4 files changed, 37 insertions(+), 9 deletions(-)

Comments

Anup Patel May 19, 2022, 3:33 p.m. UTC | #1
On Tue, May 17, 2022 at 4:52 AM David Matlack <dmatlack@google.com> wrote:
>
> Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at
> declaration time rather than being fixed for all declarations. This will
> be used in a follow-up commit to declare an cache in x86 with a capacity
> of 512+ objects without having to increase the capacity of all caches in
> KVM.
>
> This change requires each cache now specify its capacity at runtime,
> since the cache struct itself no longer has a fixed capacity known at
> compile time. To protect against someone accidentally defining a
> kvm_mmu_memory_cache struct directly (without the extra storage), this
> commit includes a WARN_ON() in kvm_mmu_topup_memory_cache().
>
> In order to support different capacities, this commit changes the
> objects pointer array to be dynamically allocated the first time the
> cache is topped-up.
>
> While here, opportunistically clean up the stack-allocated
> kvm_mmu_memory_cache structs in riscv and arm64 to use designated
> initializers.
>
> No functional change intended.
>
> Reviewed-by: Marc Zyngier <maz@kernel.org>
> Signed-off-by: David Matlack <dmatlack@google.com>

Looks good to me for KVM RISC-V.

Reviewed-by: Anup Patel <anup@brainfault.org>

A small heads-up that function stage2_ioremap() is going to be
renamed for Linux-5.19 so you might have to rebase one more time.

Thanks,
Anup

> ---
>  arch/arm64/kvm/mmu.c      |  2 +-
>  arch/riscv/kvm/mmu.c      |  5 +----
>  include/linux/kvm_types.h |  6 +++++-
>  virt/kvm/kvm_main.c       | 33 ++++++++++++++++++++++++++++++---
>  4 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 53ae2c0640bc..f443ed845f85 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -764,7 +764,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  {
>         phys_addr_t addr;
>         int ret = 0;
> -       struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
> +       struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
>         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
>                                      KVM_PGTABLE_PROT_R |
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index f80a34fbf102..4d95ebe4114f 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -347,10 +347,7 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>         int ret = 0;
>         unsigned long pfn;
>         phys_addr_t addr, end;
> -       struct kvm_mmu_memory_cache pcache;
> -
> -       memset(&pcache, 0, sizeof(pcache));
> -       pcache.gfp_zero = __GFP_ZERO;
> +       struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO };
>
>         end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
>         pfn = __phys_to_pfn(hpa);
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index ac1ebb37a0ff..68529884eaf8 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -83,12 +83,16 @@ struct gfn_to_pfn_cache {
>   * MMU flows is problematic, as is triggering reclaim, I/O, etc... while
>   * holding MMU locks.  Note, these caches act more like prefetch buffers than
>   * classical caches, i.e. objects are not returned to the cache on being freed.
> + *
> + * The @capacity field and @objects array are lazily initialized when the cache
> + * is topped up (__kvm_mmu_topup_memory_cache()).
>   */
>  struct kvm_mmu_memory_cache {
>         int nobjs;
>         gfp_t gfp_zero;
>         struct kmem_cache *kmem_cache;
> -       void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE];
> +       int capacity;
> +       void **objects;
>  };
>  #endif
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e089db822c12..5e2e75014256 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
>                 return (void *)__get_free_page(gfp_flags);
>  }
>
> -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
>  {
> +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
>         void *obj;
>
>         if (mc->nobjs >= min)
>                 return 0;
> -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> +
> +       if (unlikely(!mc->objects)) {
> +               if (WARN_ON_ONCE(!capacity))
> +                       return -EIO;
> +
> +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> +               if (!mc->objects)
> +                       return -ENOMEM;
> +
> +               mc->capacity = capacity;
> +       }
> +
> +       /* It is illegal to request a different capacity across topups. */
> +       if (WARN_ON_ONCE(mc->capacity != capacity))
> +               return -EIO;
> +
> +       while (mc->nobjs < mc->capacity) {
> +               obj = mmu_memory_cache_alloc_obj(mc, gfp);
>                 if (!obj)
>                         return mc->nobjs >= min ? 0 : -ENOMEM;
>                 mc->objects[mc->nobjs++] = obj;
> @@ -384,6 +401,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
>         return 0;
>  }
>
> +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> +{
> +       return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
> +}
> +
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>  {
>         return mc->nobjs;
> @@ -397,6 +419,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
>                 else
>                         free_page((unsigned long)mc->objects[--mc->nobjs]);
>         }
> +
> +       kvfree(mc->objects);
> +
> +       mc->objects = NULL;
> +       mc->capacity = 0;
>  }
>
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> --
> 2.36.0.550.gb090851708-goog
>
Mingwei Zhang May 20, 2022, 11:21 p.m. UTC | #2
On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
>
> Allow the capacity of the kvm_mmu_memory_cache struct to be chosen at
> declaration time rather than being fixed for all declarations. This will
> be used in a follow-up commit to declare an cache in x86 with a capacity
> of 512+ objects without having to increase the capacity of all caches in
> KVM.
>
> This change requires each cache now specify its capacity at runtime,
> since the cache struct itself no longer has a fixed capacity known at
> compile time. To protect against someone accidentally defining a
> kvm_mmu_memory_cache struct directly (without the extra storage), this
> commit includes a WARN_ON() in kvm_mmu_topup_memory_cache().
>
> In order to support different capacities, this commit changes the
> objects pointer array to be dynamically allocated the first time the
> cache is topped-up.
>
> While here, opportunistically clean up the stack-allocated
> kvm_mmu_memory_cache structs in riscv and arm64 to use designated
> initializers.
>
> No functional change intended.
>
> Reviewed-by: Marc Zyngier <maz@kernel.org>
> Signed-off-by: David Matlack <dmatlack@google.com>
> ---
>  arch/arm64/kvm/mmu.c      |  2 +-
>  arch/riscv/kvm/mmu.c      |  5 +----
>  include/linux/kvm_types.h |  6 +++++-
>  virt/kvm/kvm_main.c       | 33 ++++++++++++++++++++++++++++++---
>  4 files changed, 37 insertions(+), 9 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index 53ae2c0640bc..f443ed845f85 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -764,7 +764,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>  {
>         phys_addr_t addr;
>         int ret = 0;
> -       struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
> +       struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
>         struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
>         enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
>                                      KVM_PGTABLE_PROT_R |
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index f80a34fbf102..4d95ebe4114f 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -347,10 +347,7 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>         int ret = 0;
>         unsigned long pfn;
>         phys_addr_t addr, end;
> -       struct kvm_mmu_memory_cache pcache;
> -
> -       memset(&pcache, 0, sizeof(pcache));
> -       pcache.gfp_zero = __GFP_ZERO;
> +       struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO };
>
>         end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
>         pfn = __phys_to_pfn(hpa);
> diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
> index ac1ebb37a0ff..68529884eaf8 100644
> --- a/include/linux/kvm_types.h
> +++ b/include/linux/kvm_types.h
> @@ -83,12 +83,16 @@ struct gfn_to_pfn_cache {
>   * MMU flows is problematic, as is triggering reclaim, I/O, etc... while
>   * holding MMU locks.  Note, these caches act more like prefetch buffers than
>   * classical caches, i.e. objects are not returned to the cache on being freed.
> + *
> + * The @capacity field and @objects array are lazily initialized when the cache
> + * is topped up (__kvm_mmu_topup_memory_cache()).
>   */
>  struct kvm_mmu_memory_cache {
>         int nobjs;
>         gfp_t gfp_zero;
>         struct kmem_cache *kmem_cache;
> -       void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE];
> +       int capacity;
> +       void **objects;
>  };
>  #endif
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index e089db822c12..5e2e75014256 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
>                 return (void *)__get_free_page(gfp_flags);
>  }
>
> -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
>  {
> +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
>         void *obj;
>
>         if (mc->nobjs >= min)
>                 return 0;
> -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> +
> +       if (unlikely(!mc->objects)) {
> +               if (WARN_ON_ONCE(!capacity))
> +                       return -EIO;
> +
> +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> +               if (!mc->objects)
> +                       return -ENOMEM;
> +
> +               mc->capacity = capacity;

Do we want to ensure the minimum value of the capacity? I think
otherwise, we may more likely start using memory from GFP_ATOMIC if
the capacity is less than, say 5? But the minimum value seems related
to each cache type.

> +       }
> +
> +       /* It is illegal to request a different capacity across topups. */
> +       if (WARN_ON_ONCE(mc->capacity != capacity))
> +               return -EIO;
> +
> +       while (mc->nobjs < mc->capacity) {
> +               obj = mmu_memory_cache_alloc_obj(mc, gfp);
>                 if (!obj)
>                         return mc->nobjs >= min ? 0 : -ENOMEM;
>                 mc->objects[mc->nobjs++] = obj;
> @@ -384,6 +401,11 @@ int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
>         return 0;
>  }
>
> +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> +{
> +       return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
> +}
> +
>  int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
>  {
>         return mc->nobjs;
> @@ -397,6 +419,11 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
>                 else
>                         free_page((unsigned long)mc->objects[--mc->nobjs]);
>         }
> +
> +       kvfree(mc->objects);
> +
> +       mc->objects = NULL;
> +       mc->capacity = 0;
>  }
>
>  void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
> --
> 2.36.0.550.gb090851708-goog
>
Sean Christopherson May 23, 2022, 5:37 p.m. UTC | #3
On Fri, May 20, 2022, Mingwei Zhang wrote:
> On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index e089db822c12..5e2e75014256 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
> >                 return (void *)__get_free_page(gfp_flags);
> >  }
> >
> > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> >  {
> > +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
> >         void *obj;
> >
> >         if (mc->nobjs >= min)
> >                 return 0;
> > -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> > -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> > +
> > +       if (unlikely(!mc->objects)) {
> > +               if (WARN_ON_ONCE(!capacity))
> > +                       return -EIO;
> > +
> > +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> > +               if (!mc->objects)
> > +                       return -ENOMEM;
> > +
> > +               mc->capacity = capacity;
> 
> Do we want to ensure the minimum value of the capacity? I think
> otherwise, we may more likely start using memory from GFP_ATOMIC if
> the capacity is less than, say 5? But the minimum value seems related
> to each cache type.

Eh, if we specify a minimum, just make the arch default the minimum.  That way we
avoid adding even more magic/arbitrary numbers.  E.g. for whatever reason, MIPS's
default is '4'.
David Matlack May 23, 2022, 5:44 p.m. UTC | #4
On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Fri, May 20, 2022, Mingwei Zhang wrote:
> > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index e089db822c12..5e2e75014256 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
> > >                 return (void *)__get_free_page(gfp_flags);
> > >  }
> > >
> > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> > >  {
> > > +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
> > >         void *obj;
> > >
> > >         if (mc->nobjs >= min)
> > >                 return 0;
> > > -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> > > -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> > > +
> > > +       if (unlikely(!mc->objects)) {
> > > +               if (WARN_ON_ONCE(!capacity))
> > > +                       return -EIO;
> > > +
> > > +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> > > +               if (!mc->objects)
> > > +                       return -ENOMEM;
> > > +
> > > +               mc->capacity = capacity;
> >
> > Do we want to ensure the minimum value of the capacity? I think
> > otherwise, we may more likely start using memory from GFP_ATOMIC if
> > the capacity is less than, say 5? But the minimum value seems related
> > to each cache type.
>
> Eh, if we specify a minimum, just make the arch default the minimum.  That way we
> avoid adding even more magic/arbitrary numbers.  E.g. for whatever reason, MIPS's
> default is '4'.

I'm not exactly sure what you had in mind Mingwei. But there is a bug
in this code if min > capacity. This function will happily return 0
after filling up the cache, even though it did not allocate min
objects. The same bug existed before this patch if min >
ARRAY_SIZE(mc->objects). I can include a separate patch to fix this
bug (e.g. WARN and return -ENOMEM if min > capacity).
Mingwei Zhang May 23, 2022, 6:13 p.m. UTC | #5
On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Fri, May 20, 2022, Mingwei Zhang wrote:
> > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index e089db822c12..5e2e75014256 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
> > > >                 return (void *)__get_free_page(gfp_flags);
> > > >  }
> > > >
> > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> > > >  {
> > > > +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
> > > >         void *obj;
> > > >
> > > >         if (mc->nobjs >= min)
> > > >                 return 0;
> > > > -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> > > > -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> > > > +
> > > > +       if (unlikely(!mc->objects)) {
> > > > +               if (WARN_ON_ONCE(!capacity))
> > > > +                       return -EIO;
> > > > +
> > > > +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> > > > +               if (!mc->objects)
> > > > +                       return -ENOMEM;
> > > > +
> > > > +               mc->capacity = capacity;
> > >
> > > Do we want to ensure the minimum value of the capacity? I think
> > > otherwise, we may more likely start using memory from GFP_ATOMIC if
> > > the capacity is less than, say 5? But the minimum value seems related
> > > to each cache type.
> >
> > Eh, if we specify a minimum, just make the arch default the minimum.  That way we
> > avoid adding even more magic/arbitrary numbers.  E.g. for whatever reason, MIPS's
> > default is '4'.
>
> I'm not exactly sure what you had in mind Mingwei. But there is a bug
> in this code if min > capacity. This function will happily return 0
> after filling up the cache, even though it did not allocate min
> objects. The same bug existed before this patch if min >
> ARRAY_SIZE(mc->objects). I can include a separate patch to fix this
> bug (e.g. WARN and return -ENOMEM if min > capacity).

oh, what I am saying is this one:
https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417

If we are running out of kmem cache, then we start to use
__GFP_ATOMIC, which should be avoided as much as we can? Since this
patch parameterized the 'capacity', then to avoid the future usage
where caller provides a too small value, maybe we could add a warning
if the 'capacity' is too small, say, smaller than 40 (the default
value)?

The case of  'capacity' < min would be a more serious issue, that
situation probably should never be allowed.
David Matlack May 23, 2022, 6:22 p.m. UTC | #6
On Mon, May 23, 2022 at 11:13 AM Mingwei Zhang <mizhang@google.com> wrote:
>
> On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote:
> >
> > On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Fri, May 20, 2022, Mingwei Zhang wrote:
> > > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
> > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > index e089db822c12..5e2e75014256 100644
> > > > > --- a/virt/kvm/kvm_main.c
> > > > > +++ b/virt/kvm/kvm_main.c
> > > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
> > > > >                 return (void *)__get_free_page(gfp_flags);
> > > > >  }
> > > > >
> > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> > > > >  {
> > > > > +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
> > > > >         void *obj;
> > > > >
> > > > >         if (mc->nobjs >= min)
> > > > >                 return 0;
> > > > > -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> > > > > -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> > > > > +
> > > > > +       if (unlikely(!mc->objects)) {
> > > > > +               if (WARN_ON_ONCE(!capacity))
> > > > > +                       return -EIO;
> > > > > +
> > > > > +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> > > > > +               if (!mc->objects)
> > > > > +                       return -ENOMEM;
> > > > > +
> > > > > +               mc->capacity = capacity;
> > > >
> > > > Do we want to ensure the minimum value of the capacity? I think
> > > > otherwise, we may more likely start using memory from GFP_ATOMIC if
> > > > the capacity is less than, say 5? But the minimum value seems related
> > > > to each cache type.
> > >
> > > Eh, if we specify a minimum, just make the arch default the minimum.  That way we
> > > avoid adding even more magic/arbitrary numbers.  E.g. for whatever reason, MIPS's
> > > default is '4'.
> >
> > I'm not exactly sure what you had in mind Mingwei. But there is a bug
> > in this code if min > capacity. This function will happily return 0
> > after filling up the cache, even though it did not allocate min
> > objects. The same bug existed before this patch if min >
> > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this
> > bug (e.g. WARN and return -ENOMEM if min > capacity).
>
> oh, what I am saying is this one:
> https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417
>
> If we are running out of kmem cache, then we start to use
> __GFP_ATOMIC, which should be avoided as much as we can? Since this
> patch parameterized the 'capacity', then to avoid the future usage
> where caller provides a too small value, maybe we could add a warning
> if the 'capacity' is too small, say, smaller than 40 (the default
> value)?

I'm not too worried about that. Callers of
kvm_mmu_topup_memory_cache() are responsible for passing in a min
value. It doesn't matter if capacity is a number lower than 40, as
long as kvm_mmu_topup_memory_cache() is able to allocate min objects,
the call is a success (and the GFP_ATOMIC fallback should never
trigger, and if it does, we'll get a WARN splat).

The only actual loophole I can spot is if capacity is less than min.
In that case topup will return 0 despite allocating less than min
objects. Again we'll still hit the GFP_ATOMIC and get a WARN splat,
but we can detect the problem in kvm_mmu_topup_memory_cache() which
will include the buggy callsite in the backtrace.

>
> The case of  'capacity' < min would be a more serious issue, that
> situation probably should never be allowed.
David Matlack May 23, 2022, 11:53 p.m. UTC | #7
On Mon, May 23, 2022 at 11:22 AM David Matlack <dmatlack@google.com> wrote:
>
> On Mon, May 23, 2022 at 11:13 AM Mingwei Zhang <mizhang@google.com> wrote:
> >
> > On Mon, May 23, 2022 at 10:44 AM David Matlack <dmatlack@google.com> wrote:
> > >
> > > On Mon, May 23, 2022 at 10:37 AM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > On Fri, May 20, 2022, Mingwei Zhang wrote:
> > > > > On Mon, May 16, 2022 at 4:24 PM David Matlack <dmatlack@google.com> wrote:
> > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > > index e089db822c12..5e2e75014256 100644
> > > > > > --- a/virt/kvm/kvm_main.c
> > > > > > +++ b/virt/kvm/kvm_main.c
> > > > > > @@ -369,14 +369,31 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
> > > > > >                 return (void *)__get_free_page(gfp_flags);
> > > > > >  }
> > > > > >
> > > > > > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > > > > > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> > > > > >  {
> > > > > > +       gfp_t gfp = GFP_KERNEL_ACCOUNT;
> > > > > >         void *obj;
> > > > > >
> > > > > >         if (mc->nobjs >= min)
> > > > > >                 return 0;
> > > > > > -       while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
> > > > > > -               obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
> > > > > > +
> > > > > > +       if (unlikely(!mc->objects)) {
> > > > > > +               if (WARN_ON_ONCE(!capacity))
> > > > > > +                       return -EIO;
> > > > > > +
> > > > > > +               mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
> > > > > > +               if (!mc->objects)
> > > > > > +                       return -ENOMEM;
> > > > > > +
> > > > > > +               mc->capacity = capacity;
> > > > >
> > > > > Do we want to ensure the minimum value of the capacity? I think
> > > > > otherwise, we may more likely start using memory from GFP_ATOMIC if
> > > > > the capacity is less than, say 5? But the minimum value seems related
> > > > > to each cache type.
> > > >
> > > > Eh, if we specify a minimum, just make the arch default the minimum.  That way we
> > > > avoid adding even more magic/arbitrary numbers.  E.g. for whatever reason, MIPS's
> > > > default is '4'.
> > >
> > > I'm not exactly sure what you had in mind Mingwei. But there is a bug
> > > in this code if min > capacity. This function will happily return 0
> > > after filling up the cache, even though it did not allocate min
> > > objects. The same bug existed before this patch if min >
> > > ARRAY_SIZE(mc->objects). I can include a separate patch to fix this
> > > bug (e.g. WARN and return -ENOMEM if min > capacity).
> >
> > oh, what I am saying is this one:
> > https://elixir.bootlin.com/linux/latest/source/virt/kvm/kvm_main.c#L417
> >
> > If we are running out of kmem cache, then we start to use
> > __GFP_ATOMIC, which should be avoided as much as we can? Since this
> > patch parameterized the 'capacity', then to avoid the future usage
> > where caller provides a too small value, maybe we could add a warning
> > if the 'capacity' is too small, say, smaller than 40 (the default
> > value)?
>
> I'm not too worried about that. Callers of
> kvm_mmu_topup_memory_cache() are responsible for passing in a min
> value. It doesn't matter if capacity is a number lower than 40, as
> long as kvm_mmu_topup_memory_cache() is able to allocate min objects,
> the call is a success (and the GFP_ATOMIC fallback should never
> trigger, and if it does, we'll get a WARN splat).

Ah and I forgot to add: In this situation, the bug is that *min* is
too small, not capacity. So adding a restriction on capacity would not
help.

>
> The only actual loophole I can spot is if capacity is less than min.
> In that case topup will return 0 despite allocating less than min
> objects. Again we'll still hit the GFP_ATOMIC and get a WARN splat,
> but we can detect the problem in kvm_mmu_topup_memory_cache() which
> will include the buggy callsite in the backtrace.
>
> >
> > The case of  'capacity' < min would be a more serious issue, that
> > situation probably should never be allowed.
Sean Christopherson June 17, 2022, 5:41 p.m. UTC | #8
On Mon, May 16, 2022, David Matlack wrote:
> -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)

I still find it somewhat kludgy to have callers provide an capacity.  It's not
terrible while there's only a single call site, but if a use case comes along
for using an oversized cache with multiple call sites, it'll be gross.

Tweaking my idea of a "custom" wrapper, what about adding an "oversized" wrapper?
That yields clear, firm rules on when to use each helper, guards against calling
the "standard" flavor with an impossible @min, and addresses Mingwei's concern
that a misguided user could specify a nonsensically small capacity.

The only quirk is that kvm_mmu_topup_memory_cache_oversized() has a fixed min,
but IMO that's an acceptable tradeoff, and it's a non-issue until another user
pops up.

static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc,
					int capacity, int min)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT;
	void *obj;

	if (mc->nobjs >= min)
		return 0;

	if (unlikely(!mc->objects)) {
		capacity = max(min, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE);

		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
		if (!mc->objects)
			return -ENOMEM;

		mc->capacity = capacity;
	}

	/* It is illegal to request a different capacity across topups. */
	if (WARN_ON_ONCE(mc->capacity != capacity))
		return -EIO;

	while (mc->nobjs < mc->capacity) {
		obj = mmu_memory_cache_alloc_obj(mc, gfp);
		if (!obj)
			return mc->nobjs >= min ? 0 : -ENOMEM;
		mc->objects[mc->nobjs++] = obj;
	}
	return 0;
}

int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
{
	const int capacity = KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;

	if (WARN_ON_ONCE(min > capacity))
		min = capacity;

	return __kvm_mmu_topup_memory_cache(mc, capacity, min);
}

/* Oversized caches have a fixed size, i.e. min == capacity == size. */
int kvm_mmu_topup_memory_cache_oversized(struct kvm_mmu_memory_cache *mc, int size)
{
	if (WARN_ON_ONCE(size < KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE)
		size = KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;

	return __kvm_mmu_topup_memory_cache(mc, size, size);
}
Sean Christopherson June 17, 2022, 6:34 p.m. UTC | #9
On Fri, Jun 17, 2022, Sean Christopherson wrote:
> On Mon, May 16, 2022, David Matlack wrote:
> > -int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
> > +static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
> 
> I still find it somewhat kludgy to have callers provide an capacity.  It's not
> terrible while there's only a single call site, but if a use case comes along
> for using an oversized cache with multiple call sites, it'll be gross.
> 
> Tweaking my idea of a "custom" wrapper, what about adding an "oversized" wrapper?
> That yields clear, firm rules on when to use each helper, guards against calling
> the "standard" flavor with an impossible @min, and addresses Mingwei's concern
> that a misguided user could specify a nonsensically small capacity.

Drat, arguing against my own idea.

The future usage in nested_mmu_try_split_huge_page() is going to be inefficient.
By having capacity==min, consuming just one entry, which is guaranteed when a
huge page split is successful, will mean the cache needs to be topped up.  In other
words, I'm pretty sure need_topup_split_caches_or_resched() will always return
true between different huge pages and so KVM will drop mmu_lock and reschedule
after every huge page.  Maybe that's not a big deal, but at the very least it's
odd, and its a nasty gotcha with forcing capacity==min.

So I'm ok with this patch, though it should yell if min > capacity.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 932abb4fb67e..14e807501229 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -388,7 +388,7 @@ int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity,
                return 0;

        if (unlikely(!mc->objects)) {
-               if (WARN_ON_ONCE(!capacity))
+               if (WARN_ON_ONCE(!capacity || min > capacity))
                        return -EIO;

                mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
diff mbox series

Patch

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 53ae2c0640bc..f443ed845f85 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -764,7 +764,7 @@  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
 {
 	phys_addr_t addr;
 	int ret = 0;
-	struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, };
+	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
 	struct kvm_pgtable *pgt = kvm->arch.mmu.pgt;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
 				     KVM_PGTABLE_PROT_R |
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index f80a34fbf102..4d95ebe4114f 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -347,10 +347,7 @@  static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 	int ret = 0;
 	unsigned long pfn;
 	phys_addr_t addr, end;
-	struct kvm_mmu_memory_cache pcache;
-
-	memset(&pcache, 0, sizeof(pcache));
-	pcache.gfp_zero = __GFP_ZERO;
+	struct kvm_mmu_memory_cache pcache = { .gfp_zero = __GFP_ZERO };
 
 	end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK;
 	pfn = __phys_to_pfn(hpa);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index ac1ebb37a0ff..68529884eaf8 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -83,12 +83,16 @@  struct gfn_to_pfn_cache {
  * MMU flows is problematic, as is triggering reclaim, I/O, etc... while
  * holding MMU locks.  Note, these caches act more like prefetch buffers than
  * classical caches, i.e. objects are not returned to the cache on being freed.
+ *
+ * The @capacity field and @objects array are lazily initialized when the cache
+ * is topped up (__kvm_mmu_topup_memory_cache()).
  */
 struct kvm_mmu_memory_cache {
 	int nobjs;
 	gfp_t gfp_zero;
 	struct kmem_cache *kmem_cache;
-	void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE];
+	int capacity;
+	void **objects;
 };
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e089db822c12..5e2e75014256 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -369,14 +369,31 @@  static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
 		return (void *)__get_free_page(gfp_flags);
 }
 
-int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+static int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
 {
+	gfp_t gfp = GFP_KERNEL_ACCOUNT;
 	void *obj;
 
 	if (mc->nobjs >= min)
 		return 0;
-	while (mc->nobjs < ARRAY_SIZE(mc->objects)) {
-		obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT);
+
+	if (unlikely(!mc->objects)) {
+		if (WARN_ON_ONCE(!capacity))
+			return -EIO;
+
+		mc->objects = kvmalloc_array(sizeof(void *), capacity, gfp);
+		if (!mc->objects)
+			return -ENOMEM;
+
+		mc->capacity = capacity;
+	}
+
+	/* It is illegal to request a different capacity across topups. */
+	if (WARN_ON_ONCE(mc->capacity != capacity))
+		return -EIO;
+
+	while (mc->nobjs < mc->capacity) {
+		obj = mmu_memory_cache_alloc_obj(mc, gfp);
 		if (!obj)
 			return mc->nobjs >= min ? 0 : -ENOMEM;
 		mc->objects[mc->nobjs++] = obj;
@@ -384,6 +401,11 @@  int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
 	return 0;
 }
 
+int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
+{
+	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
+}
+
 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
 {
 	return mc->nobjs;
@@ -397,6 +419,11 @@  void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
 		else
 			free_page((unsigned long)mc->objects[--mc->nobjs]);
 	}
+
+	kvfree(mc->objects);
+
+	mc->objects = NULL;
+	mc->capacity = 0;
 }
 
 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)