Message ID | 1469192248-25141-1-git-send-email-npiggin@gmail.com (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
On Fri, Jul 22, 2016 at 10:57:28PM +1000, Nicholas Piggin wrote: > Calculating the slice mask can become a signifcant overhead for > get_unmapped_area. The mask is relatively small and does not change > frequently, so we can cache it in the mm context. > > This saves about 30% kernel time on a 4K user address allocation > in a microbenchmark. > > Comments on the approach taken? I think there is the option for fixed > allocations to avoid some of the slice calculation entirely, but first > I think it will be good to have a general speedup that covers all > mmaps. > > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> > Cc: Anton Blanchard <anton@samba.org> > --- > arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++++ > arch/powerpc/mm/slice.c | 39 ++++++++++++++++++++++++++++++-- > 2 files changed, 45 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h > index 5854263..0d15af4 100644 > --- a/arch/powerpc/include/asm/book3s/64/mmu.h > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h > @@ -71,6 +71,14 @@ typedef struct { > #ifdef CONFIG_PPC_MM_SLICES > u64 low_slices_psize; /* SLB page size encodings */ > unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; > + struct slice_mask mask_4k; > +# ifdef CONFIG_PPC_64K_PAGES > + struct slice_mask mask_64k; > +# endif > +# ifdef CONFIG_HUGETLB_PAGE > + struct slice_mask mask_16m; > + struct slice_mask mask_16g; > +# endif Should we cache these in mmu_psize_defs? I am not 100% sure if want to overload that structure, but it provides a convient way of saying mmu_psize_defs[psize].mask instead of all the if checks > #else > u16 sllp; /* SLB page size encoding */ > #endif > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c > index 2b27458..559ea5f 100644 > --- a/arch/powerpc/mm/slice.c > +++ b/arch/powerpc/mm/slice.c > @@ -147,7 +147,7 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm) > return ret; > } > > -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) > +static struct slice_mask calc_slice_mask_for_size(struct mm_struct *mm, int psize) > { > unsigned char *hpsizes; > int index, mask_index; > @@ -171,6 +171,36 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) > return ret; > } > > +static void recalc_slice_mask_cache(struct mm_struct *mm) > +{ > + mm->context.mask_4k = calc_slice_mask_for_size(mm, MMU_PAGE_4K); > +#ifdef CONFIG_PPC_64K_PAGES > + mm->context.mask_64k = calc_slice_mask_for_size(mm, MMU_PAGE_64K); > +#endif > +# ifdef CONFIG_HUGETLB_PAGE > + /* Radix does not come here */ > + mm->context.mask_16m = calc_slice_mask_for_size(mm, MMU_PAGE_16M); > + mm->context.mask_16g = calc_slice_mask_for_size(mm, MMU_PAGE_16G); > +# endif > +} Should the function above be called under slice_convert_lock? > + > +static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) > +{ > + if (psize == MMU_PAGE_4K) > + return mm->context.mask_4k; > +#ifdef CONFIG_PPC_64K_PAGES > + if (psize == MMU_PAGE_64K) > + return mm->context.mask_64k; > +#endif > +# ifdef CONFIG_HUGETLB_PAGE > + if (psize == MMU_PAGE_16M) > + return mm->context.mask_16m; > + if (psize == MMU_PAGE_16G) > + return mm->context.mask_16g; > +# endif > + BUG(); > +} > + > static int slice_check_fit(struct slice_mask mask, struct slice_mask available) > { > return (mask.low_slices & available.low_slices) == mask.low_slices && > @@ -233,6 +263,8 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz > > spin_unlock_irqrestore(&slice_convert_lock, flags); > > + recalc_slice_mask_cache(mm); > + > copro_flush_all_slbs(mm); > } > > @@ -625,7 +657,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) > goto bail; > > mm->context.user_psize = psize; > - wmb(); > + wmb(); /* Why? */ > > lpsizes = mm->context.low_slices_psize; > for (i = 0; i < SLICE_NUM_LOW; i++) > @@ -652,6 +684,9 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) > mm->context.low_slices_psize, > mm->context.high_slices_psize); > > + spin_unlock_irqrestore(&slice_convert_lock, flags); > + recalc_slice_mask_cache(mm); > + return; > bail: > spin_unlock_irqrestore(&slice_convert_lock, flags); > } > -- > 2.8.1 > > _______________________________________________ > Linuxppc-dev mailing list > Linuxppc-dev@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/linuxppc-dev
On Sat, 23 Jul 2016 12:19:37 +1000 Balbir Singh <bsingharora@gmail.com> wrote: > On Fri, Jul 22, 2016 at 10:57:28PM +1000, Nicholas Piggin wrote: > > Calculating the slice mask can become a signifcant overhead for > > get_unmapped_area. The mask is relatively small and does not change > > frequently, so we can cache it in the mm context. > > > > This saves about 30% kernel time on a 4K user address allocation > > in a microbenchmark. > > > > Comments on the approach taken? I think there is the option for > > fixed allocations to avoid some of the slice calculation entirely, > > but first I think it will be good to have a general speedup that > > covers all mmaps. > > > > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> > > Cc: Anton Blanchard <anton@samba.org> > > --- > > arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++++ > > arch/powerpc/mm/slice.c | 39 > > ++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), > > 2 deletions(-) > > > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h > > b/arch/powerpc/include/asm/book3s/64/mmu.h index 5854263..0d15af4 > > 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h > > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h > > @@ -71,6 +71,14 @@ typedef struct { > > #ifdef CONFIG_PPC_MM_SLICES > > u64 low_slices_psize; /* SLB page size encodings */ > > unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; > > + struct slice_mask mask_4k; > > +# ifdef CONFIG_PPC_64K_PAGES > > + struct slice_mask mask_64k; > > +# endif > > +# ifdef CONFIG_HUGETLB_PAGE > > + struct slice_mask mask_16m; > > + struct slice_mask mask_16g; > > +# endif > > Should we cache these in mmu_psize_defs? I am not 100% sure > if want to overload that structure, but it provides a convient > way of saying mmu_psize_defs[psize].mask instead of all > the if checks I'm not sure if we can, can we? mmu_psize_defs is global whereas we need per-process structure. The branches are a bit annoying, but we can't directly use an array because it's too big. But see the comment at MMU_PAGE_* defines. Perhaps we could change this structure to be sized at compile time to only include possible page sizes, and would enable building a structure like the above with simply struct type blah[MMU_POSSIBLE_PAGE_COUNT]; Perhaps we can consider that as a follow on patch? It's probably a bit more work to implement. > > #else > > u16 sllp; /* SLB page size encoding */ > > #endif > > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c > > index 2b27458..559ea5f 100644 > > --- a/arch/powerpc/mm/slice.c > > +++ b/arch/powerpc/mm/slice.c > > @@ -147,7 +147,7 @@ static struct slice_mask > > slice_mask_for_free(struct mm_struct *mm) return ret; > > } > > > > -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, > > int psize) +static struct slice_mask > > calc_slice_mask_for_size(struct mm_struct *mm, int psize) { > > unsigned char *hpsizes; > > int index, mask_index; > > @@ -171,6 +171,36 @@ static struct slice_mask > > slice_mask_for_size(struct mm_struct *mm, int psize) return ret; > > } > > > > +static void recalc_slice_mask_cache(struct mm_struct *mm) > > +{ > > + mm->context.mask_4k = calc_slice_mask_for_size(mm, > > MMU_PAGE_4K); +#ifdef CONFIG_PPC_64K_PAGES > > + mm->context.mask_64k = calc_slice_mask_for_size(mm, > > MMU_PAGE_64K); +#endif > > +# ifdef CONFIG_HUGETLB_PAGE > > + /* Radix does not come here */ > > + mm->context.mask_16m = calc_slice_mask_for_size(mm, > > MMU_PAGE_16M); > > + mm->context.mask_16g = calc_slice_mask_for_size(mm, > > MMU_PAGE_16G); +# endif > > +} > > Should the function above be called under slice_convert_lock? Good question. The slice_convert_lock is... interesting. It only protects the update-side of the slice page size arrays. I thought this was okay last time I looked, but now you make me think again maybe it is not. I need to check again what's providing exclusion on the read side too. I wanted to avoid doing more work under slice_convert_lock, but we should just make that a per-mm lock anyway shouldn't we? Thanks, Nick
On Sat, 2016-07-23 at 17:10 +1000, Nicholas Piggin wrote: > I wanted to avoid doing more work under slice_convert_lock, but > we should just make that a per-mm lock anyway shouldn't we? Aren't the readers under the mm sem taken for writing or has this changed ? Cheers, Ben.
On Sat, Jul 23, 2016 at 05:10:36PM +1000, Nicholas Piggin wrote: > On Sat, 23 Jul 2016 12:19:37 +1000 > Balbir Singh <bsingharora@gmail.com> wrote: > > > On Fri, Jul 22, 2016 at 10:57:28PM +1000, Nicholas Piggin wrote: > > > Calculating the slice mask can become a signifcant overhead for > > > get_unmapped_area. The mask is relatively small and does not change > > > frequently, so we can cache it in the mm context. > > > > > > This saves about 30% kernel time on a 4K user address allocation > > > in a microbenchmark. > > > > > > Comments on the approach taken? I think there is the option for > > > fixed allocations to avoid some of the slice calculation entirely, > > > but first I think it will be good to have a general speedup that > > > covers all mmaps. > > > > > > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> > > > Cc: Anton Blanchard <anton@samba.org> > > > --- > > > arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++++ > > > arch/powerpc/mm/slice.c | 39 > > > ++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), > > > 2 deletions(-) > > > > > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h > > > b/arch/powerpc/include/asm/book3s/64/mmu.h index 5854263..0d15af4 > > > 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h > > > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h > > > @@ -71,6 +71,14 @@ typedef struct { > > > #ifdef CONFIG_PPC_MM_SLICES > > > u64 low_slices_psize; /* SLB page size encodings */ > > > unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; > > > + struct slice_mask mask_4k; > > > +# ifdef CONFIG_PPC_64K_PAGES > > > + struct slice_mask mask_64k; > > > +# endif > > > +# ifdef CONFIG_HUGETLB_PAGE > > > + struct slice_mask mask_16m; > > > + struct slice_mask mask_16g; > > > +# endif > > > > Should we cache these in mmu_psize_defs? I am not 100% sure > > if want to overload that structure, but it provides a convient > > way of saying mmu_psize_defs[psize].mask instead of all > > the if checks > > I'm not sure if we can, can we? mmu_psize_defs is global > whereas we need per-process structure. > Oh! sorry, I meant a structure like mmu_psize_defs. > The branches are a bit annoying, but we can't directly use an array > because it's too big. But see the comment at MMU_PAGE_* defines. > Perhaps we could change this structure to be sized at compile time to > only include possible page sizes, and would enable building a > structure like the above with simply > > struct type blah[MMU_POSSIBLE_PAGE_COUNT]; > > Perhaps we can consider that as a follow on patch? It's probably a bit > more work to implement. > Yeah.. good idea MMU_PAGE_COUNT is 15, the size is going to be 15*8 bytes? > > > > #else > > > u16 sllp; /* SLB page size encoding */ > > > #endif > > > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c > > > index 2b27458..559ea5f 100644 > > > --- a/arch/powerpc/mm/slice.c > > > +++ b/arch/powerpc/mm/slice.c > > > @@ -147,7 +147,7 @@ static struct slice_mask > > > slice_mask_for_free(struct mm_struct *mm) return ret; > > > } > > > > > > -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, > > > int psize) +static struct slice_mask > > > calc_slice_mask_for_size(struct mm_struct *mm, int psize) { > > > unsigned char *hpsizes; > > > int index, mask_index; > > > @@ -171,6 +171,36 @@ static struct slice_mask > > > slice_mask_for_size(struct mm_struct *mm, int psize) return ret; > > > } > > > > > > +static void recalc_slice_mask_cache(struct mm_struct *mm) > > > +{ > > > + mm->context.mask_4k = calc_slice_mask_for_size(mm, > > > MMU_PAGE_4K); +#ifdef CONFIG_PPC_64K_PAGES > > > + mm->context.mask_64k = calc_slice_mask_for_size(mm, > > > MMU_PAGE_64K); +#endif > > > +# ifdef CONFIG_HUGETLB_PAGE > > > + /* Radix does not come here */ > > > + mm->context.mask_16m = calc_slice_mask_for_size(mm, > > > MMU_PAGE_16M); > > > + mm->context.mask_16g = calc_slice_mask_for_size(mm, > > > MMU_PAGE_16G); +# endif > > > +} > > > > Should the function above be called under slice_convert_lock? > > Good question. The slice_convert_lock is... interesting. It only > protects the update-side of the slice page size arrays. I thought > this was okay last time I looked, but now you make me think again > maybe it is not. I need to check again what's providing exclusion > on the read side too. > > I wanted to avoid doing more work under slice_convert_lock, but > we should just make that a per-mm lock anyway shouldn't we? > Yeah and Ben's comment in the reply suggest we already hold a per mm lock on the read side. Balbir Singh
On Sat, 23 Jul 2016 18:49:06 +1000 Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Sat, 2016-07-23 at 17:10 +1000, Nicholas Piggin wrote: > > I wanted to avoid doing more work under slice_convert_lock, but > > we should just make that a per-mm lock anyway shouldn't we? > > Aren't the readers under the mm sem taken for writing or has this > changed ? I don't think this has changed, but I look at the writers now, which aren't synchronized by mm sem. But neither are readers under the slice_convert_lock, so I'm looking at what the locking actually is. Is it just using atomicity of dword stores vs loads? Thanks, Nick
On Sat, 23 Jul 2016 20:36:42 +1000 Balbir Singh <bsingharora@gmail.com> wrote: > On Sat, Jul 23, 2016 at 05:10:36PM +1000, Nicholas Piggin wrote: > > On Sat, 23 Jul 2016 12:19:37 +1000 > > Balbir Singh <bsingharora@gmail.com> wrote: > > > > > On Fri, Jul 22, 2016 at 10:57:28PM +1000, Nicholas Piggin wrote: > > > > Calculating the slice mask can become a signifcant overhead for > > > > get_unmapped_area. The mask is relatively small and does not > > > > change frequently, so we can cache it in the mm context. > > > > > > > > This saves about 30% kernel time on a 4K user address allocation > > > > in a microbenchmark. > > > > > > > > Comments on the approach taken? I think there is the option for > > > > fixed allocations to avoid some of the slice calculation > > > > entirely, but first I think it will be good to have a general > > > > speedup that covers all mmaps. > > > > > > > > Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> > > > > Cc: Anton Blanchard <anton@samba.org> > > > > --- > > > > arch/powerpc/include/asm/book3s/64/mmu.h | 8 +++++++ > > > > arch/powerpc/mm/slice.c | 39 > > > > ++++++++++++++++++++++++++++++-- 2 files changed, 45 > > > > insertions(+), 2 deletions(-) > > > > > > > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h > > > > b/arch/powerpc/include/asm/book3s/64/mmu.h index > > > > 5854263..0d15af4 100644 --- > > > > a/arch/powerpc/include/asm/book3s/64/mmu.h +++ > > > > b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -71,6 +71,14 @@ > > > > typedef struct { #ifdef CONFIG_PPC_MM_SLICES > > > > u64 low_slices_psize; /* SLB page size > > > > encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; > > > > + struct slice_mask mask_4k; > > > > +# ifdef CONFIG_PPC_64K_PAGES > > > > + struct slice_mask mask_64k; > > > > +# endif > > > > +# ifdef CONFIG_HUGETLB_PAGE > > > > + struct slice_mask mask_16m; > > > > + struct slice_mask mask_16g; > > > > +# endif > > > > > > Should we cache these in mmu_psize_defs? I am not 100% sure > > > if want to overload that structure, but it provides a convient > > > way of saying mmu_psize_defs[psize].mask instead of all > > > the if checks > > > > I'm not sure if we can, can we? mmu_psize_defs is global > > whereas we need per-process structure. > > > > Oh! sorry, I meant a structure like mmu_psize_defs. In that case, sure. Avoiding the branches might be worthwhile. > > The branches are a bit annoying, but we can't directly use an array > > because it's too big. But see the comment at MMU_PAGE_* defines. > > Perhaps we could change this structure to be sized at compile time > > to only include possible page sizes, and would enable building a > > structure like the above with simply > > > > struct type blah[MMU_POSSIBLE_PAGE_COUNT]; > > > > Perhaps we can consider that as a follow on patch? It's probably a > > bit more work to implement. > > > > > Yeah.. good idea > MMU_PAGE_COUNT is 15, the size is going to be 15*8 bytes? Unfortunately, slice_mask is 16 bytes. Only 10 are used, but it seemed too ugly to try squashing things together. > > Good question. The slice_convert_lock is... interesting. It only > > protects the update-side of the slice page size arrays. I thought > > this was okay last time I looked, but now you make me think again > > maybe it is not. I need to check again what's providing exclusion > > on the read side too. > > > > I wanted to avoid doing more work under slice_convert_lock, but > > we should just make that a per-mm lock anyway shouldn't we? > > > > Yeah and Ben's comment in the reply suggest we already hold a > per mm lock on the read side. Let's discuss this further in my reply to Ben. Thanks, Nick
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 5854263..0d15af4 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -71,6 +71,14 @@ typedef struct { #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; + struct slice_mask mask_4k; +# ifdef CONFIG_PPC_64K_PAGES + struct slice_mask mask_64k; +# endif +# ifdef CONFIG_HUGETLB_PAGE + struct slice_mask mask_16m; + struct slice_mask mask_16g; +# endif #else u16 sllp; /* SLB page size encoding */ #endif diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 2b27458..559ea5f 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -147,7 +147,7 @@ static struct slice_mask slice_mask_for_free(struct mm_struct *mm) return ret; } -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +static struct slice_mask calc_slice_mask_for_size(struct mm_struct *mm, int psize) { unsigned char *hpsizes; int index, mask_index; @@ -171,6 +171,36 @@ static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) return ret; } +static void recalc_slice_mask_cache(struct mm_struct *mm) +{ + mm->context.mask_4k = calc_slice_mask_for_size(mm, MMU_PAGE_4K); +#ifdef CONFIG_PPC_64K_PAGES + mm->context.mask_64k = calc_slice_mask_for_size(mm, MMU_PAGE_64K); +#endif +# ifdef CONFIG_HUGETLB_PAGE + /* Radix does not come here */ + mm->context.mask_16m = calc_slice_mask_for_size(mm, MMU_PAGE_16M); + mm->context.mask_16g = calc_slice_mask_for_size(mm, MMU_PAGE_16G); +# endif +} + +static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +{ + if (psize == MMU_PAGE_4K) + return mm->context.mask_4k; +#ifdef CONFIG_PPC_64K_PAGES + if (psize == MMU_PAGE_64K) + return mm->context.mask_64k; +#endif +# ifdef CONFIG_HUGETLB_PAGE + if (psize == MMU_PAGE_16M) + return mm->context.mask_16m; + if (psize == MMU_PAGE_16G) + return mm->context.mask_16g; +# endif + BUG(); +} + static int slice_check_fit(struct slice_mask mask, struct slice_mask available) { return (mask.low_slices & available.low_slices) == mask.low_slices && @@ -233,6 +263,8 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz spin_unlock_irqrestore(&slice_convert_lock, flags); + recalc_slice_mask_cache(mm); + copro_flush_all_slbs(mm); } @@ -625,7 +657,7 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) goto bail; mm->context.user_psize = psize; - wmb(); + wmb(); /* Why? */ lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) @@ -652,6 +684,9 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) mm->context.low_slices_psize, mm->context.high_slices_psize); + spin_unlock_irqrestore(&slice_convert_lock, flags); + recalc_slice_mask_cache(mm); + return; bail: spin_unlock_irqrestore(&slice_convert_lock, flags); }