Message ID | VE1PR08MB5599A80A0480330F1CB5CC0083E49@VE1PR08MB5599.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/5] AArch64: Improve A64FX memset | expand |
Hi Wilco, Thanks for the patch. I confirmed that the performance is improved than the master as show in the graphs [1]. There are two comments, please find them. Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> Tested-by: Naohiro Tamura <naohirot@fujitsu.com> [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing > [PATCH v3 1/5] AArch64: Improve A64FX memset > Would you update the commit title so as not to be the same among 5 patches? Because we need to ask distro to backport these patches. If all commit titles are the same, it will increase the room to happen confusion and mistake. How about "AArch64: Improve A64FX memset for less than 512B" ? > Improve performance of small copies by reducing instruction counts and improving > alignment. Bench-memset shows 35-45% performance gain for small sizes. > > --- > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644 > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > @@ -30,7 +30,6 @@ > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB > #define CACHE_LINE_SIZE 256 > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance This caused compile error. > #define rest x8 > #define vector_length x9 > #define vl_remainder x10 // vector_length remainder > @@ -51,78 +50,54 @@ > .endm > > .macro st1b_unroll first=0, last=7 > - st1b z0.b, p0, [dst, #\first, mul vl] > + st1b z0.b, p0, [dst, \first, mul vl] > .if \last-\first > st1b_unroll "(\first+1)", \last > .endif > .endm > > - .macro shortcut_for_small_size exit > - // if rest <= vector_length * 2 > - whilelo p0.b, xzr, count > - whilelo p1.b, vector_length, count > - b.last 1f > - st1b z0.b, p0, [dstin, #0, mul vl] > - st1b z0.b, p1, [dstin, #1, mul vl] > - ret > -1: // if rest > vector_length * 8 > - cmp count, vector_length, lsl 3 // vector_length * 8 > - b.hi \exit > - // if rest <= vector_length * 4 > - lsl tmp1, vector_length, 1 // vector_length * 2 > - whilelo p2.b, tmp1, count > - incb tmp1 > - whilelo p3.b, tmp1, count > - b.last 1f > - st1b z0.b, p0, [dstin, #0, mul vl] > - st1b z0.b, p1, [dstin, #1, mul vl] > - st1b z0.b, p2, [dstin, #2, mul vl] > - st1b z0.b, p3, [dstin, #3, mul vl] > - ret > -1: // if rest <= vector_length * 8 > - lsl tmp1, vector_length, 2 // vector_length * 4 > - whilelo p4.b, tmp1, count > - incb tmp1 > - whilelo p5.b, tmp1, count > - b.last 1f > - st1b z0.b, p0, [dstin, #0, mul vl] > - st1b z0.b, p1, [dstin, #1, mul vl] > - st1b z0.b, p2, [dstin, #2, mul vl] > - st1b z0.b, p3, [dstin, #3, mul vl] > - st1b z0.b, p4, [dstin, #4, mul vl] > - st1b z0.b, p5, [dstin, #5, mul vl] > - ret > -1: lsl tmp1, vector_length, 2 // vector_length * 4 > - incb tmp1 // vector_length * 5 > - incb tmp1 // vector_length * 6 > - whilelo p6.b, tmp1, count > - incb tmp1 > - whilelo p7.b, tmp1, count > - st1b z0.b, p0, [dstin, #0, mul vl] > - st1b z0.b, p1, [dstin, #1, mul vl] > - st1b z0.b, p2, [dstin, #2, mul vl] > - st1b z0.b, p3, [dstin, #3, mul vl] > - st1b z0.b, p4, [dstin, #4, mul vl] > - st1b z0.b, p5, [dstin, #5, mul vl] > - st1b z0.b, p6, [dstin, #6, mul vl] > - st1b z0.b, p7, [dstin, #7, mul vl] > - ret > - .endm > > -ENTRY (MEMSET) > +#undef BTI_C > +#define BTI_C > > +ENTRY (MEMSET) > PTR_ARG (0) > SIZE_ARG (2) > > - cbnz count, 1f > - ret > -1: dup z0.b, valw > cntb vector_length > - // shortcut for less than vector_length * 8 > - // gives a free ptrue to p0.b for n >= vector_length > - shortcut_for_small_size L(vl_agnostic) > - // end of shortcut > + dup z0.b, valw > + whilelo p0.b, vector_length, count > + b.last 1f > + whilelo p1.b, xzr, count > + st1b z0.b, p1, [dstin, 0, mul vl] > + st1b z0.b, p0, [dstin, 1, mul vl] > + ret > + > + // count >= vector_length * 2 > +1: cmp count, vector_length, lsl 2 > + add dstend, dstin, count > + b.hi 1f > + st1b z0.b, p0, [dstin, 0, mul vl] > + st1b z0.b, p0, [dstin, 1, mul vl] > + st1b z0.b, p0, [dstend, -2, mul vl] > + st1b z0.b, p0, [dstend, -1, mul vl] > + ret > + > + // count > vector_length * 4 > +1: lsl tmp1, vector_length, 3 > + cmp count, tmp1 > + b.hi L(vl_agnostic) > + st1b z0.b, p0, [dstin, 0, mul vl] > + st1b z0.b, p0, [dstin, 1, mul vl] > + st1b z0.b, p0, [dstin, 2, mul vl] > + st1b z0.b, p0, [dstin, 3, mul vl] > + st1b z0.b, p0, [dstend, -4, mul vl] > + st1b z0.b, p0, [dstend, -3, mul vl] > + st1b z0.b, p0, [dstend, -2, mul vl] > + st1b z0.b, p0, [dstend, -1, mul vl] > + ret > > + .p2align 4 > L(vl_agnostic): // VL Agnostic > mov rest, count > mov dst, dstin >
Hi Wilco, I have one question below. > -----Original Message----- > From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Sent: Wednesday, July 28, 2021 5:11 PM > To: Wilco Dijkstra <Wilco.Dijkstra@arm.com> > Cc: 'GNU C Library' <libc-alpha@sourceware.org> > Subject: Re: [PATCH v3 1/5] AArch64: Improve A64FX memset > > Hi Wilco, > > Thanks for the patch. > > I confirmed that the performance is improved than the master as show > in the graphs [1]. > There are two comments, please find them. > > Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> > Tested-by: Naohiro Tamura <naohirot@fujitsu.com> > > [1] https://drive.google.com/file/d/1DfYPMd6RRS0Z_2y3VH3Q4b-r8N6TyW1c/view?usp=sharing > > > [PATCH v3 1/5] AArch64: Improve A64FX memset > > > > Would you update the commit title so as not to be the same among 5 > patches? > Because we need to ask distro to backport these patches. > If all commit titles are the same, it will increase the room to happen > confusion and mistake. > > How about "AArch64: Improve A64FX memset for less than 512B" ? > > > Improve performance of small copies by reducing instruction counts and improving > > alignment. Bench-memset shows 35-45% performance gain for small sizes. > > > > --- > > > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > > index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644 > > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > > @@ -30,7 +30,6 @@ > > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB > > #define CACHE_LINE_SIZE 256 > > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance > > This caused compile error. > > > #define rest x8 > > #define vector_length x9 > > #define vl_remainder x10 // vector_length remainder > > @@ -51,78 +50,54 @@ > > .endm > > > > .macro st1b_unroll first=0, last=7 > > - st1b z0.b, p0, [dst, #\first, mul vl] > > + st1b z0.b, p0, [dst, \first, mul vl] > > .if \last-\first > > st1b_unroll "(\first+1)", \last > > .endif > > .endm > > > > - .macro shortcut_for_small_size exit > > - // if rest <= vector_length * 2 > > - whilelo p0.b, xzr, count > > - whilelo p1.b, vector_length, count > > - b.last 1f > > - st1b z0.b, p0, [dstin, #0, mul vl] > > - st1b z0.b, p1, [dstin, #1, mul vl] > > - ret > > -1: // if rest > vector_length * 8 > > - cmp count, vector_length, lsl 3 // vector_length * 8 > > - b.hi \exit > > - // if rest <= vector_length * 4 > > - lsl tmp1, vector_length, 1 // vector_length * 2 > > - whilelo p2.b, tmp1, count > > - incb tmp1 > > - whilelo p3.b, tmp1, count > > - b.last 1f > > - st1b z0.b, p0, [dstin, #0, mul vl] > > - st1b z0.b, p1, [dstin, #1, mul vl] > > - st1b z0.b, p2, [dstin, #2, mul vl] > > - st1b z0.b, p3, [dstin, #3, mul vl] > > - ret > > -1: // if rest <= vector_length * 8 > > - lsl tmp1, vector_length, 2 // vector_length * 4 > > - whilelo p4.b, tmp1, count > > - incb tmp1 > > - whilelo p5.b, tmp1, count > > - b.last 1f > > - st1b z0.b, p0, [dstin, #0, mul vl] > > - st1b z0.b, p1, [dstin, #1, mul vl] > > - st1b z0.b, p2, [dstin, #2, mul vl] > > - st1b z0.b, p3, [dstin, #3, mul vl] > > - st1b z0.b, p4, [dstin, #4, mul vl] > > - st1b z0.b, p5, [dstin, #5, mul vl] > > - ret > > -1: lsl tmp1, vector_length, 2 // vector_length * 4 > > - incb tmp1 // vector_length * 5 > > - incb tmp1 // vector_length * 6 > > - whilelo p6.b, tmp1, count > > - incb tmp1 > > - whilelo p7.b, tmp1, count > > - st1b z0.b, p0, [dstin, #0, mul vl] > > - st1b z0.b, p1, [dstin, #1, mul vl] > > - st1b z0.b, p2, [dstin, #2, mul vl] > > - st1b z0.b, p3, [dstin, #3, mul vl] > > - st1b z0.b, p4, [dstin, #4, mul vl] > > - st1b z0.b, p5, [dstin, #5, mul vl] > > - st1b z0.b, p6, [dstin, #6, mul vl] > > - st1b z0.b, p7, [dstin, #7, mul vl] > > - ret > > - .endm > > > > -ENTRY (MEMSET) > > +#undef BTI_C > > +#define BTI_C We discussed how should be defined BTI_C macro before, at that time conclusion was "NOP" rather than empty unless HAVE_AARCH64_BTI. Now the above code defines BTI_C as empty unconditionally. A64FX doesn't support BTI, so this code is OK. But I'm just interested in the reason why it is changed. Thanks. Naohiro > > > > +ENTRY (MEMSET) > > PTR_ARG (0) > > SIZE_ARG (2) > > > > - cbnz count, 1f > > - ret > > -1: dup z0.b, valw > > cntb vector_length > > - // shortcut for less than vector_length * 8 > > - // gives a free ptrue to p0.b for n >= vector_length > > - shortcut_for_small_size L(vl_agnostic) > > - // end of shortcut > > + dup z0.b, valw > > + whilelo p0.b, vector_length, count > > + b.last 1f > > + whilelo p1.b, xzr, count > > + st1b z0.b, p1, [dstin, 0, mul vl] > > + st1b z0.b, p0, [dstin, 1, mul vl] > > + ret > > + > > + // count >= vector_length * 2 > > +1: cmp count, vector_length, lsl 2 > > + add dstend, dstin, count > > + b.hi 1f > > + st1b z0.b, p0, [dstin, 0, mul vl] > > + st1b z0.b, p0, [dstin, 1, mul vl] > > + st1b z0.b, p0, [dstend, -2, mul vl] > > + st1b z0.b, p0, [dstend, -1, mul vl] > > + ret > > + > > + // count > vector_length * 4 > > +1: lsl tmp1, vector_length, 3 > > + cmp count, tmp1 > > + b.hi L(vl_agnostic) > > + st1b z0.b, p0, [dstin, 0, mul vl] > > + st1b z0.b, p0, [dstin, 1, mul vl] > > + st1b z0.b, p0, [dstin, 2, mul vl] > > + st1b z0.b, p0, [dstin, 3, mul vl] > > + st1b z0.b, p0, [dstend, -4, mul vl] > > + st1b z0.b, p0, [dstend, -3, mul vl] > > + st1b z0.b, p0, [dstend, -2, mul vl] > > + st1b z0.b, p0, [dstend, -1, mul vl] > > + ret > > > > + .p2align 4 > > L(vl_agnostic): // VL Agnostic > > mov rest, count > > mov dst, dstin > >
Hi Naohiro, > Would you update the commit title so as not to be the same among 5 > patches? > Because we need to ask distro to backport these patches. > If all commit titles are the same, it will increase the room to happen > confusion and mistake. > > How about "AArch64: Improve A64FX memset for less than 512B" ? Generally the commit title in a patch series would include the series number, however it's also easy to add something to the title as suggested. As for backporting, one uses the hash of the patch in the cherry-pick rather than the title, so once you have the right hashes, there should be no possibility of confusion. > > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance > > This caused compile error. Sorry, that should be part of the 2nd patch. > > -ENTRY (MEMSET) > > +#undef BTI_C > > +#define BTI_C > > We discussed how should be defined BTI_C macro before, at that time conclusion > was "NOP" rather than empty unless HAVE_AARCH64_BTI. > Now the above code defines BTI_C as empty unconditionally. > A64FX doesn't support BTI, so this code is OK. > But I'm just interested in the reason why it is changed. We changed to NOP in the generic code, so that works for all string functions. In this specific case removing the initial NOP as well allows all performance critical code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks. Cheers, Wilco
The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote: > > We discussed how should be defined BTI_C macro before, at that time conclusion > > was "NOP" rather than empty unless HAVE_AARCH64_BTI. > > Now the above code defines BTI_C as empty unconditionally. > > A64FX doesn't support BTI, so this code is OK. > > But I'm just interested in the reason why it is changed. > > We changed to NOP in the generic code, so that works for all string functions. > In this specific case removing the initial NOP as well allows all performance critical > code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks. yes, this makes sense: originally BTI_C was always hint 34, but since that can be slow it was changed for !HAVE_AARCH64_BTI. We don't want the layout of asm code to change based on toolchain configuration so BTI_C is defined as a place holder nop then. but in a64fx specific code bti is never needed so we also don't need the place holder nop, BTI_C can be unconditionally empty.
Hi Wilco, > > Would you update the commit title so as not to be the same among 5 > > patches? > > Because we need to ask distro to backport these patches. > > If all commit titles are the same, it will increase the room to happen > > confusion and mistake. > > > > How about "AArch64: Improve A64FX memset for less than 512B" ? > > Generally the commit title in a patch series would include the series number, > however it's also easy to add something to the title as suggested. As for > backporting, one uses the hash of the patch in the cherry-pick rather than > the title, so once you have the right hashes, there should be no possibility > of confusion. Thank you for considering it. If communication made among engineers, I think the series number is enough. But we don't know if people in the middle of engineer to engineer communication knows git :-) Thanks. Naohiro
Hi Szabolcs, Wilco, > From: Szabolcs Nagy <szabolcs.nagy@arm.com> > Sent: Monday, August 2, 2021 11:50 PM > The 08/02/2021 14:38, Wilco Dijkstra via Libc-alpha wrote: > > > We discussed how should be defined BTI_C macro before, at that time conclusion > > > was "NOP" rather than empty unless HAVE_AARCH64_BTI. > > > Now the above code defines BTI_C as empty unconditionally. > > > A64FX doesn't support BTI, so this code is OK. > > > But I'm just interested in the reason why it is changed. > > > > We changed to NOP in the generic code, so that works for all string functions. > > In this specific case removing the initial NOP as well allows all performance critical > > code for <= 512 bytes to be perfectly aligned to 16-byte fetch blocks. > > yes, this makes sense: > > originally BTI_C was always hint 34, but since that can be > slow it was changed for !HAVE_AARCH64_BTI. We don't want the > layout of asm code to change based on toolchain configuration > so BTI_C is defined as a place holder nop then. Now I understood the difference between nop and empty is the layout. When we discussed BTI_C before, I didn't ask the difference so as not to prolong the discussion because there is no performance difference. > but in a64fx specific code bti is never needed so we also > don't need the place holder nop, BTI_C can be unconditionally > empty. Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same way too. Thanks. Naohiro
The 08/03/2021 02:57, naohirot@fujitsu.com wrote: > > but in a64fx specific code bti is never needed so we also > > don't need the place holder nop, BTI_C can be unconditionally > > empty. > > Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same > way too. sounds good to me.
Hi Szabolcs, Wilco, > From: Szabolcs Nagy <szabolcs.nagy@arm.com> > Sent: Tuesday, August 3, 2021 5:02 PM > > The 08/03/2021 02:57, naohirot@fujitsu.com wrote: > > > but in a64fx specific code bti is never needed so we also > > > don't need the place holder nop, BTI_C can be unconditionally > > > empty. > > > > Yes, I'd like to change __memcpy_a64fx and __memmove_a64fx to the same > > way too. > > sounds good to me. I submitted a patch [1], please review it if it's OK or not. [1] https://sourceware.org/pipermail/libc-alpha/2021-September/131356.html Thanks. Naohiro
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index ce54e5418b08c8bc0ecc7affff68a59272ba6397..f7fcc7b323e1553f50a2e005b8ccef344a08127d 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -30,7 +30,6 @@ #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB #define CACHE_LINE_SIZE 256 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance #define rest x8 #define vector_length x9 #define vl_remainder x10 // vector_length remainder @@ -51,78 +50,54 @@ .endm .macro st1b_unroll first=0, last=7 - st1b z0.b, p0, [dst, #\first, mul vl] + st1b z0.b, p0, [dst, \first, mul vl] .if \last-\first st1b_unroll "(\first+1)", \last .endif .endm - .macro shortcut_for_small_size exit - // if rest <= vector_length * 2 - whilelo p0.b, xzr, count - whilelo p1.b, vector_length, count - b.last 1f - st1b z0.b, p0, [dstin, #0, mul vl] - st1b z0.b, p1, [dstin, #1, mul vl] - ret -1: // if rest > vector_length * 8 - cmp count, vector_length, lsl 3 // vector_length * 8 - b.hi \exit - // if rest <= vector_length * 4 - lsl tmp1, vector_length, 1 // vector_length * 2 - whilelo p2.b, tmp1, count - incb tmp1 - whilelo p3.b, tmp1, count - b.last 1f - st1b z0.b, p0, [dstin, #0, mul vl] - st1b z0.b, p1, [dstin, #1, mul vl] - st1b z0.b, p2, [dstin, #2, mul vl] - st1b z0.b, p3, [dstin, #3, mul vl] - ret -1: // if rest <= vector_length * 8 - lsl tmp1, vector_length, 2 // vector_length * 4 - whilelo p4.b, tmp1, count - incb tmp1 - whilelo p5.b, tmp1, count - b.last 1f - st1b z0.b, p0, [dstin, #0, mul vl] - st1b z0.b, p1, [dstin, #1, mul vl] - st1b z0.b, p2, [dstin, #2, mul vl] - st1b z0.b, p3, [dstin, #3, mul vl] - st1b z0.b, p4, [dstin, #4, mul vl] - st1b z0.b, p5, [dstin, #5, mul vl] - ret -1: lsl tmp1, vector_length, 2 // vector_length * 4 - incb tmp1 // vector_length * 5 - incb tmp1 // vector_length * 6 - whilelo p6.b, tmp1, count - incb tmp1 - whilelo p7.b, tmp1, count - st1b z0.b, p0, [dstin, #0, mul vl] - st1b z0.b, p1, [dstin, #1, mul vl] - st1b z0.b, p2, [dstin, #2, mul vl] - st1b z0.b, p3, [dstin, #3, mul vl] - st1b z0.b, p4, [dstin, #4, mul vl] - st1b z0.b, p5, [dstin, #5, mul vl] - st1b z0.b, p6, [dstin, #6, mul vl] - st1b z0.b, p7, [dstin, #7, mul vl] - ret - .endm -ENTRY (MEMSET) +#undef BTI_C +#define BTI_C +ENTRY (MEMSET) PTR_ARG (0) SIZE_ARG (2) - cbnz count, 1f - ret -1: dup z0.b, valw cntb vector_length - // shortcut for less than vector_length * 8 - // gives a free ptrue to p0.b for n >= vector_length - shortcut_for_small_size L(vl_agnostic) - // end of shortcut + dup z0.b, valw + whilelo p0.b, vector_length, count + b.last 1f + whilelo p1.b, xzr, count + st1b z0.b, p1, [dstin, 0, mul vl] + st1b z0.b, p0, [dstin, 1, mul vl] + ret + + // count >= vector_length * 2 +1: cmp count, vector_length, lsl 2 + add dstend, dstin, count + b.hi 1f + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z0.b, p0, [dstin, 1, mul vl] + st1b z0.b, p0, [dstend, -2, mul vl] + st1b z0.b, p0, [dstend, -1, mul vl] + ret + + // count > vector_length * 4 +1: lsl tmp1, vector_length, 3 + cmp count, tmp1 + b.hi L(vl_agnostic) + st1b z0.b, p0, [dstin, 0, mul vl] + st1b z0.b, p0, [dstin, 1, mul vl] + st1b z0.b, p0, [dstin, 2, mul vl] + st1b z0.b, p0, [dstin, 3, mul vl] + st1b z0.b, p0, [dstend, -4, mul vl] + st1b z0.b, p0, [dstend, -3, mul vl] + st1b z0.b, p0, [dstend, -2, mul vl] + st1b z0.b, p0, [dstend, -1, mul vl] + ret + .p2align 4 L(vl_agnostic): // VL Agnostic mov rest, count mov dst, dstin