Message ID | VE1PR08MB559938483A0630B16E8C64B383E49@VE1PR08MB5599.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | [v3,1/5] AArch64: Improve A64FX memset | expand |
Hi Wilco, Thank you for the patch. I confirmed that the performance is improved than the master as shown in the graphs [1][2]. Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> Tested-by: Naohiro Tamura <naohirot@fujitsu.com> [1] https://drive.google.com/file/d/1RxdIlJa2Wvl8eT5_TRVkvbkyS6bfZObx/view?usp=sharing [2] https://drive.google.com/file/d/1xCLsa7qweovdQpWtfnNZZwcEi7W7z3Ok/view?usp=sharing There are one comment and one question below. > Subject: [PATCH v3 2/5] AArch64: Improve A64FX memset How about "AArch64: Improve A64FX memset for more than 8MB"? > > Improve performance of large memsets. Simplify alignment code. For zero memset use DC ZVA, > which almost doubles performance. For non-zero memsets use the unroll8 loop which is about 10% faster. > > --- > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > @@ -30,10 +30,8 @@ > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB > #define CACHE_LINE_SIZE 256 > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > -#define rest x8 > +#define rest x2 > #define vector_length x9 > -#define vl_remainder x10 // vector_length remainder > -#define cl_remainder x11 // CACHE_LINE_SIZE remainder > > #if HAVE_AARCH64_SVE_ASM > # if IS_IN (libc) > @@ -41,14 +39,6 @@ > > .arch armv8.2-a+sve > > - .macro dc_zva times > - dc zva, tmp1 > - add tmp1, tmp1, CACHE_LINE_SIZE > - .if \times-1 > - dc_zva "(\times-1)" > - .endif > - .endm > - > .macro st1b_unroll first=0, last=7 > st1b z0.b, p0, [dst, \first, mul vl] > .if \last-\first > @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE > cbnz rest, L(unroll32) > ret > > + // count >= L2_SIZE > L(L2): > - // align dst address at vector_length byte boundary > - sub tmp1, vector_length, 1 > - ands tmp2, dst, tmp1 > - // if vl_remainder == 0 > - b.eq 1f > - sub vl_remainder, vector_length, tmp2 > - // process remainder until the first vector_length boundary > - whilelt p2.b, xzr, vl_remainder > - st1b z0.b, p2, [dst] > - add dst, dst, vl_remainder > - sub rest, rest, vl_remainder > - // align dstin address at CACHE_LINE_SIZE byte boundary > -1: mov tmp1, CACHE_LINE_SIZE > - ands tmp2, dst, CACHE_LINE_SIZE - 1 > - // if cl_remainder == 0 > - b.eq L(L2_dc_zva) > - sub cl_remainder, tmp1, tmp2 > - // process remainder until the first CACHE_LINE_SIZE boundary > - mov tmp1, xzr // index > -2: whilelt p2.b, tmp1, cl_remainder > - st1b z0.b, p2, [dst, tmp1] > - incb tmp1 > - cmp tmp1, cl_remainder > - b.lo 2b > - add dst, dst, cl_remainder > - sub rest, rest, cl_remainder > - > -L(L2_dc_zva): > - // zero fill > - mov tmp1, dst > - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 > - mov zva_len, ZF_DIST > - add tmp1, zva_len, CACHE_LINE_SIZE * 2 > - // unroll > - .p2align 3 > -1: st1b_unroll 0, 3 > - add tmp2, dst, zva_len > - dc zva, tmp2 > - st1b_unroll 4, 7 > - add tmp2, tmp2, CACHE_LINE_SIZE > - dc zva, tmp2 > - add dst, dst, CACHE_LINE_SIZE * 2 > - sub rest, rest, CACHE_LINE_SIZE * 2 > - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 > - b.ge 1b > - cbnz rest, L(unroll8) > - ret > + tst valw, 255 > + b.ne L(unroll8) > + // align dst to CACHE_LINE_SIZE byte boundary > + and tmp2, dst, CACHE_LINE_SIZE - 1 > + sub tmp2, tmp2, CACHE_LINE_SIZE > + st1b z0.b, p0, [dst, 0, mul vl] > + st1b z0.b, p0, [dst, 1, mul vl] > + st1b z0.b, p0, [dst, 2, mul vl] > + st1b z0.b, p0, [dst, 3, mul vl] > + sub dst, dst, tmp2 > + add count, count, tmp2 > + > + // clear cachelines using DC ZVA > + sub count, count, CACHE_LINE_SIZE > + .p2align 4 > +1: dc zva, dst DC ZVA is called if buffer size is more than 8MB and fill data is zero. In case of __memset_generic, DC ZVA is called if buffer size is more than 256B and fill data is zero. This is implemented by you[3]. V3 Patch 02 __memset_a64fx (green line) recorded very close performance to __memset_generic (red line) in terms of zerofill [4]. V3 Patch 05 __memset_a64fx (green line) recorded almost same performance as __memset_generic (red line) in terms of zerofill [5]. Graphs[4][5] X axis starts from 256B to 64MB, and are created by the following command. $ cat bench-memset-zerofill.out | \ > jq -r 'del(.functions.memset.results[] | select(.char2 != 0))' | \ > plot_strings.py -l -p thru -v - So DC ZVA implementations for __memset_generic and __memset_a64fx seem appropriate respectively. But comparing nonzero fill graph[6] with zero fill graph[4], why DC ZVA is only effective more than 8MB for __memset_a64fx in spite that DC ZVA is effective from smaller size for __memset_generic? Still I couldn't understand DC ZVA behavior. [3] https://sourceware.org/git/?p=glibc.git;a=commit;f=sysdeps/aarch64/memset.S;h=a8c5a2a9521e105da6e96eaf4029b8e4d595e4f5 [4] https://drive.google.com/file/d/1f0_sTiujCcEZTfxbQ1UZdVwAvbMbP2ii/view?usp=sharing [5] https://drive.google.com/file/d/1Wyp3GO-9ipcphwqOQOQ9a97EwFz90SPc/view?usp=sharing [6] https://drive.google.com/file/d/1nZ_lfj6Kz5vFCR35O0q929SceUP-wbih/view?usp=sharing Thanks. Naohiro > + add dst, dst, CACHE_LINE_SIZE > + subs count, count, CACHE_LINE_SIZE > + b.hi 1b > + add count, count, CACHE_LINE_SIZE > + b L(last) > > END (MEMSET) > libc_hidden_builtin_def (MEMSET)
Hi Wilco, I found my typo in the original code comment. Would you fix it with the following? > > -#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance > From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Sent: Monday, August 2, 2021 10:29 PM > > diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S > > index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 > > --- a/sysdeps/aarch64/multiarch/memset_a64fx.S > > +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S > > @@ -30,10 +30,8 @@ > > #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB Wrong: // L2 8MB - 1MB Right: // L2 8MB > > #define CACHE_LINE_SIZE 256 > > #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 > > -#define rest x8 > > +#define rest x2 > > #define vector_length x9 > > -#define vl_remainder x10 // vector_length remainder > > -#define cl_remainder x11 // CACHE_LINE_SIZE remainder > > Thanks. Naohiro
Hi Wilco, Sorry, I forgot to mention one thing about readability matter. > From: Tamura, Naohiro/田村 直広 <naohirot@fujitsu.com> > Sent: Monday, August 2, 2021 10:29 PM > > + // count >= L2_SIZE > > L(L2): > > - // align dst address at vector_length byte boundary > > - sub tmp1, vector_length, 1 > > - ands tmp2, dst, tmp1 > > - // if vl_remainder == 0 > > - b.eq 1f > > - sub vl_remainder, vector_length, tmp2 > > - // process remainder until the first vector_length boundary > > - whilelt p2.b, xzr, vl_remainder > > - st1b z0.b, p2, [dst] > > - add dst, dst, vl_remainder > > - sub rest, rest, vl_remainder > > - // align dstin address at CACHE_LINE_SIZE byte boundary > > -1: mov tmp1, CACHE_LINE_SIZE > > - ands tmp2, dst, CACHE_LINE_SIZE - 1 > > - // if cl_remainder == 0 > > - b.eq L(L2_dc_zva) > > - sub cl_remainder, tmp1, tmp2 > > - // process remainder until the first CACHE_LINE_SIZE boundary > > - mov tmp1, xzr // index > > -2: whilelt p2.b, tmp1, cl_remainder > > - st1b z0.b, p2, [dst, tmp1] > > - incb tmp1 > > - cmp tmp1, cl_remainder > > - b.lo 2b > > - add dst, dst, cl_remainder > > - sub rest, rest, cl_remainder > > - > > -L(L2_dc_zva): > > - // zero fill > > - mov tmp1, dst > > - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 > > - mov zva_len, ZF_DIST > > - add tmp1, zva_len, CACHE_LINE_SIZE * 2 > > - // unroll > > - .p2align 3 > > -1: st1b_unroll 0, 3 > > - add tmp2, dst, zva_len > > - dc zva, tmp2 > > - st1b_unroll 4, 7 > > - add tmp2, tmp2, CACHE_LINE_SIZE > > - dc zva, tmp2 > > - add dst, dst, CACHE_LINE_SIZE * 2 > > - sub rest, rest, CACHE_LINE_SIZE * 2 > > - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 > > - b.ge 1b > > - cbnz rest, L(unroll8) > > - ret > > + tst valw, 255 > > + b.ne L(unroll8) > > + // align dst to CACHE_LINE_SIZE byte boundary > > + and tmp2, dst, CACHE_LINE_SIZE - 1 > > + sub tmp2, tmp2, CACHE_LINE_SIZE "tmp2" becomes always minus value. I felt that it would be easier to understand and natural if it is reversed like this: sub tmp2, CACHE_LINE_SIZE, tmp2 > > + st1b z0.b, p0, [dst, 0, mul vl] > > + st1b z0.b, p0, [dst, 1, mul vl] > > + st1b z0.b, p0, [dst, 2, mul vl] > > + st1b z0.b, p0, [dst, 3, mul vl] > > + sub dst, dst, tmp2 "dst" needs to be incremented. Actually "dst" is incremented by "sub" because "tmp2" is minus value. So it would become natural if "tmp2" is plus value like this: add dst, dst, tmp2 > > + add count, count, tmp2 "count" needs to be decremented. Actually "count" is decremented by "add" because "tmp2" is minus value. So it would become natural if tmp2 is plus value like this: sub count, count, tmp2 Thanks. Naohiro
Hi Naohiro, > > + // align dst to CACHE_LINE_SIZE byte boundary > > + and tmp2, dst, CACHE_LINE_SIZE - 1 > > + sub tmp2, tmp2, CACHE_LINE_SIZE > "tmp2" becomes always minus value. > I felt that it would be easier to understand and natural if it is reversed like this: > > sub tmp2, CACHE_LINE_SIZE, tmp2 That's not a valid instruction though. I've just removed it in v4 since we can delay the cacheline adjustment to dst and count to later instructions. > But comparing nonzero fill graph[6] with zero fill graph[4], > why DC ZVA is only effective more than 8MB for __memset_a64fx in spite > that DC ZVA is effective from smaller size for __memset_generic? Well it seems on A64FX DC ZVA is faster only when data is not in L1. So it may be feasible to use DC ZVA for smaller sizes. Cheers, Wilco
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S index f7fcc7b323e1553f50a2e005b8ccef344a08127d..608e0e2e2ff5259178e2fdadf1eea8816194d879 100644 --- a/sysdeps/aarch64/multiarch/memset_a64fx.S +++ b/sysdeps/aarch64/multiarch/memset_a64fx.S @@ -30,10 +30,8 @@ #define L2_SIZE (8*1024*1024) // L2 8MB - 1MB #define CACHE_LINE_SIZE 256 #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1 -#define rest x8 +#define rest x2 #define vector_length x9 -#define vl_remainder x10 // vector_length remainder -#define cl_remainder x11 // CACHE_LINE_SIZE remainder #if HAVE_AARCH64_SVE_ASM # if IS_IN (libc) @@ -41,14 +39,6 @@ .arch armv8.2-a+sve - .macro dc_zva times - dc zva, tmp1 - add tmp1, tmp1, CACHE_LINE_SIZE - .if \times-1 - dc_zva "(\times-1)" - .endif - .endm - .macro st1b_unroll first=0, last=7 st1b z0.b, p0, [dst, \first, mul vl] .if \last-\first @@ -187,54 +177,29 @@ L(L1_prefetch): // if rest >= L1_SIZE cbnz rest, L(unroll32) ret + // count >= L2_SIZE L(L2): - // align dst address at vector_length byte boundary - sub tmp1, vector_length, 1 - ands tmp2, dst, tmp1 - // if vl_remainder == 0 - b.eq 1f - sub vl_remainder, vector_length, tmp2 - // process remainder until the first vector_length boundary - whilelt p2.b, xzr, vl_remainder - st1b z0.b, p2, [dst] - add dst, dst, vl_remainder - sub rest, rest, vl_remainder - // align dstin address at CACHE_LINE_SIZE byte boundary -1: mov tmp1, CACHE_LINE_SIZE - ands tmp2, dst, CACHE_LINE_SIZE - 1 - // if cl_remainder == 0 - b.eq L(L2_dc_zva) - sub cl_remainder, tmp1, tmp2 - // process remainder until the first CACHE_LINE_SIZE boundary - mov tmp1, xzr // index -2: whilelt p2.b, tmp1, cl_remainder - st1b z0.b, p2, [dst, tmp1] - incb tmp1 - cmp tmp1, cl_remainder - b.lo 2b - add dst, dst, cl_remainder - sub rest, rest, cl_remainder - -L(L2_dc_zva): - // zero fill - mov tmp1, dst - dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1 - mov zva_len, ZF_DIST - add tmp1, zva_len, CACHE_LINE_SIZE * 2 - // unroll - .p2align 3 -1: st1b_unroll 0, 3 - add tmp2, dst, zva_len - dc zva, tmp2 - st1b_unroll 4, 7 - add tmp2, tmp2, CACHE_LINE_SIZE - dc zva, tmp2 - add dst, dst, CACHE_LINE_SIZE * 2 - sub rest, rest, CACHE_LINE_SIZE * 2 - cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2 - b.ge 1b - cbnz rest, L(unroll8) - ret + tst valw, 255 + b.ne L(unroll8) + // align dst to CACHE_LINE_SIZE byte boundary + and tmp2, dst, CACHE_LINE_SIZE - 1 + sub tmp2, tmp2, CACHE_LINE_SIZE + st1b z0.b, p0, [dst, 0, mul vl] + st1b z0.b, p0, [dst, 1, mul vl] + st1b z0.b, p0, [dst, 2, mul vl] + st1b z0.b, p0, [dst, 3, mul vl] + sub dst, dst, tmp2 + add count, count, tmp2 + + // clear cachelines using DC ZVA + sub count, count, CACHE_LINE_SIZE + .p2align 4 +1: dc zva, dst + add dst, dst, CACHE_LINE_SIZE + subs count, count, CACHE_LINE_SIZE + b.hi 1b + add count, count, CACHE_LINE_SIZE + b L(last) END (MEMSET) libc_hidden_builtin_def (MEMSET)