Message ID | 20180202045056.3121-1-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | aarch64: Optimized memcmp for medium to large sizes | expand |
Ping! On Friday 02 February 2018 10:20 AM, Siddhesh Poyarekar wrote: > This improved memcmp provides a fast path for compares up to 16 bytes > and then compares 16 bytes at a time, thus optimizing loads from both > sources. The glibc memcmp microbenchmark retains performance (with an > error of ~1ns) for smaller compare sizes and reduces up to 31% of > execution time for compares up to 4K on the APM Mustang. On Qualcomm > Falkor this improves to almost 48%, i.e. it is almost 2x improvement > for sizes of 2K and above. > > * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a > time. > --- > sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 50 insertions(+), 16 deletions(-) > > diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S > index ecd12061b2..0804e75d99 100644 > --- a/sysdeps/aarch64/memcmp.S > +++ b/sysdeps/aarch64/memcmp.S > @@ -34,9 +34,12 @@ > /* Internal variables. */ > #define data1 x3 > #define data1w w3 > -#define data2 x4 > -#define data2w w4 > -#define tmp1 x5 > +#define data1h x4 > +#define data2 x5 > +#define data2w w5 > +#define data2h x6 > +#define tmp1 x7 > +#define tmp2 x8 > > ENTRY_ALIGN (memcmp, 6) > DELOUSE (0) > @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) > subs limit, limit, 8 > b.lo L(less8) > > - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ > ldr data1, [src1], 8 > ldr data2, [src2], 8 > - and tmp1, src1, 7 > - add limit, limit, tmp1 > + cmp data1, data2 > + b.ne L(return) > + > + subs limit, limit, 8 > + b.gt L(more16) > + > + ldr data1, [src1, limit] > + ldr data2, [src2, limit] > + b L(return) > + > +L(more16): > + ldr data1, [src1], 8 > + ldr data2, [src2], 8 > cmp data1, data2 > bne L(return) > > + /* Jump directly to copying the last 16 bytes for 32 byte (or less) > + strings. */ > + subs limit, limit, 16 > + b.ls L(last_bytes) > + > + /* We overlap loads between 0-32 bytes at either side of SRC1 when we > + try to align, so limit it only to strings larger than 128 bytes. */ > + cmp limit, 96 > + b.ls L(loop8) > + > /* Align src1 and adjust src2 with bytes not yet done. */ > + and tmp1, src1, 15 > + add limit, limit, tmp1 > sub src1, src1, tmp1 > sub src2, src2, tmp1 > > - subs limit, limit, 8 > - b.ls L(last_bytes) > - > /* Loop performing 8 bytes per iteration using aligned src1. > Limit is pre-decremented by 8 and must be larger than zero. > Exit if <= 8 bytes left to do or if the data is not equal. */ > .p2align 4 > L(loop8): > - ldr data1, [src1], 8 > - ldr data2, [src2], 8 > - subs limit, limit, 8 > - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ > + ldp data1, data1h, [src1], 16 > + ldp data2, data2h, [src2], 16 > + subs limit, limit, 16 > + ccmp data1, data2, 0, hi > + ccmp data1h, data2h, 0, eq > b.eq L(loop8) > > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > cmp data1, data2 > bne L(return) > > - /* Compare last 1-8 bytes using unaligned access. */ > + /* Compare last 1-16 bytes using unaligned access. */ > L(last_bytes): > - ldr data1, [src1, limit] > - ldr data2, [src2, limit] > + add src1, src1, limit > + add src2, src2, limit > + ldp data1, data1h, [src1] > + ldp data2, data2h, [src2] > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > + cmp data1, data2 > > /* Compare data bytes and set return value to 0, -1 or 1. */ > L(return): >
Ping! On Tuesday 06 February 2018 11:07 AM, Siddhesh Poyarekar wrote: > Ping! > > On Friday 02 February 2018 10:20 AM, Siddhesh Poyarekar wrote: >> This improved memcmp provides a fast path for compares up to 16 bytes >> and then compares 16 bytes at a time, thus optimizing loads from both >> sources. The glibc memcmp microbenchmark retains performance (with an >> error of ~1ns) for smaller compare sizes and reduces up to 31% of >> execution time for compares up to 4K on the APM Mustang. On Qualcomm >> Falkor this improves to almost 48%, i.e. it is almost 2x improvement >> for sizes of 2K and above. >> >> * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a >> time. >> --- >> sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------ >> 1 file changed, 50 insertions(+), 16 deletions(-) >> >> diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S >> index ecd12061b2..0804e75d99 100644 >> --- a/sysdeps/aarch64/memcmp.S >> +++ b/sysdeps/aarch64/memcmp.S >> @@ -34,9 +34,12 @@ >> /* Internal variables. */ >> #define data1 x3 >> #define data1w w3 >> -#define data2 x4 >> -#define data2w w4 >> -#define tmp1 x5 >> +#define data1h x4 >> +#define data2 x5 >> +#define data2w w5 >> +#define data2h x6 >> +#define tmp1 x7 >> +#define tmp2 x8 >> >> ENTRY_ALIGN (memcmp, 6) >> DELOUSE (0) >> @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) >> subs limit, limit, 8 >> b.lo L(less8) >> >> - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ >> ldr data1, [src1], 8 >> ldr data2, [src2], 8 >> - and tmp1, src1, 7 >> - add limit, limit, tmp1 >> + cmp data1, data2 >> + b.ne L(return) >> + >> + subs limit, limit, 8 >> + b.gt L(more16) >> + >> + ldr data1, [src1, limit] >> + ldr data2, [src2, limit] >> + b L(return) >> + >> +L(more16): >> + ldr data1, [src1], 8 >> + ldr data2, [src2], 8 >> cmp data1, data2 >> bne L(return) >> >> + /* Jump directly to copying the last 16 bytes for 32 byte (or less) >> + strings. */ >> + subs limit, limit, 16 >> + b.ls L(last_bytes) >> + >> + /* We overlap loads between 0-32 bytes at either side of SRC1 when we >> + try to align, so limit it only to strings larger than 128 bytes. */ >> + cmp limit, 96 >> + b.ls L(loop8) >> + >> /* Align src1 and adjust src2 with bytes not yet done. */ >> + and tmp1, src1, 15 >> + add limit, limit, tmp1 >> sub src1, src1, tmp1 >> sub src2, src2, tmp1 >> >> - subs limit, limit, 8 >> - b.ls L(last_bytes) >> - >> /* Loop performing 8 bytes per iteration using aligned src1. >> Limit is pre-decremented by 8 and must be larger than zero. >> Exit if <= 8 bytes left to do or if the data is not equal. */ >> .p2align 4 >> L(loop8): >> - ldr data1, [src1], 8 >> - ldr data2, [src2], 8 >> - subs limit, limit, 8 >> - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ >> + ldp data1, data1h, [src1], 16 >> + ldp data2, data2h, [src2], 16 >> + subs limit, limit, 16 >> + ccmp data1, data2, 0, hi >> + ccmp data1h, data2h, 0, eq >> b.eq L(loop8) >> >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> cmp data1, data2 >> bne L(return) >> >> - /* Compare last 1-8 bytes using unaligned access. */ >> + /* Compare last 1-16 bytes using unaligned access. */ >> L(last_bytes): >> - ldr data1, [src1, limit] >> - ldr data2, [src2, limit] >> + add src1, src1, limit >> + add src2, src2, limit >> + ldp data1, data1h, [src1] >> + ldp data2, data2h, [src2] >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> + cmp data1, data2 >> >> /* Compare data bytes and set return value to 0, -1 or 1. */ >> L(return): >> >
On 2 February 2018 at 04:50, Siddhesh Poyarekar <siddhesh@sourceware.org> wrote: > This improved memcmp provides a fast path for compares up to 16 bytes > and then compares 16 bytes at a time, thus optimizing loads from both > sources. The glibc memcmp microbenchmark retains performance (with an > error of ~1ns) for smaller compare sizes and reduces up to 31% of > execution time for compares up to 4K on the APM Mustang. On Qualcomm > Falkor this improves to almost 48%, i.e. it is almost 2x improvement > for sizes of 2K and above. Hi Siddhesh, Thanks for sharing the performance numbers on these two u-architectures. Have you looked at the impact of this patch on performance of the various other aarch64 u-architectures? If so please share your findings. Cheers /Marcus
On Monday 12 February 2018 05:07 PM, Marcus Shawcroft wrote: > Thanks for sharing the performance numbers on these two > u-architectures. Have you looked at the impact of this patch on > performance of the various other aarch64 u-architectures? If so > please share your findings. I don't have ready access to any other u-arch (unless you want me to share more performance numbers on other Qualcomm cores, but I suspect you don't ;)), but I'll ask around and let you know. Thanks, Siddhesh
On Monday 12 February 2018 07:41 PM, Siddhesh Poyarekar wrote: > On Monday 12 February 2018 05:07 PM, Marcus Shawcroft wrote: >> Thanks for sharing the performance numbers on these two >> u-architectures. Have you looked at the impact of this patch on >> performance of the various other aarch64 u-architectures? If so >> please share your findings. > > I don't have ready access to any other u-arch (unless you want me to > share more performance numbers on other Qualcomm cores, but I suspect > you don't ;)), but I'll ask around and let you know. Sorry it took me a bit long but I was finally able to get my hands on a HiKey960 (which has A73 and A53 cores) and set it up. I isolated bench-memcmp on each of those types of cores one at a time and both do fairly well with the new memcmp. On the a73 core, performance of the 128 byte to 4K compares take up to 11%-33% less time. On the a53 core, the same range takes between 8%-30% less time. Numbers for smaller sizes are unstable and fluctuating between -30% and +30% for the same inputs. I have not found a way to stabilize these numbers in the benchmark. So now we have numbers for 4 micro-architectures, all showing positive gains for medium sizes and non-significant changes in performance for the smaller sizes. Siddhesh
Ping: to summarize, these are the peak performance results for 2K+ comparisons in the memcmp microbenchmark: falkor: almost 2x (up to 48% time reduction) xgene: over 1.5x (up to 31% time reduction) cortex-a73: over 1.5x (up to 33% time reduction) cortex-a53: over 1.5x (up to 30% time reduction) Siddhesh On Friday 02 February 2018 10:20 AM, Siddhesh Poyarekar wrote: > This improved memcmp provides a fast path for compares up to 16 bytes > and then compares 16 bytes at a time, thus optimizing loads from both > sources. The glibc memcmp microbenchmark retains performance (with an > error of ~1ns) for smaller compare sizes and reduces up to 31% of > execution time for compares up to 4K on the APM Mustang. On Qualcomm > Falkor this improves to almost 48%, i.e. it is almost 2x improvement > for sizes of 2K and above. > > * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a > time. > --- > sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 50 insertions(+), 16 deletions(-) > > diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S > index ecd12061b2..0804e75d99 100644 > --- a/sysdeps/aarch64/memcmp.S > +++ b/sysdeps/aarch64/memcmp.S > @@ -34,9 +34,12 @@ > /* Internal variables. */ > #define data1 x3 > #define data1w w3 > -#define data2 x4 > -#define data2w w4 > -#define tmp1 x5 > +#define data1h x4 > +#define data2 x5 > +#define data2w w5 > +#define data2h x6 > +#define tmp1 x7 > +#define tmp2 x8 > > ENTRY_ALIGN (memcmp, 6) > DELOUSE (0) > @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) > subs limit, limit, 8 > b.lo L(less8) > > - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ > ldr data1, [src1], 8 > ldr data2, [src2], 8 > - and tmp1, src1, 7 > - add limit, limit, tmp1 > + cmp data1, data2 > + b.ne L(return) > + > + subs limit, limit, 8 > + b.gt L(more16) > + > + ldr data1, [src1, limit] > + ldr data2, [src2, limit] > + b L(return) > + > +L(more16): > + ldr data1, [src1], 8 > + ldr data2, [src2], 8 > cmp data1, data2 > bne L(return) > > + /* Jump directly to copying the last 16 bytes for 32 byte (or less) > + strings. */ > + subs limit, limit, 16 > + b.ls L(last_bytes) > + > + /* We overlap loads between 0-32 bytes at either side of SRC1 when we > + try to align, so limit it only to strings larger than 128 bytes. */ > + cmp limit, 96 > + b.ls L(loop8) > + > /* Align src1 and adjust src2 with bytes not yet done. */ > + and tmp1, src1, 15 > + add limit, limit, tmp1 > sub src1, src1, tmp1 > sub src2, src2, tmp1 > > - subs limit, limit, 8 > - b.ls L(last_bytes) > - > /* Loop performing 8 bytes per iteration using aligned src1. > Limit is pre-decremented by 8 and must be larger than zero. > Exit if <= 8 bytes left to do or if the data is not equal. */ > .p2align 4 > L(loop8): > - ldr data1, [src1], 8 > - ldr data2, [src2], 8 > - subs limit, limit, 8 > - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ > + ldp data1, data1h, [src1], 16 > + ldp data2, data2h, [src2], 16 > + subs limit, limit, 16 > + ccmp data1, data2, 0, hi > + ccmp data1h, data2h, 0, eq > b.eq L(loop8) > > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > cmp data1, data2 > bne L(return) > > - /* Compare last 1-8 bytes using unaligned access. */ > + /* Compare last 1-16 bytes using unaligned access. */ > L(last_bytes): > - ldr data1, [src1, limit] > - ldr data2, [src2, limit] > + add src1, src1, limit > + add src2, src2, limit > + ldp data1, data1h, [src1] > + ldp data2, data2h, [src2] > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > + cmp data1, data2 > > /* Compare data bytes and set return value to 0, -1 or 1. */ > L(return): >
Ping! On Thursday 01 March 2018 04:48 PM, Siddhesh Poyarekar wrote: > Ping: to summarize, these are the peak performance results for 2K+ > comparisons in the memcmp microbenchmark: > > falkor: almost 2x (up to 48% time reduction) > xgene: over 1.5x (up to 31% time reduction) > cortex-a73: over 1.5x (up to 33% time reduction) > cortex-a53: over 1.5x (up to 30% time reduction) > > Siddhesh > > On Friday 02 February 2018 10:20 AM, Siddhesh Poyarekar wrote: >> This improved memcmp provides a fast path for compares up to 16 bytes >> and then compares 16 bytes at a time, thus optimizing loads from both >> sources. The glibc memcmp microbenchmark retains performance (with an >> error of ~1ns) for smaller compare sizes and reduces up to 31% of >> execution time for compares up to 4K on the APM Mustang. On Qualcomm >> Falkor this improves to almost 48%, i.e. it is almost 2x improvement >> for sizes of 2K and above. >> >> * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a >> time. >> --- >> sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------ >> 1 file changed, 50 insertions(+), 16 deletions(-) >> >> diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S >> index ecd12061b2..0804e75d99 100644 >> --- a/sysdeps/aarch64/memcmp.S >> +++ b/sysdeps/aarch64/memcmp.S >> @@ -34,9 +34,12 @@ >> /* Internal variables. */ >> #define data1 x3 >> #define data1w w3 >> -#define data2 x4 >> -#define data2w w4 >> -#define tmp1 x5 >> +#define data1h x4 >> +#define data2 x5 >> +#define data2w w5 >> +#define data2h x6 >> +#define tmp1 x7 >> +#define tmp2 x8 >> >> ENTRY_ALIGN (memcmp, 6) >> DELOUSE (0) >> @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) >> subs limit, limit, 8 >> b.lo L(less8) >> >> - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ >> ldr data1, [src1], 8 >> ldr data2, [src2], 8 >> - and tmp1, src1, 7 >> - add limit, limit, tmp1 >> + cmp data1, data2 >> + b.ne L(return) >> + >> + subs limit, limit, 8 >> + b.gt L(more16) >> + >> + ldr data1, [src1, limit] >> + ldr data2, [src2, limit] >> + b L(return) >> + >> +L(more16): >> + ldr data1, [src1], 8 >> + ldr data2, [src2], 8 >> cmp data1, data2 >> bne L(return) >> >> + /* Jump directly to copying the last 16 bytes for 32 byte (or less) >> + strings. */ >> + subs limit, limit, 16 >> + b.ls L(last_bytes) >> + >> + /* We overlap loads between 0-32 bytes at either side of SRC1 when we >> + try to align, so limit it only to strings larger than 128 bytes. */ >> + cmp limit, 96 >> + b.ls L(loop8) >> + >> /* Align src1 and adjust src2 with bytes not yet done. */ >> + and tmp1, src1, 15 >> + add limit, limit, tmp1 >> sub src1, src1, tmp1 >> sub src2, src2, tmp1 >> >> - subs limit, limit, 8 >> - b.ls L(last_bytes) >> - >> /* Loop performing 8 bytes per iteration using aligned src1. >> Limit is pre-decremented by 8 and must be larger than zero. >> Exit if <= 8 bytes left to do or if the data is not equal. */ >> .p2align 4 >> L(loop8): >> - ldr data1, [src1], 8 >> - ldr data2, [src2], 8 >> - subs limit, limit, 8 >> - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ >> + ldp data1, data1h, [src1], 16 >> + ldp data2, data2h, [src2], 16 >> + subs limit, limit, 16 >> + ccmp data1, data2, 0, hi >> + ccmp data1h, data2h, 0, eq >> b.eq L(loop8) >> >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> cmp data1, data2 >> bne L(return) >> >> - /* Compare last 1-8 bytes using unaligned access. */ >> + /* Compare last 1-16 bytes using unaligned access. */ >> L(last_bytes): >> - ldr data1, [src1, limit] >> - ldr data2, [src2, limit] >> + add src1, src1, limit >> + add src2, src2, limit >> + ldp data1, data1h, [src1] >> + ldp data2, data2h, [src2] >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> + cmp data1, data2 >> >> /* Compare data bytes and set return value to 0, -1 or 1. */ >> L(return): >> >
On 02/02/2018 02:50, Siddhesh Poyarekar wrote: > This improved memcmp provides a fast path for compares up to 16 bytes > and then compares 16 bytes at a time, thus optimizing loads from both > sources. The glibc memcmp microbenchmark retains performance (with an > error of ~1ns) for smaller compare sizes and reduces up to 31% of > execution time for compares up to 4K on the APM Mustang. On Qualcomm > Falkor this improves to almost 48%, i.e. it is almost 2x improvement > for sizes of 2K and above. > > * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a > time. LGTM with some comments clarifications below. > --- > sysdeps/aarch64/memcmp.S | 66 ++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 50 insertions(+), 16 deletions(-) > > diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S > index ecd12061b2..0804e75d99 100644 > --- a/sysdeps/aarch64/memcmp.S > +++ b/sysdeps/aarch64/memcmp.S > @@ -34,9 +34,12 @@ > /* Internal variables. */ > #define data1 x3 > #define data1w w3 > -#define data2 x4 > -#define data2w w4 > -#define tmp1 x5 > +#define data1h x4 > +#define data2 x5 > +#define data2w w5 > +#define data2h x6 > +#define tmp1 x7 > +#define tmp2 x8 > > ENTRY_ALIGN (memcmp, 6) > DELOUSE (0) > @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) > subs limit, limit, 8 > b.lo L(less8) > > - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ > ldr data1, [src1], 8 > ldr data2, [src2], 8 > - and tmp1, src1, 7 > - add limit, limit, tmp1 > + cmp data1, data2 > + b.ne L(return) > + > + subs limit, limit, 8 > + b.gt L(more16) > + > + ldr data1, [src1, limit] > + ldr data2, [src2, limit] > + b L(return) > + > +L(more16): > + ldr data1, [src1], 8 > + ldr data2, [src2], 8 > cmp data1, data2 > bne L(return) > > + /* Jump directly to copying the last 16 bytes for 32 byte (or less) > + strings. */ > + subs limit, limit, 16 > + b.ls L(last_bytes) Shouldn't be 'Jump directly to comparing [...]'? > + > + /* We overlap loads between 0-32 bytes at either side of SRC1 when we > + try to align, so limit it only to strings larger than 128 bytes. */ > + cmp limit, 96 > + b.ls L(loop8) > + > /* Align src1 and adjust src2 with bytes not yet done. */ > + and tmp1, src1, 15 > + add limit, limit, tmp1 > sub src1, src1, tmp1 > sub src2, src2, tmp1 > > - subs limit, limit, 8 > - b.ls L(last_bytes) > - > /* Loop performing 8 bytes per iteration using aligned src1. > Limit is pre-decremented by 8 and must be larger than zero. > Exit if <= 8 bytes left to do or if the data is not equal. */ I think you need to update the comment here to reflect the use of ldp instead of ldr. > .p2align 4 > L(loop8): Also I think we should rename to loop16 or similar. > - ldr data1, [src1], 8 > - ldr data2, [src2], 8 > - subs limit, limit, 8 > - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ > + ldp data1, data1h, [src1], 16 > + ldp data2, data2h, [src2], 16 > + subs limit, limit, 16 > + ccmp data1, data2, 0, hi > + ccmp data1h, data2h, 0, eq > b.eq L(loop8) > > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > cmp data1, data2 > bne L(return) > > - /* Compare last 1-8 bytes using unaligned access. */ > + /* Compare last 1-16 bytes using unaligned access. */ > L(last_bytes): > - ldr data1, [src1, limit] > - ldr data2, [src2, limit] > + add src1, src1, limit > + add src2, src2, limit > + ldp data1, data1h, [src1] > + ldp data2, data2h, [src2] > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > + cmp data1, data2 > > /* Compare data bytes and set return value to 0, -1 or 1. */ > L(return): >
On Tuesday 06 March 2018 06:45 PM, Adhemerval Zanella wrote: > On 02/02/2018 02:50, Siddhesh Poyarekar wrote: >> This improved memcmp provides a fast path for compares up to 16 bytes >> and then compares 16 bytes at a time, thus optimizing loads from both >> sources. The glibc memcmp microbenchmark retains performance (with an >> error of ~1ns) for smaller compare sizes and reduces up to 31% of >> execution time for compares up to 4K on the APM Mustang. On Qualcomm >> Falkor this improves to almost 48%, i.e. it is almost 2x improvement >> for sizes of 2K and above. >> >> * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a >> time. > > LGTM with some comments clarifications below. Thanks, fixed up and pushed. Siddhesh
On 06/03/18 13:54, Siddhesh Poyarekar wrote: > On Tuesday 06 March 2018 06:45 PM, Adhemerval Zanella wrote: >> On 02/02/2018 02:50, Siddhesh Poyarekar wrote: >>> This improved memcmp provides a fast path for compares up to 16 bytes >>> and then compares 16 bytes at a time, thus optimizing loads from both >>> sources. The glibc memcmp microbenchmark retains performance (with an >>> error of ~1ns) for smaller compare sizes and reduces up to 31% of >>> execution time for compares up to 4K on the APM Mustang. On Qualcomm >>> Falkor this improves to almost 48%, i.e. it is almost 2x improvement >>> for sizes of 2K and above. >>> >>> * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a >>> time. >> >> LGTM with some comments clarifications below. > > Thanks, fixed up and pushed. > this broke the build for me: /B/elf/librtld.os: In function `memcmp': /S/string/../sysdeps/aarch64/memcmp.S:78: undefined reference to `.Lloop8' collect2: error: ld returned 1 exit status make[2]: *** [/B/elf/ld.so] Error 1 make[2]: Leaving directory `/S/elf'
On Tuesday 06 March 2018 10:47 PM, Szabolcs Nagy wrote: > this broke the build for me: > > /B/elf/librtld.os: In function `memcmp': > /S/string/../sysdeps/aarch64/memcmp.S:78: undefined reference to `.Lloop8' > collect2: error: ld returned 1 exit status > make[2]: *** [/B/elf/ld.so] Error 1 > make[2]: Leaving directory `/S/elf' Sorry, I took the lazy way out and failed to smoke test the loop8 name fixup and missed one instance. I've pushed this obvious fix after actually building it this time. Siddhesh From 4e54d918630ea53e29dd70d3bdffcb00d29ed3d4 Mon Sep 17 00:00:00 2001 From: Siddhesh Poyarekar <siddhesh@sourceware.org> Date: Tue, 6 Mar 2018 22:56:35 +0530 Subject: [PATCH] aarch64: Fix branch target to loop16 I goofed up when changing the loop8 name to loop16 and missed on out the branch instance. Fixed and actually build tested this time. * sysdeps/aarch64/memcmp.S (more16): Fix branch target loop16. --- ChangeLog | 2 ++ sysdeps/aarch64/memcmp.S | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 23609b80d7..a24ed86474 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,7 @@ 2018-03-06 Siddhesh Poyarekar <siddhesh@sourceware.org> + * sysdeps/aarch64/memcmp.S (more16): Fix loop16 branch target. + * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a time. diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 8325d047e7..743bc078bb 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -75,7 +75,7 @@ L(more16): /* We overlap loads between 0-32 bytes at either side of SRC1 when we try to align, so limit it only to strings larger than 128 bytes. */ cmp limit, 96 - b.ls L(loop8) + b.ls L(loop16) /* Align src1 and adjust src2 with bytes not yet done. */ and tmp1, src1, 15
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index ecd12061b2..0804e75d99 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -34,9 +34,12 @@ /* Internal variables. */ #define data1 x3 #define data1w w3 -#define data2 x4 -#define data2w w4 -#define tmp1 x5 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 ENTRY_ALIGN (memcmp, 6) DELOUSE (0) @@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6) subs limit, limit, 8 b.lo L(less8) - /* Limit >= 8, so check first 8 bytes using unaligned loads. */ ldr data1, [src1], 8 ldr data2, [src2], 8 - and tmp1, src1, 7 - add limit, limit, tmp1 + cmp data1, data2 + b.ne L(return) + + subs limit, limit, 8 + b.gt L(more16) + + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) + +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 cmp data1, data2 bne L(return) + /* Jump directly to copying the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) + + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop8) + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 sub src1, src1, tmp1 sub src2, src2, tmp1 - subs limit, limit, 8 - b.ls L(last_bytes) - /* Loop performing 8 bytes per iteration using aligned src1. Limit is pre-decremented by 8 and must be larger than zero. Exit if <= 8 bytes left to do or if the data is not equal. */ .p2align 4 L(loop8): - ldr data1, [src1], 8 - ldr data2, [src2], 8 - subs limit, limit, 8 - ccmp data1, data2, 0, hi /* NZCV = 0b0000. */ + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq b.eq L(loop8) + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h cmp data1, data2 bne L(return) - /* Compare last 1-8 bytes using unaligned access. */ + /* Compare last 1-16 bytes using unaligned access. */ L(last_bytes): - ldr data1, [src1, limit] - ldr data2, [src2, limit] + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 /* Compare data bytes and set return value to 0, -1 or 1. */ L(return):