Message ID | 20180306134724.28216-1-siddhesh@sourceware.org |
---|---|
State | New |
Headers | show |
Series | aarch64: Improve strncmp for mutually misaligned inputs | expand |
Ping! On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote: > The mutually misaligned inputs on aarch64 are compared with a simple > byte copy, which is not very efficient. Enhance the comparison > similar to strcmp by loading a double-word at a time. The peak > performance improvement (i.e. 4k maxlen comparisons) due to this on > the strncmp microbenchmark is as follows: > > falkor: 3.5x (up to 72% time reduction) > cortex-a73: 3.5x (up to 71% time reduction) > cortex-a53: 3.5x (up to 71% time reduction) > > All mutually misaligned inputs from 16 bytes maxlen onwards show > upwards of 15% improvement and there is no measurable effect on the > performance of aligned/mutually aligned inputs. > > * sysdeps/aarch64/strncmp.S (count): New macro. > (strncmp): Store misaligned length in SRC1 in COUNT. > (mutual_align): Adjust. > (misaligned8): Load dword at a time when it is safe. > --- > sysdeps/aarch64/strncmp.S | 95 +++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 80 insertions(+), 15 deletions(-) > > diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S > index a08d2c0aa5..20c7ec8dad 100644 > --- a/sysdeps/aarch64/strncmp.S > +++ b/sysdeps/aarch64/strncmp.S > @@ -49,6 +49,7 @@ > #define limit_wd x13 > #define mask x14 > #define endloop x15 > +#define count mask > > ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) > DELOUSE (0) > @@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) > eor tmp1, src1, src2 > mov zeroones, #REP8_01 > tst tmp1, #7 > + and count, src1, #7 > b.ne L(misaligned8) > - ands tmp1, src1, #7 > - b.ne L(mutual_align) > + cbnz count, L(mutual_align) > /* Calculate the number of full and partial words -1. */ > sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ > lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ > @@ -165,43 +166,107 @@ L(mutual_align): > bic src1, src1, #7 > bic src2, src2, #7 > ldr data1, [src1], #8 > - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ > + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ > ldr data2, [src2], #8 > mov tmp2, #~0 > sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ > #ifdef __AARCH64EB__ > /* Big-endian. Early bytes are at MSB. */ > - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ > + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ > #else > /* Little-endian. Early bytes are at LSB. */ > - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ > + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ > #endif > and tmp3, limit_wd, #7 > lsr limit_wd, limit_wd, #3 > /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ > - add limit, limit, tmp1 > - add tmp3, tmp3, tmp1 > + add limit, limit, count > + add tmp3, tmp3, count > orr data1, data1, tmp2 > orr data2, data2, tmp2 > add limit_wd, limit_wd, tmp3, lsr #3 > b L(start_realigned) > > -L(ret0): > - mov result, #0 > - RET > - > .p2align 6 > + /* Don't bother with dwords for up to 16 bytes. */ > L(misaligned8): > - sub limit, limit, #1 > -1: > + cmp limit, #16 > + b.hs L(try_misaligned_words) > + > +L(byte_loop): > /* Perhaps we can do better than this. */ > ldrb data1w, [src1], #1 > ldrb data2w, [src2], #1 > subs limit, limit, #1 > - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ > + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ > ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ > - b.eq 1b > + b.eq L(byte_loop) > +L(done): > sub result, data1, data2 > RET > + > + /* Align the SRC1 to a dword by doing a bytewise compare and then do > + the dword loop. */ > +L(try_misaligned_words): > + mov limit_wd, limit, lsr #3 > + cbz count, L(do_misaligned) > + > + neg count, count > + and count, count, #7 > + sub limit, limit, count > + mov limit_wd, limit, lsr #3 > + > +L(page_end_loop): > + ldrb data1w, [src1], #1 > + ldrb data2w, [src2], #1 > + cmp data1w, #1 > + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ > + b.ne L(done) > + subs count, count, #1 > + b.hi L(page_end_loop) > + > +L(do_misaligned): > + /* Prepare ourselves for the next page crossing. Unlike the aligned > + loop, we fetch 1 less dword because we risk crossing bounds on > + SRC2. */ > + mov count, #8 > + subs limit_wd, limit_wd, #1 > + b.lo L(done_loop) > +L(loop_misaligned): > + and tmp2, src2, #0xff8 > + eor tmp2, tmp2, #0xff8 > + cbz tmp2, L(page_end_loop) > + > + ldr data1, [src1], #8 > + ldr data2, [src2], #8 > + sub tmp1, data1, zeroones > + orr tmp2, data1, #REP8_7f > + eor diff, data1, data2 /* Non-zero if differences found. */ > + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > + ccmp diff, #0, #0, eq > + b.ne L(not_limit) > + subs limit_wd, limit_wd, #1 > + b.pl L(loop_misaligned) > + > +L(done_loop): > + /* We found a difference or a NULL before the limit was reached. */ > + and limit, limit, #7 > + cbz limit, L(not_limit) > + /* Read the last word. */ > + sub src1, src1, 8 > + sub src2, src2, 8 > + ldr data1, [src1, limit] > + ldr data2, [src2, limit] > + sub tmp1, data1, zeroones > + orr tmp2, data1, #REP8_7f > + eor diff, data1, data2 /* Non-zero if differences found. */ > + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ > + ccmp diff, #0, #0, eq > + b.ne L(not_limit) > + > +L(ret0): > + mov result, #0 > + RET > + > END (strncmp) > libc_hidden_builtin_def (strncmp) >
On 13/03/18 09:03, Siddhesh Poyarekar wrote: > Ping! > > On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote: >> The mutually misaligned inputs on aarch64 are compared with a simple >> byte copy, which is not very efficient. Enhance the comparison >> similar to strcmp by loading a double-word at a time. The peak >> performance improvement (i.e. 4k maxlen comparisons) due to this on >> the strncmp microbenchmark is as follows: >> >> falkor: 3.5x (up to 72% time reduction) >> cortex-a73: 3.5x (up to 71% time reduction) >> cortex-a53: 3.5x (up to 71% time reduction) >> >> All mutually misaligned inputs from 16 bytes maxlen onwards show >> upwards of 15% improvement and there is no measurable effect on the >> performance of aligned/mutually aligned inputs. >> >> * sysdeps/aarch64/strncmp.S (count): New macro. >> (strncmp): Store misaligned length in SRC1 in COUNT. >> (mutual_align): Adjust. >> (misaligned8): Load dword at a time when it is safe. OK to commit. (it would be nice to have the equivalent change in newlib too..)
On Tuesday 13 March 2018 06:42 PM, Szabolcs Nagy wrote: > OK to commit. > > (it would be nice to have the equivalent change in newlib too..) Yes, it's in my TODO list for later this week. Siddhesh
On 13/03/18 13:12, Szabolcs Nagy wrote: > On 13/03/18 09:03, Siddhesh Poyarekar wrote: >> Ping! >> >> On Tuesday 06 March 2018 07:17 PM, Siddhesh Poyarekar wrote: >>> The mutually misaligned inputs on aarch64 are compared with a simple >>> byte copy, which is not very efficient. Enhance the comparison >>> similar to strcmp by loading a double-word at a time. The peak >>> performance improvement (i.e. 4k maxlen comparisons) due to this on >>> the strncmp microbenchmark is as follows: >>> >>> falkor: 3.5x (up to 72% time reduction) >>> cortex-a73: 3.5x (up to 71% time reduction) >>> cortex-a53: 3.5x (up to 71% time reduction) >>> >>> All mutually misaligned inputs from 16 bytes maxlen onwards show >>> upwards of 15% improvement and there is no measurable effect on the >>> performance of aligned/mutually aligned inputs. >>> >>> * sysdeps/aarch64/strncmp.S (count): New macro. >>> (strncmp): Store misaligned length in SRC1 in COUNT. >>> (mutual_align): Adjust. >>> (misaligned8): Load dword at a time when it is safe. > > OK to commit. > > (it would be nice to have the equivalent change in newlib too..) this broke the build for me ../sysdeps/aarch64/strncmp.S: Assembler messages: ../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3' ../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following instruction at operand 2 -- `mov x13,x2,lsr#3' old binutils 2.26 and before did not support mov with shifted register (only orr reg,xzr,reg,shift). but i think a shift instruction (lsr) should be better anyway (on most implementations). can you please fix this?
On Wednesday 14 March 2018 06:05 PM, Szabolcs Nagy wrote: > this broke the build for me > > ../sysdeps/aarch64/strncmp.S: Assembler messages: > ../sysdeps/aarch64/strncmp.S:211: Error: unexpected characters following > instruction at operand 2 -- `mov x13,x2,lsr#3' > ../sysdeps/aarch64/strncmp.S:217: Error: unexpected characters following > instruction at operand 2 -- `mov x13,x2,lsr#3' > > old binutils 2.26 and before did not support mov with shifted > register (only orr reg,xzr,reg,shift). > > but i think a shift instruction (lsr) should be better anyway > (on most implementations). > > can you please fix this? Thanks for pointing out, I've pushed this patch[1] after testing it on a xenial box, which showed this error. This was my old builder and I recently moved to a new one based on bionic because of which I didn't see this problem earlier. Siddhesh [1] https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d46f84de745db8f3f06a37048261f4e5ceacf0a3
diff --git a/sysdeps/aarch64/strncmp.S b/sysdeps/aarch64/strncmp.S index a08d2c0aa5..20c7ec8dad 100644 --- a/sysdeps/aarch64/strncmp.S +++ b/sysdeps/aarch64/strncmp.S @@ -49,6 +49,7 @@ #define limit_wd x13 #define mask x14 #define endloop x15 +#define count mask ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) DELOUSE (0) @@ -58,9 +59,9 @@ ENTRY_ALIGN_AND_PAD (strncmp, 6, 7) eor tmp1, src1, src2 mov zeroones, #REP8_01 tst tmp1, #7 + and count, src1, #7 b.ne L(misaligned8) - ands tmp1, src1, #7 - b.ne L(mutual_align) + cbnz count, L(mutual_align) /* Calculate the number of full and partial words -1. */ sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ @@ -165,43 +166,107 @@ L(mutual_align): bic src1, src1, #7 bic src2, src2, #7 ldr data1, [src1], #8 - neg tmp3, tmp1, lsl #3 /* 64 - bits(bytes beyond align). */ + neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ ldr data2, [src2], #8 mov tmp2, #~0 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ #ifdef __AARCH64EB__ /* Big-endian. Early bytes are at MSB. */ - lsl tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ #else /* Little-endian. Early bytes are at LSB. */ - lsr tmp2, tmp2, tmp3 /* Shift (tmp1 & 63). */ + lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ #endif and tmp3, limit_wd, #7 lsr limit_wd, limit_wd, #3 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ - add limit, limit, tmp1 - add tmp3, tmp3, tmp1 + add limit, limit, count + add tmp3, tmp3, count orr data1, data1, tmp2 orr data2, data2, tmp2 add limit_wd, limit_wd, tmp3, lsr #3 b L(start_realigned) -L(ret0): - mov result, #0 - RET - .p2align 6 + /* Don't bother with dwords for up to 16 bytes. */ L(misaligned8): - sub limit, limit, #1 -1: + cmp limit, #16 + b.hs L(try_misaligned_words) + +L(byte_loop): /* Perhaps we can do better than this. */ ldrb data1w, [src1], #1 ldrb data2w, [src2], #1 subs limit, limit, #1 - ccmp data1w, #1, #0, cs /* NZCV = 0b0000. */ + ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ - b.eq 1b + b.eq L(byte_loop) +L(done): sub result, data1, data2 RET + + /* Align the SRC1 to a dword by doing a bytewise compare and then do + the dword loop. */ +L(try_misaligned_words): + mov limit_wd, limit, lsr #3 + cbz count, L(do_misaligned) + + neg count, count + and count, count, #7 + sub limit, limit, count + mov limit_wd, limit, lsr #3 + +L(page_end_loop): + ldrb data1w, [src1], #1 + ldrb data2w, [src2], #1 + cmp data1w, #1 + ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ + b.ne L(done) + subs count, count, #1 + b.hi L(page_end_loop) + +L(do_misaligned): + /* Prepare ourselves for the next page crossing. Unlike the aligned + loop, we fetch 1 less dword because we risk crossing bounds on + SRC2. */ + mov count, #8 + subs limit_wd, limit_wd, #1 + b.lo L(done_loop) +L(loop_misaligned): + and tmp2, src2, #0xff8 + eor tmp2, tmp2, #0xff8 + cbz tmp2, L(page_end_loop) + + ldr data1, [src1], #8 + ldr data2, [src2], #8 + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + subs limit_wd, limit_wd, #1 + b.pl L(loop_misaligned) + +L(done_loop): + /* We found a difference or a NULL before the limit was reached. */ + and limit, limit, #7 + cbz limit, L(not_limit) + /* Read the last word. */ + sub src1, src1, 8 + sub src2, src2, 8 + ldr data1, [src1, limit] + ldr data2, [src2, limit] + sub tmp1, data1, zeroones + orr tmp2, data1, #REP8_7f + eor diff, data1, data2 /* Non-zero if differences found. */ + bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ + ccmp diff, #0, #0, eq + b.ne L(not_limit) + +L(ret0): + mov result, #0 + RET + END (strncmp) libc_hidden_builtin_def (strncmp)