Message ID | PAWPR08MB898215EB1E88D436D9EF27D283FD9@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Improve strrchr | expand |
The 01/12/2023 16:05, Wilco Dijkstra wrote: > Use shrn for narrowing the mask which simplifies code and speeds up small > strings. Unroll the first search loop to improve performance on large strings. > Passes regress. please commit it, thanks. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> > > --- > > diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S > index b4b8b44aadf75d900589e25ebbc49640ef43f0bf..528034a5183b4479634dfb346426102d6aa4b64a 100644 > --- a/sysdeps/aarch64/strrchr.S > +++ b/sysdeps/aarch64/strrchr.S > @@ -22,19 +22,16 @@ > > /* Assumptions: > * > - * ARMv8-a, AArch64 > - * Neon Available. > + * ARMv8-a, AArch64, Advanced SIMD. > * MTE compatible. > */ > > -/* Arguments and results. */ > #define srcin x0 > #define chrin w1 > #define result x0 > > #define src x2 > #define tmp x3 > -#define wtmp w3 > #define synd x3 > #define shift x4 > #define src_match x4 > @@ -46,7 +43,6 @@ > #define vhas_nul v2 > #define vhas_chr v3 > #define vrepmask v4 > -#define vrepmask2 v5 > #define vend v5 > #define dend d5 > > @@ -58,59 +54,71 @@ > the relevant byte matched the requested character; bits 2-3 are set > if the relevant byte matched the NUL end of string. */ > > -ENTRY(strrchr) > +ENTRY (strrchr) > PTR_ARG (0) > bic src, srcin, 15 > dup vrepchr.16b, chrin > - mov wtmp, 0x3003 > - dup vrepmask.8h, wtmp > - tst srcin, 15 > - beq L(loop1) > - > - ld1 {vdata.16b}, [src], 16 > + movi vrepmask.16b, 0x33 > + ld1 {vdata.16b}, [src] > cmeq vhas_nul.16b, vdata.16b, 0 > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > - mov wtmp, 0xf00f > - dup vrepmask2.8h, wtmp > bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b > - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b > + shrn vend.8b, vhas_nul.8h, 4 > lsl shift, srcin, 2 > fmov synd, dend > lsr synd, synd, shift > lsl synd, synd, shift > ands nul_match, synd, 0xcccccccccccccccc > bne L(tail) > - cbnz synd, L(loop2) > + cbnz synd, L(loop2_start) > > - .p2align 5 > + .p2align 4 > L(loop1): > - ld1 {vdata.16b}, [src], 16 > + ldr q1, [src, 16] > + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b > + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > + fmov synd, dend > + cbnz synd, L(loop1_end) > + ldr q1, [src, 32]! > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b > umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > fmov synd, dend > cbz synd, L(loop1) > - > + sub src, src, 16 > +L(loop1_end): > + add src, src, 16 > cmeq vhas_nul.16b, vdata.16b, 0 > +#ifdef __AARCH64EB__ > + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b > + shrn vend.8b, vhas_nul.8h, 4 > + fmov synd, dend > + rbit synd, synd > +#else > bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b > - bic vhas_nul.8h, 0x0f, lsl 8 > - addp vend.16b, vhas_nul.16b, vhas_nul.16b > + shrn vend.8b, vhas_nul.8h, 4 > fmov synd, dend > +#endif > ands nul_match, synd, 0xcccccccccccccccc > - beq L(loop2) > - > + beq L(loop2_start) > L(tail): > sub nul_match, nul_match, 1 > and chr_match, synd, 0x3333333333333333 > ands chr_match, chr_match, nul_match > - sub result, src, 1 > + add result, src, 15 > clz tmp, chr_match > sub result, result, tmp, lsr 2 > csel result, result, xzr, ne > ret > > .p2align 4 > + nop > + nop > +L(loop2_start): > + add src, src, 16 > + bic vrepmask.8h, 0xf0 > + > L(loop2): > cmp synd, 0 > csel src_match, src, src_match, ne
diff --git a/sysdeps/aarch64/strrchr.S b/sysdeps/aarch64/strrchr.S index b4b8b44aadf75d900589e25ebbc49640ef43f0bf..528034a5183b4479634dfb346426102d6aa4b64a 100644 --- a/sysdeps/aarch64/strrchr.S +++ b/sysdeps/aarch64/strrchr.S @@ -22,19 +22,16 @@ /* Assumptions: * - * ARMv8-a, AArch64 - * Neon Available. + * ARMv8-a, AArch64, Advanced SIMD. * MTE compatible. */ -/* Arguments and results. */ #define srcin x0 #define chrin w1 #define result x0 #define src x2 #define tmp x3 -#define wtmp w3 #define synd x3 #define shift x4 #define src_match x4 @@ -46,7 +43,6 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 #define vend v5 #define dend d5 @@ -58,59 +54,71 @@ the relevant byte matched the requested character; bits 2-3 are set if the relevant byte matched the NUL end of string. */ -ENTRY(strrchr) +ENTRY (strrchr) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin - mov wtmp, 0x3003 - dup vrepmask.8h, wtmp - tst srcin, 15 - beq L(loop1) - - ld1 {vdata.16b}, [src], 16 + movi vrepmask.16b, 0x33 + ld1 {vdata.16b}, [src] cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp, 0xf00f - dup vrepmask2.8h, wtmp bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 lsl shift, srcin, 2 fmov synd, dend lsr synd, synd, shift lsl synd, synd, shift ands nul_match, synd, 0xcccccccccccccccc bne L(tail) - cbnz synd, L(loop2) + cbnz synd, L(loop2_start) - .p2align 5 + .p2align 4 L(loop1): - ld1 {vdata.16b}, [src], 16 + ldr q1, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov synd, dend + cbnz synd, L(loop1_end) + ldr q1, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov synd, dend cbz synd, L(loop1) - + sub src, src, 16 +L(loop1_end): + add src, src, 16 cmeq vhas_nul.16b, vdata.16b, 0 +#ifdef __AARCH64EB__ + bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b + shrn vend.8b, vhas_nul.8h, 4 + fmov synd, dend + rbit synd, synd +#else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - bic vhas_nul.8h, 0x0f, lsl 8 - addp vend.16b, vhas_nul.16b, vhas_nul.16b + shrn vend.8b, vhas_nul.8h, 4 fmov synd, dend +#endif ands nul_match, synd, 0xcccccccccccccccc - beq L(loop2) - + beq L(loop2_start) L(tail): sub nul_match, nul_match, 1 and chr_match, synd, 0x3333333333333333 ands chr_match, chr_match, nul_match - sub result, src, 1 + add result, src, 15 clz tmp, chr_match sub result, result, tmp, lsr 2 csel result, result, xzr, ne ret .p2align 4 + nop + nop +L(loop2_start): + add src, src, 16 + bic vrepmask.8h, 0xf0 + L(loop2): cmp synd, 0 csel src_match, src, src_match, ne