Message ID | PAWPR08MB89822DA55C9346018CE24E2083FD9@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Optimize strchr | expand |
The 01/12/2023 15:58, Wilco Dijkstra wrote: > Simplify calculation of the mask using shrn. Unroll the main loop. > Small strings are 20% faster on modern CPUs. Passes regress. please commit it, thanks. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> > > --- > > diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S > index 900ef15944c2b8a82943cc0fbdaf0b40907c40e1..14ae1513a7330a62cf5985d06e1fb6a8bab78d63 100644 > --- a/sysdeps/aarch64/strchr.S > +++ b/sysdeps/aarch64/strchr.S > @@ -32,8 +32,7 @@ > > #define src x2 > #define tmp1 x1 > -#define wtmp2 w3 > -#define tmp3 x3 > +#define tmp2 x3 > > #define vrepchr v0 > #define vdata v1 > @@ -41,39 +40,30 @@ > #define vhas_nul v2 > #define vhas_chr v3 > #define vrepmask v4 > -#define vrepmask2 v5 > -#define vend v6 > -#define dend d6 > +#define vend v5 > +#define dend d5 > > /* Core algorithm. > > For each 16-byte chunk we calculate a 64-bit syndrome value with four bits > - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the > - requested character, bits 2-3 are set if the byte is NUL (or matched), and > - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd > - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits > - in the syndrome reflect the order in which things occur in the original > - string, counting trailing zeros identifies exactly which byte matched. */ > + per byte. Bits 0-1 are set if the relevant byte matched the requested > + character, bits 2-3 are set if the byte is NUL or matched. Count trailing > + zeroes gives the position of the matching byte if it is a multiple of 4. > + If it is not a multiple of 4, there was no match. */ > > ENTRY (strchr) > PTR_ARG (0) > bic src, srcin, 15 > dup vrepchr.16b, chrin > ld1 {vdata.16b}, [src] > - mov wtmp2, 0x3003 > - dup vrepmask.8h, wtmp2 > + movi vrepmask.16b, 0x33 > cmeq vhas_nul.16b, vdata.16b, 0 > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > - mov wtmp2, 0xf00f > - dup vrepmask2.8h, wtmp2 > - > bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b > - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b > - lsl tmp3, srcin, 2 > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ > - > + lsl tmp2, srcin, 2 > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov tmp1, dend > - lsr tmp1, tmp1, tmp3 > + lsr tmp1, tmp1, tmp2 > cbz tmp1, L(loop) > > rbit tmp1, tmp1 > @@ -87,28 +77,34 @@ ENTRY (strchr) > > .p2align 4 > L(loop): > - ldr qdata, [src, 16]! > + ldr qdata, [src, 16] > + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b > + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > + fmov tmp1, dend > + cbnz tmp1, L(end) > + ldr qdata, [src, 32]! > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b > umaxp vend.16b, vhas_nul.16b, vhas_nul.16b > fmov tmp1, dend > cbz tmp1, L(loop) > + sub src, src, 16 > +L(end): > > #ifdef __AARCH64EB__ > bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b > - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov tmp1, dend > #else > bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b > - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b > - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ > + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ > fmov tmp1, dend > rbit tmp1, tmp1 > #endif > + add src, src, 16 > clz tmp1, tmp1 > - /* Tmp1 is an even multiple of 2 if the target character was > - found first. Otherwise we've found the end of string. */ > + /* Tmp1 is a multiple of 4 if the target character was found. */ > tst tmp1, 2 > add result, src, tmp1, lsr 2 > csel result, result, xzr, eq
diff --git a/sysdeps/aarch64/strchr.S b/sysdeps/aarch64/strchr.S index 900ef15944c2b8a82943cc0fbdaf0b40907c40e1..14ae1513a7330a62cf5985d06e1fb6a8bab78d63 100644 --- a/sysdeps/aarch64/strchr.S +++ b/sysdeps/aarch64/strchr.S @@ -32,8 +32,7 @@ #define src x2 #define tmp1 x1 -#define wtmp2 w3 -#define tmp3 x3 +#define tmp2 x3 #define vrepchr v0 #define vdata v1 @@ -41,39 +40,30 @@ #define vhas_nul v2 #define vhas_chr v3 #define vrepmask v4 -#define vrepmask2 v5 -#define vend v6 -#define dend d6 +#define vend v5 +#define dend d5 /* Core algorithm. For each 16-byte chunk we calculate a 64-bit syndrome value with four bits - per byte. For even bytes, bits 0-1 are set if the relevant byte matched the - requested character, bits 2-3 are set if the byte is NUL (or matched), and - bits 4-7 are not used and must be zero if none of bits 0-3 are set). Odd - bytes set bits 4-7 so that adjacent bytes can be merged. Since the bits - in the syndrome reflect the order in which things occur in the original - string, counting trailing zeros identifies exactly which byte matched. */ + per byte. Bits 0-1 are set if the relevant byte matched the requested + character, bits 2-3 are set if the byte is NUL or matched. Count trailing + zeroes gives the position of the matching byte if it is a multiple of 4. + If it is not a multiple of 4, there was no match. */ ENTRY (strchr) PTR_ARG (0) bic src, srcin, 15 dup vrepchr.16b, chrin ld1 {vdata.16b}, [src] - mov wtmp2, 0x3003 - dup vrepmask.8h, wtmp2 + movi vrepmask.16b, 0x33 cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b - mov wtmp2, 0xf00f - dup vrepmask2.8h, wtmp2 - bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - lsl tmp3, srcin, 2 - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ - + lsl tmp2, srcin, 2 + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend - lsr tmp1, tmp1, tmp3 + lsr tmp1, tmp1, tmp2 cbz tmp1, L(loop) rbit tmp1, tmp1 @@ -87,28 +77,34 @@ ENTRY (strchr) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_nul.16b, vhas_nul.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_nul.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_nul.16b, vhas_nul.16b fmov tmp1, dend cbz tmp1, L(loop) + sub src, src, 16 +L(end): #ifdef __AARCH64EB__ bif vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend #else bit vhas_nul.16b, vhas_chr.16b, vrepmask.16b - and vhas_nul.16b, vhas_nul.16b, vrepmask2.16b - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */ + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */ fmov tmp1, dend rbit tmp1, tmp1 #endif + add src, src, 16 clz tmp1, tmp1 - /* Tmp1 is an even multiple of 2 if the target character was - found first. Otherwise we've found the end of string. */ + /* Tmp1 is a multiple of 4 if the target character was found. */ tst tmp1, 2 add result, src, tmp1, lsr 2 csel result, result, xzr, eq