Message ID | PAWPR08MB8982A380E327146CB6070B9783FD9@PAWPR08MB8982.eurprd08.prod.outlook.com |
---|---|
State | New |
Headers | show |
Series | AArch64: Improve strchrnul | expand |
The 01/12/2023 16:00, Wilco Dijkstra wrote: > Unroll the main loop, which improves performance slightly. > Passes regress. please commit it, thanks. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> > > --- > > diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S > index 4ca1e58c36ac903fad83c4470ed7f4abd6c74e27..aa8c9a4363051b4098c2f60ee830bab7326a54a7 100644 > --- a/sysdeps/aarch64/strchrnul.S > +++ b/sysdeps/aarch64/strchrnul.S > @@ -70,14 +70,22 @@ ENTRY (__strchrnul) > > .p2align 4 > L(loop): > - ldr qdata, [src, 16]! > + ldr qdata, [src, 16] > + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b > + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b > + fmov tmp1, dend > + cbnz tmp1, L(end) > + ldr qdata, [src, 32]! > cmeq vhas_chr.16b, vdata.16b, vrepchr.16b > cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b > umaxp vend.16b, vhas_chr.16b, vhas_chr.16b > fmov tmp1, dend > cbz tmp1, L(loop) > - > + sub src, src, 16 > +L(end): > shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ > + add src, src, 16 > fmov tmp1, dend > #ifndef __AARCH64EB__ > rbit tmp1, tmp1
diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S index 4ca1e58c36ac903fad83c4470ed7f4abd6c74e27..aa8c9a4363051b4098c2f60ee830bab7326a54a7 100644 --- a/sysdeps/aarch64/strchrnul.S +++ b/sysdeps/aarch64/strchrnul.S @@ -70,14 +70,22 @@ ENTRY (__strchrnul) .p2align 4 L(loop): - ldr qdata, [src, 16]! + ldr qdata, [src, 16] + cmeq vhas_chr.16b, vdata.16b, vrepchr.16b + cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b + umaxp vend.16b, vhas_chr.16b, vhas_chr.16b + fmov tmp1, dend + cbnz tmp1, L(end) + ldr qdata, [src, 32]! cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b umaxp vend.16b, vhas_chr.16b, vhas_chr.16b fmov tmp1, dend cbz tmp1, L(loop) - + sub src, src, 16 +L(end): shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */ + add src, src, 16 fmov tmp1, dend #ifndef __AARCH64EB__ rbit tmp1, tmp1