Message ID | 1527221256-17029-2-git-send-email-wei.guo.simon@gmail.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | powerpc/64: memcmp() optimization | expand |
On Fri, May 25, 2018 at 12:07:33PM +0800, wei.guo.simon@gmail.com wrote: > _GLOBAL(memcmp) > cmpdi cr1,r5,0 > > - /* Use the short loop if both strings are not 8B aligned */ > - or r6,r3,r4 > + /* Use the short loop if the src/dst addresses are not > + * with the same offset of 8 bytes align boundary. > + */ > + xor r6,r3,r4 > andi. r6,r6,7 > > - /* Use the short loop if length is less than 32B */ > - cmpdi cr6,r5,31 > + /* Fall back to short loop if compare at aligned addrs > + * with less than 8 bytes. > + */ > + cmpdi cr6,r5,7 > > beq cr1,.Lzero > - bne .Lshort > - bgt cr6,.Llong > + bgt cr6,.Lno_short If this doesn't use cr0 anymore, you can do rlwinm r6,r6,0,7 instead of andi r6,r6,7 . > +.Lsameoffset_8bytes_make_align_start: > + /* attempt to compare bytes not aligned with 8 bytes so that > + * rest comparison can run based on 8 bytes alignment. > + */ > + andi. r6,r3,7 > + > + /* Try to compare the first double word which is not 8 bytes aligned: > + * load the first double word at (src & ~7UL) and shift left appropriate > + * bits before comparision. > + */ > + clrlwi r6,r3,29 > + rlwinm r6,r6,3,0,28 Those last two lines are together just rlwinm r6,r3,3,0x1c > + subfc. r5,r6,r5 Why subfc? You don't use the carry. > + rlwinm r6,r6,3,0,28 That's slwi r6,r6,3 > + bgt cr0,8f > + li r3,-1 > +8: > + blr blelr li r3,-1 blr (and more of the same things elsewhere). Segher
Hi Segher, On Mon, May 28, 2018 at 05:35:12AM -0500, Segher Boessenkool wrote: > On Fri, May 25, 2018 at 12:07:33PM +0800, wei.guo.simon@gmail.com wrote: > > _GLOBAL(memcmp) > > cmpdi cr1,r5,0 > > > > - /* Use the short loop if both strings are not 8B aligned */ > > - or r6,r3,r4 > > + /* Use the short loop if the src/dst addresses are not > > + * with the same offset of 8 bytes align boundary. > > + */ > > + xor r6,r3,r4 > > andi. r6,r6,7 > > > > - /* Use the short loop if length is less than 32B */ > > - cmpdi cr6,r5,31 > > + /* Fall back to short loop if compare at aligned addrs > > + * with less than 8 bytes. > > + */ > > + cmpdi cr6,r5,7 > > > > beq cr1,.Lzero > > - bne .Lshort > > - bgt cr6,.Llong > > + bgt cr6,.Lno_short > > If this doesn't use cr0 anymore, you can do rlwinm r6,r6,0,7 instead of > andi r6,r6,7 . > CR0 is used at .Lno_short handling. > > +.Lsameoffset_8bytes_make_align_start: > > + /* attempt to compare bytes not aligned with 8 bytes so that > > + * rest comparison can run based on 8 bytes alignment. > > + */ > > + andi. r6,r3,7 > > + > > + /* Try to compare the first double word which is not 8 bytes aligned: > > + * load the first double word at (src & ~7UL) and shift left appropriate > > + * bits before comparision. > > + */ > > + clrlwi r6,r3,29 > > + rlwinm r6,r6,3,0,28 > > Those last two lines are together just > rlwinm r6,r3,3,0x1c > Yes. I will combine them. > > + subfc. r5,r6,r5 > > Why subfc? You don't use the carry. OK. I will use subfc instead. > > > + rlwinm r6,r6,3,0,28 > > That's > slwi r6,r6,3 Yes. > > > + bgt cr0,8f > > + li r3,-1 > > +8: > > + blr > > blelr > li r3,-1 > blr Sure. That looks more impact. > > (and more of the same things elsewhere). > > > Segher Thanks for your good comments. BR, - Simon
Hi! On Wed, May 30, 2018 at 04:11:50PM +0800, Simon Guo wrote: > On Mon, May 28, 2018 at 05:35:12AM -0500, Segher Boessenkool wrote: > > On Fri, May 25, 2018 at 12:07:33PM +0800, wei.guo.simon@gmail.com wrote: > > If this doesn't use cr0 anymore, you can do rlwinm r6,r6,0,7 instead of > > andi r6,r6,7 . > > > CR0 is used at .Lno_short handling. Tricky. > > > + subfc. r5,r6,r5 > > > > Why subfc? You don't use the carry. > OK. I will use subfc instead. I meant subf -- no carry. If you want CR0 set there is subf. just fine. > > > + bgt cr0,8f > > > + li r3,-1 > > > +8: > > > + blr > > > > blelr > > li r3,-1 > > blr > Sure. That looks more impact. Should have been bgtlr of course -- well check please :-) Segher
On Wed, May 30, 2018 at 03:27:39AM -0500, Segher Boessenkool wrote: > Hi! > > On Wed, May 30, 2018 at 04:11:50PM +0800, Simon Guo wrote: > > On Mon, May 28, 2018 at 05:35:12AM -0500, Segher Boessenkool wrote: > > > On Fri, May 25, 2018 at 12:07:33PM +0800, wei.guo.simon@gmail.com wrote: > > > If this doesn't use cr0 anymore, you can do rlwinm r6,r6,0,7 instead of > > > andi r6,r6,7 . > > > > > CR0 is used at .Lno_short handling. > > Tricky. > > > > > + subfc. r5,r6,r5 > > > > > > Why subfc? You don't use the carry. > > OK. I will use subfc instead. > > I meant subf -- no carry. If you want CR0 set there is subf. just fine. > > > > > + bgt cr0,8f > > > > + li r3,-1 > > > > +8: > > > > + blr > > > > > > blelr > > > li r3,-1 > > > blr > > Sure. That looks more impact. > > Should have been bgtlr of course -- well check please :-) Yes :) Thanks, - Simon
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index d75d18b..f20e883 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -24,28 +24,41 @@ #define rH r31 #ifdef __LITTLE_ENDIAN__ +#define LH lhbrx +#define LW lwbrx #define LD ldbrx #else +#define LH lhzx +#define LW lwzx #define LD ldx #endif +/* + * There are 2 categories for memcmp: + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers + * are named like .Lsameoffset_xxxx + * 2) src/dst has different offset to the 8 bytes boundary. The handlers + * are named like .Ldiffoffset_xxxx + */ _GLOBAL(memcmp) cmpdi cr1,r5,0 - /* Use the short loop if both strings are not 8B aligned */ - or r6,r3,r4 + /* Use the short loop if the src/dst addresses are not + * with the same offset of 8 bytes align boundary. + */ + xor r6,r3,r4 andi. r6,r6,7 - /* Use the short loop if length is less than 32B */ - cmpdi cr6,r5,31 + /* Fall back to short loop if compare at aligned addrs + * with less than 8 bytes. + */ + cmpdi cr6,r5,7 beq cr1,.Lzero - bne .Lshort - bgt cr6,.Llong + bgt cr6,.Lno_short .Lshort: mtctr r5 - 1: lbz rA,0(r3) lbz rB,0(r4) subf. rC,rB,rA @@ -78,11 +91,90 @@ _GLOBAL(memcmp) li r3,0 blr +.Lno_short: + dcbt 0,r3 + dcbt 0,r4 + bne .Ldiffoffset_8bytes_make_align_start + + +.Lsameoffset_8bytes_make_align_start: + /* attempt to compare bytes not aligned with 8 bytes so that + * rest comparison can run based on 8 bytes alignment. + */ + andi. r6,r3,7 + + /* Try to compare the first double word which is not 8 bytes aligned: + * load the first double word at (src & ~7UL) and shift left appropriate + * bits before comparision. + */ + clrlwi r6,r3,29 + rlwinm r6,r6,3,0,28 + beq .Lsameoffset_8bytes_aligned + clrrdi r3,r3,3 + clrrdi r4,r4,3 + LD rA,0,r3 + LD rB,0,r4 + sld rA,rA,r6 + sld rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + subfic r6,r6,8 + subfc. r5,r6,r5 + addi r3,r3,8 + addi r4,r4,8 + beq .Lzero + +.Lsameoffset_8bytes_aligned: + /* now we are aligned with 8 bytes. + * Use .Llong loop if left cmp bytes are equal or greater than 32B. + */ + cmpdi cr6,r5,31 + bgt cr6,.Llong + +.Lcmp_lt32bytes: + /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ + cmpdi cr5,r5,7 + srdi r0,r5,3 + ble cr5,.Lcmp_rest_lt8bytes + + /* handle 8 ~ 31 bytes */ + clrldi r5,r5,61 + mtctr r0 +2: + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + bne cr0,.LcmpAB_lightweight + bdnz 2b + + cmpwi r5,0 + beq .Lzero + +.Lcmp_rest_lt8bytes: + /* Here we have only less than 8 bytes to compare with. at least s1 + * Address is aligned with 8 bytes. + * The next double words are load and shift right with appropriate + * bits. + */ + subfic r6,r5,8 + rlwinm r6,r6,3,0,28 + LD rA,0,r3 + LD rB,0,r4 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero + .Lnon_zero: mr r3,rC blr .Llong: + /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 li off24,24 @@ -232,4 +324,41 @@ _GLOBAL(memcmp) ld r28,-32(r1) ld r27,-40(r1) blr + +.LcmpAB_lightweight: /* skip NV GPRS restore */ + li r3,1 + bgt cr0,8f + li r3,-1 +8: + blr + +.Ldiffoffset_8bytes_make_align_start: + /* now try to align s1 with 8 bytes */ + andi. r6,r3,0x7 + rlwinm r6,r6,3,0,28 + beq .Ldiffoffset_align_s1_8bytes + + clrrdi r3,r3,3 + LD rA,0,r3 + LD rB,0,r4 /* unaligned load */ + sld rA,rA,r6 + srd rA,rA,r6 + srd rB,rB,r6 + cmpld cr0,rA,rB + srwi r6,r6,3 + bne cr0,.LcmpAB_lightweight + + subfic r6,r6,8 + subfc. r5,r6,r5 + addi r3,r3,8 + add r4,r4,r6 + + beq .Lzero + +.Ldiffoffset_align_s1_8bytes: + /* now s1 is aligned with 8 bytes. */ + cmpdi cr5,r5,31 + ble cr5,.Lcmp_lt32bytes + b .Llong + EXPORT_SYMBOL(memcmp)