Message ID | 1527058083-6998-3-git-send-email-wei.guo.simon@gmail.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | powerpc/64: memcmp() optimization | expand |
Hi Michael, On Thu, May 24, 2018 at 05:44:33PM +1000, Michael Ellerman wrote: > Hi Simon, > > wei.guo.simon@gmail.com writes: > > From: Simon Guo <wei.guo.simon@gmail.com> > > > > This patch add VMX primitives to do memcmp() in case the compare size > > exceeds 4K bytes. KSM feature can benefit from this. > > You say "exceeds 4K" here. > it should be >= 4k. I will correct the message. > > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > > index f20e883..6303bbf 100644 > > --- a/arch/powerpc/lib/memcmp_64.S > > +++ b/arch/powerpc/lib/memcmp_64.S > > @@ -27,12 +27,73 @@ > > #define LH lhbrx > > #define LW lwbrx > > #define LD ldbrx > > +#define LVS lvsr > > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ > > + vperm _VRT,_VRB,_VRA,_VRC > > #else > > #define LH lhzx > > #define LW lwzx > > #define LD ldx > > +#define LVS lvsl > > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ > > + vperm _VRT,_VRA,_VRB,_VRC > > #endif > > > > +#define VMX_OPS_THRES 4096 > > THRES == 4096 > > BTW, can we call it VMX_THRESH ? > Sure. I will update it. > > +#define ENTER_VMX_OPS \ > > + mflr r0; \ > > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ > > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ > > + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ > > + std r0,16(r1); \ > > + stdu r1,-STACKFRAMESIZE(r1); \ > > + bl enter_vmx_ops; \ > > + cmpwi cr1,r3,0; \ > > + ld r0,STACKFRAMESIZE+16(r1); \ > > + ld r3,STK_REG(R31)(r1); \ > > + ld r4,STK_REG(R30)(r1); \ > > + ld r5,STK_REG(R29)(r1); \ > > + addi r1,r1,STACKFRAMESIZE; \ > > + mtlr r0 > > + > > +#define EXIT_VMX_OPS \ > > + mflr r0; \ > > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ > > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ > > + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ > > + std r0,16(r1); \ > > + stdu r1,-STACKFRAMESIZE(r1); \ > > + bl exit_vmx_ops; \ > > + ld r0,STACKFRAMESIZE+16(r1); \ > > + ld r3,STK_REG(R31)(r1); \ > > + ld r4,STK_REG(R30)(r1); \ > > + ld r5,STK_REG(R29)(r1); \ > > + addi r1,r1,STACKFRAMESIZE; \ > > + mtlr r0 > > + > > +/* > > + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with > > + * 16 bytes boundary and permute the result with the 1st 16 bytes. > > + > > + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | > > + * ^ ^ ^ > > + * 0xbbbb10 0xbbbb20 0xbbb30 > > + * ^ > > + * _vaddr > > + * > > + * > > + * _vmask is the mask generated by LVS > > + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. > > + * for example: 0xyyyyyyyyyyyyy012 for big endian > > + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. > > + * for example: 0x3456789abcdefzzz for big endian > > + * The permute result is saved in _v_res. > > + * for example: 0x0123456789abcdef for big endian. > > + */ > > +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ > > + lvx _v2nd_qw,_vaddr,off16; \ > > + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) > > + > > /* > > * There are 2 categories for memcmp: > > * 1) src/dst has the same offset to the 8 bytes boundary. The handlers > > @@ -174,6 +235,13 @@ _GLOBAL(memcmp) > > blr > > > > .Llong: > > +#ifdef CONFIG_ALTIVEC > > + /* Try to use vmx loop if length is larger than 4K */ > > + cmpldi cr6,r5,VMX_OPS_THRES > > + bge cr6,.Lsameoffset_vmx_cmp > > Here we compare the length to 4K and if it's greater *or equal* then we > go to the VMX case. Or am I reading it backward? > > So we should say "if the size is 4K or more we do VMX" shouldn't we? Yes. Again I need reword the comment to "equal or greater than 4K" here. Thanks, - Simon
Hi Simon, wei.guo.simon@gmail.com writes: > From: Simon Guo <wei.guo.simon@gmail.com> > > This patch add VMX primitives to do memcmp() in case the compare size > exceeds 4K bytes. KSM feature can benefit from this. You say "exceeds 4K" here. > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > index f20e883..6303bbf 100644 > --- a/arch/powerpc/lib/memcmp_64.S > +++ b/arch/powerpc/lib/memcmp_64.S > @@ -27,12 +27,73 @@ > #define LH lhbrx > #define LW lwbrx > #define LD ldbrx > +#define LVS lvsr > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ > + vperm _VRT,_VRB,_VRA,_VRC > #else > #define LH lhzx > #define LW lwzx > #define LD ldx > +#define LVS lvsl > +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ > + vperm _VRT,_VRA,_VRB,_VRC > #endif > > +#define VMX_OPS_THRES 4096 THRES == 4096 BTW, can we call it VMX_THRESH ? > +#define ENTER_VMX_OPS \ > + mflr r0; \ > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ > + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ > + std r0,16(r1); \ > + stdu r1,-STACKFRAMESIZE(r1); \ > + bl enter_vmx_ops; \ > + cmpwi cr1,r3,0; \ > + ld r0,STACKFRAMESIZE+16(r1); \ > + ld r3,STK_REG(R31)(r1); \ > + ld r4,STK_REG(R30)(r1); \ > + ld r5,STK_REG(R29)(r1); \ > + addi r1,r1,STACKFRAMESIZE; \ > + mtlr r0 > + > +#define EXIT_VMX_OPS \ > + mflr r0; \ > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ > + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ > + std r0,16(r1); \ > + stdu r1,-STACKFRAMESIZE(r1); \ > + bl exit_vmx_ops; \ > + ld r0,STACKFRAMESIZE+16(r1); \ > + ld r3,STK_REG(R31)(r1); \ > + ld r4,STK_REG(R30)(r1); \ > + ld r5,STK_REG(R29)(r1); \ > + addi r1,r1,STACKFRAMESIZE; \ > + mtlr r0 > + > +/* > + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with > + * 16 bytes boundary and permute the result with the 1st 16 bytes. > + > + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | > + * ^ ^ ^ > + * 0xbbbb10 0xbbbb20 0xbbb30 > + * ^ > + * _vaddr > + * > + * > + * _vmask is the mask generated by LVS > + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. > + * for example: 0xyyyyyyyyyyyyy012 for big endian > + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. > + * for example: 0x3456789abcdefzzz for big endian > + * The permute result is saved in _v_res. > + * for example: 0x0123456789abcdef for big endian. > + */ > +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ > + lvx _v2nd_qw,_vaddr,off16; \ > + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) > + > /* > * There are 2 categories for memcmp: > * 1) src/dst has the same offset to the 8 bytes boundary. The handlers > @@ -174,6 +235,13 @@ _GLOBAL(memcmp) > blr > > .Llong: > +#ifdef CONFIG_ALTIVEC > + /* Try to use vmx loop if length is larger than 4K */ > + cmpldi cr6,r5,VMX_OPS_THRES > + bge cr6,.Lsameoffset_vmx_cmp Here we compare the length to 4K and if it's greater *or equal* then we go to the VMX case. Or am I reading it backward? So we should say "if the size is 4K or more we do VMX" shouldn't we? cheers
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index d9713ad..31fdcee 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval, /* VMX copying */ int enter_vmx_usercopy(void); int exit_vmx_usercopy(void); -int enter_vmx_copy(void); -void * exit_vmx_copy(void *dest); +int enter_vmx_ops(void); +void *exit_vmx_ops(void *dest); /* Traps */ long machine_check_early(struct pt_regs *regs); diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S index 8fa73b7..e38f956 100644 --- a/arch/powerpc/lib/copypage_power7.S +++ b/arch/powerpc/lib/copypage_power7.S @@ -57,7 +57,7 @@ _GLOBAL(copypage_power7) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -100,7 +100,7 @@ _GLOBAL(copypage_power7) addi r3,r3,128 bdnz 1b - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #else li r0,(PAGE_SIZE/128) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index f20e883..6303bbf 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -27,12 +27,73 @@ #define LH lhbrx #define LW lwbrx #define LD ldbrx +#define LVS lvsr +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRB,_VRA,_VRC #else #define LH lhzx #define LW lwzx #define LD ldx +#define LVS lvsl +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRA,_VRB,_VRC #endif +#define VMX_OPS_THRES 4096 +#define ENTER_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl enter_vmx_ops; \ + cmpwi cr1,r3,0; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +#define EXIT_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl exit_vmx_ops; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +/* + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with + * 16 bytes boundary and permute the result with the 1st 16 bytes. + + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | + * ^ ^ ^ + * 0xbbbb10 0xbbbb20 0xbbb30 + * ^ + * _vaddr + * + * + * _vmask is the mask generated by LVS + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. + * for example: 0xyyyyyyyyyyyyy012 for big endian + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. + * for example: 0x3456789abcdefzzz for big endian + * The permute result is saved in _v_res. + * for example: 0x0123456789abcdef for big endian. + */ +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ + lvx _v2nd_qw,_vaddr,off16; \ + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) + /* * There are 2 categories for memcmp: * 1) src/dst has the same offset to the 8 bytes boundary. The handlers @@ -174,6 +235,13 @@ _GLOBAL(memcmp) blr .Llong: +#ifdef CONFIG_ALTIVEC + /* Try to use vmx loop if length is larger than 4K */ + cmpldi cr6,r5,VMX_OPS_THRES + bge cr6,.Lsameoffset_vmx_cmp + +.Llong_novmx_cmp: +#endif /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 @@ -332,7 +400,94 @@ _GLOBAL(memcmp) 8: blr +#ifdef CONFIG_ALTIVEC +.Lsameoffset_vmx_cmp: + /* Enter with src/dst addrs has the same offset with 8 bytes + * align boundary + */ + ENTER_VMX_OPS + beq cr1,.Llong_novmx_cmp + +3: + /* need to check whether r4 has the same offset with r3 + * for 16 bytes boundary. + */ + xor r0,r3,r4 + andi. r0,r0,0xf + bne .Ldiffoffset_vmx_cmp_start + + /* len is no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + LD rA,0,r3 + beq 4f + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + + beq cr0,4f + /* save and restore cr0 */ + mfocrf r5,64 + EXIT_VMX_OPS + mtocrf 64,r5 + b .LcmpAB_lightweight + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + clrldi r5,r5,59 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + vcmpequd. v0,v0,v1 + bf 24,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + vcmpequd. v0,v0,v1 + bf 24,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + EXIT_VMX_OPS + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + /* diff the last 16 bytes */ + EXIT_VMX_OPS + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + li off8,8 + bne cr0,.LcmpAB_lightweight + + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif + .Ldiffoffset_8bytes_make_align_start: +#ifdef CONFIG_ALTIVEC + /* only do vmx ops when the size exceeds 4K bytes */ + cmpdi cr5,r5,VMX_OPS_THRES + bge cr5,.Ldiffoffset_vmx_cmp +.Ldiffoffset_novmx_cmp: +#endif + /* now try to align s1 with 8 bytes */ andi. r6,r3,0x7 rlwinm r6,r6,3,0,28 @@ -359,6 +514,82 @@ _GLOBAL(memcmp) /* now s1 is aligned with 8 bytes. */ cmpdi cr5,r5,31 ble cr5,.Lcmp_lt32bytes + +#ifdef CONFIG_ALTIVEC + b .Llong_novmx_cmp +#else b .Llong +#endif + +#ifdef CONFIG_ALTIVEC +.Ldiffoffset_vmx_cmp: + ENTER_VMX_OPS + beq cr1,.Ldiffoffset_novmx_cmp + +.Ldiffoffset_vmx_cmp_start: + /* Firstly try to align r3 with 16 bytes */ + andi. r6,r3,0xf + li off16,16 + beq .Ldiffoffset_vmx_s1_16bytes_align + LVS v3,0,r3 + LVS v4,0,r4 + + lvx v5,0,r3 + lvx v6,0,r4 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + + vcmpequb. v7,v9,v10 + bnl cr6,.Ldiffoffset_vmx_diff_found + + subfic r6,r6,16 + subf r5,r6,r5 + add r3,r3,r6 + add r4,r4,r6 + +.Ldiffoffset_vmx_s1_16bytes_align: + /* now s1 is aligned with 16 bytes */ + lvx v6,0,r4 + LVS v4,0,r4 + srdi r6,r5,5 /* loop for 32 bytes each */ + clrldi r5,r5,59 + mtctr r6 + +.balign 16 +.Ldiffoffset_vmx_32bytesloop: + /* the first qw of r4 was saved in v6 */ + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + vcmpequb. v7,v9,v10 + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + vcmpequb. v7,v9,v10 + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + bdnz .Ldiffoffset_vmx_32bytesloop + + EXIT_VMX_OPS + + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +.Ldiffoffset_vmx_diff_found: + EXIT_VMX_OPS + /* anyway, the diff will appear in next 16 bytes */ + li r5,16 + b .Lcmp_lt32bytes + +#endif EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index df7de9d..070cdf6 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi cr1,r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ .Lvmx_unaligned_copy: /* Get the destination 16B aligned */ @@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index bf925cd..9f34049 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void) return 0; } -int enter_vmx_copy(void) +int enter_vmx_ops(void) { if (in_interrupt()) return 0; @@ -70,7 +70,7 @@ int enter_vmx_copy(void) * passed a pointer to the destination which we return as required by a * memcpy implementation. */ -void *exit_vmx_copy(void *dest) +void *exit_vmx_ops(void *dest) { disable_kernel_altivec(); preempt_enable();