Message ID | 1527221256-17029-3-git-send-email-wei.guo.simon@gmail.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Series | powerpc/64: memcmp() optimization | expand |
On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote: > + /* save and restore cr0 */ > + mfocrf r5,64 > + EXIT_VMX_OPS > + mtocrf 64,r5 > + b .LcmpAB_lightweight That's cr1, not cr0. You can use mcrf instead, it is cheaper (esp. if you have it in a non-volatile CR field before so you need only one, if any). > + vcmpequb. v7,v9,v10 > + bnl cr6,.Ldiffoffset_vmx_diff_found In other places you say bf 24,... Dunno which is more readable, but please pick one? Segher
Hi Simon, wei.guo.simon@gmail.com writes: > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > index f20e883..4ba7bb6 100644 > --- a/arch/powerpc/lib/memcmp_64.S > +++ b/arch/powerpc/lib/memcmp_64.S > @@ -174,6 +235,13 @@ _GLOBAL(memcmp) > blr > > .Llong: > +#ifdef CONFIG_ALTIVEC > + /* Try to use vmx loop if length is equal or greater than 4K */ > + cmpldi cr6,r5,VMX_THRESH > + bge cr6,.Lsameoffset_vmx_cmp > + Here we decide to use vmx, but we don't do any CPU feature checks. > @@ -332,7 +400,94 @@ _GLOBAL(memcmp) > 8: > blr > > +#ifdef CONFIG_ALTIVEC > +.Lsameoffset_vmx_cmp: > + /* Enter with src/dst addrs has the same offset with 8 bytes > + * align boundary > + */ > + ENTER_VMX_OPS > + beq cr1,.Llong_novmx_cmp > + > +3: > + /* need to check whether r4 has the same offset with r3 > + * for 16 bytes boundary. > + */ > + xor r0,r3,r4 > + andi. r0,r0,0xf > + bne .Ldiffoffset_vmx_cmp_start > + > + /* len is no less than 4KB. Need to align with 16 bytes further. > + */ > + andi. rA,r3,8 > + LD rA,0,r3 > + beq 4f > + LD rB,0,r4 > + cmpld cr0,rA,rB > + addi r3,r3,8 > + addi r4,r4,8 > + addi r5,r5,-8 > + > + beq cr0,4f > + /* save and restore cr0 */ > + mfocrf r5,64 > + EXIT_VMX_OPS > + mtocrf 64,r5 > + b .LcmpAB_lightweight > + > +4: > + /* compare 32 bytes for each loop */ > + srdi r0,r5,5 > + mtctr r0 > + clrldi r5,r5,59 > + li off16,16 > + > +.balign 16 > +5: > + lvx v0,0,r3 > + lvx v1,0,r4 > + vcmpequd. v0,v0,v1 vcmpequd is only available on Power8 and later CPUs. Which means this will crash on Power7 or earlier. Something like this should fix it I think. diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index 96eb08b2be2e..0a11ff14dcd9 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -236,9 +236,11 @@ _GLOBAL(memcmp) .Llong: #ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION /* Try to use vmx loop if length is equal or greater than 4K */ cmpldi cr6,r5,VMX_THRESH bge cr6,.Lsameoffset_vmx_cmp +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) .Llong_novmx_cmp: #endif There's another problem which is that old toolchains don't know about vcmpequd. To fix that we'll need to add a macro that uses .long to construct the instruction. cheers
Hi Segher, On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote: > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote: > > + /* save and restore cr0 */ > > + mfocrf r5,64 > > + EXIT_VMX_OPS > > + mtocrf 64,r5 > > + b .LcmpAB_lightweight > > That's cr1, not cr0. You can use mcrf instead, it is cheaper (esp. if > you have it in a non-volatile CR field before so you need only one, if any). > You are right :) How about using mtcr/mfcr instead, I think they are fast as well and more readable. > > + vcmpequb. v7,v9,v10 > > + bnl cr6,.Ldiffoffset_vmx_diff_found > > In other places you say bf 24,... Dunno which is more readable, but > please pick one? I will update to bnl for other cases. > > > Segher Thanks for your review. BR, - Simon
Hi Michael, On Mon, May 28, 2018 at 09:59:29PM +1000, Michael Ellerman wrote: > Hi Simon, > > wei.guo.simon@gmail.com writes: > > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > > index f20e883..4ba7bb6 100644 > > --- a/arch/powerpc/lib/memcmp_64.S > > +++ b/arch/powerpc/lib/memcmp_64.S > > @@ -174,6 +235,13 @@ _GLOBAL(memcmp) > > blr > > > > .Llong: > > +#ifdef CONFIG_ALTIVEC > > + /* Try to use vmx loop if length is equal or greater than 4K */ > > + cmpldi cr6,r5,VMX_THRESH > > + bge cr6,.Lsameoffset_vmx_cmp > > + > > Here we decide to use vmx, but we don't do any CPU feature checks. > > > > @@ -332,7 +400,94 @@ _GLOBAL(memcmp) > > 8: > > blr > > > > +#ifdef CONFIG_ALTIVEC > > +.Lsameoffset_vmx_cmp: > > + /* Enter with src/dst addrs has the same offset with 8 bytes > > + * align boundary > > + */ > > + ENTER_VMX_OPS > > + beq cr1,.Llong_novmx_cmp > > + > > +3: > > + /* need to check whether r4 has the same offset with r3 > > + * for 16 bytes boundary. > > + */ > > + xor r0,r3,r4 > > + andi. r0,r0,0xf > > + bne .Ldiffoffset_vmx_cmp_start > > + > > + /* len is no less than 4KB. Need to align with 16 bytes further. > > + */ > > + andi. rA,r3,8 > > + LD rA,0,r3 > > + beq 4f > > + LD rB,0,r4 > > + cmpld cr0,rA,rB > > + addi r3,r3,8 > > + addi r4,r4,8 > > + addi r5,r5,-8 > > + > > + beq cr0,4f > > + /* save and restore cr0 */ > > + mfocrf r5,64 > > + EXIT_VMX_OPS > > + mtocrf 64,r5 > > + b .LcmpAB_lightweight > > + > > +4: > > + /* compare 32 bytes for each loop */ > > + srdi r0,r5,5 > > + mtctr r0 > > + clrldi r5,r5,59 > > + li off16,16 > > + > > +.balign 16 > > +5: > > + lvx v0,0,r3 > > + lvx v1,0,r4 > > + vcmpequd. v0,v0,v1 > > vcmpequd is only available on Power8 and later CPUs. > > Which means this will crash on Power7 or earlier. > > Something like this should fix it I think. > > diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S > index 96eb08b2be2e..0a11ff14dcd9 100644 > --- a/arch/powerpc/lib/memcmp_64.S > +++ b/arch/powerpc/lib/memcmp_64.S > @@ -236,9 +236,11 @@ _GLOBAL(memcmp) > > .Llong: > #ifdef CONFIG_ALTIVEC > +BEGIN_FTR_SECTION > /* Try to use vmx loop if length is equal or greater than 4K */ > cmpldi cr6,r5,VMX_THRESH > bge cr6,.Lsameoffset_vmx_cmp > +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) > > .Llong_novmx_cmp: > #endif Thanks for the good catch! I will update that. > > > There's another problem which is that old toolchains don't know about > vcmpequd. To fix that we'll need to add a macro that uses .long to > construct the instruction. Right. I will add the corresponding macros. Thanks for your review. BR, - Simon
On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote: > Hi Segher, > On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote: > > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote: > > > + /* save and restore cr0 */ > > > + mfocrf r5,64 > > > + EXIT_VMX_OPS > > > + mtocrf 64,r5 > > > + b .LcmpAB_lightweight > > > > That's cr1, not cr0. You can use mcrf instead, it is cheaper (esp. if > > you have it in a non-volatile CR field before so you need only one, if any). > > > You are right :) How about using mtcr/mfcr instead, I think they are > fast as well and more readable. Those are much worse than m[ft]ocrf. You probably should just shuffle things around so that EXIT_VMX_OPS does not clobber the CR field you need to keep. Segher
On Wed, May 30, 2018 at 03:35:40AM -0500, Segher Boessenkool wrote: > On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote: > > Hi Segher, > > On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote: > > > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote: > > > > + /* save and restore cr0 */ > > > > + mfocrf r5,64 > > > > + EXIT_VMX_OPS > > > > + mtocrf 64,r5 > > > > + b .LcmpAB_lightweight > > > > > > That's cr1, not cr0. You can use mcrf instead, it is cheaper (esp. if > > > you have it in a non-volatile CR field before so you need only one, if any). > > > > > You are right :) How about using mtcr/mfcr instead, I think they are > > fast as well and more readable. > > Those are much worse than m[ft]ocrf. > > You probably should just shuffle things around so that EXIT_VMX_OPS > does not clobber the CR field you need to keep. Let me use mcrf then :) Thanks, - Simon
Hi segher, On Wed, May 30, 2018 at 05:03:21PM +0800, Simon Guo wrote: > On Wed, May 30, 2018 at 03:35:40AM -0500, Segher Boessenkool wrote: > > On Wed, May 30, 2018 at 04:14:02PM +0800, Simon Guo wrote: > > > Hi Segher, > > > On Mon, May 28, 2018 at 06:05:59AM -0500, Segher Boessenkool wrote: > > > > On Fri, May 25, 2018 at 12:07:34PM +0800, wei.guo.simon@gmail.com wrote: > > > > > + /* save and restore cr0 */ > > > > > + mfocrf r5,64 > > > > > + EXIT_VMX_OPS > > > > > + mtocrf 64,r5 > > > > > + b .LcmpAB_lightweight > > > > > > > > That's cr1, not cr0. You can use mcrf instead, it is cheaper (esp. if > > > > you have it in a non-volatile CR field before so you need only one, if any). > > > > > > > You are right :) How about using mtcr/mfcr instead, I think they are > > > fast as well and more readable. > > > > Those are much worse than m[ft]ocrf. > > > > You probably should just shuffle things around so that EXIT_VMX_OPS > > does not clobber the CR field you need to keep. > Let me use mcrf then :) I now felt unformatable to use mcrf like: mcrf 7,0 since I cannot 100% confident that compiler will not use CR7 or other CR# in exit_vmx_ops(). Can we switch back to mfocrf/mtocrf with correct CR0 value? mfocrf r5,128 ... mtocrf 128,r5 Thanks, - Simon
On Wed, Jun 06, 2018 at 02:42:27PM +0800, Simon Guo wrote: > I now felt unformatable to use mcrf like: > mcrf 7,0 > > since I cannot 100% confident that compiler will not use CR7 or other > CR# in exit_vmx_ops(). It wasn't clear to me this macro boils down to a function call. You can use CR2,CR3,CR4, but you'll need to save and restore those at the start and end of function then, which is just as nasty. Better is to restructure some code so you don't need that CR field there anymore. > Can we switch back to mfocrf/mtocrf with correct CR0 value? > mfocrf r5,128 > ... > mtocrf 128,r5 Sure, I'm not your boss ;-) It seems a shame to me to have this 12 or whatever cycle delay here, since the whole point of the patch is to make things faster, that's all (but it still is faster, right, you tested it). Segher
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index d9713ad..31fdcee 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -49,8 +49,8 @@ void __trace_hcall_exit(long opcode, unsigned long retval, /* VMX copying */ int enter_vmx_usercopy(void); int exit_vmx_usercopy(void); -int enter_vmx_copy(void); -void * exit_vmx_copy(void *dest); +int enter_vmx_ops(void); +void *exit_vmx_ops(void *dest); /* Traps */ long machine_check_early(struct pt_regs *regs); diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S index 8fa73b7..e38f956 100644 --- a/arch/powerpc/lib/copypage_power7.S +++ b/arch/powerpc/lib/copypage_power7.S @@ -57,7 +57,7 @@ _GLOBAL(copypage_power7) std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -100,7 +100,7 @@ _GLOBAL(copypage_power7) addi r3,r3,128 bdnz 1b - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #else li r0,(PAGE_SIZE/128) diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S index f20e883..4ba7bb6 100644 --- a/arch/powerpc/lib/memcmp_64.S +++ b/arch/powerpc/lib/memcmp_64.S @@ -27,12 +27,73 @@ #define LH lhbrx #define LW lwbrx #define LD ldbrx +#define LVS lvsr +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRB,_VRA,_VRC #else #define LH lhzx #define LW lwzx #define LD ldx +#define LVS lvsl +#define VPERM(_VRT,_VRA,_VRB,_VRC) \ + vperm _VRT,_VRA,_VRB,_VRC #endif +#define VMX_THRESH 4096 +#define ENTER_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl enter_vmx_ops; \ + cmpwi cr1,r3,0; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +#define EXIT_VMX_OPS \ + mflr r0; \ + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ + std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ + std r0,16(r1); \ + stdu r1,-STACKFRAMESIZE(r1); \ + bl exit_vmx_ops; \ + ld r0,STACKFRAMESIZE+16(r1); \ + ld r3,STK_REG(R31)(r1); \ + ld r4,STK_REG(R30)(r1); \ + ld r5,STK_REG(R29)(r1); \ + addi r1,r1,STACKFRAMESIZE; \ + mtlr r0 + +/* + * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with + * 16 bytes boundary and permute the result with the 1st 16 bytes. + + * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | + * ^ ^ ^ + * 0xbbbb10 0xbbbb20 0xbbb30 + * ^ + * _vaddr + * + * + * _vmask is the mask generated by LVS + * _v1st_qw is the 1st aligned QW of current addr which is already loaded. + * for example: 0xyyyyyyyyyyyyy012 for big endian + * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. + * for example: 0x3456789abcdefzzz for big endian + * The permute result is saved in _v_res. + * for example: 0x0123456789abcdef for big endian. + */ +#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ + lvx _v2nd_qw,_vaddr,off16; \ + VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) + /* * There are 2 categories for memcmp: * 1) src/dst has the same offset to the 8 bytes boundary. The handlers @@ -133,7 +194,7 @@ _GLOBAL(memcmp) bgt cr6,.Llong .Lcmp_lt32bytes: - /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ + /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ cmpdi cr5,r5,7 srdi r0,r5,3 ble cr5,.Lcmp_rest_lt8bytes @@ -174,6 +235,13 @@ _GLOBAL(memcmp) blr .Llong: +#ifdef CONFIG_ALTIVEC + /* Try to use vmx loop if length is equal or greater than 4K */ + cmpldi cr6,r5,VMX_THRESH + bge cr6,.Lsameoffset_vmx_cmp + +.Llong_novmx_cmp: +#endif /* At least s1 addr is aligned with 8 bytes */ li off8,8 li off16,16 @@ -332,7 +400,94 @@ _GLOBAL(memcmp) 8: blr +#ifdef CONFIG_ALTIVEC +.Lsameoffset_vmx_cmp: + /* Enter with src/dst addrs has the same offset with 8 bytes + * align boundary + */ + ENTER_VMX_OPS + beq cr1,.Llong_novmx_cmp + +3: + /* need to check whether r4 has the same offset with r3 + * for 16 bytes boundary. + */ + xor r0,r3,r4 + andi. r0,r0,0xf + bne .Ldiffoffset_vmx_cmp_start + + /* len is no less than 4KB. Need to align with 16 bytes further. + */ + andi. rA,r3,8 + LD rA,0,r3 + beq 4f + LD rB,0,r4 + cmpld cr0,rA,rB + addi r3,r3,8 + addi r4,r4,8 + addi r5,r5,-8 + + beq cr0,4f + /* save and restore cr0 */ + mfocrf r5,64 + EXIT_VMX_OPS + mtocrf 64,r5 + b .LcmpAB_lightweight + +4: + /* compare 32 bytes for each loop */ + srdi r0,r5,5 + mtctr r0 + clrldi r5,r5,59 + li off16,16 + +.balign 16 +5: + lvx v0,0,r3 + lvx v1,0,r4 + vcmpequd. v0,v0,v1 + bf 24,7f + lvx v0,off16,r3 + lvx v1,off16,r4 + vcmpequd. v0,v0,v1 + bf 24,6f + addi r3,r3,32 + addi r4,r4,32 + bdnz 5b + + EXIT_VMX_OPS + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +6: + addi r3,r3,16 + addi r4,r4,16 + +7: + /* diff the last 16 bytes */ + EXIT_VMX_OPS + LD rA,0,r3 + LD rB,0,r4 + cmpld cr0,rA,rB + li off8,8 + bne cr0,.LcmpAB_lightweight + + LD rA,off8,r3 + LD rB,off8,r4 + cmpld cr0,rA,rB + bne cr0,.LcmpAB_lightweight + b .Lzero +#endif + .Ldiffoffset_8bytes_make_align_start: +#ifdef CONFIG_ALTIVEC + /* only do vmx ops when the size equal or greater than 4K bytes */ + cmpdi cr5,r5,VMX_THRESH + bge cr5,.Ldiffoffset_vmx_cmp +.Ldiffoffset_novmx_cmp: +#endif + /* now try to align s1 with 8 bytes */ andi. r6,r3,0x7 rlwinm r6,r6,3,0,28 @@ -359,6 +514,82 @@ _GLOBAL(memcmp) /* now s1 is aligned with 8 bytes. */ cmpdi cr5,r5,31 ble cr5,.Lcmp_lt32bytes + +#ifdef CONFIG_ALTIVEC + b .Llong_novmx_cmp +#else b .Llong +#endif + +#ifdef CONFIG_ALTIVEC +.Ldiffoffset_vmx_cmp: + ENTER_VMX_OPS + beq cr1,.Ldiffoffset_novmx_cmp + +.Ldiffoffset_vmx_cmp_start: + /* Firstly try to align r3 with 16 bytes */ + andi. r6,r3,0xf + li off16,16 + beq .Ldiffoffset_vmx_s1_16bytes_align + LVS v3,0,r3 + LVS v4,0,r4 + + lvx v5,0,r3 + lvx v6,0,r4 + LD_VSR_CROSS16B(r3,v3,v5,v7,v9) + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + + vcmpequb. v7,v9,v10 + bnl cr6,.Ldiffoffset_vmx_diff_found + + subfic r6,r6,16 + subf r5,r6,r5 + add r3,r3,r6 + add r4,r4,r6 + +.Ldiffoffset_vmx_s1_16bytes_align: + /* now s1 is aligned with 16 bytes */ + lvx v6,0,r4 + LVS v4,0,r4 + srdi r6,r5,5 /* loop for 32 bytes each */ + clrldi r5,r5,59 + mtctr r6 + +.balign 16 +.Ldiffoffset_vmx_32bytesloop: + /* the first qw of r4 was saved in v6 */ + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + vcmpequb. v7,v9,v10 + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + lvx v9,0,r3 + LD_VSR_CROSS16B(r4,v4,v6,v8,v10) + vcmpequb. v7,v9,v10 + vor v6,v8,v8 + bnl cr6,.Ldiffoffset_vmx_diff_found + + addi r3,r3,16 + addi r4,r4,16 + + bdnz .Ldiffoffset_vmx_32bytesloop + + EXIT_VMX_OPS + + cmpdi r5,0 + beq .Lzero + b .Lcmp_lt32bytes + +.Ldiffoffset_vmx_diff_found: + EXIT_VMX_OPS + /* anyway, the diff will appear in next 16 bytes */ + li r5,16 + b .Lcmp_lt32bytes + +#endif EXPORT_SYMBOL(memcmp) diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index df7de9d..070cdf6 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S @@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7) std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) std r0,16(r1) stdu r1,-STACKFRAMESIZE(r1) - bl enter_vmx_copy + bl enter_vmx_ops cmpwi cr1,r3,0 ld r0,STACKFRAMESIZE+16(r1) ld r3,STK_REG(R31)(r1) @@ -445,7 +445,7 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ .Lvmx_unaligned_copy: /* Get the destination 16B aligned */ @@ -649,5 +649,5 @@ _GLOBAL(memcpy_power7) 15: addi r1,r1,STACKFRAMESIZE ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) - b exit_vmx_copy /* tail call optimise */ + b exit_vmx_ops /* tail call optimise */ #endif /* CONFIG_ALTIVEC */ diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index bf925cd..9f34049 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -53,7 +53,7 @@ int exit_vmx_usercopy(void) return 0; } -int enter_vmx_copy(void) +int enter_vmx_ops(void) { if (in_interrupt()) return 0; @@ -70,7 +70,7 @@ int enter_vmx_copy(void) * passed a pointer to the destination which we return as required by a * memcpy implementation. */ -void *exit_vmx_copy(void *dest) +void *exit_vmx_ops(void *dest) { disable_kernel_altivec(); preempt_enable();