Message ID | 20170320234046.32718-1-anton@ozlabs.org (mailing list archive) |
---|---|
State | Changes Requested |
Headers | show |
On Tue, 21 Mar 2017 10:40:46 +1100 Anton Blanchard <anton@ozlabs.org> wrote: > From: Anton Blanchard <anton@samba.org> > > Add a POWER9 optimised copy_page() loop. This loop uses the new D form > vector loads and stores, and uses dcbz to pre zero the destination. > > A few questions: > > - I'm using a nested feature section, but that is going to get unwieldy > at some stage. It would be nice to update the call site for copy_page > directly. I've got a patch that makes alternate feature patching a bit more flexible and not hit relocation limits when using big "else" parts. I was thinking of doing something like _GLOBAL_TOC(copy_page) BEGIN_FTR_SECTION_NESTED(50) #include "copypage_power9.S" FTR_SECTION_ELSE_NESTED(50) #include "copypage_power7.S" ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50) Patching callers directly is another option though. I'll bug mpe about it again when he's least expecting it. > - I'm using CPU_FTR_ARCH_300, but as our functions grow perhaps we want > the cputable entry to contain a pointer to optimised functions. We might be able to do some nested alternatives macros to hide the details and allow an IFSET / ELSEIFSET / etc / ELSE. > > Signed-off-by: Anton Blanchard <anton@samba.org> > --- > arch/powerpc/lib/Makefile | 2 +- > arch/powerpc/lib/copypage_64.S | 4 + > arch/powerpc/lib/copypage_power9.S | 224 +++++++++++++++++++++++++++++++++++++ > 3 files changed, 229 insertions(+), 1 deletion(-) > create mode 100644 arch/powerpc/lib/copypage_power9.S > > diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile > index 2b5e090..d3667b5 100644 > --- a/arch/powerpc/lib/Makefile > +++ b/arch/powerpc/lib/Makefile > @@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o > > obj64-y += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \ > copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ > - memcpy_64.o memcmp_64.o > + memcpy_64.o memcmp_64.o copypage_power9.o > > obj64-$(CONFIG_SMP) += locks.o > obj64-$(CONFIG_ALTIVEC) += vmx-helper.o > diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S > index 4bcc9e7..051423e 100644 > --- a/arch/powerpc/lib/copypage_64.S > +++ b/arch/powerpc/lib/copypage_64.S > @@ -21,7 +21,11 @@ _GLOBAL_TOC(copy_page) > BEGIN_FTR_SECTION > lis r5,PAGE_SIZE@h > FTR_SECTION_ELSE > + BEGIN_FTR_SECTION_NESTED(50) > + b copypage_power9 > + FTR_SECTION_ELSE_NESTED(50) > b copypage_power7 > + ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50) > ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) > ori r5,r5,PAGE_SIZE@l > BEGIN_FTR_SECTION > diff --git a/arch/powerpc/lib/copypage_power9.S b/arch/powerpc/lib/copypage_power9.S > new file mode 100644 > index 0000000..2493f94 > --- /dev/null > +++ b/arch/powerpc/lib/copypage_power9.S > @@ -0,0 +1,224 @@ > +/* > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. > + * > + * Copyright (C) IBM Corporation, 2017 > + * > + * Author: Anton Blanchard <anton@au.ibm.com> > + */ > +#include <asm/page.h> > +#include <asm/ppc_asm.h> > + > +_GLOBAL(copypage_power9) > + /* > + * We prefetch the source using enhanced touch instructions. We use > + * a stream ID of 0 for this. Since the source is page aligned we > + * don't need to clear the bottom 7 bits of the address. > + */ > +#ifdef CONFIG_PPC_64K_PAGES > + lis r7,0x0E01 /* depth=7 > + * units/cachelines=512 */ > +#else > + lis r7,0x0E00 /* depth=7 */ > + ori r7,r7,0x1000 /* units/cachelines=32 */ > +#endif > + > + lis r8,0x8000 /* GO=1 */ > + clrldi r8,r8,32 > + > +.machine push > +.machine "power4" > + /* setup read stream 0 */ > + dcbt r0,r4,0b01000 /* addr from */ > + dcbt r0,r7,0b01010 /* length and depth from */ > + eieio > + dcbt r0,r8,0b01010 /* all streams GO */ > + eieio > +.machine pop I guess POWER asm doesn't need this but it's good practice to prevent copy paste errors? It would be nice to have some macros to hide all these constants, but that's for another patch. The commenting is good. I don't suppose the stream setup is costly enough to consider touching a cacheline or two ahead before starting it? > + > + /* > + * To reduce memory bandwidth on the store side we send dcbzs ahead. > + * Experimental testing shows 2 cachelines as good enough. > + */ > + li r6,128 > + dcbz 0,r3 > + dcbz r6,r3 > + > +#ifdef CONFIG_ALTIVEC > + mflr r0 > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) > + std r0,16(r1) > + stdu r1,-STACKFRAMESIZE(r1) > + bl enter_vmx_copy > + cmpwi r3,0 > + ld r0,STACKFRAMESIZE+16(r1) > + ld r3,STK_REG(R31)(r1) > + ld r4,STK_REG(R30)(r1) > + addi r1,r1,STACKFRAMESIZE > + mtlr r0 (Also for another day) We might be able to avoid the stack and call for some common cases. Pretty small overcall cost I guess, but it could be beneficial for memcpy if not copy_page. Thanks, Nick
Hi Nick, > I've got a patch that makes alternate feature patching a bit > more flexible and not hit relocation limits when using big "else" > parts. I was thinking of doing something like > > _GLOBAL_TOC(copy_page) > BEGIN_FTR_SECTION_NESTED(50) > #include "copypage_power9.S" > FTR_SECTION_ELSE_NESTED(50) > #include "copypage_power7.S" > ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50) Good idea, I hadn't thought of embedding it all in a feature section. > I guess POWER asm doesn't need this but it's good practice to prevent > copy paste errors? It would be nice to have some macros to hide all > these constants, but that's for another patch. The commenting is good. The .machine X macros? Unfortunately the format of dcbt is different for recent server chips. This wasn't a great idea in retrospect because if you do get the instruction layout wrong, you wont get a fault to warn you. > I don't suppose the stream setup is costly enough to consider > touching a cacheline or two ahead before starting it? Starting up software streams is a bit of an art - if the demand loads get ahead then a hardware stream gets started before the software one. Note all the eieios to try and avoid this happening. I've struggled with software prefetch on previous chips and sometimes I wonder if it is worth the pain. > (Also for another day) We might be able to avoid the stack and call > for some common cases. Pretty small overcall cost I guess, but it > could be beneficial for memcpy if not copy_page. Definitely. Also the breakpoint for using vector should be much lower if we have already saved the user state in a previous call. Anton
On Tue, 21 Mar 2017 15:01:03 +1100 Anton Blanchard <anton@samba.org> wrote: > Hi Nick, > > > I've got a patch that makes alternate feature patching a bit > > more flexible and not hit relocation limits when using big "else" > > parts. I was thinking of doing something like > > > > _GLOBAL_TOC(copy_page) > > BEGIN_FTR_SECTION_NESTED(50) > > #include "copypage_power9.S" > > FTR_SECTION_ELSE_NESTED(50) > > #include "copypage_power7.S" > > ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50) > > Good idea, I hadn't thought of embedding it all in a feature section. It may not work currently because you get those ftr_alt_97 relocation errors with the "else" parts because relative branches to other code need to be direct and I think reachable from both places. > > I guess POWER asm doesn't need this but it's good practice to prevent > > copy paste errors? It would be nice to have some macros to hide all > > these constants, but that's for another patch. The commenting is good. > > The .machine X macros? Unfortunately the format of dcbt is different > for recent server chips. This wasn't a great idea in retrospect because > if you do get the instruction layout wrong, you wont get a fault to warn > you. Is that embedded vs server, or pre-POWER4 vs POWER4 and up? Anyway no big deal. > > I don't suppose the stream setup is costly enough to consider > > touching a cacheline or two ahead before starting it? > > Starting up software streams is a bit of an art - if the demand loads > get ahead then a hardware stream gets started before the software one. > Note all the eieios to try and avoid this happening. > > I've struggled with software prefetch on previous chips and sometimes I > wonder if it is worth the pain. Oh I see. Makes sense. > > (Also for another day) We might be able to avoid the stack and call > > for some common cases. Pretty small overcall cost I guess, but it > > could be beneficial for memcpy if not copy_page. > > Definitely. Also the breakpoint for using vector should be much > lower if we have already saved the user state in a previous call. Yes agreed. Another problem is multiple small mem/string/crypto operations may never trip the limit even if it would make sense. Difficult to improve that (kernel could provide a hint to the arch maybe).
Hi Nick, > > Good idea, I hadn't thought of embedding it all in a feature > > section. > > It may not work currently because you get those ftr_alt_97 relocation > errors with the "else" parts because relative branches to other code > need to be direct and I think reachable from both places. I thought about this a bit more. One potential issue will be profiling - perf annotate will match the samples against the unpatched code which could be very confusing. Anton
On Mon, 2017-04-03 at 10:54 +1000, Anton Blanchard wrote: > > > Good idea, I hadn't thought of embedding it all in a feature > > > section. > > > > It may not work currently because you get those ftr_alt_97 relocation > > errors with the "else" parts because relative branches to other code > > need to be direct and I think reachable from both places. > > I thought about this a bit more. One potential issue will be > profiling - perf annotate will match the samples against the unpatched > code which could be very confusing. Could we make all those functions a dynamic-linker style stub ? IE, they "find" the right target function and call a helper to patch the calling site to call directly into the right one on the first call. Cheers, Ben.
Anton Blanchard <anton@ozlabs.org> writes: > From: Anton Blanchard <anton@samba.org> > > Add a POWER9 optimised copy_page() loop. This loop uses the new D form > vector loads and stores, and uses dcbz to pre zero the destination. > ... > + > +#ifdef CONFIG_ALTIVEC > + mflr r0 > + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) > + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) > + std r0,16(r1) > + stdu r1,-STACKFRAMESIZE(r1) > + bl enter_vmx_copy > + cmpwi r3,0 > + ld r0,STACKFRAMESIZE+16(r1) > + ld r3,STK_REG(R31)(r1) > + ld r4,STK_REG(R30)(r1) > + addi r1,r1,STACKFRAMESIZE > + mtlr r0 > + > + li r0,((PAGE_SIZE/128)-2) > + mtctr r0 > + > + li r8,256 > + > + beq .Lnonvmx_copy > + > + .balign 16 > +1: dcbz r8,r3 > + lxv vs32,0(r4) > + lxv vs33,16(r4) Unfortunately this doesn't build: arch/powerpc/lib/copypage_power9.S: Assembler messages: arch/powerpc/lib/copypage_power9.S:66: Error: unrecognized opcode: `lxv' arch/powerpc/lib/copypage_power9.S:67: Error: unrecognized opcode: `lxv' arch/powerpc/lib/copypage_power9.S:68: Error: unrecognized opcode: `stxv' arch/powerpc/lib/copypage_power9.S:69: Error: unrecognized opcode: `stxv' Presumably we need to a .long macro version for older toolchains? cheers
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 2b5e090..d3667b5 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o obj64-y += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \ copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ - memcpy_64.o memcmp_64.o + memcpy_64.o memcmp_64.o copypage_power9.o obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index 4bcc9e7..051423e 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -21,7 +21,11 @@ _GLOBAL_TOC(copy_page) BEGIN_FTR_SECTION lis r5,PAGE_SIZE@h FTR_SECTION_ELSE + BEGIN_FTR_SECTION_NESTED(50) + b copypage_power9 + FTR_SECTION_ELSE_NESTED(50) b copypage_power7 + ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_300, 50) ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) ori r5,r5,PAGE_SIZE@l BEGIN_FTR_SECTION diff --git a/arch/powerpc/lib/copypage_power9.S b/arch/powerpc/lib/copypage_power9.S new file mode 100644 index 0000000..2493f94 --- /dev/null +++ b/arch/powerpc/lib/copypage_power9.S @@ -0,0 +1,224 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Anton Blanchard <anton@au.ibm.com> + */ +#include <asm/page.h> +#include <asm/ppc_asm.h> + +_GLOBAL(copypage_power9) + /* + * We prefetch the source using enhanced touch instructions. We use + * a stream ID of 0 for this. Since the source is page aligned we + * don't need to clear the bottom 7 bits of the address. + */ +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7 + * units/cachelines=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units/cachelines=32 */ +#endif + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + /* setup read stream 0 */ + dcbt r0,r4,0b01000 /* addr from */ + dcbt r0,r7,0b01010 /* length and depth from */ + eieio + dcbt r0,r8,0b01010 /* all streams GO */ + eieio +.machine pop + + /* + * To reduce memory bandwidth on the store side we send dcbzs ahead. + * Experimental testing shows 2 cachelines as good enough. + */ + li r6,128 + dcbz 0,r3 + dcbz r6,r3 + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) + std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STK_REG(R31)(r1) + ld r4,STK_REG(R30)(r1) + addi r1,r1,STACKFRAMESIZE + mtlr r0 + + li r0,((PAGE_SIZE/128)-2) + mtctr r0 + + li r8,256 + + beq .Lnonvmx_copy + + .balign 16 +1: dcbz r8,r3 + lxv vs32,0(r4) + lxv vs33,16(r4) + stxv vs32,0(r3) + stxv vs33,16(r3) + + lxv vs34,32(r4) + lxv vs35,48(r4) + stxv vs34,32(r3) + stxv vs35,48(r3) + + lxv vs36,64(r4) + lxv vs37,80(r4) + stxv vs36,64(r3) + stxv vs37,80(r3) + + lxv vs38,96(r4) + lxv vs39,112(r4) + stxv vs38,96(r3) + stxv vs39,112(r3) + + addi r4,r4,128 + addi r3,r3,128 + bdnz 1b + + li r0,2 + mtctr r0 + +1: lxv vs32,0(r4) + lxv vs33,16(r4) + stxv vs32,0(r3) + stxv vs33,16(r3) + + lxv vs34,32(r4) + lxv vs35,48(r4) + stxv vs34,32(r3) + stxv vs35,48(r3) + + lxv vs36,64(r4) + lxv vs37,80(r4) + stxv vs36,64(r3) + stxv vs37,80(r3) + + lxv vs38,96(r4) + lxv vs39,112(r4) + stxv vs38,96(r3) + stxv vs39,112(r3) + + addi r4,r4,128 + addi r3,r3,128 + bdnz 1b + + b exit_vmx_copy /* tail call optimise */ +#else + li r0,((PAGE_SIZE/128)-2) + mtctr r0 + + li r8,256 +#endif + + .balign 16 +.Lnonvmx_copy: +1: dcbz r8,r3 + ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + + ld r0,32(r4) + ld r5,40(r4) + ld r6,48(r4) + ld r7,56(r4) + std r0,32(r3) + std r5,40(r3) + std r6,48(r3) + std r7,56(r3) + + ld r0,64(r4) + ld r5,72(r4) + ld r6,80(r4) + ld r7,88(r4) + std r0,64(r3) + std r5,72(r3) + std r6,80(r3) + std r7,88(r3) + + ld r0,96(r4) + ld r5,104(r4) + ld r6,112(r4) + ld r7,120(r4) + addi r4,r4,128 + std r0,96(r3) + std r5,104(r3) + std r6,112(r3) + std r7,120(r3) + addi r3,r3,128 + bdnz 1b + + li r0,2 + mtctr r0 + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + + ld r0,32(r4) + ld r5,40(r4) + ld r6,48(r4) + ld r7,56(r4) + std r0,32(r3) + std r5,40(r3) + std r6,48(r3) + std r7,56(r3) + + ld r0,64(r4) + ld r5,72(r4) + ld r6,80(r4) + ld r7,88(r4) + std r0,64(r3) + std r5,72(r3) + std r6,80(r3) + std r7,88(r3) + + ld r0,96(r4) + ld r5,104(r4) + ld r6,112(r4) + ld r7,120(r4) + addi r4,r4,128 + std r0,96(r3) + std r5,104(r3) + std r6,112(r3) + std r7,120(r3) + addi r3,r3,128 + bdnz 1b + + blr