Message ID | 1383640732-21449-1-git-send-email-felix@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Philippe Bergheaud <felix@linux.vnet.ibm.com> wrote: > Unaligned stores take alignment exceptions on POWER7 running in little-endian. > This is a dumb little-endian base memcpy that prevents unaligned stores. > It is replaced by the VMX memcpy at boot. Is this any faster than the generic version? Mikey > > Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com> > --- > arch/powerpc/include/asm/string.h | 4 ---- > arch/powerpc/kernel/ppc_ksyms.c | 2 -- > arch/powerpc/lib/Makefile | 2 -- > arch/powerpc/lib/memcpy_64.S | 19 +++++++++++++++++++ > 4 files changed, 19 insertions(+), 8 deletions(-) > > diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h > index 0dffad6..e40010a 100644 > --- a/arch/powerpc/include/asm/string.h > +++ b/arch/powerpc/include/asm/string.h > @@ -10,9 +10,7 @@ > #define __HAVE_ARCH_STRNCMP > #define __HAVE_ARCH_STRCAT > #define __HAVE_ARCH_MEMSET > -#ifdef __BIG_ENDIAN__ > #define __HAVE_ARCH_MEMCPY > -#endif > #define __HAVE_ARCH_MEMMOVE > #define __HAVE_ARCH_MEMCMP > #define __HAVE_ARCH_MEMCHR > @@ -24,9 +22,7 @@ extern int strcmp(const char *,const char *); > extern int strncmp(const char *, const char *, __kernel_size_t); > extern char * strcat(char *, const char *); > extern void * memset(void *,int,__kernel_size_t); > -#ifdef __BIG_ENDIAN__ > extern void * memcpy(void *,const void *,__kernel_size_t); > -#endif > extern void * memmove(void *,const void *,__kernel_size_t); > extern int memcmp(const void *,const void *,__kernel_size_t); > extern void * memchr(const void *,int,__kernel_size_t); > diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c > index 526ad5c..0c2dd60 100644 > --- a/arch/powerpc/kernel/ppc_ksyms.c > +++ b/arch/powerpc/kernel/ppc_ksyms.c > @@ -147,9 +147,7 @@ EXPORT_SYMBOL(__ucmpdi2); > #endif > long long __bswapdi2(long long); > EXPORT_SYMBOL(__bswapdi2); > -#ifdef __BIG_ENDIAN__ > EXPORT_SYMBOL(memcpy); > -#endif > EXPORT_SYMBOL(memset); > EXPORT_SYMBOL(memmove); > EXPORT_SYMBOL(memcmp); > diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile > index 5310132..6670361 100644 > --- a/arch/powerpc/lib/Makefile > +++ b/arch/powerpc/lib/Makefile > @@ -23,9 +23,7 @@ obj-y += checksum_$(CONFIG_WORD_SIZE).o > obj-$(CONFIG_PPC64) += checksum_wrappers_64.o > endif > > -ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),) > obj-$(CONFIG_PPC64) += memcpy_power7.o memcpy_64.o > -endif > > obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o > > diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S > index d2bbbc8..358cf74 100644 > --- a/arch/powerpc/lib/memcpy_64.S > +++ b/arch/powerpc/lib/memcpy_64.S > @@ -12,10 +12,28 @@ > .align 7 > _GLOBAL(memcpy) > BEGIN_FTR_SECTION > +#ifdef __LITTLE_ENDIAN__ > + cmpdi cr7,r5,0 /* dumb little-endian memcpy */ > +#else > std r3,48(r1) /* save destination pointer for return value */ > +#endif > FTR_SECTION_ELSE > b memcpy_power7 > ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) > +#ifdef __LITTLE_ENDIAN__ > + addi r5,r5,-1 > + addi r9,r3,-1 > + add r5,r3,r5 > + subf r5,r9,r5 > + addi r4,r4,-1 > + mtctr r5 > + beqlr cr7 > +1: > + lbzu r10,1(r4) > + stbu r10,1(r9) > + bdnz 1b > + blr > +#else > PPC_MTOCRF(0x01,r5) > cmpldi cr1,r5,16 > neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry > @@ -201,3 +219,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) > stb r0,0(r3) > 4: ld r3,48(r1) /* return dest pointer */ > blr > +#endif > -- > 1.7.10.4 > > _______________________________________________ > Linuxppc-dev mailing list > Linuxppc-dev@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/linuxppc-dev >
Michael Neuling wrote: > Philippe Bergheaud <felix@linux.vnet.ibm.com> wrote: > > >>Unaligned stores take alignment exceptions on POWER7 running in little-endian. >>This is a dumb little-endian base memcpy that prevents unaligned stores. >>It is replaced by the VMX memcpy at boot. > > > Is this any faster than the generic version? The little-endian assembly code of the base memcpy is similar to the code emitted by gcc when compiling the generic memcpy in lib/string.c, and runs at the same speed. However, a little-endian assembly version of the base memcpy is required (as opposed to a C version), in order to use the self-modifying code instrumentation system. After the cpu feature CPU_FTR_ALTIVEC is detected at boot, the slow base memcpy is nop'ed out, and the fast memcpy_power7 is used instead. Philippe
Hi, > > Unaligned stores take alignment exceptions on POWER7 running in > > little-endian. This is a dumb little-endian base memcpy that > > prevents unaligned stores. It is replaced by the VMX memcpy at boot. > > Is this any faster than the generic version? Once booted the feature fixup code switches us over to the VMX copy loops (which are already endian safe). The question is what we do before that switch over. The base 64bit memcpy takes alignment exceptions on POWER7 so we can't use it as is. Fixing the causes of alignment exception would slow it down, because we'd need to ensure all loads and stores are aligned either through rotate tricks or bytewise loads and stores. Either would be bad for all other 64bit platforms. Anton
OK, can you add that and/or maybe antons description to the patch changelog? Mikey On Wed, Nov 6, 2013 at 9:21 PM, Philippe Bergheaud <felix@linux.vnet.ibm.com> wrote: > Michael Neuling wrote: >> >> Philippe Bergheaud <felix@linux.vnet.ibm.com> wrote: >> >> >>> Unaligned stores take alignment exceptions on POWER7 running in >>> little-endian. >>> This is a dumb little-endian base memcpy that prevents unaligned stores. >>> It is replaced by the VMX memcpy at boot. >> >> >> >> Is this any faster than the generic version? > > > The little-endian assembly code of the base memcpy is similar to the code > emitted by gcc when compiling the generic memcpy in lib/string.c, and runs > at the same speed. > However, a little-endian assembly version of the base memcpy is required (as > opposed to a C version), in order to use the self-modifying code > instrumentation system. > After the cpu feature CPU_FTR_ALTIVEC is detected at boot, the slow base > memcpy is nop'ed out, and the fast memcpy_power7 is used instead. > > Philippe >
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 0dffad6..e40010a 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -10,9 +10,7 @@ #define __HAVE_ARCH_STRNCMP #define __HAVE_ARCH_STRCAT #define __HAVE_ARCH_MEMSET -#ifdef __BIG_ENDIAN__ #define __HAVE_ARCH_MEMCPY -#endif #define __HAVE_ARCH_MEMMOVE #define __HAVE_ARCH_MEMCMP #define __HAVE_ARCH_MEMCHR @@ -24,9 +22,7 @@ extern int strcmp(const char *,const char *); extern int strncmp(const char *, const char *, __kernel_size_t); extern char * strcat(char *, const char *); extern void * memset(void *,int,__kernel_size_t); -#ifdef __BIG_ENDIAN__ extern void * memcpy(void *,const void *,__kernel_size_t); -#endif extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 526ad5c..0c2dd60 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -147,9 +147,7 @@ EXPORT_SYMBOL(__ucmpdi2); #endif long long __bswapdi2(long long); EXPORT_SYMBOL(__bswapdi2); -#ifdef __BIG_ENDIAN__ EXPORT_SYMBOL(memcpy); -#endif EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcmp); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 5310132..6670361 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -23,9 +23,7 @@ obj-y += checksum_$(CONFIG_WORD_SIZE).o obj-$(CONFIG_PPC64) += checksum_wrappers_64.o endif -ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),) obj-$(CONFIG_PPC64) += memcpy_power7.o memcpy_64.o -endif obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S index d2bbbc8..358cf74 100644 --- a/arch/powerpc/lib/memcpy_64.S +++ b/arch/powerpc/lib/memcpy_64.S @@ -12,10 +12,28 @@ .align 7 _GLOBAL(memcpy) BEGIN_FTR_SECTION +#ifdef __LITTLE_ENDIAN__ + cmpdi cr7,r5,0 /* dumb little-endian memcpy */ +#else std r3,48(r1) /* save destination pointer for return value */ +#endif FTR_SECTION_ELSE b memcpy_power7 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +#ifdef __LITTLE_ENDIAN__ + addi r5,r5,-1 + addi r9,r3,-1 + add r5,r3,r5 + subf r5,r9,r5 + addi r4,r4,-1 + mtctr r5 + beqlr cr7 +1: + lbzu r10,1(r4) + stbu r10,1(r9) + bdnz 1b + blr +#else PPC_MTOCRF(0x01,r5) cmpldi cr1,r5,16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry @@ -201,3 +219,4 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) stb r0,0(r3) 4: ld r3,48(r1) /* return dest pointer */ blr +#endif
Unaligned stores take alignment exceptions on POWER7 running in little-endian. This is a dumb little-endian base memcpy that prevents unaligned stores. It is replaced by the VMX memcpy at boot. Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com> --- arch/powerpc/include/asm/string.h | 4 ---- arch/powerpc/kernel/ppc_ksyms.c | 2 -- arch/powerpc/lib/Makefile | 2 -- arch/powerpc/lib/memcpy_64.S | 19 +++++++++++++++++++ 4 files changed, 19 insertions(+), 8 deletions(-)