Message ID | 539B1868.3000307@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
Ping. On 13-06-2014 12:27, Adhemerval Zanella wrote: > This patch changes power7 memcpy to use VSX instructions only when > memory is aligned to quardword (16b). It is to avoid unaligned kernel > traps on non-cacheable memory (for instance, memory-mapped I/O). > > Checked on ppc64be and ppc32be. > > -- > > 2014-06-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com> > > * sysdeps/powerpc/powerpc64/power7/memcpy.S: Align VSX copies to 16B > to avoid alignment traps in non-cacheable memory. > * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. > > --- > > diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S > index 52c2a6b..e540fea 100644 > --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S > +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S > @@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0) > ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move > code. */ > > - andi. 11,3,7 /* Check alignment of DST. */ > - clrlwi 10,4,29 /* Check alignment of SRC. */ > + andi. 11,3,15 /* Check alignment of DST. */ > + clrlwi 10,4,28 /* Check alignment of SRC. */ > cmplw cr6,10,11 /* SRC and DST alignments match? */ > mr 12,4 > mr 31,5 > diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S > index bbfd381..18467f6 100644 > --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S > +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S > @@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0) > ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move > code. */ > > -#ifdef __LITTLE_ENDIAN__ > -/* In little-endian mode, power7 takes an alignment trap on any lxvd2x > - or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy > - loop is only used for quadword aligned copies. */ > +/* Align copies using VSX instructions to quadword. It is to avoid alignment > + traps when memcpy is used on non-cacheable memory (for instance, memory > + mapped I/O). */ > andi. 10,3,15 > clrldi 11,4,60 > -#else > - andi. 10,3,7 /* Check alignment of DST. */ > - clrldi 11,4,61 /* Check alignment of SRC. */ > -#endif > cmpld cr6,10,11 /* SRC and DST alignments match? */ > > mr dst,3 > @@ -53,11 +48,7 @@ EALIGN (memcpy, 5, 0) > beq L(aligned_copy) > > mtocrf 0x01,0 > -#ifdef __LITTLE_ENDIAN__ > clrldi 0,0,60 > -#else > - clrldi 0,0,61 > -#endif > > /* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ > 1: > @@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0) > stw 6,0(dst) > addi dst,dst,4 > 8: > -#ifdef __LITTLE_ENDIAN__ > bf 28,16f > ld 6,0(src) > addi src,src,8 > std 6,0(dst) > addi dst,dst,8 > 16: > -#endif > subf cnt,0,cnt > > /* Main aligned copy loop. Copies 128 bytes at a time. */ > @@ -298,9 +287,7 @@ L(copy_LE_8): > .align 4 > L(copy_GE_32_unaligned): > clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ > -#ifndef __LITTLE_ENDIAN__ > andi. 10,3,15 /* Check alignment of DST (against quadwords). */ > -#endif > srdi 9,cnt,4 /* Number of full quadwords remaining. */ > > beq L(copy_GE_32_unaligned_cont) >
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S index 52c2a6b..e540fea 100644 --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S @@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0) ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move code. */ - andi. 11,3,7 /* Check alignment of DST. */ - clrlwi 10,4,29 /* Check alignment of SRC. */ + andi. 11,3,15 /* Check alignment of DST. */ + clrlwi 10,4,28 /* Check alignment of SRC. */ cmplw cr6,10,11 /* SRC and DST alignments match? */ mr 12,4 mr 31,5 diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S index bbfd381..18467f6 100644 --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S @@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0) ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move code. */ -#ifdef __LITTLE_ENDIAN__ -/* In little-endian mode, power7 takes an alignment trap on any lxvd2x - or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy - loop is only used for quadword aligned copies. */ +/* Align copies using VSX instructions to quadword. It is to avoid alignment + traps when memcpy is used on non-cacheable memory (for instance, memory + mapped I/O). */ andi. 10,3,15 clrldi 11,4,60 -#else - andi. 10,3,7 /* Check alignment of DST. */ - clrldi 11,4,61 /* Check alignment of SRC. */ -#endif cmpld cr6,10,11 /* SRC and DST alignments match? */ mr dst,3 @@ -53,11 +48,7 @@ EALIGN (memcpy, 5, 0) beq L(aligned_copy) mtocrf 0x01,0 -#ifdef __LITTLE_ENDIAN__ clrldi 0,0,60 -#else - clrldi 0,0,61 -#endif /* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */ 1: @@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0) stw 6,0(dst) addi dst,dst,4 8: -#ifdef __LITTLE_ENDIAN__ bf 28,16f ld 6,0(src) addi src,src,8 std 6,0(dst) addi dst,dst,8 16: -#endif subf cnt,0,cnt /* Main aligned copy loop. Copies 128 bytes at a time. */ @@ -298,9 +287,7 @@ L(copy_LE_8): .align 4 L(copy_GE_32_unaligned): clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */ -#ifndef __LITTLE_ENDIAN__ andi. 10,3,15 /* Check alignment of DST (against quadwords). */ -#endif srdi 9,cnt,4 /* Number of full quadwords remaining. */ beq L(copy_GE_32_unaligned_cont)