@@ -14,6 +14,7 @@
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2011
+ * Copyright Freescale Semiconductor, 2015
*
* Author: Anton Blanchard <anton@au.ibm.com>
*/
@@ -63,9 +64,11 @@
.Ldo_err4:
+#if L1_CACHE_BYTES >= 128
ld r16,STK_REG(R16)(r1)
ld r15,STK_REG(R15)(r1)
ld r14,STK_REG(R14)(r1)
+#endif
.Ldo_err3:
bl exit_vmx_usercopy
ld r0,STACKFRAMESIZE+16(r1)
@@ -74,6 +77,7 @@
#endif /* CONFIG_ALTIVEC */
.Ldo_err2:
+#if L1_CACHE_BYTES >= 128
ld r22,STK_REG(R22)(r1)
ld r21,STK_REG(R21)(r1)
ld r20,STK_REG(R20)(r1)
@@ -81,6 +85,7 @@
ld r18,STK_REG(R18)(r1)
ld r17,STK_REG(R17)(r1)
ld r16,STK_REG(R16)(r1)
+#endif
ld r15,STK_REG(R15)(r1)
ld r14,STK_REG(R14)(r1)
.Lexit:
@@ -93,6 +98,10 @@
_GLOBAL(__copy_tofrom_user_power7)
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbt 0,r4
+ dcbt 0,r3
+#endif
#ifdef CONFIG_ALTIVEC
cmpldi r5,16
cmpldi cr1,r5,4096
@@ -139,12 +148,13 @@ err1; stw r0,0(r3)
3: sub r5,r5,r6
cmpldi r5,L1_CACHE_BYTES
- blt 5f
+ blt 9f
mflr r0
stdu r1,-STACKFRAMESIZE(r1)
std r14,STK_REG(R14)(r1)
std r15,STK_REG(R15)(r1)
+#if L1_CACHE_BYTES >= 128
std r16,STK_REG(R16)(r1)
std r17,STK_REG(R17)(r1)
std r18,STK_REG(R18)(r1)
@@ -152,14 +162,43 @@ err1; stw r0,0(r3)
std r20,STK_REG(R20)(r1)
std r21,STK_REG(R21)(r1)
std r22,STK_REG(R22)(r1)
+#endif
std r0,STACKFRAMESIZE+16(r1)
- srdi r6,r5,L1_CACHE_SHIFT
- mtctr r6
+#ifdef CONFIG_PPC_BOOK3E_64
+#define LINES_AHEAD 10
+ clrrdi r6,r4,L1_CACHE_SHIFT
+ clrrdi r9,r3,L1_CACHE_SHIFT
+ srdi r7,r5,L1_CACHE_SHIFT /* length in cachelines,
+ * capped at LINES_AHEAD
+ */
+ cmpldi r7,LINES_AHEAD
+ ble 4f
+ li r7,LINES_AHEAD
+4: mtctr r7
+
+5: addi r6,r6,L1_CACHE_BYTES
+ dcbt 0,r6
+ addi r9,r9,L1_CACHE_BYTES
+ dcbtst 0,r9
+
+ bdnz 5b
+
+ li r14,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+ srdi r15,r5,L1_CACHE_SHIFT
+#ifdef CONFIG_PPC_BOOK3E_64
+ cmpldi r15,LINES_AHEAD
+ ble 7f /* don't prefetch if cachelines <= LINES_AHEAD*/
+ subi r15,r15,LINES_AHEAD /* otherwise, r6 <- r6 - LINES_AHEAD*/
+#endif
+
+ mtctr r15
/* Now do cacheline sized loads and stores. */
.align 5
-4:
+6:
err2; ld r0,0(r4)
err2; ld r6,8(r4)
err2; ld r7,16(r4)
@@ -179,6 +218,9 @@ err2; ld r20,112(r4)
err2; ld r21,120(r4)
#endif
addi r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbt r14,r4
+#endif
err2; std r0,0(r3)
err2; std r6,8(r3)
err2; std r7,16(r3)
@@ -198,12 +240,47 @@ err2; std r20,112(r3)
err2; std r21,120(r3)
#endif
addi r3,r3,L1_CACHE_BYTES
- bdnz 4b
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbtst r14,r3
+#endif
+ bdnz 6b
+
+#ifdef CONFIG_PPC_BOOK3E_64
+ srdi r7,r5,L1_CACHE_SHIFT /* length in cachelines */
+ subf r15,r15,r7 /* r6 = r7 - r15 */
+
+7:
+ mtctr r15
+
+ /* remaining cacheline sized loads and stores, without prefetches. */
+ .align 5
+8:
+err2; ld r0,0(r4)
+err2; ld r6,8(r4)
+err2; ld r7,16(r4)
+err2; ld r8,24(r4)
+err2; ld r9,32(r4)
+err2; ld r10,40(r4)
+err2; ld r11,48(r4)
+err2; ld r12,56(r4)
+ addi r4,r4,L1_CACHE_BYTES
+err2; std r0,0(r3)
+err2; std r6,8(r3)
+err2; std r7,16(r3)
+err2; std r8,24(r3)
+err2; std r9,32(r3)
+err2; std r10,40(r3)
+err2; std r11,48(r3)
+err2; std r12,56(r3)
+ addi r3,r3,L1_CACHE_BYTES
+ bdnz 8b
+#endif
clrldi r5,r5,(64-L1_CACHE_SHIFT)
ld r14,STK_REG(R14)(r1)
ld r15,STK_REG(R15)(r1)
+#if L1_CACHE_BYTES >= 128
ld r16,STK_REG(R16)(r1)
ld r17,STK_REG(R17)(r1)
ld r18,STK_REG(R18)(r1)
@@ -211,14 +288,15 @@ err2; std r21,120(r3)
ld r20,STK_REG(R20)(r1)
ld r21,STK_REG(R21)(r1)
ld r22,STK_REG(R22)(r1)
+#endif
addi r1,r1,STACKFRAMESIZE
/* Up to L1_CACHE_BYTES - 1 to go */
-5: srdi r6,r5,4
+9: srdi r6,r5,4
mtocrf 0x01,r6
#if L1_CACHE_BYTES >= 128
-6: bf cr7*4+1,7f
+10: bf cr7*4+1,11f
err1; ld r0,0(r4)
err1; ld r6,8(r4)
err1; ld r7,16(r4)
@@ -240,7 +318,7 @@ err1; std r12,56(r3)
#endif
/* Up to 63B to go */
-7: bf cr7*4+2,8f
+11: bf cr7*4+2,12f
err1; ld r0,0(r4)
err1; ld r6,8(r4)
err1; ld r7,16(r4)
@@ -253,7 +331,7 @@ err1; std r8,24(r3)
addi r3,r3,32
/* Up to 31B to go */
-8: bf cr7*4+3,9f
+12: bf cr7*4+3,13f
err1; ld r0,0(r4)
err1; ld r6,8(r4)
addi r4,r4,16
@@ -261,12 +339,12 @@ err1; std r0,0(r3)
err1; std r6,8(r3)
addi r3,r3,16
-9: clrldi r5,r5,(64-4)
+13: clrldi r5,r5,(64-4)
/* Up to 15B to go */
.Lshort_copy:
mtocrf 0x01,r5
- bf cr7*4+0,12f
+ bf cr7*4+0,14f
err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
err1; lwz r6,4(r4)
addi r4,r4,8
@@ -274,23 +352,23 @@ err1; stw r0,0(r3)
err1; stw r6,4(r3)
addi r3,r3,8
-12: bf cr7*4+1,13f
+14: bf cr7*4+1,15f
err1; lwz r0,0(r4)
addi r4,r4,4
err1; stw r0,0(r3)
addi r3,r3,4
-13: bf cr7*4+2,14f
+15: bf cr7*4+2,16f
err1; lhz r0,0(r4)
addi r4,r4,2
err1; sth r0,0(r3)
addi r3,r3,2
-14: bf cr7*4+3,15f
+16: bf cr7*4+3,17f
err1; lbz r0,0(r4)
err1; stb r0,0(r3)
-15: li r3,0
+17: li r3,0
blr
.Lunwind_stack_nonvmx_copy:
@@ -310,6 +388,7 @@ err1; stb r0,0(r3)
ld r5,STK_REG(R29)(r1)
mtlr r0
+#ifdef CONFIG_PPC_BOOK3S_64
/*
* We prefetch both the source and destination using enhanced touch
* instructions. We use a stream ID of 0 for the load side and
@@ -342,6 +421,30 @@ err1; stb r0,0(r3)
eieio
dcbt r0,r8,0b01010 /* all streams GO */
.machine pop
+#else
+ /*
+ * We prefetch both the source and destination using regular touch
+ * instructions.
+ */
+ clrrdi r6,r4,L1_CACHE_SHIFT
+ clrrdi r9,r3,L1_CACHE_SHIFT
+ srdi r7,r5,L1_CACHE_SHIFT /* length in cachelines,
+ * capped at LINES_AHEAD
+ */
+ cmpldi r7,LINES_AHEAD
+ ble 2f
+ li r7,LINES_AHEAD
+2: mtctr r7
+
+3: addi r6,r6,L1_CACHE_BYTES
+ dcbt 0,r6
+ addi r9,r9,L1_CACHE_BYTES
+ dcbtst 0,r9
+
+ bdnz 3b
+
+ li r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
beq cr1,.Lunwind_stack_nonvmx_copy
@@ -426,6 +529,14 @@ err3; stvx vr0,r3,r11
7: sub r5,r5,r6
srdi r6,r5,L1_CACHE_SHIFT
+#ifdef CONFIG_PPC_BOOK3E_64
+ cmpldi r6,LINES_AHEAD
+ ble 12f /* don't prefetch if cachelines <= LINES_AHEAD*/
+ subi r6,r6,LINES_AHEAD /* otherwise, r6 <- r6 - LINES_AHEAD*/
+ li r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+#if L1_CACHE_BYTES >= 128
std r14,STK_REG(R14)(r1)
std r15,STK_REG(R15)(r1)
std r16,STK_REG(R16)(r1)
@@ -434,6 +545,7 @@ err3; stvx vr0,r3,r11
li r14,80
li r15,96
li r16,112
+#endif
mtctr r6
@@ -454,6 +566,9 @@ err4; lvx vr1,r4,r15
err4; lvx vr0,r4,r16
#endif
addi r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbt r8,r4
+#endif
err4; stvx vr7,r0,r3
err4; stvx vr6,r3,r9
err4; stvx vr5,r3,r10
@@ -465,11 +580,39 @@ err4; stvx vr1,r3,r15
err4; stvx vr0,r3,r16
#endif
addi r3,r3,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbtst r8,r3
+#endif
bdnz 8b
+#ifdef CONFIG_PPC_BOOK3E_64
+ srdi r7,r5,L1_CACHE_SHIFT /* length in cachelines */
+ subf r6,r6,r7 /* r6 = r7 - r6 */
+
+12:
+ mtctr r6
+
+ /* remaining cacheline sized loads and stores, without prefetches. */
+ .align 5
+13:
+err4; lvx vr7,r0,r4
+err4; lvx vr6,r4,r9
+err4; lvx vr5,r4,r10
+err4; lvx vr4,r4,r11
+ addi r4,r4,L1_CACHE_BYTES
+err4; stvx vr7,r0,r3
+err4; stvx vr6,r3,r9
+err4; stvx vr5,r3,r10
+err4; stvx vr4,r3,r11
+ addi r3,r3,L1_CACHE_BYTES
+ bdnz 13b
+#endif
+
+#if L1_CACHE_BYTES >= 128
ld r14,STK_REG(R14)(r1)
ld r15,STK_REG(R15)(r1)
ld r16,STK_REG(R16)(r1)
+#endif
/* Up to L1_CACHE_BYTES - 1 to go */
clrldi r5,r5,(64-L1_CACHE_SHIFT)
@@ -477,7 +620,7 @@ err4; stvx vr0,r3,r16
mtocrf 0x01,r6
#if L1_CACHE_BYTES >= 128
- bf cr7*4+1,9f
+ bf cr7*4+1,14f
err3; lvx vr3,r0,r4
err3; lvx vr2,r4,r9
err3; lvx vr1,r4,r10
@@ -490,7 +633,7 @@ err3; stvx vr0,r3,r11
addi r3,r3,64
#endif
-9: bf cr7*4+2,10f
+14: bf cr7*4+2,15f
err3; lvx vr1,r0,r4
err3; lvx vr0,r4,r9
addi r4,r4,32
@@ -498,38 +641,38 @@ err3; stvx vr1,r0,r3
err3; stvx vr0,r3,r9
addi r3,r3,32
-10: bf cr7*4+3,11f
+15: bf cr7*4+3,16f
err3; lvx vr1,r0,r4
addi r4,r4,16
err3; stvx vr1,r0,r3
addi r3,r3,16
/* Up to 15B to go */
-11: clrldi r5,r5,(64-4)
+16: clrldi r5,r5,(64-4)
mtocrf 0x01,r5
- bf cr7*4+0,12f
+ bf cr7*4+0,17f
err3; ld r0,0(r4)
addi r4,r4,8
err3; std r0,0(r3)
addi r3,r3,8
-12: bf cr7*4+1,13f
+17: bf cr7*4+1,18f
err3; lwz r0,0(r4)
addi r4,r4,4
err3; stw r0,0(r3)
addi r3,r3,4
-13: bf cr7*4+2,14f
+18: bf cr7*4+2,19f
err3; lhz r0,0(r4)
addi r4,r4,2
err3; sth r0,0(r3)
addi r3,r3,2
-14: bf cr7*4+3,15f
+19: bf cr7*4+3,20f
err3; lbz r0,0(r4)
err3; stb r0,0(r3)
-15: addi r1,r1,STACKFRAMESIZE
+20: addi r1,r1,STACKFRAMESIZE
b exit_vmx_usercopy /* tail call optimise */
.Lvmx_unaligned_copy:
@@ -620,6 +763,14 @@ err3; stvx vr11,r3,r11
7: sub r5,r5,r6
srdi r6,r5,L1_CACHE_SHIFT
+#ifdef CONFIG_PPC_BOOK3E_64
+ cmpldi r6,LINES_AHEAD
+ ble 9f /* don't prefetch if cachelines <= LINES_AHEAD*/
+ subi r6,r6,LINES_AHEAD /* otherwise, r6 <- r6 - LINES_AHEAD*/
+ li r8,LINES_AHEAD*L1_CACHE_BYTES
+#endif
+
+#if L1_CACHE_BYTES >= 128
std r14,STK_REG(R14)(r1)
std r15,STK_REG(R15)(r1)
std r16,STK_REG(R16)(r1)
@@ -628,6 +779,7 @@ err3; stvx vr11,r3,r11
li r14,80
li r15,96
li r16,112
+#endif
mtctr r6
@@ -659,6 +811,9 @@ err4; lvx vr0,r4,r16
VPERM(vr15,vr1,vr0,vr16)
#endif
addi r4,r4,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbt r8,r4
+#endif
err4; stvx vr8,r0,r3
err4; stvx vr9,r3,r9
err4; stvx vr10,r3,r10
@@ -670,11 +825,44 @@ err4; stvx vr14,r3,r15
err4; stvx vr15,r3,r16
#endif
addi r3,r3,L1_CACHE_BYTES
+#ifdef CONFIG_PPC_BOOK3E_64
+ dcbtst r8,r3
+#endif
bdnz 8b
+#ifdef CONFIG_PPC_BOOK3E_64
+ srdi r7,r5,L1_CACHE_SHIFT /* length in cachelines */
+ subf r6,r6,r7 /* r6 = r7 - r6 */
+
+9:
+ mtctr r6
+
+ /* remaining cacheline sized loads and stores, without prefetches. */
+ .align 5
+10:
+err4; lvx vr7,r0,r4
+ VPERM(vr8,vr0,vr7,vr16)
+err4; lvx vr6,r4,r9
+ VPERM(vr9,vr7,vr6,vr16)
+err4; lvx vr5,r4,r10
+ VPERM(vr10,vr6,vr5,vr16)
+err4; lvx vr0,r4,r11
+ VPERM(vr11,vr5,vr0,vr16)
+ addi r4,r4,L1_CACHE_BYTES
+err4; stvx vr8,r0,r3
+err4; stvx vr9,r3,r9
+err4; stvx vr10,r3,r10
+err4; stvx vr11,r3,r11
+ addi r3,r3,L1_CACHE_BYTES
+
+ bdnz 10b
+#endif
+
+#if L1_CACHE_BYTES >= 128
ld r14,STK_REG(R14)(r1)
ld r15,STK_REG(R15)(r1)
ld r16,STK_REG(R16)(r1)
+#endif
/* Up to L1_CACHE_BYTES - 1 to go */
clrldi r5,r5,(64-L1_CACHE_SHIFT)
@@ -682,7 +870,7 @@ err4; stvx vr15,r3,r16
mtocrf 0x01,r6
#if L1_CACHE_BYTES >= 128
- bf cr7*4+1,9f
+ bf cr7*4+1,11f
err3; lvx vr3,r0,r4
VPERM(vr8,vr0,vr3,vr16)
err3; lvx vr2,r4,r9
@@ -699,7 +887,7 @@ err3; stvx vr11,r3,r11
addi r3,r3,64
#endif
-9: bf cr7*4+2,10f
+11: bf cr7*4+2,12f
err3; lvx vr1,r0,r4
VPERM(vr8,vr0,vr1,vr16)
err3; lvx vr0,r4,r9
@@ -709,7 +897,7 @@ err3; stvx vr8,r0,r3
err3; stvx vr9,r3,r9
addi r3,r3,32
-10: bf cr7*4+3,11f
+12: bf cr7*4+3,13f
err3; lvx vr1,r0,r4
VPERM(vr8,vr0,vr1,vr16)
addi r4,r4,16
@@ -717,10 +905,10 @@ err3; stvx vr8,r0,r3
addi r3,r3,16
/* Up to 15B to go */
-11: clrldi r5,r5,(64-4)
+13: clrldi r5,r5,(64-4)
addi r4,r4,-16 /* Unwind the +16 load offset */
mtocrf 0x01,r5
- bf cr7*4+0,12f
+ bf cr7*4+0,14f
err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
err3; lwz r6,4(r4)
addi r4,r4,8
@@ -728,22 +916,22 @@ err3; stw r0,0(r3)
err3; stw r6,4(r3)
addi r3,r3,8
-12: bf cr7*4+1,13f
+14: bf cr7*4+1,15f
err3; lwz r0,0(r4)
addi r4,r4,4
err3; stw r0,0(r3)
addi r3,r3,4
-13: bf cr7*4+2,14f
+15: bf cr7*4+2,16f
err3; lhz r0,0(r4)
addi r4,r4,2
err3; sth r0,0(r3)
addi r3,r3,2
-14: bf cr7*4+3,15f
+16: bf cr7*4+3,17f
err3; lbz r0,0(r4)
err3; stb r0,0(r3)
-15: addi r1,r1,STACKFRAMESIZE
+17: addi r1,r1,STACKFRAMESIZE
b exit_vmx_usercopy /* tail call optimise */
#endif /* CONFIG_ALTIVEC */
POWER7 has a dedicated stream prefetcher that is pre-programmed via dcbt rX,rY,0b010?0 instructions in the beginning of vmx_copy. e6500 has no such prefetcher, so we revert to using regular dcbt instructions in-loop: 1. at __copy_tofrom_user_power7 entry, we prefetch the first src and dest lines with dcbt and dcbtst, respectively. 2. if a short (16 bytes or less) copy, don't prefetch any further. 3. else (nonvmx_copy, vmx_copy, unaligned_vmx_copy), we prefetch LINES_AHEAD number of lines, then, in the inner cacheline-wide loops, prefetch a line LINES_AHEAD of the current address being copied, and, finally drop into a tail-end cacheline-wide loop that doesn't prefetch for the last LINES_AHEAD number of iterations. LINES_AHEAD has been chosen emperically to be 10 based on generally best results on an important target benchmark: 1MB netperf TCP_CRR runs*, relative to a stock 3.19 kernel with FTR_VMX_COPY turned off for e6500 (i.e., without this patchseries): throughput mean latency single-thread 27% 21.2% 8-thread 6.08% 5.8% POWER7/BOOK3S_64 code is left completely immune from these e6500 changes: we don't have a POWER7 to benchmark with. Lastly, this patch includes some post make copyuser_power7.S 64-byte cacheline friendly enhancements by removing any register stack saves for non-128-byte case, and a renumbering of the branch labels. * specifically: netperf -v 0 -B "-b 8 -D" -H $dest_ip -l 20 -t TCP_CRR -P 0 -- -b 8 -D -r 64,1000000 Signed-off-by: Kim Phillips <kim.phillips@freescale.com> --- arch/powerpc/lib/copyuser_power7.S | 254 ++++++++++++++++++++++++++++++++----- 1 file changed, 221 insertions(+), 33 deletions(-)