Message ID | 20240626143405.1514956-1-bmahi496@linux.ibm.com |
---|---|
State | New |
Headers | show |
Series | [v2] powerpc: optimize strcpy and stpcpy for POWER9/10 | expand |
Please go through the attached benchmark file. On 26/06/24 8:04 pm, MAHESH BODAPATI wrote: > This patch modifies the current POWER9 implementation of strcpy and > stpcpy to optimize it for POWER9/10. > > Since no new POWER10 instructions are used, the original POWER9 > strcpy is modified instead of creating a new implementation for POWER10. > > The changes also affect stpcpy, which uses the same implementation > with some additional code before returning. > > Improvements compared to POWER9 version: > > Use simple comparisons for the first ~512 bytes > The main loop is good for long strings, but comparing 16B each time is > better for shorter strings. After aligning the address to 16 bytes, we > unroll the loop four times, checking 128 bytes each time. There may be > some overlap with the main loop for unaligned strings, but it is better > for shorter strings. > > Loop with 64 bytes for longer bytes > using 4 consecutive lxv/stxv instructions. > > Showed an average improvement of 13% and the degradations for smaller > strings are not consistent. > --- > sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 +++++++++++++++---- > 1 file changed, 242 insertions(+), 64 deletions(-) > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > index 603bde1e39..8918dcf90a 100644 > --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > @@ -42,22 +42,48 @@ > > if USE_AS_STPCPY is defined. > > - The implementation can load bytes past a null terminator, but only > - up to the next 16B boundary, so it never crosses a page. */ > + This implementation never reads across a page boundary, but may > + read beyond the NUL terminator. */ > > -/* Load quadword at addr+offset to vreg, check for null bytes, > +/* Load 4 quadwords, merge into one VR for speed and check for NULLs > + and branch to label if NULL is found. */ > +#define CHECK_64B(offset,addr,label) \ > + lxv 32+v4,(offset+0)(addr); \ > + lxv 32+v5,(offset+16)(addr); \ > + lxv 32+v6,(offset+32)(addr); \ > + lxv 32+v7,(offset+48)(addr); \ > + vminub v14,v4,v5; \ > + vminub v15,v6,v7; \ > + vminub v16,v14,v15; \ > + vcmpequb. v0,v16,v18; \ > + beq cr6,$+12; \ > + li r7,offset; \ > + b L(label); \ > + stxv 32+v4,(offset+0)(r11); \ > + stxv 32+v5,(offset+16)(r11); \ > + stxv 32+v6,(offset+32)(r11); \ > + stxv 32+v7,(offset+48)(r11) > + > +/* Load quadword at addr+offset to vreg, check for NULL bytes, > and branch to label if any are found. */ > -#define CHECK16(vreg,offset,addr,label) \ > +#define CHECK_16B(vreg,offset,addr,label) \ > lxv vreg+32,offset(addr); \ > - vcmpequb. v6,vreg,v18; \ > + vcmpequb. v15,vreg,v18; \ > bne cr6,L(label); > > +/* Store vreg2 with length if NULL is found. */ > +#define STORE_WITH_LEN(vreg1,vreg2,reg) \ > + vctzlsbb r8,vreg1; \ > + addi r9,r8,1; \ > + sldi r9,r9,56; \ > + stxvl 32+vreg2,reg,r9; > + > .machine power9 > ENTRY_TOCLESS (FUNC_NAME, 4) > CALL_MCOUNT 2 > > vspltisb v18,0 /* Zeroes in v18 */ > - vspltisb v19,-1 /* 0xFF bytes in v19 */ > + vspltisb v19,-1 /* 0xFF bytes in v19 */ > > /* Next 16B-aligned address. Prepare address for L(loop). */ > addi r5,r4,16 > @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > lvsr v1,0,r4 > vperm v0,v19,v0,v1 > > - vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise. */ > beq cr6,L(no_null) > > - /* There's a null byte. */ > - vctzlsbb r8,v6 /* Number of trailing zeroes */ > - addi r9,r8,1 /* Add null byte. */ > - sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ > - stxvl 32+v0,r3,r10 /* Partial store */ > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v6,v0,r3) > > #ifdef USE_AS_STPCPY > /* stpcpy returns the dest address plus the size not counting the > @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > blr > > L(no_null): > - sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > - stxvl 32+v0,r3,r10 /* Partial store */ > + sldi r10,r8,56 /* stxvl wants size in top 8 bits. */ > + stxvl 32+v0,r3,r10 /* Partial store. */ > > +/* The main loop is optimized for longer strings(> 512 bytes), > + so checking the first bytes in 16B chunks benefits shorter > + strings a lot. */ > .p2align 4 > -L(loop): > - CHECK16(v0,0,r5,tail1) > - CHECK16(v1,16,r5,tail2) > - CHECK16(v2,32,r5,tail3) > - CHECK16(v3,48,r5,tail4) > - CHECK16(v4,64,r5,tail5) > - CHECK16(v5,80,r5,tail6) > +L(aligned): > + CHECK_16B(v0,0,r5,tail1) > + CHECK_16B(v1,16,r5,tail2) > + CHECK_16B(v2,32,r5,tail3) > + CHECK_16B(v3,48,r5,tail4) > + CHECK_16B(v4,64,r5,tail5) > + CHECK_16B(v5,80,r5,tail6) > + CHECK_16B(v6,96,r5,tail7) > + CHECK_16B(v7,112,r5,tail8) > > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > @@ -105,18 +133,145 @@ L(loop): > stxv 32+v3,48(r11) > stxv 32+v4,64(r11) > stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,128,r5,tail1) > + CHECK_16B(v1,128+16,r5,tail2) > + CHECK_16B(v2,128+32,r5,tail3) > + CHECK_16B(v3,128+48,r5,tail4) > + CHECK_16B(v4,128+64,r5,tail5) > + CHECK_16B(v5,128+80,r5,tail6) > + CHECK_16B(v6,128+96,r5,tail7) > + CHECK_16B(v7,128+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,256,r5,tail1) > + CHECK_16B(v1,256+16,r5,tail2) > + CHECK_16B(v2,256+32,r5,tail3) > + CHECK_16B(v3,256+48,r5,tail4) > + CHECK_16B(v4,256+64,r5,tail5) > + CHECK_16B(v5,256+80,r5,tail6) > + CHECK_16B(v6,256+96,r5,tail7) > + CHECK_16B(v7,256+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,384,r5,tail1) > + CHECK_16B(v1,384+16,r5,tail2) > + CHECK_16B(v2,384+32,r5,tail3) > + CHECK_16B(v3,384+48,r5,tail4) > + CHECK_16B(v4,384+64,r5,tail5) > + CHECK_16B(v5,384+80,r5,tail6) > + CHECK_16B(v6,384+96,r5,tail7) > + CHECK_16B(v7,384+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + /* Prepare address for the loop. */ > + addi r5,r4,512 > + clrrdi r5,r5,6 > + subf r7,r4,r5 > + add r11,r3,r7 > + > +/* Switch to a more aggressive approach checking 64B each time. */ > + .p2align 5 > +L(strcpy_loop): > + CHECK_64B(0,r5,tail_64b) > + CHECK_64B(64,r5,tail_64b) > + CHECK_64B(128,r5,tail_64b) > + CHECK_64B(192,r5,tail_64b) > > - addi r5,r5,96 > - addi r11,r11,96 > + CHECK_64B(256,r5,tail_64b) > + CHECK_64B(256+64,r5,tail_64b) > + CHECK_64B(256+128,r5,tail_64b) > + CHECK_64B(256+192,r5,tail_64b) > + addi r5,r5,512 > + addi r11,r11,512 > + > + b L(strcpy_loop) > + > + .p2align 5 > +L(tail_64b): > + /* OK, we found a NULL byte. Let's look for it in the current 64-byte > + block and mark it in its corresponding VR. */ > + add r11,r11,r7 > + vcmpequb. v8,v4,v18 > + beq cr6,L(no_null_16B) > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v8,v4,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > +L(no_null_16B): > + stxv 32+v4,0(r11) > + vcmpequb. v8,v5,v18 > + beq cr6,L(no_null_32B) > + /* There's a NULL byte. */ > + addi r11,r11,16 > + STORE_WITH_LEN(v8,v5,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > +L(no_null_32B): > + stxv 32+v5,16(r11) > + vcmpequb. v8,v6,v18 > + beq cr6,L(no_null_48B) > + /* There's a NULL byte. */ > + addi r11,r11,32 > + STORE_WITH_LEN(v8,v6,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > > - b L(loop) > +L(no_null_48B): > + stxv 32+v6,32(r11) > + vcmpequb. v8,v7,v18; > + /* There's a NULL byte. */ > + addi r11,r11,48 > + STORE_WITH_LEN(v8,v7,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > > .p2align 4 > L(tail1): > - vctzlsbb r8,v6 /* Number of trailing zeroes */ > - addi r9,r8,1 /* Add null terminator */ > - sldi r9,r9,56 /* stxvl wants size in top 8 bits */ > - stxvl 32+v0,r11,r9 /* Partial store */ > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v15,v0,r11) > #ifdef USE_AS_STPCPY > /* stpcpy returns the dest address plus the size not counting the > final '\0'. */ > @@ -127,11 +282,9 @@ L(tail1): > .p2align 4 > L(tail2): > stxv 32+v0,0(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,16 > - stxvl 32+v1,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,16 > + STORE_WITH_LEN(v15,v1,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -141,11 +294,9 @@ L(tail2): > L(tail3): > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,32 > - stxvl 32+v2,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,32 > + STORE_WITH_LEN(v15,v2,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -156,11 +307,9 @@ L(tail4): > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > stxv 32+v2,32(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,48 > - stxvl 32+v3,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,48 > + STORE_WITH_LEN(v15,v3,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -168,34 +317,63 @@ L(tail4): > > .p2align 4 > L(tail5): > - stxv 32+v0,0(r11) > - stxv 32+v1,16(r11) > - stxv 32+v2,32(r11) > - stxv 32+v3,48(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,64 > - stxvl 32+v4,r11,r9 > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + /* There's a NULL byte. */ > + addi r11,r11,64 > + STORE_WITH_LEN(v15,v4,r11) > #ifdef USE_AS_STPCPY > - add r3,r11,r8 > + add r3,r11,r8 > #endif > blr > > .p2align 4 > L(tail6): > - stxv 32+v0,0(r11) > - stxv 32+v1,16(r11) > - stxv 32+v2,32(r11) > - stxv 32+v3,48(r11) > - stxv 32+v4,64(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,80 > - stxvl 32+v5,r11,r9 > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + /* There's a NULL byte. */ > + addi r11,r11,80 > + STORE_WITH_LEN(v15,v5,r11) > #ifdef USE_AS_STPCPY > - add r3,r11,r8 > + add r3,r11,r8 > +#endif > + blr > + > + .p2align 4 > +L(tail7): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + /* There's a NULL byte. */ > + addi r11,r11,96 > + STORE_WITH_LEN(v15,v6,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > + .p2align 4 > +L(tail8): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + /* There's a NULL byte. */ > + addi r11,r11,112 > + STORE_WITH_LEN(v15,v7,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > #endif > blr > strcpy_power9/10 strcpy_power9_old_version align1=1, align2=0, len=1: 1.21 ( 4.13%) 1.26 align1=0, align2=1, len=1: 1.21 ( 4.24%) 1.26 align1=0, align2=0, len=1: 1.21 ( 4.37%) 1.26 align1=1, align2=1, len=1: 1.21 ( 9.38%) 1.33 align1=1, align2=0, len=2: 1.21 ( 9.73%) 1.34 align1=0, align2=1, len=2: 1.21 ( 9.67%) 1.33 align1=0, align2=0, len=2: 1.21 ( 9.58%) 1.33 align1=1, align2=1, len=2: 1.21 ( 9.53%) 1.33 align1=1, align2=0, len=4: 1.21 ( 9.74%) 1.34 align1=0, align2=1, len=4: 1.21 ( 9.53%) 1.33 align1=0, align2=0, len=4: 1.21 ( 9.56%) 1.33 align1=1, align2=1, len=4: 1.20 ( 9.54%) 1.33 align1=1, align2=0, len=8: 1.20 ( 9.76%) 1.33 align1=0, align2=1, len=8: 1.20 ( 9.69%) 1.33 align1=0, align2=0, len=8: 1.20 ( 9.57%) 1.33 align1=1, align2=1, len=8: 1.21 ( 9.42%) 1.33 align1=1, align2=0, len=16: 1.39 ( -0.03%) 1.39 align1=0, align2=1, len=16: 1.39 ( -0.08%) 1.39 align1=0, align2=0, len=16: 1.40 ( -0.64%) 1.39 align1=1, align2=1, len=16: 1.39 ( -0.29%) 1.39 align1=1, align2=0, len=32: 1.67 ( -7.17%) 1.56 align1=0, align2=1, len=32: 1.68 ( -7.90%) 1.55 align1=0, align2=0, len=32: 1.67 ( -7.81%) 1.55 align1=1, align2=1, len=32: 1.67 ( -7.39%) 1.56 align1=1, align2=0, len=64: 1.67 ( 7.71%) 1.81 align1=0, align2=1, len=64: 1.67 ( 7.32%) 1.80 align1=0, align2=0, len=64: 1.68 ( 7.05%) 1.80 align1=1, align2=1, len=64: 1.67 ( 7.41%) 1.80 align1=1, align2=0, len=128: 2.06 ( 11.49%) 2.33 align1=0, align2=1, len=128: 2.05 ( 11.38%) 2.32 align1=0, align2=0, len=128: 2.06 ( 16.13%) 2.45 align1=1, align2=1, len=128: 2.06 ( 15.96%) 2.45 align1=1, align2=0, len=256: 3.09 ( 5.35%) 3.27 align1=0, align2=1, len=256: 3.10 ( 4.07%) 3.23 align1=0, align2=0, len=256: 3.09 ( 4.30%) 3.23 align1=1, align2=1, len=256: 3.10 ( 4.04%) 3.23 align1=1, align2=0, len=512: 5.33 ( 10.32%) 5.94 align1=0, align2=1, len=512: 5.32 ( 10.44%) 5.94 align1=0, align2=0, len=512: 5.26 ( 9.35%) 5.80 align1=1, align2=1, len=512: 5.26 ( 9.27%) 5.80 align1=1, align2=0, len=1024: 9.87 ( 11.70%) 11.18 align1=0, align2=1, len=1024: 9.77 ( 13.27%) 11.26 align1=0, align2=0, len=1024: 9.64 ( 13.21%) 11.10 align1=1, align2=1, len=1024: 9.64 ( 13.05%) 11.09 align1=1, align2=0, len=2048: 17.96 ( 17.14%) 21.68 align1=0, align2=1, len=2048: 17.71 ( 18.21%) 21.65 align1=0, align2=0, len=2048: 17.37 ( 19.37%) 21.54 align1=1, align2=1, len=2048: 17.37 ( 19.41%) 21.55 align1=1, align2=0, len=4096: 42.29 ( 10.33%) 47.16 align1=0, align2=1, len=4096: 42.22 ( 9.95%) 46.88 align1=0, align2=0, len=4096: 41.59 ( 11.36%) 46.92 align1=1, align2=1, len=4096: 41.57 ( 11.89%) 47.18 align1=1, align2=0, len=8192: 73.72 ( 17.77%) 89.66 align1=0, align2=1, len=8192: 74.86 ( 16.43%) 89.58 align1=0, align2=0, len=8192: 73.91 ( 17.16%) 89.22 align1=1, align2=1, len=8192: 73.85 ( 17.26%) 89.26 align1=1, align2=0, len=16384: 138.67 ( 19.87%) 173.06 align1=0, align2=1, len=16384: 134.35 ( 22.39%) 173.12 align1=0, align2=0, len=16384: 135.85 ( 21.30%) 172.62 align1=1, align2=1, len=16384: 135.39 ( 21.62%) 172.74 align1=1, align2=0, len=32768: 262.83 ( 23.07%) 341.63 align1=0, align2=1, len=32768: 266.12 ( 22.17%) 341.91 align1=0, align2=0, len=32768: 256.77 ( 24.89%) 341.84 align1=1, align2=1, len=32768: 258.00 ( 24.42%) 341.35
On 6/26/24 9:34 AM, MAHESH BODAPATI wrote: > This patch modifies the current POWER9 implementation of strcpy and > stpcpy to optimize it for POWER9/10. > > Since no new POWER10 instructions are used, the original POWER9 > strcpy is modified instead of creating a new implementation for POWER10. > > The changes also affect stpcpy, which uses the same implementation > with some additional code before returning. > > Improvements compared to POWER9 version: > > Use simple comparisons for the first ~512 bytes > The main loop is good for long strings, but comparing 16B each time is > better for shorter strings. After aligning the address to 16 bytes, we > unroll the loop four times, checking 128 bytes each time. There may be > some overlap with the main loop for unaligned strings, but it is better > for shorter strings. > > Loop with 64 bytes for longer bytes > using 4 consecutive lxv/stxv instructions. > > Showed an average improvement of 13% and the degradations for smaller > strings are not consistent. > --- > sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 +++++++++++++++---- > 1 file changed, 242 insertions(+), 64 deletions(-) > > diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > index 603bde1e39..8918dcf90a 100644 > --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S > @@ -42,22 +42,48 @@ > > if USE_AS_STPCPY is defined. > > - The implementation can load bytes past a null terminator, but only > - up to the next 16B boundary, so it never crosses a page. */ > + This implementation never reads across a page boundary, but may > + read beyond the NUL terminator. */ > > -/* Load quadword at addr+offset to vreg, check for null bytes, > +/* Load 4 quadwords, merge into one VR for speed and check for NULLs Minor nit, NULL should be NUL in most cases here, as it refers to the specific value of a byte. > + and branch to label if NULL is found. */ > +#define CHECK_64B(offset,addr,label) \ > + lxv 32+v4,(offset+0)(addr); \ > + lxv 32+v5,(offset+16)(addr); \ > + lxv 32+v6,(offset+32)(addr); \ > + lxv 32+v7,(offset+48)(addr); \ > + vminub v14,v4,v5; \ > + vminub v15,v6,v7; \ > + vminub v16,v14,v15; \ > + vcmpequb. v0,v16,v18; \ > + beq cr6,$+12; \ Minor cleanup request: please check the usage of tabs and spaces throughout this file. The usage is inconsistent. > + li r7,offset; \ > + b L(label); \ > + stxv 32+v4,(offset+0)(r11); \ > + stxv 32+v5,(offset+16)(r11); \ > + stxv 32+v6,(offset+32)(r11); \ > + stxv 32+v7,(offset+48)(r11) > + > +/* Load quadword at addr+offset to vreg, check for NULL bytes, > and branch to label if any are found. */ > -#define CHECK16(vreg,offset,addr,label) \ > +#define CHECK_16B(vreg,offset,addr,label) \ > lxv vreg+32,offset(addr); \ > - vcmpequb. v6,vreg,v18; \ > + vcmpequb. v15,vreg,v18; \ > bne cr6,L(label); > > +/* Store vreg2 with length if NULL is found. */ > +#define STORE_WITH_LEN(vreg1,vreg2,reg) \ > + vctzlsbb r8,vreg1; \ > + addi r9,r8,1; \ > + sldi r9,r9,56; \ > + stxvl 32+vreg2,reg,r9; > + > .machine power9 > ENTRY_TOCLESS (FUNC_NAME, 4) > CALL_MCOUNT 2 > > vspltisb v18,0 /* Zeroes in v18 */ > - vspltisb v19,-1 /* 0xFF bytes in v19 */ > + vspltisb v19,-1 /* 0xFF bytes in v19 */ > > /* Next 16B-aligned address. Prepare address for L(loop). */ > addi r5,r4,16 > @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > lvsr v1,0,r4 > vperm v0,v19,v0,v1 > > - vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ > + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise. */ > beq cr6,L(no_null) > > - /* There's a null byte. */ > - vctzlsbb r8,v6 /* Number of trailing zeroes */ > - addi r9,r8,1 /* Add null byte. */ > - sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ > - stxvl 32+v0,r3,r10 /* Partial store */ > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v6,v0,r3) > > #ifdef USE_AS_STPCPY > /* stpcpy returns the dest address plus the size not counting the > @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4) > blr > > L(no_null): > - sldi r10,r8,56 /* stxvl wants size in top 8 bits */ > - stxvl 32+v0,r3,r10 /* Partial store */ > + sldi r10,r8,56 /* stxvl wants size in top 8 bits. */ > + stxvl 32+v0,r3,r10 /* Partial store. */ > > +/* The main loop is optimized for longer strings(> 512 bytes), > + so checking the first bytes in 16B chunks benefits shorter > + strings a lot. */ > .p2align 4 > -L(loop): > - CHECK16(v0,0,r5,tail1) > - CHECK16(v1,16,r5,tail2) > - CHECK16(v2,32,r5,tail3) > - CHECK16(v3,48,r5,tail4) > - CHECK16(v4,64,r5,tail5) > - CHECK16(v5,80,r5,tail6) > +L(aligned): > + CHECK_16B(v0,0,r5,tail1) > + CHECK_16B(v1,16,r5,tail2) > + CHECK_16B(v2,32,r5,tail3) > + CHECK_16B(v3,48,r5,tail4) > + CHECK_16B(v4,64,r5,tail5) > + CHECK_16B(v5,80,r5,tail6) > + CHECK_16B(v6,96,r5,tail7) > + CHECK_16B(v7,112,r5,tail8) > > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > @@ -105,18 +133,145 @@ L(loop): > stxv 32+v3,48(r11) > stxv 32+v4,64(r11) > stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,128,r5,tail1) > + CHECK_16B(v1,128+16,r5,tail2) > + CHECK_16B(v2,128+32,r5,tail3) > + CHECK_16B(v3,128+48,r5,tail4) > + CHECK_16B(v4,128+64,r5,tail5) > + CHECK_16B(v5,128+80,r5,tail6) > + CHECK_16B(v6,128+96,r5,tail7) > + CHECK_16B(v7,128+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,256,r5,tail1) > + CHECK_16B(v1,256+16,r5,tail2) > + CHECK_16B(v2,256+32,r5,tail3) > + CHECK_16B(v3,256+48,r5,tail4) > + CHECK_16B(v4,256+64,r5,tail5) > + CHECK_16B(v5,256+80,r5,tail6) > + CHECK_16B(v6,256+96,r5,tail7) > + CHECK_16B(v7,256+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + addi r11,r11,128 > + > + CHECK_16B(v0,384,r5,tail1) > + CHECK_16B(v1,384+16,r5,tail2) > + CHECK_16B(v2,384+32,r5,tail3) > + CHECK_16B(v3,384+48,r5,tail4) > + CHECK_16B(v4,384+64,r5,tail5) > + CHECK_16B(v5,384+80,r5,tail6) > + CHECK_16B(v6,384+96,r5,tail7) > + CHECK_16B(v7,384+112,r5,tail8) > + > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + stxv 32+v7,112(r11) > + > + /* Prepare address for the loop. */ I think this comment can be improved. Maybe, "Align src pointer down to a 64B boundary."? > + addi r5,r4,512 > + clrrdi r5,r5,6 > + subf r7,r4,r5 > + add r11,r3,r7 > + > +/* Switch to a more aggressive approach checking 64B each time. */ > + .p2align 5 > +L(strcpy_loop): > + CHECK_64B(0,r5,tail_64b) > + CHECK_64B(64,r5,tail_64b) > + CHECK_64B(128,r5,tail_64b) > + CHECK_64B(192,r5,tail_64b) > > - addi r5,r5,96 > - addi r11,r11,96 > + CHECK_64B(256,r5,tail_64b) > + CHECK_64B(256+64,r5,tail_64b) > + CHECK_64B(256+128,r5,tail_64b) > + CHECK_64B(256+192,r5,tail_64b) > + addi r5,r5,512 > + addi r11,r11,512 > + > + b L(strcpy_loop) > + > + .p2align 5 > +L(tail_64b): > + /* OK, we found a NULL byte. Let's look for it in the current 64-byte > + block and mark it in its corresponding VR. */ > + add r11,r11,r7 > + vcmpequb. v8,v4,v18 > + beq cr6,L(no_null_16B) > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v8,v4,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > +L(no_null_16B): > + stxv 32+v4,0(r11) > + vcmpequb. v8,v5,v18 > + beq cr6,L(no_null_32B) > + /* There's a NULL byte. */ > + addi r11,r11,16 > + STORE_WITH_LEN(v8,v5,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > +L(no_null_32B): > + stxv 32+v5,16(r11) > + vcmpequb. v8,v6,v18 > + beq cr6,L(no_null_48B) > + /* There's a NULL byte. */ > + addi r11,r11,32 > + STORE_WITH_LEN(v8,v6,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > > - b L(loop) > +L(no_null_48B): > + stxv 32+v6,32(r11) > + vcmpequb. v8,v7,v18; > + /* There's a NULL byte. */ > + addi r11,r11,48 > + STORE_WITH_LEN(v8,v7,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > > .p2align 4 > L(tail1): > - vctzlsbb r8,v6 /* Number of trailing zeroes */ > - addi r9,r8,1 /* Add null terminator */ > - sldi r9,r9,56 /* stxvl wants size in top 8 bits */ > - stxvl 32+v0,r11,r9 /* Partial store */ > + /* There's a NULL byte. */ > + STORE_WITH_LEN(v15,v0,r11) > #ifdef USE_AS_STPCPY > /* stpcpy returns the dest address plus the size not counting the > final '\0'. */ This comment is used in only two places where the stpcpy return value is computed. I think this one can be removed, the first one is sufficient. > @@ -127,11 +282,9 @@ L(tail1): > .p2align 4 > L(tail2): > stxv 32+v0,0(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,16 > - stxvl 32+v1,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,16 > + STORE_WITH_LEN(v15,v1,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -141,11 +294,9 @@ L(tail2): > L(tail3): > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,32 > - stxvl 32+v2,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,32 > + STORE_WITH_LEN(v15,v2,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -156,11 +307,9 @@ L(tail4): > stxv 32+v0,0(r11) > stxv 32+v1,16(r11) > stxv 32+v2,32(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,48 > - stxvl 32+v3,r11,r9 > + /* There's a NULL byte. */ > + addi r11,r11,48 > + STORE_WITH_LEN(v15,v3,r11) > #ifdef USE_AS_STPCPY > add r3,r11,r8 > #endif > @@ -168,34 +317,63 @@ L(tail4): > > .p2align 4 > L(tail5): > - stxv 32+v0,0(r11) > - stxv 32+v1,16(r11) > - stxv 32+v2,32(r11) > - stxv 32+v3,48(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,64 > - stxvl 32+v4,r11,r9 > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + /* There's a NULL byte. */ > + addi r11,r11,64 > + STORE_WITH_LEN(v15,v4,r11) > #ifdef USE_AS_STPCPY > - add r3,r11,r8 > + add r3,r11,r8 > #endif > blr > > .p2align 4 > L(tail6): > - stxv 32+v0,0(r11) > - stxv 32+v1,16(r11) > - stxv 32+v2,32(r11) > - stxv 32+v3,48(r11) > - stxv 32+v4,64(r11) > - vctzlsbb r8,v6 > - addi r9,r8,1 > - sldi r9,r9,56 > - addi r11,r11,80 > - stxvl 32+v5,r11,r9 > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + /* There's a NULL byte. */ > + addi r11,r11,80 > + STORE_WITH_LEN(v15,v5,r11) > #ifdef USE_AS_STPCPY > - add r3,r11,r8 > + add r3,r11,r8 > +#endif > + blr > + > + .p2align 4 > +L(tail7): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + /* There's a NULL byte. */ > + addi r11,r11,96 > + STORE_WITH_LEN(v15,v6,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > +#endif > + blr > + > + .p2align 4 > +L(tail8): > + stxv 32+v0,0(r11) > + stxv 32+v1,16(r11) > + stxv 32+v2,32(r11) > + stxv 32+v3,48(r11) > + stxv 32+v4,64(r11) > + stxv 32+v5,80(r11) > + stxv 32+v6,96(r11) > + /* There's a NULL byte. */ > + addi r11,r11,112 > + STORE_WITH_LEN(v15,v7,r11) > +#ifdef USE_AS_STPCPY > + add r3,r11,r8 > #endif > blr > One final request for V3, the prefix of the title should be updated to "powerpc64le: ...". With the minor changes requested, this LGTM.
On 26/06/24 11:34, MAHESH BODAPATI wrote: > This patch modifies the current POWER9 implementation of strcpy and > stpcpy to optimize it for POWER9/10. > > Since no new POWER10 instructions are used, the original POWER9 > strcpy is modified instead of creating a new implementation for POWER10. > > The changes also affect stpcpy, which uses the same implementation > with some additional code before returning. > > Improvements compared to POWER9 version: > > Use simple comparisons for the first ~512 bytes > The main loop is good for long strings, but comparing 16B each time is > better for shorter strings. After aligning the address to 16 bytes, we > unroll the loop four times, checking 128 bytes each time. There may be > some overlap with the main loop for unaligned strings, but it is better > for shorter strings. > > Loop with 64 bytes for longer bytes > using 4 consecutive lxv/stxv instructions. > > Showed an average improvement of 13% and the degradations for smaller > strings are not consistent. Other implementations either focus on small sizes or add fast-paths code assuming small sizes are more common than larger one. You might reevaluate if this implementation if really worth if you are seeing smaller size regressions.
On Jun 26 2024, Paul E Murphy wrote: > Minor nit, NULL should be NUL in most cases here, as it refers to the > specific value of a byte. NUL is the abbreviation for the ASCII Null character. In ordiary text it should be referred to as the null character or null byte.
On 27/06/24 4:03 am, Adhemerval Zanella Netto wrote: > > On 26/06/24 11:34, MAHESH BODAPATI wrote: >> This patch modifies the current POWER9 implementation of strcpy and >> stpcpy to optimize it for POWER9/10. >> >> Since no new POWER10 instructions are used, the original POWER9 >> strcpy is modified instead of creating a new implementation for POWER10. >> >> The changes also affect stpcpy, which uses the same implementation >> with some additional code before returning. >> >> Improvements compared to POWER9 version: >> >> Use simple comparisons for the first ~512 bytes >> The main loop is good for long strings, but comparing 16B each time is >> better for shorter strings. After aligning the address to 16 bytes, we >> unroll the loop four times, checking 128 bytes each time. There may be >> some overlap with the main loop for unaligned strings, but it is better >> for shorter strings. >> >> Loop with 64 bytes for longer bytes >> using 4 consecutive lxv/stxv instructions. >> >> Showed an average improvement of 13% and the degradations for smaller >> strings are not consistent. > Other implementations either focus on small sizes or add fast-paths code > assuming small sizes are more common than larger one. You might reevaluate > if this implementation if really worth if you are seeing smaller size > regressions. The implementation is exactly similar to the POWER9 on the first 96 bytes but benchmark results are showing inconsistent improvements/regressions. I will submit a v3 patch, run the benchmarks on a different machine and share the results.
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S index 603bde1e39..8918dcf90a 100644 --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S @@ -42,22 +42,48 @@ if USE_AS_STPCPY is defined. - The implementation can load bytes past a null terminator, but only - up to the next 16B boundary, so it never crosses a page. */ + This implementation never reads across a page boundary, but may + read beyond the NUL terminator. */ -/* Load quadword at addr+offset to vreg, check for null bytes, +/* Load 4 quadwords, merge into one VR for speed and check for NULLs + and branch to label if NULL is found. */ +#define CHECK_64B(offset,addr,label) \ + lxv 32+v4,(offset+0)(addr); \ + lxv 32+v5,(offset+16)(addr); \ + lxv 32+v6,(offset+32)(addr); \ + lxv 32+v7,(offset+48)(addr); \ + vminub v14,v4,v5; \ + vminub v15,v6,v7; \ + vminub v16,v14,v15; \ + vcmpequb. v0,v16,v18; \ + beq cr6,$+12; \ + li r7,offset; \ + b L(label); \ + stxv 32+v4,(offset+0)(r11); \ + stxv 32+v5,(offset+16)(r11); \ + stxv 32+v6,(offset+32)(r11); \ + stxv 32+v7,(offset+48)(r11) + +/* Load quadword at addr+offset to vreg, check for NULL bytes, and branch to label if any are found. */ -#define CHECK16(vreg,offset,addr,label) \ +#define CHECK_16B(vreg,offset,addr,label) \ lxv vreg+32,offset(addr); \ - vcmpequb. v6,vreg,v18; \ + vcmpequb. v15,vreg,v18; \ bne cr6,L(label); +/* Store vreg2 with length if NULL is found. */ +#define STORE_WITH_LEN(vreg1,vreg2,reg) \ + vctzlsbb r8,vreg1; \ + addi r9,r8,1; \ + sldi r9,r9,56; \ + stxvl 32+vreg2,reg,r9; + .machine power9 ENTRY_TOCLESS (FUNC_NAME, 4) CALL_MCOUNT 2 vspltisb v18,0 /* Zeroes in v18 */ - vspltisb v19,-1 /* 0xFF bytes in v19 */ + vspltisb v19,-1 /* 0xFF bytes in v19 */ /* Next 16B-aligned address. Prepare address for L(loop). */ addi r5,r4,16 @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4) lvsr v1,0,r4 vperm v0,v19,v0,v1 - vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */ + vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise. */ beq cr6,L(no_null) - /* There's a null byte. */ - vctzlsbb r8,v6 /* Number of trailing zeroes */ - addi r9,r8,1 /* Add null byte. */ - sldi r10,r9,56 /* stxvl wants size in top 8 bits. */ - stxvl 32+v0,r3,r10 /* Partial store */ + /* There's a NULL byte. */ + STORE_WITH_LEN(v6,v0,r3) #ifdef USE_AS_STPCPY /* stpcpy returns the dest address plus the size not counting the @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4) blr L(no_null): - sldi r10,r8,56 /* stxvl wants size in top 8 bits */ - stxvl 32+v0,r3,r10 /* Partial store */ + sldi r10,r8,56 /* stxvl wants size in top 8 bits. */ + stxvl 32+v0,r3,r10 /* Partial store. */ +/* The main loop is optimized for longer strings(> 512 bytes), + so checking the first bytes in 16B chunks benefits shorter + strings a lot. */ .p2align 4 -L(loop): - CHECK16(v0,0,r5,tail1) - CHECK16(v1,16,r5,tail2) - CHECK16(v2,32,r5,tail3) - CHECK16(v3,48,r5,tail4) - CHECK16(v4,64,r5,tail5) - CHECK16(v5,80,r5,tail6) +L(aligned): + CHECK_16B(v0,0,r5,tail1) + CHECK_16B(v1,16,r5,tail2) + CHECK_16B(v2,32,r5,tail3) + CHECK_16B(v3,48,r5,tail4) + CHECK_16B(v4,64,r5,tail5) + CHECK_16B(v5,80,r5,tail6) + CHECK_16B(v6,96,r5,tail7) + CHECK_16B(v7,112,r5,tail8) stxv 32+v0,0(r11) stxv 32+v1,16(r11) @@ -105,18 +133,145 @@ L(loop): stxv 32+v3,48(r11) stxv 32+v4,64(r11) stxv 32+v5,80(r11) + stxv 32+v6,96(r11) + stxv 32+v7,112(r11) + + addi r11,r11,128 + + CHECK_16B(v0,128,r5,tail1) + CHECK_16B(v1,128+16,r5,tail2) + CHECK_16B(v2,128+32,r5,tail3) + CHECK_16B(v3,128+48,r5,tail4) + CHECK_16B(v4,128+64,r5,tail5) + CHECK_16B(v5,128+80,r5,tail6) + CHECK_16B(v6,128+96,r5,tail7) + CHECK_16B(v7,128+112,r5,tail8) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) + stxv 32+v6,96(r11) + stxv 32+v7,112(r11) + + addi r11,r11,128 + + CHECK_16B(v0,256,r5,tail1) + CHECK_16B(v1,256+16,r5,tail2) + CHECK_16B(v2,256+32,r5,tail3) + CHECK_16B(v3,256+48,r5,tail4) + CHECK_16B(v4,256+64,r5,tail5) + CHECK_16B(v5,256+80,r5,tail6) + CHECK_16B(v6,256+96,r5,tail7) + CHECK_16B(v7,256+112,r5,tail8) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) + stxv 32+v6,96(r11) + stxv 32+v7,112(r11) + + addi r11,r11,128 + + CHECK_16B(v0,384,r5,tail1) + CHECK_16B(v1,384+16,r5,tail2) + CHECK_16B(v2,384+32,r5,tail3) + CHECK_16B(v3,384+48,r5,tail4) + CHECK_16B(v4,384+64,r5,tail5) + CHECK_16B(v5,384+80,r5,tail6) + CHECK_16B(v6,384+96,r5,tail7) + CHECK_16B(v7,384+112,r5,tail8) + + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) + stxv 32+v6,96(r11) + stxv 32+v7,112(r11) + + /* Prepare address for the loop. */ + addi r5,r4,512 + clrrdi r5,r5,6 + subf r7,r4,r5 + add r11,r3,r7 + +/* Switch to a more aggressive approach checking 64B each time. */ + .p2align 5 +L(strcpy_loop): + CHECK_64B(0,r5,tail_64b) + CHECK_64B(64,r5,tail_64b) + CHECK_64B(128,r5,tail_64b) + CHECK_64B(192,r5,tail_64b) - addi r5,r5,96 - addi r11,r11,96 + CHECK_64B(256,r5,tail_64b) + CHECK_64B(256+64,r5,tail_64b) + CHECK_64B(256+128,r5,tail_64b) + CHECK_64B(256+192,r5,tail_64b) + addi r5,r5,512 + addi r11,r11,512 + + b L(strcpy_loop) + + .p2align 5 +L(tail_64b): + /* OK, we found a NULL byte. Let's look for it in the current 64-byte + block and mark it in its corresponding VR. */ + add r11,r11,r7 + vcmpequb. v8,v4,v18 + beq cr6,L(no_null_16B) + /* There's a NULL byte. */ + STORE_WITH_LEN(v8,v4,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr + +L(no_null_16B): + stxv 32+v4,0(r11) + vcmpequb. v8,v5,v18 + beq cr6,L(no_null_32B) + /* There's a NULL byte. */ + addi r11,r11,16 + STORE_WITH_LEN(v8,v5,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr + +L(no_null_32B): + stxv 32+v5,16(r11) + vcmpequb. v8,v6,v18 + beq cr6,L(no_null_48B) + /* There's a NULL byte. */ + addi r11,r11,32 + STORE_WITH_LEN(v8,v6,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr - b L(loop) +L(no_null_48B): + stxv 32+v6,32(r11) + vcmpequb. v8,v7,v18; + /* There's a NULL byte. */ + addi r11,r11,48 + STORE_WITH_LEN(v8,v7,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr .p2align 4 L(tail1): - vctzlsbb r8,v6 /* Number of trailing zeroes */ - addi r9,r8,1 /* Add null terminator */ - sldi r9,r9,56 /* stxvl wants size in top 8 bits */ - stxvl 32+v0,r11,r9 /* Partial store */ + /* There's a NULL byte. */ + STORE_WITH_LEN(v15,v0,r11) #ifdef USE_AS_STPCPY /* stpcpy returns the dest address plus the size not counting the final '\0'. */ @@ -127,11 +282,9 @@ L(tail1): .p2align 4 L(tail2): stxv 32+v0,0(r11) - vctzlsbb r8,v6 - addi r9,r8,1 - sldi r9,r9,56 - addi r11,r11,16 - stxvl 32+v1,r11,r9 + /* There's a NULL byte. */ + addi r11,r11,16 + STORE_WITH_LEN(v15,v1,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif @@ -141,11 +294,9 @@ L(tail2): L(tail3): stxv 32+v0,0(r11) stxv 32+v1,16(r11) - vctzlsbb r8,v6 - addi r9,r8,1 - sldi r9,r9,56 - addi r11,r11,32 - stxvl 32+v2,r11,r9 + /* There's a NULL byte. */ + addi r11,r11,32 + STORE_WITH_LEN(v15,v2,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif @@ -156,11 +307,9 @@ L(tail4): stxv 32+v0,0(r11) stxv 32+v1,16(r11) stxv 32+v2,32(r11) - vctzlsbb r8,v6 - addi r9,r8,1 - sldi r9,r9,56 - addi r11,r11,48 - stxvl 32+v3,r11,r9 + /* There's a NULL byte. */ + addi r11,r11,48 + STORE_WITH_LEN(v15,v3,r11) #ifdef USE_AS_STPCPY add r3,r11,r8 #endif @@ -168,34 +317,63 @@ L(tail4): .p2align 4 L(tail5): - stxv 32+v0,0(r11) - stxv 32+v1,16(r11) - stxv 32+v2,32(r11) - stxv 32+v3,48(r11) - vctzlsbb r8,v6 - addi r9,r8,1 - sldi r9,r9,56 - addi r11,r11,64 - stxvl 32+v4,r11,r9 + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + /* There's a NULL byte. */ + addi r11,r11,64 + STORE_WITH_LEN(v15,v4,r11) #ifdef USE_AS_STPCPY - add r3,r11,r8 + add r3,r11,r8 #endif blr .p2align 4 L(tail6): - stxv 32+v0,0(r11) - stxv 32+v1,16(r11) - stxv 32+v2,32(r11) - stxv 32+v3,48(r11) - stxv 32+v4,64(r11) - vctzlsbb r8,v6 - addi r9,r8,1 - sldi r9,r9,56 - addi r11,r11,80 - stxvl 32+v5,r11,r9 + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + /* There's a NULL byte. */ + addi r11,r11,80 + STORE_WITH_LEN(v15,v5,r11) #ifdef USE_AS_STPCPY - add r3,r11,r8 + add r3,r11,r8 +#endif + blr + + .p2align 4 +L(tail7): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) + /* There's a NULL byte. */ + addi r11,r11,96 + STORE_WITH_LEN(v15,v6,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 +#endif + blr + + .p2align 4 +L(tail8): + stxv 32+v0,0(r11) + stxv 32+v1,16(r11) + stxv 32+v2,32(r11) + stxv 32+v3,48(r11) + stxv 32+v4,64(r11) + stxv 32+v5,80(r11) + stxv 32+v6,96(r11) + /* There's a NULL byte. */ + addi r11,r11,112 + STORE_WITH_LEN(v15,v7,r11) +#ifdef USE_AS_STPCPY + add r3,r11,r8 #endif blr