diff mbox series

[v2] powerpc: optimize strcpy and stpcpy for POWER9/10

Message ID 20240626143405.1514956-1-bmahi496@linux.ibm.com
State New
Headers show
Series [v2] powerpc: optimize strcpy and stpcpy for POWER9/10 | expand

Commit Message

MAHESH BODAPATI June 26, 2024, 2:34 p.m. UTC
This patch modifies the current POWER9 implementation of strcpy and
stpcpy to optimize it for POWER9/10.

Since no new POWER10 instructions are used, the original POWER9
strcpy is modified instead of creating a new implementation for POWER10.

The changes also affect stpcpy, which uses the same implementation
with some additional code before returning.

Improvements compared to POWER9 version:

Use simple comparisons for the first ~512 bytes
  The main loop is good for long strings, but comparing 16B each time is
  better for shorter strings. After aligning the address to 16 bytes, we
  unroll the loop four times, checking 128 bytes each time. There may be
  some overlap with the main loop for unaligned strings, but it is better
  for shorter strings.

Loop with 64 bytes for longer bytes
  using 4 consecutive lxv/stxv instructions.

Showed an average improvement of 13% and the degradations for smaller
strings are not consistent.
---
 sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 +++++++++++++++----
 1 file changed, 242 insertions(+), 64 deletions(-)

Comments

MAHESH BODAPATI June 26, 2024, 2:46 p.m. UTC | #1
Please go through the attached benchmark file.


On 26/06/24 8:04 pm, MAHESH BODAPATI wrote:
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
>
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
>
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
>
> Improvements compared to POWER9 version:
>
> Use simple comparisons for the first ~512 bytes
>    The main loop is good for long strings, but comparing 16B each time is
>    better for shorter strings. After aligning the address to 16 bytes, we
>    unroll the loop four times, checking 128 bytes each time. There may be
>    some overlap with the main loop for unaligned strings, but it is better
>    for shorter strings.
>
> Loop with 64 bytes for longer bytes
>    using 4 consecutive lxv/stxv instructions.
>
> Showed an average improvement of 13% and the degradations for smaller
> strings are not consistent.
> ---
>   sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 +++++++++++++++----
>   1 file changed, 242 insertions(+), 64 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> index 603bde1e39..8918dcf90a 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -42,22 +42,48 @@
>   
>      if USE_AS_STPCPY is defined.
>   
> -   The implementation can load bytes past a null terminator, but only
> -   up to the next 16B boundary, so it never crosses a page.  */
> +   This implementation never reads across a page boundary, but may
> +   read beyond the NUL terminator.  */
>   
> -/* Load quadword at addr+offset to vreg, check for null bytes,
> +/* Load 4 quadwords, merge into one VR for speed and check for NULLs
> +   and branch to label if NULL is found.  */
> +#define CHECK_64B(offset,addr,label)	  \
> +	lxv     32+v4,(offset+0)(addr);	\
> +	lxv     32+v5,(offset+16)(addr);   \
> +	lxv     32+v6,(offset+32)(addr);   \
> +	lxv     32+v7,(offset+48)(addr);   \
> +	vminub    v14,v4,v5;		\
> +	vminub    v15,v6,v7;		\
> +	vminub    v16,v14,v15;	      \
> +	vcmpequb. v0,v16,v18;	       \
> +	beq     cr6,$+12;		   \
> +	li      r7,offset;		  \
> +	b       L(label);		   \
> +	stxv    32+v4,(offset+0)(r11);      \
> +	stxv    32+v5,(offset+16)(r11);     \
> +	stxv    32+v6,(offset+32)(r11);     \
> +	stxv    32+v7,(offset+48)(r11)
> +
> +/* Load quadword at addr+offset to vreg, check for NULL bytes,
>      and branch to label if any are found.  */
> -#define CHECK16(vreg,offset,addr,label) \
> +#define CHECK_16B(vreg,offset,addr,label) \
>   	lxv	vreg+32,offset(addr);	\
> -	vcmpequb. v6,vreg,v18;	\
> +	vcmpequb. v15,vreg,v18;	\
>   	bne	cr6,L(label);
>   
> +/* Store vreg2 with length if NULL is found.  */
> +#define STORE_WITH_LEN(vreg1,vreg2,reg)    \
> +	vctzlsbb r8,vreg1;		 \
> +	addi    r9,r8,1;		   \
> +	sldi    r9,r9,56;		  \
> +	stxvl   32+vreg2,reg,r9;
> +
>   .machine power9
>   ENTRY_TOCLESS (FUNC_NAME, 4)
>   	CALL_MCOUNT 2
>   
>   	vspltisb v18,0		/* Zeroes in v18  */
> -	vspltisb v19,-1 	/* 0xFF bytes in v19  */
> +	vspltisb v19,-1	 /* 0xFF bytes in v19  */
>   
>   	/* Next 16B-aligned address. Prepare address for L(loop).  */
>   	addi	r5,r4,16
> @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	lvsr	v1,0,r4
>   	vperm	v0,v19,v0,v1
>   
> -	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise.  */
>   	beq	cr6,L(no_null)
>   
> -	/* There's a null byte.  */
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1 	/* Add null byte.  */
> -	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v6,v0,r3)
>   
>   #ifdef USE_AS_STPCPY
>   	/* stpcpy returns the dest address plus the size not counting the
> @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	blr
>   
>   L(no_null):
> -	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
> +	stxvl	32+v0,r3,r10	/* Partial store.  */
>   
> +/* The main loop is optimized for longer strings(> 512 bytes),
> +   so checking the first bytes in 16B chunks benefits shorter
> +   strings a lot.  */
>   	.p2align 4
> -L(loop):
> -	CHECK16(v0,0,r5,tail1)
> -	CHECK16(v1,16,r5,tail2)
> -	CHECK16(v2,32,r5,tail3)
> -	CHECK16(v3,48,r5,tail4)
> -	CHECK16(v4,64,r5,tail5)
> -	CHECK16(v5,80,r5,tail6)
> +L(aligned):
> +	CHECK_16B(v0,0,r5,tail1)
> +	CHECK_16B(v1,16,r5,tail2)
> +	CHECK_16B(v2,32,r5,tail3)
> +	CHECK_16B(v3,48,r5,tail4)
> +	CHECK_16B(v4,64,r5,tail5)
> +	CHECK_16B(v5,80,r5,tail6)
> +	CHECK_16B(v6,96,r5,tail7)
> +	CHECK_16B(v7,112,r5,tail8)
>   
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
> @@ -105,18 +133,145 @@ L(loop):
>   	stxv	32+v3,48(r11)
>   	stxv	32+v4,64(r11)
>   	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
> +
> +	addi	r11,r11,128
> +
> +	CHECK_16B(v0,128,r5,tail1)
> +	CHECK_16B(v1,128+16,r5,tail2)
> +	CHECK_16B(v2,128+32,r5,tail3)
> +	CHECK_16B(v3,128+48,r5,tail4)
> +	CHECK_16B(v4,128+64,r5,tail5)
> +	CHECK_16B(v5,128+80,r5,tail6)
> +	CHECK_16B(v6,128+96,r5,tail7)
> +	CHECK_16B(v7,128+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	addi    r11,r11,128
> +
> +	CHECK_16B(v0,256,r5,tail1)
> +	CHECK_16B(v1,256+16,r5,tail2)
> +	CHECK_16B(v2,256+32,r5,tail3)
> +	CHECK_16B(v3,256+48,r5,tail4)
> +	CHECK_16B(v4,256+64,r5,tail5)
> +	CHECK_16B(v5,256+80,r5,tail6)
> +	CHECK_16B(v6,256+96,r5,tail7)
> +	CHECK_16B(v7,256+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	addi    r11,r11,128
> +
> +	CHECK_16B(v0,384,r5,tail1)
> +	CHECK_16B(v1,384+16,r5,tail2)
> +	CHECK_16B(v2,384+32,r5,tail3)
> +	CHECK_16B(v3,384+48,r5,tail4)
> +	CHECK_16B(v4,384+64,r5,tail5)
> +	CHECK_16B(v5,384+80,r5,tail6)
> +	CHECK_16B(v6,384+96,r5,tail7)
> +	CHECK_16B(v7,384+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	/* Prepare address for the loop.  */
> +	addi    r5,r4,512
> +	clrrdi  r5,r5,6
> +	subf    r7,r4,r5
> +	add     r11,r3,r7
> +
> +/* Switch to a more aggressive approach checking 64B each time.  */
> +	.p2align 5
> +L(strcpy_loop):
> +	CHECK_64B(0,r5,tail_64b)
> +	CHECK_64B(64,r5,tail_64b)
> +	CHECK_64B(128,r5,tail_64b)
> +	CHECK_64B(192,r5,tail_64b)
>   
> -	addi	r5,r5,96
> -	addi	r11,r11,96
> +	CHECK_64B(256,r5,tail_64b)
> +	CHECK_64B(256+64,r5,tail_64b)
> +	CHECK_64B(256+128,r5,tail_64b)
> +	CHECK_64B(256+192,r5,tail_64b)
> +	addi    r5,r5,512
> +	addi    r11,r11,512
> +
> +	b	L(strcpy_loop)
> +
> +	.p2align 5
> +L(tail_64b):
> +	/* OK, we found a NULL byte.  Let's look for it in the current 64-byte
> +	   block and mark it in its corresponding VR.  */
> +	add	r11,r11,r7
> +	vcmpequb. v8,v4,v18
> +	beq	cr6,L(no_null_16B)
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v8,v4,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_16B):
> +	stxv    32+v4,0(r11)
> +	vcmpequb. v8,v5,v18
> +	beq     cr6,L(no_null_32B)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,16
> +	STORE_WITH_LEN(v8,v5,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_32B):
> +	stxv    32+v5,16(r11)
> +	vcmpequb. v8,v6,v18
> +	beq     cr6,L(no_null_48B)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,32
> +	STORE_WITH_LEN(v8,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
>   
> -	b	L(loop)
> +L(no_null_48B):
> +	stxv    32+v6,32(r11)
> +	vcmpequb. v8,v7,v18;
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,48
> +	STORE_WITH_LEN(v8,v7,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
>   
>   	.p2align 4
>   L(tail1):
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1		/* Add null terminator  */
> -	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r11,r9	/* Partial store  */
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v15,v0,r11)
>   #ifdef USE_AS_STPCPY
>   	/* stpcpy returns the dest address plus the size not counting the
>   	   final '\0'.  */
> @@ -127,11 +282,9 @@ L(tail1):
>   	.p2align 4
>   L(tail2):
>   	stxv	32+v0,0(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,16
> -	stxvl	32+v1,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,16
> +	STORE_WITH_LEN(v15,v1,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -141,11 +294,9 @@ L(tail2):
>   L(tail3):
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,32
> -	stxvl	32+v2,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,32
> +	STORE_WITH_LEN(v15,v2,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -156,11 +307,9 @@ L(tail4):
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
>   	stxv	32+v2,32(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,48
> -	stxvl	32+v3,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,48
> +	STORE_WITH_LEN(v15,v3,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -168,34 +317,63 @@ L(tail4):
>   
>   	.p2align 4
>   L(tail5):
> -	stxv	32+v0,0(r11)
> -	stxv	32+v1,16(r11)
> -	stxv	32+v2,32(r11)
> -	stxv	32+v3,48(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,64
> -	stxvl	32+v4,r11,r9
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,64
> +	STORE_WITH_LEN(v15,v4,r11)
>   #ifdef USE_AS_STPCPY
> -	add	r3,r11,r8
> +	add     r3,r11,r8
>   #endif
>   	blr
>   
>   	.p2align 4
>   L(tail6):
> -	stxv	32+v0,0(r11)
> -	stxv	32+v1,16(r11)
> -	stxv	32+v2,32(r11)
> -	stxv	32+v3,48(r11)
> -	stxv	32+v4,64(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,80
> -	stxvl	32+v5,r11,r9
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,80
> +	STORE_WITH_LEN(v15,v5,r11)
>   #ifdef USE_AS_STPCPY
> -	add	r3,r11,r8
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail7):
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,96
> +	STORE_WITH_LEN(v15,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail8):
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,112
> +	STORE_WITH_LEN(v15,v7,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
>   #endif
>   	blr
>
strcpy_power9/10	strcpy_power9_old_version

           align1=1, align2=0, len=1:         1.21 (  4.13%)            1.26    
           align1=0, align2=1, len=1:         1.21 (  4.24%)            1.26    
           align1=0, align2=0, len=1:         1.21 (  4.37%)            1.26    
           align1=1, align2=1, len=1:         1.21 (  9.38%)            1.33    
           align1=1, align2=0, len=2:         1.21 (  9.73%)            1.34    
           align1=0, align2=1, len=2:         1.21 (  9.67%)            1.33    
           align1=0, align2=0, len=2:         1.21 (  9.58%)            1.33    
           align1=1, align2=1, len=2:         1.21 (  9.53%)            1.33    
           align1=1, align2=0, len=4:         1.21 (  9.74%)            1.34    
           align1=0, align2=1, len=4:         1.21 (  9.53%)            1.33    
           align1=0, align2=0, len=4:         1.21 (  9.56%)            1.33    
           align1=1, align2=1, len=4:         1.20 (  9.54%)            1.33    
           align1=1, align2=0, len=8:         1.20 (  9.76%)            1.33    
           align1=0, align2=1, len=8:         1.20 (  9.69%)            1.33    
           align1=0, align2=0, len=8:         1.20 (  9.57%)            1.33    
           align1=1, align2=1, len=8:         1.21 (  9.42%)            1.33    
          align1=1, align2=0, len=16:         1.39 ( -0.03%)            1.39    
          align1=0, align2=1, len=16:         1.39 ( -0.08%)            1.39    
          align1=0, align2=0, len=16:         1.40 ( -0.64%)            1.39    
          align1=1, align2=1, len=16:         1.39 ( -0.29%)            1.39    
          align1=1, align2=0, len=32:         1.67 ( -7.17%)            1.56    
          align1=0, align2=1, len=32:         1.68 ( -7.90%)            1.55    
          align1=0, align2=0, len=32:         1.67 ( -7.81%)            1.55    
          align1=1, align2=1, len=32:         1.67 ( -7.39%)            1.56    
          align1=1, align2=0, len=64:         1.67 (  7.71%)            1.81    
          align1=0, align2=1, len=64:         1.67 (  7.32%)            1.80    
          align1=0, align2=0, len=64:         1.68 (  7.05%)            1.80    
          align1=1, align2=1, len=64:         1.67 (  7.41%)            1.80    
         align1=1, align2=0, len=128:         2.06 ( 11.49%)            2.33    
         align1=0, align2=1, len=128:         2.05 ( 11.38%)            2.32    
         align1=0, align2=0, len=128:         2.06 ( 16.13%)            2.45    
         align1=1, align2=1, len=128:         2.06 ( 15.96%)            2.45    
         align1=1, align2=0, len=256:         3.09 (  5.35%)            3.27    
         align1=0, align2=1, len=256:         3.10 (  4.07%)            3.23    
         align1=0, align2=0, len=256:         3.09 (  4.30%)            3.23    
         align1=1, align2=1, len=256:         3.10 (  4.04%)            3.23    
         align1=1, align2=0, len=512:         5.33 ( 10.32%)            5.94    
         align1=0, align2=1, len=512:         5.32 ( 10.44%)            5.94    
         align1=0, align2=0, len=512:         5.26 (  9.35%)            5.80    
         align1=1, align2=1, len=512:         5.26 (  9.27%)            5.80    
        align1=1, align2=0, len=1024:         9.87 ( 11.70%)           11.18    
        align1=0, align2=1, len=1024:         9.77 ( 13.27%)           11.26    
        align1=0, align2=0, len=1024:         9.64 ( 13.21%)           11.10    
        align1=1, align2=1, len=1024:         9.64 ( 13.05%)           11.09    
        align1=1, align2=0, len=2048:        17.96 ( 17.14%)           21.68    
        align1=0, align2=1, len=2048:        17.71 ( 18.21%)           21.65    
        align1=0, align2=0, len=2048:        17.37 ( 19.37%)           21.54    
        align1=1, align2=1, len=2048:        17.37 ( 19.41%)           21.55    
        align1=1, align2=0, len=4096:        42.29 ( 10.33%)           47.16    
        align1=0, align2=1, len=4096:        42.22 (  9.95%)           46.88    
        align1=0, align2=0, len=4096:        41.59 ( 11.36%)           46.92    
        align1=1, align2=1, len=4096:        41.57 ( 11.89%)           47.18    
        align1=1, align2=0, len=8192:        73.72 ( 17.77%)           89.66    
        align1=0, align2=1, len=8192:        74.86 ( 16.43%)           89.58    
        align1=0, align2=0, len=8192:        73.91 ( 17.16%)           89.22    
        align1=1, align2=1, len=8192:        73.85 ( 17.26%)           89.26    
       align1=1, align2=0, len=16384:       138.67 ( 19.87%)          173.06
       align1=0, align2=1, len=16384:       134.35 ( 22.39%)          173.12
       align1=0, align2=0, len=16384:       135.85 ( 21.30%)          172.62
       align1=1, align2=1, len=16384:       135.39 ( 21.62%)          172.74
       align1=1, align2=0, len=32768:       262.83 ( 23.07%)          341.63
       align1=0, align2=1, len=32768:       266.12 ( 22.17%)          341.91
       align1=0, align2=0, len=32768:       256.77 ( 24.89%)          341.84
       align1=1, align2=1, len=32768:       258.00 ( 24.42%)          341.35
Paul E. Murphy June 26, 2024, 10:15 p.m. UTC | #2
On 6/26/24 9:34 AM, MAHESH BODAPATI wrote:
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
> 
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
> 
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
> 
> Improvements compared to POWER9 version:
> 
> Use simple comparisons for the first ~512 bytes
>    The main loop is good for long strings, but comparing 16B each time is
>    better for shorter strings. After aligning the address to 16 bytes, we
>    unroll the loop four times, checking 128 bytes each time. There may be
>    some overlap with the main loop for unaligned strings, but it is better
>    for shorter strings.
> 
> Loop with 64 bytes for longer bytes
>    using 4 consecutive lxv/stxv instructions.
> 
> Showed an average improvement of 13% and the degradations for smaller
> strings are not consistent.
> ---
>   sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 306 +++++++++++++++----
>   1 file changed, 242 insertions(+), 64 deletions(-)
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> index 603bde1e39..8918dcf90a 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -42,22 +42,48 @@
>   
>      if USE_AS_STPCPY is defined.
>   
> -   The implementation can load bytes past a null terminator, but only
> -   up to the next 16B boundary, so it never crosses a page.  */
> +   This implementation never reads across a page boundary, but may
> +   read beyond the NUL terminator.  */
>   
> -/* Load quadword at addr+offset to vreg, check for null bytes,
> +/* Load 4 quadwords, merge into one VR for speed and check for NULLs
Minor nit, NULL should be NUL in most cases here, as it refers to the 
specific value of a byte.


> +   and branch to label if NULL is found.  */
> +#define CHECK_64B(offset,addr,label)	  \
> +	lxv     32+v4,(offset+0)(addr);	\
> +	lxv     32+v5,(offset+16)(addr);   \
> +	lxv     32+v6,(offset+32)(addr);   \
> +	lxv     32+v7,(offset+48)(addr);   \
> +	vminub    v14,v4,v5;		\
> +	vminub    v15,v6,v7;		\
> +	vminub    v16,v14,v15;	      \
> +	vcmpequb. v0,v16,v18;	       \
> +	beq     cr6,$+12;		   \
Minor cleanup request: please check the usage of tabs and spaces 
throughout this file. The usage is inconsistent.


> +	li      r7,offset;		  \
> +	b       L(label);		   \
> +	stxv    32+v4,(offset+0)(r11);      \
> +	stxv    32+v5,(offset+16)(r11);     \
> +	stxv    32+v6,(offset+32)(r11);     \
> +	stxv    32+v7,(offset+48)(r11)
> +
> +/* Load quadword at addr+offset to vreg, check for NULL bytes,
>      and branch to label if any are found.  */
> -#define CHECK16(vreg,offset,addr,label) \
> +#define CHECK_16B(vreg,offset,addr,label) \
>   	lxv	vreg+32,offset(addr);	\
> -	vcmpequb. v6,vreg,v18;	\
> +	vcmpequb. v15,vreg,v18;	\
>   	bne	cr6,L(label);
>   
> +/* Store vreg2 with length if NULL is found.  */
> +#define STORE_WITH_LEN(vreg1,vreg2,reg)    \
> +	vctzlsbb r8,vreg1;		 \
> +	addi    r9,r8,1;		   \
> +	sldi    r9,r9,56;		  \
> +	stxvl   32+vreg2,reg,r9;
> +
>   .machine power9
>   ENTRY_TOCLESS (FUNC_NAME, 4)
>   	CALL_MCOUNT 2
>   
>   	vspltisb v18,0		/* Zeroes in v18  */
> -	vspltisb v19,-1 	/* 0xFF bytes in v19  */
> +	vspltisb v19,-1	 /* 0xFF bytes in v19  */
>   
>   	/* Next 16B-aligned address. Prepare address for L(loop).  */
>   	addi	r5,r4,16
> @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	lvsr	v1,0,r4
>   	vperm	v0,v19,v0,v1
>   
> -	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise.  */
>   	beq	cr6,L(no_null)
>   
> -	/* There's a null byte.  */
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1 	/* Add null byte.  */
> -	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v6,v0,r3)
>   
>   #ifdef USE_AS_STPCPY
>   	/* stpcpy returns the dest address plus the size not counting the
> @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	blr
>   
>   L(no_null):
> -	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
> +	stxvl	32+v0,r3,r10	/* Partial store.  */
>   
> +/* The main loop is optimized for longer strings(> 512 bytes),
> +   so checking the first bytes in 16B chunks benefits shorter
> +   strings a lot.  */
>   	.p2align 4
> -L(loop):
> -	CHECK16(v0,0,r5,tail1)
> -	CHECK16(v1,16,r5,tail2)
> -	CHECK16(v2,32,r5,tail3)
> -	CHECK16(v3,48,r5,tail4)
> -	CHECK16(v4,64,r5,tail5)
> -	CHECK16(v5,80,r5,tail6)
> +L(aligned):
> +	CHECK_16B(v0,0,r5,tail1)
> +	CHECK_16B(v1,16,r5,tail2)
> +	CHECK_16B(v2,32,r5,tail3)
> +	CHECK_16B(v3,48,r5,tail4)
> +	CHECK_16B(v4,64,r5,tail5)
> +	CHECK_16B(v5,80,r5,tail6)
> +	CHECK_16B(v6,96,r5,tail7)
> +	CHECK_16B(v7,112,r5,tail8)
>   
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
> @@ -105,18 +133,145 @@ L(loop):
>   	stxv	32+v3,48(r11)
>   	stxv	32+v4,64(r11)
>   	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
> +
> +	addi	r11,r11,128
> +
> +	CHECK_16B(v0,128,r5,tail1)
> +	CHECK_16B(v1,128+16,r5,tail2)
> +	CHECK_16B(v2,128+32,r5,tail3)
> +	CHECK_16B(v3,128+48,r5,tail4)
> +	CHECK_16B(v4,128+64,r5,tail5)
> +	CHECK_16B(v5,128+80,r5,tail6)
> +	CHECK_16B(v6,128+96,r5,tail7)
> +	CHECK_16B(v7,128+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	addi    r11,r11,128
> +
> +	CHECK_16B(v0,256,r5,tail1)
> +	CHECK_16B(v1,256+16,r5,tail2)
> +	CHECK_16B(v2,256+32,r5,tail3)
> +	CHECK_16B(v3,256+48,r5,tail4)
> +	CHECK_16B(v4,256+64,r5,tail5)
> +	CHECK_16B(v5,256+80,r5,tail6)
> +	CHECK_16B(v6,256+96,r5,tail7)
> +	CHECK_16B(v7,256+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	addi    r11,r11,128
> +
> +	CHECK_16B(v0,384,r5,tail1)
> +	CHECK_16B(v1,384+16,r5,tail2)
> +	CHECK_16B(v2,384+32,r5,tail3)
> +	CHECK_16B(v3,384+48,r5,tail4)
> +	CHECK_16B(v4,384+64,r5,tail5)
> +	CHECK_16B(v5,384+80,r5,tail6)
> +	CHECK_16B(v6,384+96,r5,tail7)
> +	CHECK_16B(v7,384+112,r5,tail8)
> +
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	stxv    32+v7,112(r11)
> +
> +	/* Prepare address for the loop.  */
I think this comment can be improved. Maybe, "Align src pointer down to 
a 64B boundary."?


> +	addi    r5,r4,512
> +	clrrdi  r5,r5,6
> +	subf    r7,r4,r5
> +	add     r11,r3,r7
> +
> +/* Switch to a more aggressive approach checking 64B each time.  */
> +	.p2align 5
> +L(strcpy_loop):
> +	CHECK_64B(0,r5,tail_64b)
> +	CHECK_64B(64,r5,tail_64b)
> +	CHECK_64B(128,r5,tail_64b)
> +	CHECK_64B(192,r5,tail_64b)
>   
> -	addi	r5,r5,96
> -	addi	r11,r11,96
> +	CHECK_64B(256,r5,tail_64b)
> +	CHECK_64B(256+64,r5,tail_64b)
> +	CHECK_64B(256+128,r5,tail_64b)
> +	CHECK_64B(256+192,r5,tail_64b)
> +	addi    r5,r5,512
> +	addi    r11,r11,512
> +
> +	b	L(strcpy_loop)
> +
> +	.p2align 5
> +L(tail_64b):
> +	/* OK, we found a NULL byte.  Let's look for it in the current 64-byte
> +	   block and mark it in its corresponding VR.  */
> +	add	r11,r11,r7
> +	vcmpequb. v8,v4,v18
> +	beq	cr6,L(no_null_16B)
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v8,v4,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_16B):
> +	stxv    32+v4,0(r11)
> +	vcmpequb. v8,v5,v18
> +	beq     cr6,L(no_null_32B)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,16
> +	STORE_WITH_LEN(v8,v5,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_32B):
> +	stxv    32+v5,16(r11)
> +	vcmpequb. v8,v6,v18
> +	beq     cr6,L(no_null_48B)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,32
> +	STORE_WITH_LEN(v8,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
>   
> -	b	L(loop)
> +L(no_null_48B):
> +	stxv    32+v6,32(r11)
> +	vcmpequb. v8,v7,v18;
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,48
> +	STORE_WITH_LEN(v8,v7,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
>   
>   	.p2align 4
>   L(tail1):
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1		/* Add null terminator  */
> -	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r11,r9	/* Partial store  */
> +	/* There's a NULL byte.  */
> +	STORE_WITH_LEN(v15,v0,r11)
>   #ifdef USE_AS_STPCPY
>   	/* stpcpy returns the dest address plus the size not counting the
>   	   final '\0'.  */
This comment is used in only two places where the stpcpy return value is 
computed.  I think this one can be removed, the first one is sufficient.


> @@ -127,11 +282,9 @@ L(tail1):
>   	.p2align 4
>   L(tail2):
>   	stxv	32+v0,0(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,16
> -	stxvl	32+v1,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,16
> +	STORE_WITH_LEN(v15,v1,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -141,11 +294,9 @@ L(tail2):
>   L(tail3):
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,32
> -	stxvl	32+v2,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,32
> +	STORE_WITH_LEN(v15,v2,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -156,11 +307,9 @@ L(tail4):
>   	stxv	32+v0,0(r11)
>   	stxv	32+v1,16(r11)
>   	stxv	32+v2,32(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,48
> -	stxvl	32+v3,r11,r9
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,48
> +	STORE_WITH_LEN(v15,v3,r11)
>   #ifdef USE_AS_STPCPY
>   	add	r3,r11,r8
>   #endif
> @@ -168,34 +317,63 @@ L(tail4):
>   
>   	.p2align 4
>   L(tail5):
> -	stxv	32+v0,0(r11)
> -	stxv	32+v1,16(r11)
> -	stxv	32+v2,32(r11)
> -	stxv	32+v3,48(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,64
> -	stxvl	32+v4,r11,r9
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,64
> +	STORE_WITH_LEN(v15,v4,r11)
>   #ifdef USE_AS_STPCPY
> -	add	r3,r11,r8
> +	add     r3,r11,r8
>   #endif
>   	blr
>   
>   	.p2align 4
>   L(tail6):
> -	stxv	32+v0,0(r11)
> -	stxv	32+v1,16(r11)
> -	stxv	32+v2,32(r11)
> -	stxv	32+v3,48(r11)
> -	stxv	32+v4,64(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> -	addi	r11,r11,80
> -	stxvl	32+v5,r11,r9
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,80
> +	STORE_WITH_LEN(v15,v5,r11)
>   #ifdef USE_AS_STPCPY
> -	add	r3,r11,r8
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail7):
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,96
> +	STORE_WITH_LEN(v15,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail8):
> +	stxv    32+v0,0(r11)
> +	stxv    32+v1,16(r11)
> +	stxv    32+v2,32(r11)
> +	stxv    32+v3,48(r11)
> +	stxv    32+v4,64(r11)
> +	stxv    32+v5,80(r11)
> +	stxv    32+v6,96(r11)
> +	/* There's a NULL byte.  */
> +	addi    r11,r11,112
> +	STORE_WITH_LEN(v15,v7,r11)
> +#ifdef USE_AS_STPCPY
> +	add     r3,r11,r8
>   #endif
>   	blr
>

One final request for V3, the prefix of the title should be updated to 
"powerpc64le: ...".

With the minor changes requested, this LGTM.
Adhemerval Zanella Netto June 26, 2024, 10:33 p.m. UTC | #3
On 26/06/24 11:34, MAHESH BODAPATI wrote:
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
> 
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
> 
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
> 
> Improvements compared to POWER9 version:
> 
> Use simple comparisons for the first ~512 bytes
>   The main loop is good for long strings, but comparing 16B each time is
>   better for shorter strings. After aligning the address to 16 bytes, we
>   unroll the loop four times, checking 128 bytes each time. There may be
>   some overlap with the main loop for unaligned strings, but it is better
>   for shorter strings.
> 
> Loop with 64 bytes for longer bytes
>   using 4 consecutive lxv/stxv instructions.
> 
> Showed an average improvement of 13% and the degradations for smaller
> strings are not consistent.

Other implementations either focus on small sizes or add fast-paths code
assuming small sizes are more common than larger one.  You might reevaluate
if this implementation if really worth if you are seeing smaller size
regressions.
Andreas Schwab June 27, 2024, 7:50 a.m. UTC | #4
On Jun 26 2024, Paul E Murphy wrote:

> Minor nit, NULL should be NUL in most cases here, as it refers to the
> specific value of a byte.

NUL is the abbreviation for the ASCII Null character.  In ordiary text
it should be referred to as the null character or null byte.
MAHESH BODAPATI June 27, 2024, 9:22 a.m. UTC | #5
On 27/06/24 4:03 am, Adhemerval Zanella Netto wrote:
>
> On 26/06/24 11:34, MAHESH BODAPATI wrote:
>> This patch modifies the current POWER9 implementation of strcpy and
>> stpcpy to optimize it for POWER9/10.
>>
>> Since no new POWER10 instructions are used, the original POWER9
>> strcpy is modified instead of creating a new implementation for POWER10.
>>
>> The changes also affect stpcpy, which uses the same implementation
>> with some additional code before returning.
>>
>> Improvements compared to POWER9 version:
>>
>> Use simple comparisons for the first ~512 bytes
>>    The main loop is good for long strings, but comparing 16B each time is
>>    better for shorter strings. After aligning the address to 16 bytes, we
>>    unroll the loop four times, checking 128 bytes each time. There may be
>>    some overlap with the main loop for unaligned strings, but it is better
>>    for shorter strings.
>>
>> Loop with 64 bytes for longer bytes
>>    using 4 consecutive lxv/stxv instructions.
>>
>> Showed an average improvement of 13% and the degradations for smaller
>> strings are not consistent.
> Other implementations either focus on small sizes or add fast-paths code
> assuming small sizes are more common than larger one.  You might reevaluate
> if this implementation if really worth if you are seeing smaller size
> regressions.

The implementation is exactly similar to the POWER9 on the first 96 
bytes but benchmark
results are showing inconsistent improvements/regressions. I will submit 
a v3 patch, run
the benchmarks on a different machine and share the results.
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
index 603bde1e39..8918dcf90a 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -42,22 +42,48 @@ 
 
    if USE_AS_STPCPY is defined.
 
-   The implementation can load bytes past a null terminator, but only
-   up to the next 16B boundary, so it never crosses a page.  */
+   This implementation never reads across a page boundary, but may
+   read beyond the NUL terminator.  */
 
-/* Load quadword at addr+offset to vreg, check for null bytes,
+/* Load 4 quadwords, merge into one VR for speed and check for NULLs
+   and branch to label if NULL is found.  */
+#define CHECK_64B(offset,addr,label)	  \
+	lxv     32+v4,(offset+0)(addr);	\
+	lxv     32+v5,(offset+16)(addr);   \
+	lxv     32+v6,(offset+32)(addr);   \
+	lxv     32+v7,(offset+48)(addr);   \
+	vminub    v14,v4,v5;		\
+	vminub    v15,v6,v7;		\
+	vminub    v16,v14,v15;	      \
+	vcmpequb. v0,v16,v18;	       \
+	beq     cr6,$+12;		   \
+	li      r7,offset;		  \
+	b       L(label);		   \
+	stxv    32+v4,(offset+0)(r11);      \
+	stxv    32+v5,(offset+16)(r11);     \
+	stxv    32+v6,(offset+32)(r11);     \
+	stxv    32+v7,(offset+48)(r11)
+
+/* Load quadword at addr+offset to vreg, check for NULL bytes,
    and branch to label if any are found.  */
-#define CHECK16(vreg,offset,addr,label) \
+#define CHECK_16B(vreg,offset,addr,label) \
 	lxv	vreg+32,offset(addr);	\
-	vcmpequb. v6,vreg,v18;	\
+	vcmpequb. v15,vreg,v18;	\
 	bne	cr6,L(label);
 
+/* Store vreg2 with length if NULL is found.  */
+#define STORE_WITH_LEN(vreg1,vreg2,reg)    \
+	vctzlsbb r8,vreg1;		 \
+	addi    r9,r8,1;		   \
+	sldi    r9,r9,56;		  \
+	stxvl   32+vreg2,reg,r9;
+
 .machine power9
 ENTRY_TOCLESS (FUNC_NAME, 4)
 	CALL_MCOUNT 2
 
 	vspltisb v18,0		/* Zeroes in v18  */
-	vspltisb v19,-1 	/* 0xFF bytes in v19  */
+	vspltisb v19,-1	 /* 0xFF bytes in v19  */
 
 	/* Next 16B-aligned address. Prepare address for L(loop).  */
 	addi	r5,r4,16
@@ -70,14 +96,11 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	lvsr	v1,0,r4
 	vperm	v0,v19,v0,v1
 
-	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise.  */
 	beq	cr6,L(no_null)
 
-	/* There's a null byte.  */
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1 	/* Add null byte.  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	/* There's a NULL byte.  */
+	STORE_WITH_LEN(v6,v0,r3)
 
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
@@ -87,17 +110,22 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	blr
 
 L(no_null):
-	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
+	stxvl	32+v0,r3,r10	/* Partial store.  */
 
+/* The main loop is optimized for longer strings(> 512 bytes),
+   so checking the first bytes in 16B chunks benefits shorter
+   strings a lot.  */
 	.p2align 4
-L(loop):
-	CHECK16(v0,0,r5,tail1)
-	CHECK16(v1,16,r5,tail2)
-	CHECK16(v2,32,r5,tail3)
-	CHECK16(v3,48,r5,tail4)
-	CHECK16(v4,64,r5,tail5)
-	CHECK16(v5,80,r5,tail6)
+L(aligned):
+	CHECK_16B(v0,0,r5,tail1)
+	CHECK_16B(v1,16,r5,tail2)
+	CHECK_16B(v2,32,r5,tail3)
+	CHECK_16B(v3,48,r5,tail4)
+	CHECK_16B(v4,64,r5,tail5)
+	CHECK_16B(v5,80,r5,tail6)
+	CHECK_16B(v6,96,r5,tail7)
+	CHECK_16B(v7,112,r5,tail8)
 
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
@@ -105,18 +133,145 @@  L(loop):
 	stxv	32+v3,48(r11)
 	stxv	32+v4,64(r11)
 	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	addi	r11,r11,128
+
+	CHECK_16B(v0,128,r5,tail1)
+	CHECK_16B(v1,128+16,r5,tail2)
+	CHECK_16B(v2,128+32,r5,tail3)
+	CHECK_16B(v3,128+48,r5,tail4)
+	CHECK_16B(v4,128+64,r5,tail5)
+	CHECK_16B(v5,128+80,r5,tail6)
+	CHECK_16B(v6,128+96,r5,tail7)
+	CHECK_16B(v7,128+112,r5,tail8)
+
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	stxv    32+v5,80(r11)
+	stxv    32+v6,96(r11)
+	stxv    32+v7,112(r11)
+
+	addi    r11,r11,128
+
+	CHECK_16B(v0,256,r5,tail1)
+	CHECK_16B(v1,256+16,r5,tail2)
+	CHECK_16B(v2,256+32,r5,tail3)
+	CHECK_16B(v3,256+48,r5,tail4)
+	CHECK_16B(v4,256+64,r5,tail5)
+	CHECK_16B(v5,256+80,r5,tail6)
+	CHECK_16B(v6,256+96,r5,tail7)
+	CHECK_16B(v7,256+112,r5,tail8)
+
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	stxv    32+v5,80(r11)
+	stxv    32+v6,96(r11)
+	stxv    32+v7,112(r11)
+
+	addi    r11,r11,128
+
+	CHECK_16B(v0,384,r5,tail1)
+	CHECK_16B(v1,384+16,r5,tail2)
+	CHECK_16B(v2,384+32,r5,tail3)
+	CHECK_16B(v3,384+48,r5,tail4)
+	CHECK_16B(v4,384+64,r5,tail5)
+	CHECK_16B(v5,384+80,r5,tail6)
+	CHECK_16B(v6,384+96,r5,tail7)
+	CHECK_16B(v7,384+112,r5,tail8)
+
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	stxv    32+v5,80(r11)
+	stxv    32+v6,96(r11)
+	stxv    32+v7,112(r11)
+
+	/* Prepare address for the loop.  */
+	addi    r5,r4,512
+	clrrdi  r5,r5,6
+	subf    r7,r4,r5
+	add     r11,r3,r7
+
+/* Switch to a more aggressive approach checking 64B each time.  */
+	.p2align 5
+L(strcpy_loop):
+	CHECK_64B(0,r5,tail_64b)
+	CHECK_64B(64,r5,tail_64b)
+	CHECK_64B(128,r5,tail_64b)
+	CHECK_64B(192,r5,tail_64b)
 
-	addi	r5,r5,96
-	addi	r11,r11,96
+	CHECK_64B(256,r5,tail_64b)
+	CHECK_64B(256+64,r5,tail_64b)
+	CHECK_64B(256+128,r5,tail_64b)
+	CHECK_64B(256+192,r5,tail_64b)
+	addi    r5,r5,512
+	addi    r11,r11,512
+
+	b	L(strcpy_loop)
+
+	.p2align 5
+L(tail_64b):
+	/* OK, we found a NULL byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	add	r11,r11,r7
+	vcmpequb. v8,v4,v18
+	beq	cr6,L(no_null_16B)
+	/* There's a NULL byte.  */
+	STORE_WITH_LEN(v8,v4,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
+#endif
+	blr
+
+L(no_null_16B):
+	stxv    32+v4,0(r11)
+	vcmpequb. v8,v5,v18
+	beq     cr6,L(no_null_32B)
+	/* There's a NULL byte.  */
+	addi    r11,r11,16
+	STORE_WITH_LEN(v8,v5,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
+#endif
+	blr
+
+L(no_null_32B):
+	stxv    32+v5,16(r11)
+	vcmpequb. v8,v6,v18
+	beq     cr6,L(no_null_48B)
+	/* There's a NULL byte.  */
+	addi    r11,r11,32
+	STORE_WITH_LEN(v8,v6,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
+#endif
+	blr
 
-	b	L(loop)
+L(no_null_48B):
+	stxv    32+v6,32(r11)
+	vcmpequb. v8,v7,v18;
+	/* There's a NULL byte.  */
+	addi    r11,r11,48
+	STORE_WITH_LEN(v8,v7,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
+#endif
+	blr
 
 	.p2align 4
 L(tail1):
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r9	/* Partial store  */
+	/* There's a NULL byte.  */
+	STORE_WITH_LEN(v15,v0,r11)
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
 	   final '\0'.  */
@@ -127,11 +282,9 @@  L(tail1):
 	.p2align 4
 L(tail2):
 	stxv	32+v0,0(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
-	addi	r11,r11,16
-	stxvl	32+v1,r11,r9
+	/* There's a NULL byte.  */
+	addi    r11,r11,16
+	STORE_WITH_LEN(v15,v1,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -141,11 +294,9 @@  L(tail2):
 L(tail3):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
-	addi	r11,r11,32
-	stxvl	32+v2,r11,r9
+	/* There's a NULL byte.  */
+	addi    r11,r11,32
+	STORE_WITH_LEN(v15,v2,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -156,11 +307,9 @@  L(tail4):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
-	addi	r11,r11,48
-	stxvl	32+v3,r11,r9
+	/* There's a NULL byte.  */
+	addi    r11,r11,48
+	STORE_WITH_LEN(v15,v3,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -168,34 +317,63 @@  L(tail4):
 
 	.p2align 4
 L(tail5):
-	stxv	32+v0,0(r11)
-	stxv	32+v1,16(r11)
-	stxv	32+v2,32(r11)
-	stxv	32+v3,48(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
-	addi	r11,r11,64
-	stxvl	32+v4,r11,r9
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	/* There's a NULL byte.  */
+	addi    r11,r11,64
+	STORE_WITH_LEN(v15,v4,r11)
 #ifdef USE_AS_STPCPY
-	add	r3,r11,r8
+	add     r3,r11,r8
 #endif
 	blr
 
 	.p2align 4
 L(tail6):
-	stxv	32+v0,0(r11)
-	stxv	32+v1,16(r11)
-	stxv	32+v2,32(r11)
-	stxv	32+v3,48(r11)
-	stxv	32+v4,64(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
-	addi	r11,r11,80
-	stxvl	32+v5,r11,r9
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	/* There's a NULL byte.  */
+	addi    r11,r11,80
+	STORE_WITH_LEN(v15,v5,r11)
 #ifdef USE_AS_STPCPY
-	add	r3,r11,r8
+	add     r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail7):
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	stxv    32+v5,80(r11)
+	/* There's a NULL byte.  */
+	addi    r11,r11,96
+	STORE_WITH_LEN(v15,v6,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail8):
+	stxv    32+v0,0(r11)
+	stxv    32+v1,16(r11)
+	stxv    32+v2,32(r11)
+	stxv    32+v3,48(r11)
+	stxv    32+v4,64(r11)
+	stxv    32+v5,80(r11)
+	stxv    32+v6,96(r11)
+	/* There's a NULL byte.  */
+	addi    r11,r11,112
+	STORE_WITH_LEN(v15,v7,r11)
+#ifdef USE_AS_STPCPY
+	add     r3,r11,r8
 #endif
 	blr