diff mbox series

[v4] powerpc64: optimize strcpy and stpcpy for POWER9/10

Message ID 20240708094906.1120118-2-bmahi496@linux.ibm.com
State New
Headers show
Series [v4] powerpc64: optimize strcpy and stpcpy for POWER9/10 | expand

Commit Message

MAHESH BODAPATI July 8, 2024, 9:49 a.m. UTC
From: Mahesh Bodapati <bmahi496@linux.ibm.com>

This patch modifies the current POWER9 implementation of strcpy and
stpcpy to optimize it for POWER9/10.

Since no new POWER10 instructions are used, the original POWER9
strcpy is modified instead of creating a new implementation for POWER10.

The changes also affect stpcpy, which uses the same implementation
with some additional code before returning.

Improvements compared to POWER9 version:

Use simple comparisons for the first ~512 bytes
  The main loop is good for long strings, but comparing 16B each time is
  better for shorter strings. After aligning the address to 16 bytes, we
  unroll the loop four times, checking 128 bytes each time. There may be
  some overlap with the main loop for unaligned strings, but it is better
  for shorter strings.

Loop with 64 bytes for longer bytes
  using 4 consecutive lxv/stxv instructions.

Showed an average improvement of 13%.
---
 Changes v3 -> v4:
 - Used a tab after the opcode.

 sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 276 +++++++++++++++----
 1 file changed, 223 insertions(+), 53 deletions(-)

Comments

Andreas K. Huettel July 9, 2024, 10:22 a.m. UTC | #1
Do you still want this in 2.40 ?

(If not urgent, I'd prefer to have it go into master after the release and 
backport it somewhat later.)


Am Montag, 8. Juli 2024, 11:49:07 CEST schrieb MAHESH BODAPATI:
> From: Mahesh Bodapati <bmahi496@linux.ibm.com>
> 
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
> 
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
> 
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
> 
> Improvements compared to POWER9 version:
> 
> Use simple comparisons for the first ~512 bytes
>   The main loop is good for long strings, but comparing 16B each time is
>   better for shorter strings. After aligning the address to 16 bytes, we
>   unroll the loop four times, checking 128 bytes each time. There may be
>   some overlap with the main loop for unaligned strings, but it is better
>   for shorter strings.
> 
> Loop with 64 bytes for longer bytes
>   using 4 consecutive lxv/stxv instructions.
> 
> Showed an average improvement of 13%.
> ---
>  Changes v3 -> v4:
>  - Used a tab after the opcode.
> 
>  sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 276 +++++++++++++++----
>  1 file changed, 223 insertions(+), 53 deletions(-)
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> index 603bde1e39..2f50625a19 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -42,22 +42,48 @@
>  
>     if USE_AS_STPCPY is defined.
>  
> -   The implementation can load bytes past a null terminator, but only
> -   up to the next 16B boundary, so it never crosses a page.  */
> +   This implementation never reads across a page boundary, but may
> +   read beyond the NUL terminator.  */
>  
> -/* Load quadword at addr+offset to vreg, check for null bytes,
> +/* Load 4 quadwords, merge into one VR for speed and check for NUL
> +   and branch to label if NUL is found.  */
> +#define CHECK_64B(offset,addr,label)		\
> +	lxv	32+v4,(offset+0)(addr);		\
> +	lxv	32+v5,(offset+16)(addr);	\
> +	lxv	32+v6,(offset+32)(addr);	\
> +	lxv	32+v7,(offset+48)(addr);	\
> +	vminub	v14,v4,v5;			\
> +	vminub	v15,v6,v7;			\
> +	vminub	v16,v14,v15;			\
> +	vcmpequb.	v0,v16,v18;		\
> +	beq	cr6,$+12;			\
> +	li	r7,offset;			\
> +	b	L(label);			\
> +	stxv	32+v4,(offset+0)(r11);		\
> +	stxv	32+v5,(offset+16)(r11);		\
> +	stxv	32+v6,(offset+32)(r11);		\
> +	stxv	32+v7,(offset+48)(r11)
> +
> +/* Load quadword at addr+offset to vreg, check for NUL bytes,
>     and branch to label if any are found.  */
> -#define CHECK16(vreg,offset,addr,label) \
> -	lxv	vreg+32,offset(addr);	\
> -	vcmpequb. v6,vreg,v18;	\
> +#define CHECK_16B(vreg,offset,addr,label)	\
> +	lxv	vreg+32,offset(addr);		\
> +	vcmpequb.	v15,vreg,v18;		\
>  	bne	cr6,L(label);
>  
> -.machine power9
> +/* Store vreg2 with length if NUL is found.  */
> +#define STORE_WITH_LEN(vreg1,vreg2,reg)	\
> +	vctzlsbb	r8,vreg1;		\
> +	addi	r9,r8,1;			\
> +	sldi	r9,r9,56;			\
> +	stxvl	32+vreg2,reg,r9;
> +
> +.machine	power9
>  ENTRY_TOCLESS (FUNC_NAME, 4)
>  	CALL_MCOUNT 2
>  
> -	vspltisb v18,0		/* Zeroes in v18  */
> -	vspltisb v19,-1 	/* 0xFF bytes in v19  */
> +	vspltisb	v18,0		/* Zeroes in v18.  */
> +	vspltisb	v19,-1		/* 0xFF bytes in v19.  */
>  
>  	/* Next 16B-aligned address. Prepare address for L(loop).  */
>  	addi	r5,r4,16
> @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  	lvsr	v1,0,r4
>  	vperm	v0,v19,v0,v1
>  
> -	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vcmpequb.	v6,v0,v18	/* 0xff if byte is NUL, 0x00 otherwise.  */
>  	beq	cr6,L(no_null)
>  
> -	/* There's a null byte.  */
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1 	/* Add null byte.  */
> -	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	/* There's a NUL byte.  */
> +	STORE_WITH_LEN(v6,v0,r3)
>  
>  #ifdef USE_AS_STPCPY
>  	/* stpcpy returns the dest address plus the size not counting the
> @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  	blr
>  
>  L(no_null):
> -	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r3,r10	/* Partial store  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
> +	stxvl	32+v0,r3,r10	/* Partial store.  */
>  
> +/* The main loop is optimized for longer strings(> 512 bytes),
> +   so checking the first bytes in 16B chunks benefits shorter
> +   strings a lot.  */
>  	.p2align 4
> -L(loop):
> -	CHECK16(v0,0,r5,tail1)
> -	CHECK16(v1,16,r5,tail2)
> -	CHECK16(v2,32,r5,tail3)
> -	CHECK16(v3,48,r5,tail4)
> -	CHECK16(v4,64,r5,tail5)
> -	CHECK16(v5,80,r5,tail6)
> +L(aligned):
> +	CHECK_16B(v0,0,r5,tail1)
> +	CHECK_16B(v1,16,r5,tail2)
> +	CHECK_16B(v2,32,r5,tail3)
> +	CHECK_16B(v3,48,r5,tail4)
> +	CHECK_16B(v4,64,r5,tail5)
> +	CHECK_16B(v5,80,r5,tail6)
> +	CHECK_16B(v6,96,r5,tail7)
> +	CHECK_16B(v7,112,r5,tail8)
>  
>  	stxv	32+v0,0(r11)
>  	stxv	32+v1,16(r11)
> @@ -105,21 +133,146 @@ L(loop):
>  	stxv	32+v3,48(r11)
>  	stxv	32+v4,64(r11)
>  	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
>  
> -	addi	r5,r5,96
> -	addi	r11,r11,96
> +	addi	r11,r11,128
> +
> +	CHECK_16B(v0,128,r5,tail1)
> +	CHECK_16B(v1,128+16,r5,tail2)
> +	CHECK_16B(v2,128+32,r5,tail3)
> +	CHECK_16B(v3,128+48,r5,tail4)
> +	CHECK_16B(v4,128+64,r5,tail5)
> +	CHECK_16B(v5,128+80,r5,tail6)
> +	CHECK_16B(v6,128+96,r5,tail7)
> +	CHECK_16B(v7,128+112,r5,tail8)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +	stxv	32+v4,64(r11)
> +	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
> +
> +	addi	r11,r11,128
> +
> +	CHECK_16B(v0,256,r5,tail1)
> +	CHECK_16B(v1,256+16,r5,tail2)
> +	CHECK_16B(v2,256+32,r5,tail3)
> +	CHECK_16B(v3,256+48,r5,tail4)
> +	CHECK_16B(v4,256+64,r5,tail5)
> +	CHECK_16B(v5,256+80,r5,tail6)
> +	CHECK_16B(v6,256+96,r5,tail7)
> +	CHECK_16B(v7,256+112,r5,tail8)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +	stxv	32+v4,64(r11)
> +	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
> +
> +	addi	r11,r11,128
> +
> +	CHECK_16B(v0,384,r5,tail1)
> +	CHECK_16B(v1,384+16,r5,tail2)
> +	CHECK_16B(v2,384+32,r5,tail3)
> +	CHECK_16B(v3,384+48,r5,tail4)
> +	CHECK_16B(v4,384+64,r5,tail5)
> +	CHECK_16B(v5,384+80,r5,tail6)
> +	CHECK_16B(v6,384+96,r5,tail7)
> +	CHECK_16B(v7,384+112,r5,tail8)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +	stxv	32+v4,64(r11)
> +	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	stxv	32+v7,112(r11)
> +
> +	/* Align src pointer down to a 64B boundary.  */
> +	addi	r5,r4,512
> +	clrrdi	r5,r5,6
> +	subf	r7,r4,r5
> +	add	r11,r3,r7
> +
> +/* Switch to a more aggressive approach checking 64B each time.  */
> +	.p2align 5
> +L(strcpy_loop):
> +	CHECK_64B(0,r5,tail_64b)
> +	CHECK_64B(64,r5,tail_64b)
> +	CHECK_64B(128,r5,tail_64b)
> +	CHECK_64B(192,r5,tail_64b)
> +
> +	CHECK_64B(256,r5,tail_64b)
> +	CHECK_64B(256+64,r5,tail_64b)
> +	CHECK_64B(256+128,r5,tail_64b)
> +	CHECK_64B(256+192,r5,tail_64b)
> +	addi	r5,r5,512
> +	addi	r11,r11,512
> +
> +	b	L(strcpy_loop)
> +
> +	.p2align 5
> +L(tail_64b):
> +	/* OK, we found a NUL byte.  Let's look for it in the current 64-byte
> +	   block and mark it in its corresponding VR.  */
> +	add	r11,r11,r7
> +	vcmpequb.	v8,v4,v18
> +	beq	cr6,L(no_null_16B)
> +	/* There's a NUL byte.  */
> +	STORE_WITH_LEN(v8,v4,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_16B):
> +	stxv	32+v4,0(r11)
> +	vcmpequb.	v8,v5,v18
> +	beq	cr6,L(no_null_32B)
> +	/* There's a NUL byte.  */
> +	addi	r11,r11,16
> +	STORE_WITH_LEN(v8,v5,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
>  
> -	b	L(loop)
> +L(no_null_32B):
> +	stxv	32+v5,16(r11)
> +	vcmpequb.	v8,v6,v18
> +	beq	cr6,L(no_null_48B)
> +	/* There's a NUL byte.  */
> +	addi	r11,r11,32
> +	STORE_WITH_LEN(v8,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
> +
> +L(no_null_48B):
> +	stxv	32+v6,32(r11)
> +	vcmpequb.	v8,v7,v18;
> +	/* There's a NUL byte.  */
> +	addi	r11,r11,48
> +	STORE_WITH_LEN(v8,v7,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
>  
>  	.p2align 4
>  L(tail1):
> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> -	addi	r9,r8,1		/* Add null terminator  */
> -	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
> -	stxvl	32+v0,r11,r9	/* Partial store  */
> +	/* There's a NUL byte.  */
> +	STORE_WITH_LEN(v15,v0,r11)
>  #ifdef USE_AS_STPCPY
> -	/* stpcpy returns the dest address plus the size not counting the
> -	   final '\0'.  */
>  	add	r3,r11,r8
>  #endif
>  	blr
> @@ -127,11 +280,9 @@ L(tail1):
>  	.p2align 4
>  L(tail2):
>  	stxv	32+v0,0(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
> +	/* There's a NUL byte.  */
>  	addi	r11,r11,16
> -	stxvl	32+v1,r11,r9
> +	STORE_WITH_LEN(v15,v1,r11)
>  #ifdef USE_AS_STPCPY
>  	add	r3,r11,r8
>  #endif
> @@ -141,11 +292,8 @@ L(tail2):
>  L(tail3):
>  	stxv	32+v0,0(r11)
>  	stxv	32+v1,16(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
>  	addi	r11,r11,32
> -	stxvl	32+v2,r11,r9
> +	STORE_WITH_LEN(v15,v2,r11)
>  #ifdef USE_AS_STPCPY
>  	add	r3,r11,r8
>  #endif
> @@ -156,11 +304,8 @@ L(tail4):
>  	stxv	32+v0,0(r11)
>  	stxv	32+v1,16(r11)
>  	stxv	32+v2,32(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
>  	addi	r11,r11,48
> -	stxvl	32+v3,r11,r9
> +	STORE_WITH_LEN(v15,v3,r11)
>  #ifdef USE_AS_STPCPY
>  	add	r3,r11,r8
>  #endif
> @@ -172,11 +317,8 @@ L(tail5):
>  	stxv	32+v1,16(r11)
>  	stxv	32+v2,32(r11)
>  	stxv	32+v3,48(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
>  	addi	r11,r11,64
> -	stxvl	32+v4,r11,r9
> +	STORE_WITH_LEN(v15,v4,r11)
>  #ifdef USE_AS_STPCPY
>  	add	r3,r11,r8
>  #endif
> @@ -189,11 +331,39 @@ L(tail6):
>  	stxv	32+v2,32(r11)
>  	stxv	32+v3,48(r11)
>  	stxv	32+v4,64(r11)
> -	vctzlsbb r8,v6
> -	addi	r9,r8,1
> -	sldi	r9,r9,56
>  	addi	r11,r11,80
> -	stxvl	32+v5,r11,r9
> +	STORE_WITH_LEN(v15,v5,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail7):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +	stxv	32+v4,64(r11)
> +	stxv	32+v5,80(r11)
> +	addi	r11,r11,96
> +	STORE_WITH_LEN(v15,v6,r11)
> +#ifdef USE_AS_STPCPY
> +	add	r3,r11,r8
> +#endif
> +	blr
> +
> +	.p2align 4
> +L(tail8):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +	stxv	32+v4,64(r11)
> +	stxv	32+v5,80(r11)
> +	stxv	32+v6,96(r11)
> +	addi	r11,r11,112
> +	STORE_WITH_LEN(v15,v7,r11)
>  #ifdef USE_AS_STPCPY
>  	add	r3,r11,r8
>  #endif
>
MAHESH BODAPATI July 9, 2024, 4:16 p.m. UTC | #2
On 09/07/24 3:52 pm, Andreas K. Huettel wrote:
> Do you still want this in 2.40 ?
>
> (If not urgent, I'd prefer to have it go into master after the release and
> backport it somewhat later.)


Sure, We can merge this patch after the 2.40 release.


>
>
> Am Montag, 8. Juli 2024, 11:49:07 CEST schrieb MAHESH BODAPATI:
>> From: Mahesh Bodapati <bmahi496@linux.ibm.com>
>>
>> This patch modifies the current POWER9 implementation of strcpy and
>> stpcpy to optimize it for POWER9/10.
>>
>> Since no new POWER10 instructions are used, the original POWER9
>> strcpy is modified instead of creating a new implementation for POWER10.
>>
>> The changes also affect stpcpy, which uses the same implementation
>> with some additional code before returning.
>>
>> Improvements compared to POWER9 version:
>>
>> Use simple comparisons for the first ~512 bytes
>>    The main loop is good for long strings, but comparing 16B each time is
>>    better for shorter strings. After aligning the address to 16 bytes, we
>>    unroll the loop four times, checking 128 bytes each time. There may be
>>    some overlap with the main loop for unaligned strings, but it is better
>>    for shorter strings.
>>
>> Loop with 64 bytes for longer bytes
>>    using 4 consecutive lxv/stxv instructions.
>>
>> Showed an average improvement of 13%.
>> ---
>>   Changes v3 -> v4:
>>   - Used a tab after the opcode.
>>
>>   sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 276 +++++++++++++++----
>>   1 file changed, 223 insertions(+), 53 deletions(-)
>>
>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>> index 603bde1e39..2f50625a19 100644
>> --- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>> @@ -42,22 +42,48 @@
>>   
>>      if USE_AS_STPCPY is defined.
>>   
>> -   The implementation can load bytes past a null terminator, but only
>> -   up to the next 16B boundary, so it never crosses a page.  */
>> +   This implementation never reads across a page boundary, but may
>> +   read beyond the NUL terminator.  */
>>   
>> -/* Load quadword at addr+offset to vreg, check for null bytes,
>> +/* Load 4 quadwords, merge into one VR for speed and check for NUL
>> +   and branch to label if NUL is found.  */
>> +#define CHECK_64B(offset,addr,label)		\
>> +	lxv	32+v4,(offset+0)(addr);		\
>> +	lxv	32+v5,(offset+16)(addr);	\
>> +	lxv	32+v6,(offset+32)(addr);	\
>> +	lxv	32+v7,(offset+48)(addr);	\
>> +	vminub	v14,v4,v5;			\
>> +	vminub	v15,v6,v7;			\
>> +	vminub	v16,v14,v15;			\
>> +	vcmpequb.	v0,v16,v18;		\
>> +	beq	cr6,$+12;			\
>> +	li	r7,offset;			\
>> +	b	L(label);			\
>> +	stxv	32+v4,(offset+0)(r11);		\
>> +	stxv	32+v5,(offset+16)(r11);		\
>> +	stxv	32+v6,(offset+32)(r11);		\
>> +	stxv	32+v7,(offset+48)(r11)
>> +
>> +/* Load quadword at addr+offset to vreg, check for NUL bytes,
>>      and branch to label if any are found.  */
>> -#define CHECK16(vreg,offset,addr,label) \
>> -	lxv	vreg+32,offset(addr);	\
>> -	vcmpequb. v6,vreg,v18;	\
>> +#define CHECK_16B(vreg,offset,addr,label)	\
>> +	lxv	vreg+32,offset(addr);		\
>> +	vcmpequb.	v15,vreg,v18;		\
>>   	bne	cr6,L(label);
>>   
>> -.machine power9
>> +/* Store vreg2 with length if NUL is found.  */
>> +#define STORE_WITH_LEN(vreg1,vreg2,reg)	\
>> +	vctzlsbb	r8,vreg1;		\
>> +	addi	r9,r8,1;			\
>> +	sldi	r9,r9,56;			\
>> +	stxvl	32+vreg2,reg,r9;
>> +
>> +.machine	power9
>>   ENTRY_TOCLESS (FUNC_NAME, 4)
>>   	CALL_MCOUNT 2
>>   
>> -	vspltisb v18,0		/* Zeroes in v18  */
>> -	vspltisb v19,-1 	/* 0xFF bytes in v19  */
>> +	vspltisb	v18,0		/* Zeroes in v18.  */
>> +	vspltisb	v19,-1		/* 0xFF bytes in v19.  */
>>   
>>   	/* Next 16B-aligned address. Prepare address for L(loop).  */
>>   	addi	r5,r4,16
>> @@ -70,14 +96,11 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>>   	lvsr	v1,0,r4
>>   	vperm	v0,v19,v0,v1
>>   
>> -	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
>> +	vcmpequb.	v6,v0,v18	/* 0xff if byte is NUL, 0x00 otherwise.  */
>>   	beq	cr6,L(no_null)
>>   
>> -	/* There's a null byte.  */
>> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
>> -	addi	r9,r8,1 	/* Add null byte.  */
>> -	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
>> -	stxvl	32+v0,r3,r10	/* Partial store  */
>> +	/* There's a NUL byte.  */
>> +	STORE_WITH_LEN(v6,v0,r3)
>>   
>>   #ifdef USE_AS_STPCPY
>>   	/* stpcpy returns the dest address plus the size not counting the
>> @@ -87,17 +110,22 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>>   	blr
>>   
>>   L(no_null):
>> -	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
>> -	stxvl	32+v0,r3,r10	/* Partial store  */
>> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
>> +	stxvl	32+v0,r3,r10	/* Partial store.  */
>>   
>> +/* The main loop is optimized for longer strings(> 512 bytes),
>> +   so checking the first bytes in 16B chunks benefits shorter
>> +   strings a lot.  */
>>   	.p2align 4
>> -L(loop):
>> -	CHECK16(v0,0,r5,tail1)
>> -	CHECK16(v1,16,r5,tail2)
>> -	CHECK16(v2,32,r5,tail3)
>> -	CHECK16(v3,48,r5,tail4)
>> -	CHECK16(v4,64,r5,tail5)
>> -	CHECK16(v5,80,r5,tail6)
>> +L(aligned):
>> +	CHECK_16B(v0,0,r5,tail1)
>> +	CHECK_16B(v1,16,r5,tail2)
>> +	CHECK_16B(v2,32,r5,tail3)
>> +	CHECK_16B(v3,48,r5,tail4)
>> +	CHECK_16B(v4,64,r5,tail5)
>> +	CHECK_16B(v5,80,r5,tail6)
>> +	CHECK_16B(v6,96,r5,tail7)
>> +	CHECK_16B(v7,112,r5,tail8)
>>   
>>   	stxv	32+v0,0(r11)
>>   	stxv	32+v1,16(r11)
>> @@ -105,21 +133,146 @@ L(loop):
>>   	stxv	32+v3,48(r11)
>>   	stxv	32+v4,64(r11)
>>   	stxv	32+v5,80(r11)
>> +	stxv	32+v6,96(r11)
>> +	stxv	32+v7,112(r11)
>>   
>> -	addi	r5,r5,96
>> -	addi	r11,r11,96
>> +	addi	r11,r11,128
>> +
>> +	CHECK_16B(v0,128,r5,tail1)
>> +	CHECK_16B(v1,128+16,r5,tail2)
>> +	CHECK_16B(v2,128+32,r5,tail3)
>> +	CHECK_16B(v3,128+48,r5,tail4)
>> +	CHECK_16B(v4,128+64,r5,tail5)
>> +	CHECK_16B(v5,128+80,r5,tail6)
>> +	CHECK_16B(v6,128+96,r5,tail7)
>> +	CHECK_16B(v7,128+112,r5,tail8)
>> +
>> +	stxv	32+v0,0(r11)
>> +	stxv	32+v1,16(r11)
>> +	stxv	32+v2,32(r11)
>> +	stxv	32+v3,48(r11)
>> +	stxv	32+v4,64(r11)
>> +	stxv	32+v5,80(r11)
>> +	stxv	32+v6,96(r11)
>> +	stxv	32+v7,112(r11)
>> +
>> +	addi	r11,r11,128
>> +
>> +	CHECK_16B(v0,256,r5,tail1)
>> +	CHECK_16B(v1,256+16,r5,tail2)
>> +	CHECK_16B(v2,256+32,r5,tail3)
>> +	CHECK_16B(v3,256+48,r5,tail4)
>> +	CHECK_16B(v4,256+64,r5,tail5)
>> +	CHECK_16B(v5,256+80,r5,tail6)
>> +	CHECK_16B(v6,256+96,r5,tail7)
>> +	CHECK_16B(v7,256+112,r5,tail8)
>> +
>> +	stxv	32+v0,0(r11)
>> +	stxv	32+v1,16(r11)
>> +	stxv	32+v2,32(r11)
>> +	stxv	32+v3,48(r11)
>> +	stxv	32+v4,64(r11)
>> +	stxv	32+v5,80(r11)
>> +	stxv	32+v6,96(r11)
>> +	stxv	32+v7,112(r11)
>> +
>> +	addi	r11,r11,128
>> +
>> +	CHECK_16B(v0,384,r5,tail1)
>> +	CHECK_16B(v1,384+16,r5,tail2)
>> +	CHECK_16B(v2,384+32,r5,tail3)
>> +	CHECK_16B(v3,384+48,r5,tail4)
>> +	CHECK_16B(v4,384+64,r5,tail5)
>> +	CHECK_16B(v5,384+80,r5,tail6)
>> +	CHECK_16B(v6,384+96,r5,tail7)
>> +	CHECK_16B(v7,384+112,r5,tail8)
>> +
>> +	stxv	32+v0,0(r11)
>> +	stxv	32+v1,16(r11)
>> +	stxv	32+v2,32(r11)
>> +	stxv	32+v3,48(r11)
>> +	stxv	32+v4,64(r11)
>> +	stxv	32+v5,80(r11)
>> +	stxv	32+v6,96(r11)
>> +	stxv	32+v7,112(r11)
>> +
>> +	/* Align src pointer down to a 64B boundary.  */
>> +	addi	r5,r4,512
>> +	clrrdi	r5,r5,6
>> +	subf	r7,r4,r5
>> +	add	r11,r3,r7
>> +
>> +/* Switch to a more aggressive approach checking 64B each time.  */
>> +	.p2align 5
>> +L(strcpy_loop):
>> +	CHECK_64B(0,r5,tail_64b)
>> +	CHECK_64B(64,r5,tail_64b)
>> +	CHECK_64B(128,r5,tail_64b)
>> +	CHECK_64B(192,r5,tail_64b)
>> +
>> +	CHECK_64B(256,r5,tail_64b)
>> +	CHECK_64B(256+64,r5,tail_64b)
>> +	CHECK_64B(256+128,r5,tail_64b)
>> +	CHECK_64B(256+192,r5,tail_64b)
>> +	addi	r5,r5,512
>> +	addi	r11,r11,512
>> +
>> +	b	L(strcpy_loop)
>> +
>> +	.p2align 5
>> +L(tail_64b):
>> +	/* OK, we found a NUL byte.  Let's look for it in the current 64-byte
>> +	   block and mark it in its corresponding VR.  */
>> +	add	r11,r11,r7
>> +	vcmpequb.	v8,v4,v18
>> +	beq	cr6,L(no_null_16B)
>> +	/* There's a NUL byte.  */
>> +	STORE_WITH_LEN(v8,v4,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>> +
>> +L(no_null_16B):
>> +	stxv	32+v4,0(r11)
>> +	vcmpequb.	v8,v5,v18
>> +	beq	cr6,L(no_null_32B)
>> +	/* There's a NUL byte.  */
>> +	addi	r11,r11,16
>> +	STORE_WITH_LEN(v8,v5,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>>   
>> -	b	L(loop)
>> +L(no_null_32B):
>> +	stxv	32+v5,16(r11)
>> +	vcmpequb.	v8,v6,v18
>> +	beq	cr6,L(no_null_48B)
>> +	/* There's a NUL byte.  */
>> +	addi	r11,r11,32
>> +	STORE_WITH_LEN(v8,v6,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>> +
>> +L(no_null_48B):
>> +	stxv	32+v6,32(r11)
>> +	vcmpequb.	v8,v7,v18;
>> +	/* There's a NUL byte.  */
>> +	addi	r11,r11,48
>> +	STORE_WITH_LEN(v8,v7,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>>   
>>   	.p2align 4
>>   L(tail1):
>> -	vctzlsbb r8,v6		/* Number of trailing zeroes  */
>> -	addi	r9,r8,1		/* Add null terminator  */
>> -	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
>> -	stxvl	32+v0,r11,r9	/* Partial store  */
>> +	/* There's a NUL byte.  */
>> +	STORE_WITH_LEN(v15,v0,r11)
>>   #ifdef USE_AS_STPCPY
>> -	/* stpcpy returns the dest address plus the size not counting the
>> -	   final '\0'.  */
>>   	add	r3,r11,r8
>>   #endif
>>   	blr
>> @@ -127,11 +280,9 @@ L(tail1):
>>   	.p2align 4
>>   L(tail2):
>>   	stxv	32+v0,0(r11)
>> -	vctzlsbb r8,v6
>> -	addi	r9,r8,1
>> -	sldi	r9,r9,56
>> +	/* There's a NUL byte.  */
>>   	addi	r11,r11,16
>> -	stxvl	32+v1,r11,r9
>> +	STORE_WITH_LEN(v15,v1,r11)
>>   #ifdef USE_AS_STPCPY
>>   	add	r3,r11,r8
>>   #endif
>> @@ -141,11 +292,8 @@ L(tail2):
>>   L(tail3):
>>   	stxv	32+v0,0(r11)
>>   	stxv	32+v1,16(r11)
>> -	vctzlsbb r8,v6
>> -	addi	r9,r8,1
>> -	sldi	r9,r9,56
>>   	addi	r11,r11,32
>> -	stxvl	32+v2,r11,r9
>> +	STORE_WITH_LEN(v15,v2,r11)
>>   #ifdef USE_AS_STPCPY
>>   	add	r3,r11,r8
>>   #endif
>> @@ -156,11 +304,8 @@ L(tail4):
>>   	stxv	32+v0,0(r11)
>>   	stxv	32+v1,16(r11)
>>   	stxv	32+v2,32(r11)
>> -	vctzlsbb r8,v6
>> -	addi	r9,r8,1
>> -	sldi	r9,r9,56
>>   	addi	r11,r11,48
>> -	stxvl	32+v3,r11,r9
>> +	STORE_WITH_LEN(v15,v3,r11)
>>   #ifdef USE_AS_STPCPY
>>   	add	r3,r11,r8
>>   #endif
>> @@ -172,11 +317,8 @@ L(tail5):
>>   	stxv	32+v1,16(r11)
>>   	stxv	32+v2,32(r11)
>>   	stxv	32+v3,48(r11)
>> -	vctzlsbb r8,v6
>> -	addi	r9,r8,1
>> -	sldi	r9,r9,56
>>   	addi	r11,r11,64
>> -	stxvl	32+v4,r11,r9
>> +	STORE_WITH_LEN(v15,v4,r11)
>>   #ifdef USE_AS_STPCPY
>>   	add	r3,r11,r8
>>   #endif
>> @@ -189,11 +331,39 @@ L(tail6):
>>   	stxv	32+v2,32(r11)
>>   	stxv	32+v3,48(r11)
>>   	stxv	32+v4,64(r11)
>> -	vctzlsbb r8,v6
>> -	addi	r9,r8,1
>> -	sldi	r9,r9,56
>>   	addi	r11,r11,80
>> -	stxvl	32+v5,r11,r9
>> +	STORE_WITH_LEN(v15,v5,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>> +
>> +	.p2align 4
>> +L(tail7):
>> +	stxv	32+v0,0(r11)
>> +	stxv	32+v1,16(r11)
>> +	stxv	32+v2,32(r11)
>> +	stxv	32+v3,48(r11)
>> +	stxv	32+v4,64(r11)
>> +	stxv	32+v5,80(r11)
>> +	addi	r11,r11,96
>> +	STORE_WITH_LEN(v15,v6,r11)
>> +#ifdef USE_AS_STPCPY
>> +	add	r3,r11,r8
>> +#endif
>> +	blr
>> +
>> +	.p2align 4
>> +L(tail8):
>> +	stxv	32+v0,0(r11)
>> +	stxv	32+v1,16(r11)
>> +	stxv	32+v2,32(r11)
>> +	stxv	32+v3,48(r11)
>> +	stxv	32+v4,64(r11)
>> +	stxv	32+v5,80(r11)
>> +	stxv	32+v6,96(r11)
>> +	addi	r11,r11,112
>> +	STORE_WITH_LEN(v15,v7,r11)
>>   #ifdef USE_AS_STPCPY
>>   	add	r3,r11,r8
>>   #endif
>>
>
Paul E Murphy July 18, 2024, 8:03 p.m. UTC | #3
On 7/8/24 4:49 AM, MAHESH BODAPATI wrote:
> From: Mahesh Bodapati <bmahi496@linux.ibm.com>
> 
> This patch modifies the current POWER9 implementation of strcpy and
> stpcpy to optimize it for POWER9/10.
> 
> Since no new POWER10 instructions are used, the original POWER9
> strcpy is modified instead of creating a new implementation for POWER10.
> 
> The changes also affect stpcpy, which uses the same implementation
> with some additional code before returning.
> 
> Improvements compared to POWER9 version:
> 
> Use simple comparisons for the first ~512 bytes
>    The main loop is good for long strings, but comparing 16B each time is
>    better for shorter strings. After aligning the address to 16 bytes, we
>    unroll the loop four times, checking 128 bytes each time. There may be
>    some overlap with the main loop for unaligned strings, but it is better
>    for shorter strings.
> 
> Loop with 64 bytes for longer bytes
>    using 4 consecutive lxv/stxv instructions.
> 
> Showed an average improvement of 13%.

Thank you for fixing up the whitespace. This version LGTM.

Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com>
diff mbox series

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
index 603bde1e39..2f50625a19 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
@@ -42,22 +42,48 @@ 
 
    if USE_AS_STPCPY is defined.
 
-   The implementation can load bytes past a null terminator, but only
-   up to the next 16B boundary, so it never crosses a page.  */
+   This implementation never reads across a page boundary, but may
+   read beyond the NUL terminator.  */
 
-/* Load quadword at addr+offset to vreg, check for null bytes,
+/* Load 4 quadwords, merge into one VR for speed and check for NUL
+   and branch to label if NUL is found.  */
+#define CHECK_64B(offset,addr,label)		\
+	lxv	32+v4,(offset+0)(addr);		\
+	lxv	32+v5,(offset+16)(addr);	\
+	lxv	32+v6,(offset+32)(addr);	\
+	lxv	32+v7,(offset+48)(addr);	\
+	vminub	v14,v4,v5;			\
+	vminub	v15,v6,v7;			\
+	vminub	v16,v14,v15;			\
+	vcmpequb.	v0,v16,v18;		\
+	beq	cr6,$+12;			\
+	li	r7,offset;			\
+	b	L(label);			\
+	stxv	32+v4,(offset+0)(r11);		\
+	stxv	32+v5,(offset+16)(r11);		\
+	stxv	32+v6,(offset+32)(r11);		\
+	stxv	32+v7,(offset+48)(r11)
+
+/* Load quadword at addr+offset to vreg, check for NUL bytes,
    and branch to label if any are found.  */
-#define CHECK16(vreg,offset,addr,label) \
-	lxv	vreg+32,offset(addr);	\
-	vcmpequb. v6,vreg,v18;	\
+#define CHECK_16B(vreg,offset,addr,label)	\
+	lxv	vreg+32,offset(addr);		\
+	vcmpequb.	v15,vreg,v18;		\
 	bne	cr6,L(label);
 
-.machine power9
+/* Store vreg2 with length if NUL is found.  */
+#define STORE_WITH_LEN(vreg1,vreg2,reg)	\
+	vctzlsbb	r8,vreg1;		\
+	addi	r9,r8,1;			\
+	sldi	r9,r9,56;			\
+	stxvl	32+vreg2,reg,r9;
+
+.machine	power9
 ENTRY_TOCLESS (FUNC_NAME, 4)
 	CALL_MCOUNT 2
 
-	vspltisb v18,0		/* Zeroes in v18  */
-	vspltisb v19,-1 	/* 0xFF bytes in v19  */
+	vspltisb	v18,0		/* Zeroes in v18.  */
+	vspltisb	v19,-1		/* 0xFF bytes in v19.  */
 
 	/* Next 16B-aligned address. Prepare address for L(loop).  */
 	addi	r5,r4,16
@@ -70,14 +96,11 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	lvsr	v1,0,r4
 	vperm	v0,v19,v0,v1
 
-	vcmpequb. v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vcmpequb.	v6,v0,v18	/* 0xff if byte is NUL, 0x00 otherwise.  */
 	beq	cr6,L(no_null)
 
-	/* There's a null byte.  */
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1 	/* Add null byte.  */
-	sldi	r10,r9,56	/* stxvl wants size in top 8 bits.  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v6,v0,r3)
 
 #ifdef USE_AS_STPCPY
 	/* stpcpy returns the dest address plus the size not counting the
@@ -87,17 +110,22 @@  ENTRY_TOCLESS (FUNC_NAME, 4)
 	blr
 
 L(no_null):
-	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r3,r10	/* Partial store  */
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits.  */
+	stxvl	32+v0,r3,r10	/* Partial store.  */
 
+/* The main loop is optimized for longer strings(> 512 bytes),
+   so checking the first bytes in 16B chunks benefits shorter
+   strings a lot.  */
 	.p2align 4
-L(loop):
-	CHECK16(v0,0,r5,tail1)
-	CHECK16(v1,16,r5,tail2)
-	CHECK16(v2,32,r5,tail3)
-	CHECK16(v3,48,r5,tail4)
-	CHECK16(v4,64,r5,tail5)
-	CHECK16(v5,80,r5,tail6)
+L(aligned):
+	CHECK_16B(v0,0,r5,tail1)
+	CHECK_16B(v1,16,r5,tail2)
+	CHECK_16B(v2,32,r5,tail3)
+	CHECK_16B(v3,48,r5,tail4)
+	CHECK_16B(v4,64,r5,tail5)
+	CHECK_16B(v5,80,r5,tail6)
+	CHECK_16B(v6,96,r5,tail7)
+	CHECK_16B(v7,112,r5,tail8)
 
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
@@ -105,21 +133,146 @@  L(loop):
 	stxv	32+v3,48(r11)
 	stxv	32+v4,64(r11)
 	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
 
-	addi	r5,r5,96
-	addi	r11,r11,96
+	addi	r11,r11,128
+
+	CHECK_16B(v0,128,r5,tail1)
+	CHECK_16B(v1,128+16,r5,tail2)
+	CHECK_16B(v2,128+32,r5,tail3)
+	CHECK_16B(v3,128+48,r5,tail4)
+	CHECK_16B(v4,128+64,r5,tail5)
+	CHECK_16B(v5,128+80,r5,tail6)
+	CHECK_16B(v6,128+96,r5,tail7)
+	CHECK_16B(v7,128+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	addi	r11,r11,128
+
+	CHECK_16B(v0,256,r5,tail1)
+	CHECK_16B(v1,256+16,r5,tail2)
+	CHECK_16B(v2,256+32,r5,tail3)
+	CHECK_16B(v3,256+48,r5,tail4)
+	CHECK_16B(v4,256+64,r5,tail5)
+	CHECK_16B(v5,256+80,r5,tail6)
+	CHECK_16B(v6,256+96,r5,tail7)
+	CHECK_16B(v7,256+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	addi	r11,r11,128
+
+	CHECK_16B(v0,384,r5,tail1)
+	CHECK_16B(v1,384+16,r5,tail2)
+	CHECK_16B(v2,384+32,r5,tail3)
+	CHECK_16B(v3,384+48,r5,tail4)
+	CHECK_16B(v4,384+64,r5,tail5)
+	CHECK_16B(v5,384+80,r5,tail6)
+	CHECK_16B(v6,384+96,r5,tail7)
+	CHECK_16B(v7,384+112,r5,tail8)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	stxv	32+v7,112(r11)
+
+	/* Align src pointer down to a 64B boundary.  */
+	addi	r5,r4,512
+	clrrdi	r5,r5,6
+	subf	r7,r4,r5
+	add	r11,r3,r7
+
+/* Switch to a more aggressive approach checking 64B each time.  */
+	.p2align 5
+L(strcpy_loop):
+	CHECK_64B(0,r5,tail_64b)
+	CHECK_64B(64,r5,tail_64b)
+	CHECK_64B(128,r5,tail_64b)
+	CHECK_64B(192,r5,tail_64b)
+
+	CHECK_64B(256,r5,tail_64b)
+	CHECK_64B(256+64,r5,tail_64b)
+	CHECK_64B(256+128,r5,tail_64b)
+	CHECK_64B(256+192,r5,tail_64b)
+	addi	r5,r5,512
+	addi	r11,r11,512
+
+	b	L(strcpy_loop)
+
+	.p2align 5
+L(tail_64b):
+	/* OK, we found a NUL byte.  Let's look for it in the current 64-byte
+	   block and mark it in its corresponding VR.  */
+	add	r11,r11,r7
+	vcmpequb.	v8,v4,v18
+	beq	cr6,L(no_null_16B)
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v8,v4,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+L(no_null_16B):
+	stxv	32+v4,0(r11)
+	vcmpequb.	v8,v5,v18
+	beq	cr6,L(no_null_32B)
+	/* There's a NUL byte.  */
+	addi	r11,r11,16
+	STORE_WITH_LEN(v8,v5,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
 
-	b	L(loop)
+L(no_null_32B):
+	stxv	32+v5,16(r11)
+	vcmpequb.	v8,v6,v18
+	beq	cr6,L(no_null_48B)
+	/* There's a NUL byte.  */
+	addi	r11,r11,32
+	STORE_WITH_LEN(v8,v6,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+L(no_null_48B):
+	stxv	32+v6,32(r11)
+	vcmpequb.	v8,v7,v18;
+	/* There's a NUL byte.  */
+	addi	r11,r11,48
+	STORE_WITH_LEN(v8,v7,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
 
 	.p2align 4
 L(tail1):
-	vctzlsbb r8,v6		/* Number of trailing zeroes  */
-	addi	r9,r8,1		/* Add null terminator  */
-	sldi	r9,r9,56	/* stxvl wants size in top 8 bits  */
-	stxvl	32+v0,r11,r9	/* Partial store  */
+	/* There's a NUL byte.  */
+	STORE_WITH_LEN(v15,v0,r11)
 #ifdef USE_AS_STPCPY
-	/* stpcpy returns the dest address plus the size not counting the
-	   final '\0'.  */
 	add	r3,r11,r8
 #endif
 	blr
@@ -127,11 +280,9 @@  L(tail1):
 	.p2align 4
 L(tail2):
 	stxv	32+v0,0(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
+	/* There's a NUL byte.  */
 	addi	r11,r11,16
-	stxvl	32+v1,r11,r9
+	STORE_WITH_LEN(v15,v1,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -141,11 +292,8 @@  L(tail2):
 L(tail3):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,32
-	stxvl	32+v2,r11,r9
+	STORE_WITH_LEN(v15,v2,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -156,11 +304,8 @@  L(tail4):
 	stxv	32+v0,0(r11)
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,48
-	stxvl	32+v3,r11,r9
+	STORE_WITH_LEN(v15,v3,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -172,11 +317,8 @@  L(tail5):
 	stxv	32+v1,16(r11)
 	stxv	32+v2,32(r11)
 	stxv	32+v3,48(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,64
-	stxvl	32+v4,r11,r9
+	STORE_WITH_LEN(v15,v4,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif
@@ -189,11 +331,39 @@  L(tail6):
 	stxv	32+v2,32(r11)
 	stxv	32+v3,48(r11)
 	stxv	32+v4,64(r11)
-	vctzlsbb r8,v6
-	addi	r9,r8,1
-	sldi	r9,r9,56
 	addi	r11,r11,80
-	stxvl	32+v5,r11,r9
+	STORE_WITH_LEN(v15,v5,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail7):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	addi	r11,r11,96
+	STORE_WITH_LEN(v15,v6,r11)
+#ifdef USE_AS_STPCPY
+	add	r3,r11,r8
+#endif
+	blr
+
+	.p2align 4
+L(tail8):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+	stxv	32+v4,64(r11)
+	stxv	32+v5,80(r11)
+	stxv	32+v6,96(r11)
+	addi	r11,r11,112
+	STORE_WITH_LEN(v15,v7,r11)
 #ifdef USE_AS_STPCPY
 	add	r3,r11,r8
 #endif