diff mbox series

[v1] x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212]

Message ID 20240927225010.2846563-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] | expand

Commit Message

Noah Goldstein Sept. 27, 2024, 10:50 p.m. UTC
The loop should be aligned to 32-bytes so that it can ideally run out
the DSB. This is particularly important on Skylake-Server where
deficiencies in it's DSB implementation make it prone to not being
able to run loops out of the DSB.

For example running strcmp-evex on 200Mb string:

32-byte aligned loop:
    - 43,399,578,766      idq.dsb_uops
not 32-byte aligned loop:
    - 6,060,139,704       idq.dsb_uops

This results in a 25% performance degradation for the non-aligned
version.

The fix is to just ensure the code layout is such that the loop is
aligned. (Which was previously the case but was accidentally dropped
in 84e7c46df).

Times are reported as ratio of Time_With_Patch /
Time_Without_Patch. Lower is better.

The values being reported is the geometric mean of the ratio across
all tests in bench-strcmp and bench-strncmp.

Note this patch is only attempting to improve the Skylake-Server
strcmp for long strings. The rest of the numbers are only to test for
regressions.

Tigerlake Results Strings <= 512:
    strcmp : 1.026
    strncmp: 0.949

Tigerlake Results Strings > 512:
    strcmp : 0.994
    strncmp: 0.998

Skylake-Server Results Strings <= 512:
    strcmp : 0.945
    strncmp: 0.943

Skylake-Server Results Strings > 512:
    strcmp : 0.778
    strncmp: 1.000

The 2.6% regression on TGL-strcmp is due to slowdowns caused by
changes in alignment of code handling small sizes (most on the
page-cross logic). These should be safe to ignore because 1) We
previously only 16-byte aligned the function so this behavior is not
new and was essentially up to chance before this patch and 2) this
type of alignment related regression on small sizes really only comes
up in tight micro-benchmark loops and is unlikely to have any affect
on realworld performance.
---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

Comments

H.J. Lu Sept. 27, 2024, 11:10 p.m. UTC | #1
On Sat, Sep 28, 2024 at 6:50 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The loop should be aligned to 32-bytes so that it can ideally run out
> the DSB. This is particularly important on Skylake-Server where
> deficiencies in it's DSB implementation make it prone to not being
> able to run loops out of the DSB.
>
> For example running strcmp-evex on 200Mb string:
>
> 32-byte aligned loop:
>     - 43,399,578,766      idq.dsb_uops
> not 32-byte aligned loop:
>     - 6,060,139,704       idq.dsb_uops
>
> This results in a 25% performance degradation for the non-aligned
> version.
>
> The fix is to just ensure the code layout is such that the loop is
> aligned. (Which was previously the case but was accidentally dropped
> in 84e7c46df).
>
> Times are reported as ratio of Time_With_Patch /
> Time_Without_Patch. Lower is better.
>
> The values being reported is the geometric mean of the ratio across
> all tests in bench-strcmp and bench-strncmp.
>
> Note this patch is only attempting to improve the Skylake-Server
> strcmp for long strings. The rest of the numbers are only to test for
> regressions.
>
> Tigerlake Results Strings <= 512:
>     strcmp : 1.026
>     strncmp: 0.949
>
> Tigerlake Results Strings > 512:
>     strcmp : 0.994
>     strncmp: 0.998
>
> Skylake-Server Results Strings <= 512:
>     strcmp : 0.945
>     strncmp: 0.943
>
> Skylake-Server Results Strings > 512:
>     strcmp : 0.778
>     strncmp: 1.000
>
> The 2.6% regression on TGL-strcmp is due to slowdowns caused by
> changes in alignment of code handling small sizes (most on the
> page-cross logic). These should be safe to ignore because 1) We
> previously only 16-byte aligned the function so this behavior is not
> new and was essentially up to chance before this patch and 2) this
> type of alignment related regression on small sizes really only comes
> up in tight micro-benchmark loops and is unlikely to have any affect
> on realworld performance.
> ---
>  sysdeps/x86_64/multiarch/strcmp-evex.S | 26 +++++++++++++-------------
>  1 file changed, 13 insertions(+), 13 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index 06730ab2a1..cea034f394 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -209,7 +209,9 @@
>     returned.  */
>
>         .section SECTION(.text), "ax", @progbits
> -       .align  16
> +       /* Align 64 bytes here. This is to get the L(loop) block ideally
> +          aligned for the DSB.  */
> +       .align  64
>         .type   STRCMP, @function
>         .globl  STRCMP
>  # ifdef USE_AS_STRCASECMP_L
> @@ -509,9 +511,7 @@ L(ret4):
>         ret
>  # endif
>
> -       /* 32 byte align here ensures the main loop is ideally aligned
> -          for DSB.  */
> -       .p2align 5
> +       .p2align 4,, 4
>  L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
>         VMOVU   (VEC_SIZE)(%rdi), %VMM(0)
> @@ -1426,10 +1426,9 @@ L(less_32_till_page):
>  L(ret_zero_page_cross_slow_case0):
>         xorl    %eax, %eax
>         ret
> -# endif
> -
> -
> +# else
>         .p2align 4,, 10
> +# endif
>  L(less_16_till_page):
>         cmpl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
>         ja      L(less_8_till_page)
> @@ -1482,8 +1481,12 @@ L(less_16_till_page):
>  # endif
>         jmp     L(prepare_loop_aligned)
>
> -
> -
> +# ifndef USE_AS_STRNCMP
> +       /* Fits in aligning bytes.  */
> +L(ret_zero_4_loop):
> +       xorl    %eax, %eax
> +       ret
> +# endif
>
>         .p2align 4,, 10
>  L(less_8_till_page):
> @@ -1554,6 +1557,7 @@ L(ret_less_8_wcs):
>
>  #  ifdef USE_AS_STRNCMP
>         .p2align 4,, 2
> +L(ret_zero_4_loop):
>  L(ret_zero_page_cross_slow_case1):
>         xorl    %eax, %eax
>         ret
> @@ -1586,10 +1590,6 @@ L(less_4_loop):
>         subq    $-(CHAR_PER_VEC * 4), %rdx
>  #  endif
>         jmp     L(prepare_loop_aligned)
> -
> -L(ret_zero_4_loop):
> -       xorl    %eax, %eax
> -       ret
>  L(ret_less_4_loop):
>         xorl    %r8d, %eax
>         subl    %r8d, %eax
> --
> 2.34.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
Alexander Monakov Sept. 28, 2024, 6:06 a.m. UTC | #2
On Fri, 27 Sep 2024, Noah Goldstein wrote:

> The loop should be aligned to 32-bytes so that it can ideally run out
> the DSB. This is particularly important on Skylake-Server where
> deficiencies in it's DSB implementation make it prone to not being
> able to run loops out of the DSB.

The snippet in comment #13 of the bug suggests it's the well-known
Skylake JCC erratum, although without certainty because the function
is 16-byte aligned before your patch (we only see a branch crossing
a 16-byte boundary).

The disconnect between your paragraph above and the patch where you
align the function to 64 bytes, not 32, is a bit confusing though.
If you're over-aligning the function to reduce alignment padding
in front of loops, can you mention that?

Alexander
Noah Goldstein Sept. 28, 2024, 7:54 p.m. UTC | #3
On Sat, Sep 28, 2024 at 1:06 AM Alexander Monakov <amonakov@ispras.ru> wrote:
>
>
> On Fri, 27 Sep 2024, Noah Goldstein wrote:
>
> > The loop should be aligned to 32-bytes so that it can ideally run out
> > the DSB. This is particularly important on Skylake-Server where
> > deficiencies in it's DSB implementation make it prone to not being
> > able to run loops out of the DSB.
>
> The snippet in comment #13 of the bug suggests it's the well-known
> Skylake JCC erratum, although without certainty because the function
> is 16-byte aligned before your patch (we only see a branch crossing
> a 16-byte boundary).

I did not map out the DSB lines, but it could also just be capacity.

>
> The disconnect between your paragraph above and the patch where you
> align the function to 64 bytes, not 32, is a bit confusing though.
> If you're over-aligning the function to reduce alignment padding
> in front of loops, can you mention that?

Sure I was update the commit message. The reason for 64-byte is
mostly because "that's what we do in many of the other functions".
Not so principalled.

>
> Alexander
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 06730ab2a1..cea034f394 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -209,7 +209,9 @@ 
    returned.  */
 
 	.section SECTION(.text), "ax", @progbits
-	.align	16
+	/* Align 64 bytes here. This is to get the L(loop) block ideally
+	   aligned for the DSB.  */
+	.align	64
 	.type	STRCMP, @function
 	.globl	STRCMP
 # ifdef USE_AS_STRCASECMP_L
@@ -509,9 +511,7 @@  L(ret4):
 	ret
 # endif
 
-	/* 32 byte align here ensures the main loop is ideally aligned
-	   for DSB.  */
-	.p2align 5
+	.p2align 4,, 4
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
@@ -1426,10 +1426,9 @@  L(less_32_till_page):
 L(ret_zero_page_cross_slow_case0):
 	xorl	%eax, %eax
 	ret
-# endif
-
-
+# else
 	.p2align 4,, 10
+# endif
 L(less_16_till_page):
 	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
@@ -1482,8 +1481,12 @@  L(less_16_till_page):
 # endif
 	jmp	L(prepare_loop_aligned)
 
-
-
+# ifndef USE_AS_STRNCMP
+	/* Fits in aligning bytes.  */
+L(ret_zero_4_loop):
+	xorl	%eax, %eax
+	ret
+# endif
 
 	.p2align 4,, 10
 L(less_8_till_page):
@@ -1554,6 +1557,7 @@  L(ret_less_8_wcs):
 
 #  ifdef USE_AS_STRNCMP
 	.p2align 4,, 2
+L(ret_zero_4_loop):
 L(ret_zero_page_cross_slow_case1):
 	xorl	%eax, %eax
 	ret
@@ -1586,10 +1590,6 @@  L(less_4_loop):
 	subq	$-(CHAR_PER_VEC * 4), %rdx
 #  endif
 	jmp	L(prepare_loop_aligned)
-
-L(ret_zero_4_loop):
-	xorl	%eax, %eax
-	ret
 L(ret_less_4_loop):
 	xorl	%r8d, %eax
 	subl	%r8d, %eax