diff mbox series

[x86] More use of m{32,64}bcst addressing modes with ternlog.

Message ID 002201dabd05$efb5dbd0$cf219370$@nextmovesoftware.com
State New
Headers show
Series [x86] More use of m{32,64}bcst addressing modes with ternlog. | expand

Commit Message

Roger Sayle June 12, 2024, 8:20 p.m. UTC
This patch makes more use of m32bcst and m64bcst addressing modes in
ix86_expand_ternlog.  Previously, the i386 backend would only consider
using a m32bcst if the inner mode of the vector was 32-bits, or using
m64bcst if the inner mode was 64-bits.  For ternlog (and other logic
operations) this is a strange restriction, as how the same constant
is materialized is dependent upon the mode it is used/operated on.
Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
which has the same bit pattern would.  This can optimized by (re)checking
whether a CONST_VECTOR can be broadcast from memory after casting it
to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.


Taking the test case from pr115407:

__attribute__((__vector_size__(64))) char v;
void foo() {
  v = v | v << 7;
}

Compiled with -O2 -mcmodel=large -mavx512bw
GCC 14 generates a 64-byte (512-bit) load from the constant pool:

foo:    movabsq $v, %rax                                // 10
        movabsq $.LC0, %rdx                             // 10
        vpsllw  $7, (%rax), %zmm1                       // 7
        vmovdqa64       (%rax), %zmm0                   // 6
        vpternlogd      $248, (%rdx), %zmm1, %zmm0      // 7
        vmovdqa64       %zmm0, (%rax)                   // 6
        vzeroupper                                      // 3
        ret                                             // 1
.LC0:   .byte   -12                                     // 64 = 114 bytes
        .byte   -128
        ;; repeated another 62 times

mainline currently generates two instructions, using interunit broadcast:

foo:    movabsq $v, %rdx                                // 10
        movl    $-2139062144, %eax                      // 5
        vmovdqa64       (%rdx), %zmm2                   // 6
        vpbroadcastd    %eax, %zmm0                     // 6
        vpsllw  $7, %zmm2, %zmm1                        // 7
        vpternlogd      $236, %zmm0, %zmm2, %zmm1       // 7
        vmovdqa64       %zmm1, (%rdx)                   // 6
        vzeroupper                                      // 3
        ret                                             // 1 = 51 bytes

With this patch, we now generate a broadcast addressing mode:

foo:    movabsq $v, %rax                                   // 10
        movabsq $.LC1, %rdx                                // 10
        vmovdqa64       (%rax), %zmm1                      // 6
        vpsllw  $7, %zmm1, %zmm0                           // 7
        vpternlogd      $236, (%rdx){1to16}, %zmm1, %zmm0  // 7
        vmovdqa64       %zmm0, (%rax)                      // 6
        vzeroupper                                         // 3
        ret                                                // 1 = 50 total

Without -mcmodel=large, the benefit is two instructions:

foo:    vmovdqa64       v(%rip), %zmm1                         // 10
        vpsllw  $7, %zmm1, %zmm0                               // 7
        vpternlogd      $236, .LC2(%rip){1to16}, %zmm1, %zmm0  // 11
        vmovdqa64       %zmm0, v(%rip)                         // 10
        vzeroupper                                             // 3
        ret                                                    // 1 = 42
total


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2024-06-12  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        * config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
        logic operation in a different vector mode if that enables use of
        a 32-bit or 64-bit broadcast addressing mode.

gcc/testsuite/ChangeLog
        * gcc.target/i386/pr115407.c: New test case.


Thanks in advance,
Roger
--

Comments

Hongtao Liu June 13, 2024, 12:35 a.m. UTC | #1
On Thu, Jun 13, 2024 at 4:20 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch makes more use of m32bcst and m64bcst addressing modes in
> ix86_expand_ternlog.  Previously, the i386 backend would only consider
> using a m32bcst if the inner mode of the vector was 32-bits, or using
> m64bcst if the inner mode was 64-bits.  For ternlog (and other logic
> operations) this is a strange restriction, as how the same constant
> is materialized is dependent upon the mode it is used/operated on.
> Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
> use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
> which has the same bit pattern would.  This can optimized by (re)checking
> whether a CONST_VECTOR can be broadcast from memory after casting it
> to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.
>
>
> Taking the test case from pr115407:
>
> __attribute__((__vector_size__(64))) char v;
> void foo() {
>   v = v | v << 7;
> }
>
> Compiled with -O2 -mcmodel=large -mavx512bw
> GCC 14 generates a 64-byte (512-bit) load from the constant pool:
>
> foo:    movabsq $v, %rax                                // 10
>         movabsq $.LC0, %rdx                             // 10
>         vpsllw  $7, (%rax), %zmm1                       // 7
>         vmovdqa64       (%rax), %zmm0                   // 6
>         vpternlogd      $248, (%rdx), %zmm1, %zmm0      // 7
>         vmovdqa64       %zmm0, (%rax)                   // 6
>         vzeroupper                                      // 3
>         ret                                             // 1
> .LC0:   .byte   -12                                     // 64 = 114 bytes
>         .byte   -128
>         ;; repeated another 62 times
>
> mainline currently generates two instructions, using interunit broadcast:
>
> foo:    movabsq $v, %rdx                                // 10
>         movl    $-2139062144, %eax                      // 5
>         vmovdqa64       (%rdx), %zmm2                   // 6
>         vpbroadcastd    %eax, %zmm0                     // 6
>         vpsllw  $7, %zmm2, %zmm1                        // 7
>         vpternlogd      $236, %zmm0, %zmm2, %zmm1       // 7
>         vmovdqa64       %zmm1, (%rdx)                   // 6
>         vzeroupper                                      // 3
>         ret                                             // 1 = 51 bytes
>
> With this patch, we now generate a broadcast addressing mode:
>
> foo:    movabsq $v, %rax                                   // 10
>         movabsq $.LC1, %rdx                                // 10
>         vmovdqa64       (%rax), %zmm1                      // 6
>         vpsllw  $7, %zmm1, %zmm0                           // 7
>         vpternlogd      $236, (%rdx){1to16}, %zmm1, %zmm0  // 7
>         vmovdqa64       %zmm0, (%rax)                      // 6
>         vzeroupper                                         // 3
>         ret                                                // 1 = 50 total
>
> Without -mcmodel=large, the benefit is two instructions:
>
> foo:    vmovdqa64       v(%rip), %zmm1                         // 10
>         vpsllw  $7, %zmm1, %zmm0                               // 7
>         vpternlogd      $236, .LC2(%rip){1to16}, %zmm1, %zmm0  // 11
>         vmovdqa64       %zmm0, v(%rip)                         // 10
>         vzeroupper                                             // 3
>         ret                                                    // 1 = 42
> total
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32}
> with no new failures.  Ok for mainline?
Ok.
>
>
> 2024-06-12  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
>         logic operation in a different vector mode if that enables use of
>         a 32-bit or 64-bit broadcast addressing mode.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/pr115407.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e..a4379b8 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26041,6 +26041,69 @@  ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
       tmp2 = ix86_gen_bcst_mem (mode, op2);
       if (!tmp2)
 	{
+	  machine_mode bcst32_mode = mode;
+	  machine_mode bcst64_mode = mode;
+	  switch (mode)
+	    {
+	    case V1TImode:
+	    case V4SImode:
+	    case V4SFmode:
+	    case V8HImode:
+	    case V16QImode:
+	      bcst32_mode = V4SImode;
+	      bcst64_mode = V2DImode;
+	      break;
+
+	    case V2TImode:
+	    case V8SImode:
+	    case V8SFmode:
+	    case V16HImode:
+	    case V32QImode:
+	      bcst32_mode = V8SImode;
+	      bcst64_mode = V4DImode;
+	      break;
+
+	    case V4TImode:
+	    case V16SImode:
+	    case V16SFmode:
+	    case V32HImode:
+	    case V64QImode:
+	      bcst32_mode = V16SImode;
+	      bcst64_mode = V8DImode;
+	      break;
+
+	    default:
+	      break;
+	    }
+
+	  if (bcst32_mode != mode)
+	    {
+	      tmp2 = gen_lowpart (bcst32_mode, op2);
+	      if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
+		{
+		  tmp2 = ix86_expand_ternlog (bcst32_mode,
+					      gen_lowpart (bcst32_mode, tmp0),
+					      gen_lowpart (bcst32_mode, tmp1),
+					      tmp2, idx, NULL_RTX);
+		  emit_move_insn (target, gen_lowpart (mode, tmp2));
+		  return target;
+		}
+	    }
+
+	  if (bcst64_mode != mode)
+	    {
+	      tmp2 = gen_lowpart (bcst64_mode, op2);
+	      if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
+		{
+		  tmp2 = ix86_expand_ternlog (bcst64_mode,
+					      gen_lowpart (bcst64_mode, tmp0),
+					      gen_lowpart (bcst64_mode, tmp1),
+					      tmp2, idx, NULL_RTX);
+		  emit_move_insn (target, gen_lowpart (mode, tmp2));
+		  return target;
+		}
+	    }
+
 	  tmp2 = force_const_mem (mode, op2);
 	  rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
 	  tmp2 = validize_mem (tmp2);
diff --git a/gcc/testsuite/gcc.target/i386/pr115407.c b/gcc/testsuite/gcc.target/i386/pr115407.c
new file mode 100644
index 0000000..b6cb7a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115407.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mcmodel=large -mavx512bw" } */
+__attribute__((__vector_size__(64))) char v;
+
+void foo() {
+  v = v | v << 7;
+}
+
+/* { dg-final { scan-assembler "vpternlog.*1to16" } } */