From patchwork Thu Jun 23 12:00:47 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Richard Sandiford <richard.sandiford@arm.com>
X-Patchwork-Id: 1647060
Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@bilbo.ozlabs.org
Authentication-Results: bilbo.ozlabs.org;
	dkim=pass (1024-bit key;
 unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.a=rsa-sha256
 header.s=default header.b=J4bAbS3G;
	dkim-atps=neutral
Authentication-Results: ozlabs.org;
 spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org
 (client-ip=8.43.85.97; helo=sourceware.org;
 envelope-from=gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org;
 receiver=<UNKNOWN>)
Received: from sourceware.org (server2.sourceware.org [8.43.85.97])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest
 SHA256)
	(No client certificate requested)
	by bilbo.ozlabs.org (Postfix) with ESMTPS id 4LTJky5Fy1z9sGP
	for <incoming@patchwork.ozlabs.org>; Thu, 23 Jun 2022 22:01:45 +1000 (AEST)
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 3FC1D385C327
	for <incoming@patchwork.ozlabs.org>; Thu, 23 Jun 2022 12:01:41 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3FC1D385C327
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1655985701;
	bh=uiB9soQz+DBVsCAxyrjd9pJqWSeCVxcbhnTyvtrbqWE=;
	h=To:Subject:Date:List-Id:List-Unsubscribe:List-Archive:List-Post:
	 List-Help:List-Subscribe:From:Reply-To:Cc:From;
	b=J4bAbS3G5GE1JWZJJ1QT42gUr873XFu6AonIhtUcGMI7jZ743a+OagIQS8YyiKO1n
	 7GCP1kzCkCMENdMnhLwXkb+8thS+kTARl1Uk7XggKyPB+yZKgTN8++5/Ok7MrkLTRG
	 KLLkK15HvgrmVfq/rQ15g/zFu/X3SJjk0iOMGS0A=
X-Original-To: gcc-patches@gcc.gnu.org
Delivered-To: gcc-patches@gcc.gnu.org
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
 by sourceware.org (Postfix) with ESMTP id 7D4BA38560A1
 for <gcc-patches@gcc.gnu.org>; Thu, 23 Jun 2022 12:00:49 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 7D4BA38560A1
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
 by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 36E2212FC;
 Thu, 23 Jun 2022 05:00:49 -0700 (PDT)
Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.37])
 by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 8FBED3F534;
 Thu, 23 Jun 2022 05:00:48 -0700 (PDT)
To: gcc-patches@gcc.gnu.org
Mail-Followup-To: gcc-patches@gcc.gnu.org, rguenther@suse.de,
 richard.sandiford@arm.com
Subject: [PATCH] RFC: Optimise SLP permutes of non-consecutive loads
Date: Thu, 23 Jun 2022 13:00:47 +0100
Message-ID: <mptilor4mj4.fsf@arm.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux)
MIME-Version: 1.0
X-Spam-Status: No, score=-56.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
 KAM_DMARC_NONE, KAM_DMARC_STATUS, KAM_LAZY_DOMAIN_SECURITY, KAM_SHORT,
 SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_NONE, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-Patchwork-Original-From: Richard Sandiford via Gcc-patches
 <gcc-patches@gcc.gnu.org>
From: Richard Sandiford <richard.sandiford@arm.com>
Reply-To: Richard Sandiford <richard.sandiford@arm.com>
Cc: rguenther@suse.de
Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org
Sender: "Gcc-patches"
 <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>

In a reduction pair like:

  typedef float T;

  void
  f1 (T *x)
  {
    T res1 = 0;
    T res2 = 0;
    for (int i = 0; i < 100; ++i)
      {
	res1 += x[i * 2];
	res2 += x[i * 2 + 1];
      }
    x[0] = res1;
    x[1] = res2;
  }

it isn't easy to predict whether the initial reduction group will
be { res1, res2 } or { res2, res1 }.  If the initial group ends up
being { res2, res1 }, vect_optimize_slp will try to swap it around
in order to remove an unncessary permute on the load.  This gives:

f1:
.LFB0:
        .cfi_startproc
        movi    v0.4s, 0
        mov     x1, x0
        add     x2, x0, 800
        .p2align 3,,7
.L2:
        ldr     q1, [x1], 16
        fadd    v0.4s, v0.4s, v1.4s
        cmp     x2, x1
        bne     .L2
        dup     d1, v0.d[1]
        fadd    v0.2s, v0.2s, v1.2s
        str     d0, [x0]
        ret

But there are no specific ordering constraints for non-consecutive
loads.  For example, in:

  void
  f2 (T *x)
  {
    T res1 = 0;
    T res2 = 0;
    for (int i = 0; i < 100; ++i)
      {
	res1 += x[i * 4];
	res2 += x[i * 4 + 2];
      }
    x[0] = res1;
    x[1] = res2;
  }

the reduction group happens to be { res2, res1 }, and so we get
a load permutation of { 2, 0, 6, 4 }.  On aarch64 this gives:

f2:
.LFB1:
        .cfi_startproc
        adrp    x2, .LC0
        mov     x1, x0
        movi    v2.4s, 0
        ldr     q3, [x2, #:lo12:.LC0]
        add     x2, x0, 1568
        .p2align 3,,7
.L6:
        ldp     q0, q1, [x1]
        add     x1, x1, 32
        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
        fadd    v2.4s, v2.4s, v0.4s
        cmp     x1, x2
        bne     .L6
	...untidy code, 18 insns...
        ret

where we have to load the permute selector from memory and use
the general TBL instruction.  If the order happened to be { res1, res2 }
instead we would generate the much tidier:

f2:
.LFB1:
        .cfi_startproc
        movi    v1.4s, 0
        mov     x1, x0
        add     x2, x0, 1568
        .p2align 3,,7
.L6:
        ldp     q0, q2, [x1]
        add     x1, x1, 32
        uzp1    v0.4s, v0.4s, v2.4s
        fadd    v1.4s, v1.4s, v0.4s
        cmp     x1, x2
        bne     .L6
        ldr     d0, [x0, 1568]
        ldr     d5, [x0, 1576]
        ldr     s2, [x0, 1584]
        dup     d3, v1.d[1]
        ldr     s4, [x0, 1592]
        zip1    v0.2s, v0.2s, v5.2s
        ins     v2.s[1], v4.s[0]
        fadd    v0.2s, v0.2s, v2.2s
        fadd    v0.2s, v0.2s, v1.2s
        fadd    v0.2s, v0.2s, v3.2s
        str     d0, [x0]
        ret

This WIP patch makes vect_optimize_slp try to put load permutations
into index order.  However, this new transform might be a loss if it
forces permutations elsewhere.  For example, given:

void
f3 (T *restrict x, T *restrict y, T *restrict z)
{
  for (int i = 0; i < 100; ++i)
    {
      x[i * 2] = y[i * 4 + 2] + z[i * 4 + 2];
      x[i * 2 + 1] = y[i * 4] + z[i * 4];
    }
}

we would generate:

.L9:
        ldr     q0, [x1, x3]
        ldr     q3, [x6, x3]
        ldr     q1, [x2, x3]
        ldr     q2, [x5, x3]
        add     x3, x3, 32
        uzp1    v0.4s, v0.4s, v3.4s
        uzp1    v1.4s, v1.4s, v2.4s
        fadd    v0.4s, v0.4s, v1.4s
        rev64   v0.4s, v0.4s
        str     q0, [x4], 16
        cmp     x3, 1568
        bne     .L9

(3 permutes) rather than:

.L9:
        ldr     q0, [x1, x3]
        ldr     q1, [x6, x3]
        ldr     q2, [x2, x3]
        ldr     q3, [x5, x3]
        tbl     v0.16b, {v0.16b - v1.16b}, v4.16b
        add     x3, x3, 32
        tbl     v2.16b, {v2.16b - v3.16b}, v4.16b
        fadd    v0.4s, v0.4s, v2.4s
        str     q0, [x4], 16
        cmp     x3, 1568
        bne     .L9

(2 permutes).

One simple(ish) fix would be to conditionalise the new case on
whether all "roots" of the load are reduction groups.  Alternatively,
we could treat the new case as a soft preference and back out if it
would require any new VEC_PERM_EXPR nodes to be created.  This would
require a propagation back from uses to definitions.

WDYT?  Does this look like a feasible direction?  If so, any thoughts
on when the new case should be enabled?

The patch keeps the bijection requirement, since relaxing that seems
like separate work.

Tested on aarch64-linux-gnu, no regressions.

Thanks,
Richard
---
 gcc/tree-vect-slp.cc | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index dab5daddcc5..fde35d8c954 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -20,6 +20,7 @@ along with GCC; see the file COPYING3.  If not see
 <http://www.gnu.org/licenses/>.  */
 
 #include "config.h"
+#define INCLUDE_ALGORITHM
 #include "system.h"
 #include "coretypes.h"
 #include "backend.h"
@@ -3698,43 +3699,29 @@ vect_optimize_slp (vec_info *vinfo)
       if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
 	continue;
       dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
-      unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0;
-      bool any_permute = false;
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	{
-	  unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j];
-	  imin = MIN (imin, idx);
-	  imax = MAX (imax, idx);
-	  if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j)
-	    any_permute = true;
-	}
-      /* If there's no permute no need to split one out.  */
-      if (!any_permute)
-	continue;
-      /* If the span doesn't match we'd disrupt VF computation, avoid
-	 that for now.  */
-      if (imax - imin + 1 != SLP_TREE_LANES (node))
+
+      auto_vec<unsigned> sorted;
+      sorted.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
+      std::sort (sorted.begin (), sorted.end ());
+      if (std::equal (sorted.begin (), sorted.end (),
+		      SLP_TREE_LOAD_PERMUTATION (node).begin ()))
 	continue;
 
       /* For now only handle true permutes, like
 	 vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
 	 when permuting constants and invariants keeping the permute
 	 bijective.  */
-      auto_sbitmap load_index (SLP_TREE_LANES (node));
-      bitmap_clear (load_index);
-      for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin);
-      unsigned j;
-      for (j = 0; j < SLP_TREE_LANES (node); ++j)
-	if (!bitmap_bit_p (load_index, j))
-	  break;
-      if (j != SLP_TREE_LANES (node))
+      if (std::adjacent_find (sorted.begin (), sorted.end ()) != sorted.end ())
 	continue;
 
       vec<unsigned> perm = vNULL;
       perm.safe_grow (SLP_TREE_LANES (node), true);
       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
-	perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin;
+	{
+	  auto it = std::lower_bound (sorted.begin (), sorted.end (),
+				      SLP_TREE_LOAD_PERMUTATION (node)[j]);
+	  perm[j] = it - sorted.begin ();
+	}
       perms.safe_push (perm);
       vertices[idx].perm_in = perms.length () - 1;
       vertices[idx].perm_out = perms.length () - 1;
@@ -6647,7 +6634,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   int vec_index = 0;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = SLP_TREE_VECTYPE (node);
   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
   machine_mode mode;