From patchwork Wed Jul 10 12:36:08 2024
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Richard Biener <rguenther@suse.de>
X-Patchwork-Id: 1958823
Return-Path: <gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org>
X-Original-To: incoming@patchwork.ozlabs.org
Delivered-To: patchwork-incoming@legolas.ozlabs.org
Authentication-Results: legolas.ozlabs.org;
	dkim=pass (1024-bit key;
 unprotected) header.d=suse.de header.i=@suse.de header.a=rsa-sha256
 header.s=susede2_rsa header.b=0CrkH1Uy;
	dkim=pass header.d=suse.de header.i=@suse.de header.a=ed25519-sha256
 header.s=susede2_ed25519 header.b=uq3hagmZ;
	dkim=pass (1024-bit key) header.d=suse.de header.i=@suse.de
 header.a=rsa-sha256 header.s=susede2_rsa header.b=0CrkH1Uy;
	dkim=neutral header.d=suse.de header.i=@suse.de header.a=ed25519-sha256
 header.s=susede2_ed25519 header.b=uq3hagmZ;
	dkim-atps=neutral
Authentication-Results: legolas.ozlabs.org;
 spf=pass (sender SPF authorized) smtp.mailfrom=gcc.gnu.org
 (client-ip=2620:52:3:1:0:246e:9693:128c; helo=server2.sourceware.org;
 envelope-from=gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org;
 receiver=patchwork.ozlabs.org)
Received: from server2.sourceware.org (server2.sourceware.org
 [IPv6:2620:52:3:1:0:246e:9693:128c])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature ECDSA (secp384r1) server-digest SHA384)
	(No client certificate requested)
	by legolas.ozlabs.org (Postfix) with ESMTPS id 4WJy5y05Thz1xqj
	for <incoming@patchwork.ozlabs.org>; Wed, 10 Jul 2024 22:36:34 +1000 (AEST)
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id DDFAD3865C17
	for <incoming@patchwork.ozlabs.org>; Wed, 10 Jul 2024 12:36:32 +0000 (GMT)
X-Original-To: gcc-patches@gcc.gnu.org
Delivered-To: gcc-patches@gcc.gnu.org
Received: from smtp-out1.suse.de (smtp-out1.suse.de [195.135.223.130])
 by sourceware.org (Postfix) with ESMTPS id 941B53858D20
 for <gcc-patches@gcc.gnu.org>; Wed, 10 Jul 2024 12:36:09 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 941B53858D20
Authentication-Results: sourceware.org;
 dmarc=pass (p=none dis=none) header.from=suse.de
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=suse.de
ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 941B53858D20
Authentication-Results: server2.sourceware.org;
 arc=none smtp.remote-ip=195.135.223.130
ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1720614971; cv=none;
 b=SUm38dgFHy5qrxVsVs49VbTYvtnZbwS4R+fww4FTaQM941GvFftiVMQeaMMty2nj1cYi/DnQvEaM2kio81tNrwTYYuQUg3h8zp5qS9vN8ZovODkC6AJBM3DtnEdZw8ZwmiAO25rWujZgTTdIqJB0C1u3OAhEh7nxmYSMByuI7WM=
ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key;
 t=1720614971; c=relaxed/simple;
 bh=XAJFq0rzUM0/E8GbYHp2tzPc6WOuUzZ8jAvdmXq0cUg=;
 h=DKIM-Signature:DKIM-Signature:DKIM-Signature:DKIM-Signature:Date:
 From:To:Subject:MIME-Version;
 b=wWkPXXvj67aPI5l+zYGQnamz2IfEAdtQEUuHwAzo9FwDUO72J4CSXwg1umxAIUqzdVfQlZDOzxiERzV7Opy9J0IyyznVYzjyTsRjyg82vNk5rKrPJYCm6aVgghclQAtqoZrwo0xwQQlH6yDZVOGg3OaOAIkKZiLzsDTmMTuYUYo=
ARC-Authentication-Results: i=1; server2.sourceware.org
Received: from murzim.nue2.suse.org (unknown [10.168.4.243])
 (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
 key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest
 SHA256)
 (No client certificate requested)
 by smtp-out1.suse.de (Postfix) with ESMTPS id 8ADE621A13;
 Wed, 10 Jul 2024 12:36:08 +0000 (UTC)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de;
 s=susede2_rsa;
 t=1720614968;
 h=from:from:reply-to:date:date:to:to:cc:cc:mime-version:mime-version:
 content-type:content-type; bh=WIRI9BmkVcX75rHWTEEipAspa7w6Rwn5o7yD4KLm3FU=;
 b=0CrkH1Uy18lOiRkyqiKhbI8CYDv9TI1OE4xErVta2KZSbLnJ90abR70yCz1dPEeEVio6Ru
 GEB5R5OtbUg7TR/mGnoKvaFzG4XVV/DSs9TNAfiLO9gj4dPevQR7pJOB2k1ObLxS0ANCxj
 LO1JubGo9QLxvSJO3UGnOLD0HhuReyI=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de;
 s=susede2_ed25519; t=1720614968;
 h=from:from:reply-to:date:date:to:to:cc:cc:mime-version:mime-version:
 content-type:content-type; bh=WIRI9BmkVcX75rHWTEEipAspa7w6Rwn5o7yD4KLm3FU=;
 b=uq3hagmZoTGQCLcQ/CZ3yG4z5bnVvmEVeOHBjdgOZm0F9xab7kbJsR+NwllbMo5CWoqxsb
 GyVhVi6EnLpQxmBQ==
Authentication-Results: smtp-out1.suse.de;
	none
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=suse.de;
 s=susede2_rsa;
 t=1720614968;
 h=from:from:reply-to:date:date:to:to:cc:cc:mime-version:mime-version:
 content-type:content-type; bh=WIRI9BmkVcX75rHWTEEipAspa7w6Rwn5o7yD4KLm3FU=;
 b=0CrkH1Uy18lOiRkyqiKhbI8CYDv9TI1OE4xErVta2KZSbLnJ90abR70yCz1dPEeEVio6Ru
 GEB5R5OtbUg7TR/mGnoKvaFzG4XVV/DSs9TNAfiLO9gj4dPevQR7pJOB2k1ObLxS0ANCxj
 LO1JubGo9QLxvSJO3UGnOLD0HhuReyI=
DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=suse.de;
 s=susede2_ed25519; t=1720614968;
 h=from:from:reply-to:date:date:to:to:cc:cc:mime-version:mime-version:
 content-type:content-type; bh=WIRI9BmkVcX75rHWTEEipAspa7w6Rwn5o7yD4KLm3FU=;
 b=uq3hagmZoTGQCLcQ/CZ3yG4z5bnVvmEVeOHBjdgOZm0F9xab7kbJsR+NwllbMo5CWoqxsb
 GyVhVi6EnLpQxmBQ==
Date: Wed, 10 Jul 2024 14:36:08 +0200 (CEST)
From: Richard Biener <rguenther@suse.de>
To: gcc-patches@gcc.gnu.org
cc: Jan Hubicka <hubicka@ucw.cz>
Subject: [PATCH] tree-optimization/115825 - improve unroll estimates for
 volatile accesses
MIME-Version: 1.0
X-Spam-Score: -1.74
X-Spam-Level: 
X-Spamd-Result: default: False [-1.74 / 50.00]; BAYES_HAM(-3.00)[100.00%];
 MISSING_MID(2.50)[]; NEURAL_HAM_LONG(-1.00)[-1.000];
 NEURAL_HAM_SHORT(-0.14)[-0.695]; MIME_GOOD(-0.10)[text/plain];
 MISSING_XM_UA(0.00)[]; RCVD_COUNT_ZERO(0.00)[0];
 ARC_NA(0.00)[]; RCPT_COUNT_TWO(0.00)[2]; FROM_HAS_DN(0.00)[];
 DKIM_SIGNED(0.00)[suse.de:s=susede2_rsa,suse.de:s=susede2_ed25519];
 FROM_EQ_ENVFROM(0.00)[]; MIME_TRACE(0.00)[0:+];
 TO_MATCH_ENVRCPT_ALL(0.00)[]; FUZZY_BLOCKED(0.00)[rspamd.com];
 TO_DN_SOME(0.00)[]
X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, MISSING_MID,
 SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.30
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
Errors-To: gcc-patches-bounces~incoming=patchwork.ozlabs.org@gcc.gnu.org
Message-Id: <20240710123632.DDFAD3865C17@sourceware.org>

The loop unrolling code assumes that one third of all volatile accesses
can be possibly optimized away which is of course not true.  This leads
to excessive unrolling in some cases.  The following tracks the number
of stmts with side-effects as those are not eliminatable later and
only assumes one third of the other stmts can be further optimized.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

There's quite some testsuite fallout, mostly because of different rounding
and a size of 8 now no longer is optimistically optimized to 5 but only 6.
I can fix that by writing

  *est_eliminated = (unr_insns - not_elim) / 3;

as

  *est_eliminated = unr_insns - not_elim - (unr_insns - not_elim) * 2 / 3;

to preserve the old rounding behavior.  But for example

FAIL: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for 
warnings, line 56)

shows

  size:   3 C::C (_25, &MEM <const void *[8]> [(void *)&_ZTT2D1 + 48B]);

which we now consider not being optimizable (correctly I think) and thus
the optimistic size reduction isn't enough to get the loop unrolled.
Previously the computed size of 20 was reduced to 13, exactly the size
of the not unrolled body.

So the remaining fallout will be

+FAIL: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 LP64 note (test for 
warnings
, line 56)
+FAIL: g++.dg/warn/Warray-bounds-20.C  -std=gnu++14 note (test for 
warnings, lin
e 66)
...
+FAIL: c-c++-common/ubsan/unreachable-3.c  -std=gnu++14  scan-tree-dump 
optimized "__builtin___ubsan_handle_builtin_unreachable"
...
+FAIL: c-c++-common/ubsan/unreachable-3.c   -O0   scan-tree-dump optimized 
"__builtin___ubsan_handle_builtin_unreachable"

for the latter the issue is __builtin___sanitizer_cov_trace_pc ()

Does this seem feasible overall?  I can fixup the testcases above
with #pragma unroll ...

Thanks,
Richard.

	PR tree-optimization/115825
	* tree-ssa-loop-ivcanon.cc (loop_size::not_eliminatable_after_peeling):
	New.
	(loop_size::last_iteration_not_eliminatable_after_peeling): Likewise.
	(tree_estimate_loop_size): Count stmts with side-effects as
	not optimistically eliminatable.
	(estimated_unrolled_size): Compute the number of stmts that can
	be optimistically eliminated by followup transforms.
	(try_unroll_loop_completely): Adjust.

	* gcc.dg/tree-ssa/cunroll-17.c: New testcase.
---
 gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c | 11 +++++++
 gcc/tree-ssa-loop-ivcanon.cc               | 35 +++++++++++++++++-----
 2 files changed, 38 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c
new file mode 100644
index 00000000000..282db99c883
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/cunroll-17.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fdump-tree-optimized" } */
+
+char volatile v;
+void for16 (void)
+{
+  for (char i = 16; i > 0; i -= 2)
+    v = i;
+}
+
+/* { dg-final { scan-tree-dump-times " ={v} " 1 "optimized" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 5ef24a91917..dd941c31648 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -139,11 +139,16 @@ struct loop_size
      variable where induction variable starts at known constant.)  */
   int eliminated_by_peeling;
 
+  /* Number of instructions that cannot be further optimized in the
+     peeled loop, for example volatile accesses.  */
+  int not_eliminatable_after_peeling;
+
   /* Same statistics for last iteration of loop: it is smaller because
      instructions after exit are not executed.  */
   int last_iteration;
   int last_iteration_eliminated_by_peeling;
-  
+  int last_iteration_not_eliminatable_after_peeling;
+
   /* If some IV computation will become constant.  */
   bool constant_iv;
 
@@ -267,8 +272,10 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
   size->overall = 0;
   size->eliminated_by_peeling = 0;
+  size->not_eliminatable_after_peeling = 0;
   size->last_iteration = 0;
   size->last_iteration_eliminated_by_peeling = 0;
+  size->last_iteration_not_eliminatable_after_peeling = 0;
   size->num_pure_calls_on_hot_path = 0;
   size->num_non_pure_calls_on_hot_path = 0;
   size->non_call_stmts_on_hot_path = 0;
@@ -292,6 +299,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	{
 	  gimple *stmt = gsi_stmt (gsi);
 	  int num = estimate_num_insns (stmt, &eni_size_weights);
+	  bool not_eliminatable_after_peeling = false;
 	  bool likely_eliminated = false;
 	  bool likely_eliminated_last = false;
 	  bool likely_eliminated_peeled = false;
@@ -304,7 +312,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 
 	  /* Look for reasons why we might optimize this stmt away. */
 
-	  if (!gimple_has_side_effects (stmt))
+	  if (gimple_has_side_effects (stmt))
+	    not_eliminatable_after_peeling = true;
+	  else
 	    {
 	      /* Exit conditional.  */
 	      if (exit && body[i] == exit->src
@@ -377,11 +387,15 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
 	  size->overall += num;
 	  if (likely_eliminated || likely_eliminated_peeled)
 	    size->eliminated_by_peeling += num;
+	  if (not_eliminatable_after_peeling)
+	    size->not_eliminatable_after_peeling += num;
 	  if (!after_exit)
 	    {
 	      size->last_iteration += num;
 	      if (likely_eliminated || likely_eliminated_last)
 		size->last_iteration_eliminated_by_peeling += num;
+	      if (not_eliminatable_after_peeling)
+		size->last_iteration_not_eliminatable_after_peeling += num;
 	    }
 	  if ((size->overall * 3 / 2 - size->eliminated_by_peeling
 	      - size->last_iteration_eliminated_by_peeling) > upper_bound)
@@ -437,18 +451,22 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel,
    It is (NUNROLL + 1) * size of loop body with taking into account
    the fact that in last copy everything after exit conditional
    is dead and that some instructions will be eliminated after
-   peeling.  */
+   peeling.  Set *EST_ELIMINATED to the number of stmts that could be
+   optimistically eliminated by followup transforms.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
+			 unsigned HOST_WIDE_INT *est_eliminated,
 			 unsigned HOST_WIDE_INT nunroll)
 {
   HOST_WIDE_INT unr_insns = ((nunroll)
   			     * (HOST_WIDE_INT) (size->overall
 			     			- size->eliminated_by_peeling));
-  if (!nunroll)
-    unr_insns = 0;
+  HOST_WIDE_INT not_elim
+    = ((nunroll) * (HOST_WIDE_INT) size->not_eliminatable_after_peeling);
   unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling;
+  not_elim += size->last_iteration_not_eliminatable_after_peeling;
 
+  *est_eliminated = (unr_insns - not_elim) / 3;
   return unr_insns;
 }
 
@@ -829,8 +847,9 @@ try_unroll_loop_completely (class loop *loop,
 	    }
 
 	  unsigned HOST_WIDE_INT ninsns = size.overall;
+	  unsigned HOST_WIDE_INT est_eliminated;
 	  unsigned HOST_WIDE_INT unr_insns
-	    = estimated_unrolled_size (&size, n_unroll);
+	    = estimated_unrolled_size (&size, &est_eliminated, n_unroll);
 	  if (dump_file && (dump_flags & TDF_DETAILS))
 	    {
 	      fprintf (dump_file, "  Loop size: %d\n", (int) ninsns);
@@ -842,7 +861,7 @@ try_unroll_loop_completely (class loop *loop,
 	     cautious on guessing if the unrolling is going to be
 	     profitable.
 	     Move from estimated_unrolled_size to unroll small loops.  */
-	  if (unr_insns * 2 / 3
+	  if (unr_insns - est_eliminated
 	      /* If there is IV variable that will become constant, we
 		 save one instruction in the loop prologue we do not
 		 account otherwise.  */
@@ -919,7 +938,7 @@ try_unroll_loop_completely (class loop *loop,
 	     2) Big loop after completely unroll may not be vectorized
 	     by BB vectorizer.  */
 	  else if ((cunrolli && !loop->inner
-		    ? unr_insns : unr_insns * 2 / 3)
+		    ? unr_insns : unr_insns - est_eliminated)
 		   > (unsigned) param_max_completely_peeled_insns)
 	    {
 	      if (dump_file && (dump_flags & TDF_DETAILS))