From patchwork Tue Oct 20 17:15:15 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Nathan Sidwell X-Patchwork-Id: 533084 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 47320140281 for ; Wed, 21 Oct 2015 04:15:48 +1100 (AEDT) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=dnRwgTVu; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; q=dns; s=default; b=BYChCLJBc+4h7rS6zUP9WY9tMpF5ECGHS74SW/QOcSN/3kV7Qu kByti18T8SPGvii6txoVQBXxnhJDarZCm96i43oMiEsUcfey9K6xQNXy4d1M64ar EIGIn7mPApoLKFWQVU5io5P8r32ke0LotsQPFB54qxX5yhe6to150bHfk= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:to :from:subject:message-id:date:mime-version:content-type; s= default; bh=T4CnI8kAr4qEAgldMZmu2QaMgrk=; b=dnRwgTVuQy8ej7B/yUrI iwnzNE0UYO54lqW9vSXwLYYfVXXH5Hr/Fbj3UENrzvw9wV42VqrBmTXhIaKPwUJh 0jQfknzZLPdpik9P/XyVe3ZdT5bqnmuSJ+1zw0mLLNoHJ4Zx8Su3BRtXJVb82zZS mWxWW+LIdWxtzzBPfc99o2s= Received: (qmail 65402 invoked by alias); 20 Oct 2015 17:15:28 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 65181 invoked by uid 89); 20 Oct 2015 17:15:24 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.8 required=5.0 tests=BAYES_00, FREEMAIL_FROM, KAM_ASCII_DIVIDERS, RCVD_IN_DNSWL_LOW, SPF_PASS autolearn=no version=3.3.2 X-HELO: mail-qg0-f46.google.com Received: from mail-qg0-f46.google.com (HELO mail-qg0-f46.google.com) (209.85.192.46) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (AES128-GCM-SHA256 encrypted) ESMTPS; Tue, 20 Oct 2015 17:15:19 +0000 Received: by qgem9 with SMTP id m9so21186496qge.1 for ; Tue, 20 Oct 2015 10:15:17 -0700 (PDT) X-Received: by 10.140.151.140 with SMTP id 134mr3640036qhx.25.1445361316974; Tue, 20 Oct 2015 10:15:16 -0700 (PDT) Received: from ?IPv6:2601:181:c000:c497:a2a8:cdff:fe3e:b48? ([2601:181:c000:c497:a2a8:cdff:fe3e:b48]) by smtp.googlemail.com with ESMTPSA id i34sm1622477qgd.8.2015.10.20.10.15.16 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128/128); Tue, 20 Oct 2015 10:15:16 -0700 (PDT) To: GCC Patches From: Nathan Sidwell Subject: [gomp4] Update openacc loop iteration partitioning Message-ID: <562676A3.5010804@acm.org> Date: Tue, 20 Oct 2015 13:15:15 -0400 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.3.0 MIME-Version: 1.0 In preparing this patch set for trunk, I discovered I'd flubbed the calculations for default contiguous looping. This fixes the calculation in the target-side loop transformation code. I also realized that the calculation appropriate for an accelerator is not the best for the host. For the latter we want this to expand to a regular loop iterator. Applied to gomp4 branch. nathan 2015-10-20 Nathan Sidwell gcc/ * omp-low.c (expand_oacc_for): Use -1 for unspecified static chunking. Remove unnecessary gimple forcing. (oacc_xform_loop): Adjust chunk size calculation. Don't chunk on host. libgomp/ * testsuite/libgomp.oacc-c-c++-common/loop-g-1.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-g-2.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-v-1.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-w-1.c: New. * testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c: New. Index: gcc/omp-low.c =================================================================== --- gcc/omp-low.c (revision 229092) +++ gcc/omp-low.c (working copy) @@ -9962,7 +9962,7 @@ expand_oacc_for (struct omp_region *regi enum tree_code cond_code = fd->loop.cond_code; enum tree_code plus_code = PLUS_EXPR; - tree chunk_size = integer_one_node; + tree chunk_size = integer_minus_one_node; tree gwv = integer_zero_node; tree iter_type = TREE_TYPE (v); tree diff_type = iter_type; @@ -10110,10 +10110,6 @@ expand_oacc_for (struct omp_region *regi ass = gimple_build_assign (chunk_no, expr); gsi_insert_before (&gsi, ass, GSI_SAME_STMT); - expr = fold_convert (diff_type, chunk_size); - chunk_size = force_gimple_operand_gsi (&gsi, expr, true, - NULL_TREE, true, GSI_SAME_STMT); - call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, build_int_cst (integer_type_node, IFN_GOACC_LOOP_CHUNKS), @@ -16892,25 +16888,26 @@ oacc_xform_loop (gcall *call) tree dir = gimple_call_arg (call, 1); tree range = gimple_call_arg (call, 2); tree step = gimple_call_arg (call, 3); - tree chunk_size = gimple_call_arg (call, 4); + tree chunk_size = NULL_TREE; unsigned mask = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 5)); tree lhs = gimple_call_lhs (call); tree type = TREE_TYPE (lhs); tree diff_type = TREE_TYPE (range); tree r = NULL_TREE; gimple_seq seq = NULL; - bool chunking, striding; + bool chunking = false, striding = true; unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any) - if (integer_zerop (chunk_size)) - { - /* If we're at the gang or (worker with vector), we want each to - execute a contiguous run of iterations. Otherwise we want - each element to stride. */ - striding = !((outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG)) - || ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) - && (outer_mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))); +#ifdef ACCEL_COMPILER + chunk_size = gimple_call_arg (call, 4); + if (integer_minus_onep (chunk_size) /* Force static allocation. */ + || integer_zerop (chunk_size)) /* Default (also static). */ + { + /* If we're at the gang level, we want each to execute a + contiguous run of iterations. Otherwise we want each element + to stride. */ + striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG)); chunking = false; } else @@ -16919,7 +16916,16 @@ oacc_xform_loop (gcall *call) striding = integer_onep (chunk_size); chunking = !striding; } +#endif + /* striding=true, chunking=true + -> invalid. + striding=true, chunking=false + -> chunks=1 + striding=false,chunking=true + -> chunks=ceil (range/(chunksize*threads*step)) + striding=false,chunking=false + -> chunk_size=ceil(range/(threads*step)),chunks=1 */ push_gimplify_context (true); switch (code) @@ -16963,31 +16969,25 @@ oacc_xform_loop (gcall *call) } else { - tree span; tree inner_size = oacc_thread_numbers (false, inner_mask, &seq); tree outer_size = oacc_thread_numbers (false, outer_mask, &seq); tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size), inner_size, outer_size); + volume = fold_convert (diff_type, volume); if (chunking) - { - chunk_size = fold_convert (diff_type, chunk_size); - - span = inner_size; - span = fold_convert (diff_type, span); - span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size); - } + chunk_size = fold_convert (diff_type, chunk_size); else { - tree per = fold_convert (diff_type, volume); - per = fold_build2 (MULT_EXPR, diff_type, per, step); + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); - span = build2 (MINUS_EXPR, diff_type, range, dir); - span = build2 (PLUS_EXPR, diff_type, span, per); - span = build2 (TRUNC_DIV_EXPR, diff_type, span, per); - span = build2 (MULT_EXPR, diff_type, span, inner_size); + chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); + chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); + chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); } + tree span = build2 (MULT_EXPR, diff_type, chunk_size, + fold_convert (diff_type, inner_size)); r = oacc_thread_numbers (true, outer_mask, &seq); r = fold_convert (diff_type, r); r = build2 (MULT_EXPR, diff_type, r, span); @@ -16998,9 +16998,9 @@ oacc_xform_loop (gcall *call) if (chunking) { - tree chunk = gimple_call_arg (call, 6); - tree per = fold_convert (diff_type, volume); - per = fold_build2 (MULT_EXPR, diff_type, per, chunk_size); + tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6)); + tree per + = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size); per = build2 (MULT_EXPR, diff_type, per, chunk); r = build2 (PLUS_EXPR, diff_type, r, per); @@ -17016,29 +17016,29 @@ oacc_xform_loop (gcall *call) r = range; else { - tree offset = gimple_call_arg (call, 6); - tree span; - - if (chunking) - { - chunk_size = fold_convert (diff_type, chunk_size); + tree inner_size = oacc_thread_numbers (false, inner_mask, &seq); + tree outer_size = oacc_thread_numbers (false, outer_mask, &seq); + tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size), + inner_size, outer_size); - span = oacc_thread_numbers (false, inner_mask, &seq); - span = fold_convert (diff_type, span); - span = fold_build2 (MULT_EXPR, diff_type, span, chunk_size); - } + volume = fold_convert (diff_type, volume); + if (chunking) + chunk_size = fold_convert (diff_type, chunk_size); else { - tree per = oacc_thread_numbers (false, mask, &seq); - per = fold_convert (diff_type, per); - per = build2 (MULT_EXPR, diff_type, per, step); - span = build2 (MINUS_EXPR, diff_type, range, dir); - span = build2 (PLUS_EXPR, diff_type, span, per); - span = build2 (TRUNC_DIV_EXPR, diff_type, span, per); + tree per = fold_build2 (MULT_EXPR, diff_type, volume, step); + + chunk_size = build2 (MINUS_EXPR, diff_type, range, dir); + chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per); + chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per); } + tree span = build2 (MULT_EXPR, diff_type, chunk_size, + fold_convert (diff_type, inner_size)); + r = fold_build2 (MULT_EXPR, diff_type, span, step); + tree offset = gimple_call_arg (call, 6); r = build2 (PLUS_EXPR, diff_type, r, fold_convert (diff_type, offset)); r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-1.c (working copy) @@ -0,0 +1,57 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop gang + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int g = ix / ((N + 31) / 32); + int w = 0; + int v = 0; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-g-2.c (working copy) @@ -0,0 +1,57 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_gangs(32) vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop gang (static:1) + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int g = ix % 32; + int w = 0; + int v = 0; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-gwv-1.c (working copy) @@ -0,0 +1,59 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_gangs(32) num_workers(32) vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop gang worker vector + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int chunk_size = (N + 32*32*32 - 1) / (32*32*32); + + int g = ix / (chunk_size * 32 * 32); + int w = ix / 32 % 32; + int v = ix % 32; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-v-1.c (working copy) @@ -0,0 +1,57 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop vector + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int g = 0; + int w = 0; + int v = ix % 32; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-w-1.c (working copy) @@ -0,0 +1,57 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop worker + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int g = 0; + int w = ix % 32; + int v = 0; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (revision 0) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/loop-wv-1.c (working copy) @@ -0,0 +1,57 @@ +/* { dg-do run } */ +/* { dg-additional-options "-O2" */ + +#include + +#define N (32*32*32+17) +int main () +{ + int ary[N]; + int ix; + int exit = 0; + int ondev = 0; + + for (ix = 0; ix < N;ix++) + ary[ix] = -1; + +#pragma acc parallel num_workers(32) vector_length(32) copy(ary) copy(ondev) + { +#pragma acc loop worker vector + for (unsigned ix = 0; ix < N; ix++) + { + if (__builtin_acc_on_device (5)) + { + int g = 0, w = 0, v = 0; + + __asm__ volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (g)); + __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (w)); + __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (v)); + ary[ix] = (g << 16) | (w << 8) | v; + ondev = 1; + } + else + ary[ix] = ix; + } + } + + for (ix = 0; ix < N; ix++) + { + int expected = ix; + if(ondev) + { + int g = 0; + int w = (ix / 32) % 32; + int v = ix % 32; + + expected = (g << 16) | (w << 8) | v; + } + + if (ary[ix] != expected) + { + exit = 1; + printf ("ary[%d]=%x expected %x\n", ix, ary[ix], expected); + } + } + + return exit; +}