From patchwork Sun Mar 27 18:57:38 2011 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 88516 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) by ozlabs.org (Postfix) with SMTP id 8384A1007D1 for ; Mon, 28 Mar 2011 06:24:31 +1100 (EST) Received: (qmail 21051 invoked by alias); 27 Mar 2011 18:57:50 -0000 Received: (qmail 21035 invoked by uid 22791); 27 Mar 2011 18:57:47 -0000 X-SWARE-Spam-Status: No, hits=-2.1 required=5.0 tests=AWL, BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, FREEMAIL_FROM, RCVD_IN_DNSWL_LOW, TW_AV, TW_VX, TW_VZ, TW_ZJ X-Spam-Check-By: sourceware.org Received: from mail-iw0-f175.google.com (HELO mail-iw0-f175.google.com) (209.85.214.175) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Sun, 27 Mar 2011 18:57:39 +0000 Received: by iwn10 with SMTP id 10so4061994iwn.20 for ; Sun, 27 Mar 2011 11:57:39 -0700 (PDT) MIME-Version: 1.0 Received: by 10.43.64.18 with SMTP id xg18mr4836028icb.144.1301252258597; Sun, 27 Mar 2011 11:57:38 -0700 (PDT) Received: by 10.42.142.5 with HTTP; Sun, 27 Mar 2011 11:57:38 -0700 (PDT) In-Reply-To: References: Date: Sun, 27 Mar 2011 11:57:38 -0700 Message-ID: Subject: Re: PATCH: Split AVX 32byte unalignd load/store From: "H.J. Lu" To: Uros Bizjak Cc: GCC Patches X-IsSubscribed: yes Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org On Sun, Mar 27, 2011 at 10:53 AM, Uros Bizjak wrote: > On Sun, Mar 27, 2011 at 3:44 PM, H.J. Lu wrote: > >> Here is a patch to split AVX 32byte unalignd load/store: >> >> http://gcc.gnu.org/ml/gcc-patches/2011-02/msg00743.html >> >> It speeds up some SPEC CPU 2006 benchmarks by up to 6%. >> OK for trunk? > >> 2011-02-11  H.J. Lu   >> >>       * config/i386/i386.c (flag_opts): Add -mavx256-split-unaligned-load >>       and -mavx256-split-unaligned-store. >>       (ix86_option_override_internal): Split 32-byte AVX unaligned >>       load/store by default. >>       (ix86_avx256_split_vector_move_misalign): New. >>       (ix86_expand_vector_move_misalign): Use it. >> >>       * config/i386/i386.opt: Add -mavx256-split-unaligned-load and >>       -mavx256-split-unaligned-store. >> >>       * config/i386/sse.md (*avx_mov_internal): Verify unaligned >>       256bit load/store.  Generate unaligned store on misaligned memory >>       operand. >>       (*avx_movu): Verify unaligned >>       256bit load/store. >>       (*avx_movdqu): Likewise. >> >>       * doc/invoke.texi: Document -mavx256-split-unaligned-load and >>       -mavx256-split-unaligned-store. >> >> gcc/testsuite/ >> >> 2011-02-11  H.J. Lu   >> >>       * gcc.target/i386/avx256-unaligned-load-1.c: New. >>       * gcc.target/i386/avx256-unaligned-load-2.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-load-3.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-load-4.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-load-5.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-load-6.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-load-7.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-1.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-2.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-3.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-4.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-5.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-6.c: Likewise. >>       * gcc.target/i386/avx256-unaligned-store-7.c: Likewise. >> > > > >> @@ -203,19 +203,37 @@ >>        return standard_sse_constant_opcode (insn, operands[1]); >>      case 1: >>      case 2: >> +      if (GET_MODE_ALIGNMENT (mode) == 256 >> +       && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE >> +            && MEM_P (operands[0]) >> +            && MEM_ALIGN (operands[0]) < 256) >> +           || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD >> +               && MEM_P (operands[1]) >> +               && MEM_ALIGN (operands[1]) < 256))) >> +     gcc_unreachable (); > > Please use "misaligned_operand (operands[...], mode)" instead of > MEM_P && MEM_ALIGN combo in a couple of places. > > OK with that change. > This is the patch I checked in. Thanks. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4e8ca69..a4ca762 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -3130,6 +3130,8 @@ ix86_target_string (int isa, int flags, const char *arch, const char *tune, { "-mvect8-ret-in-mem", MASK_VECT8_RETURNS }, { "-m8bit-idiv", MASK_USE_8BIT_IDIV }, { "-mvzeroupper", MASK_VZEROUPPER }, + { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD}, + { "-mavx256-split-unaligned-stroe", MASK_AVX256_SPLIT_UNALIGNED_STORE}, }; const char *opts[ARRAY_SIZE (isa_opts) + ARRAY_SIZE (flag_opts) + 6][2]; @@ -4274,11 +4276,18 @@ ix86_option_override_internal (bool main_args_p) if (TARGET_AVX) { /* When not optimize for size, enable vzeroupper optimization for - TARGET_AVX with -fexpensive-optimizations. */ - if (!optimize_size - && flag_expensive_optimizations - && !(target_flags_explicit & MASK_VZEROUPPER)) - target_flags |= MASK_VZEROUPPER; + TARGET_AVX with -fexpensive-optimizations and split 32-byte + AVX unaligned load/store. */ + if (!optimize_size) + { + if (flag_expensive_optimizations + && !(target_flags_explicit & MASK_VZEROUPPER)) + target_flags |= MASK_VZEROUPPER; + if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD)) + target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD; + if (!(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE)) + target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE; + } } else { @@ -15588,6 +15597,57 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) emit_insn (gen_rtx_SET (VOIDmode, op0, op1)); } +/* Split 32-byte AVX unaligned load and store if needed. */ + +static void +ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) +{ + rtx m; + rtx (*extract) (rtx, rtx, rtx); + rtx (*move_unaligned) (rtx, rtx); + enum machine_mode mode; + + switch (GET_MODE (op0)) + { + default: + gcc_unreachable (); + case V32QImode: + extract = gen_avx_vextractf128v32qi; + move_unaligned = gen_avx_movdqu256; + mode = V16QImode; + break; + case V8SFmode: + extract = gen_avx_vextractf128v8sf; + move_unaligned = gen_avx_movups256; + mode = V4SFmode; + break; + case V4DFmode: + extract = gen_avx_vextractf128v4df; + move_unaligned = gen_avx_movupd256; + mode = V2DFmode; + break; + } + + if (MEM_P (op1) && TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + { + rtx r = gen_reg_rtx (mode); + m = adjust_address (op1, mode, 0); + emit_move_insn (r, m); + m = adjust_address (op1, mode, 16); + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); + emit_move_insn (op0, r); + } + else if (MEM_P (op0) && TARGET_AVX256_SPLIT_UNALIGNED_STORE) + { + m = adjust_address (op0, mode, 0); + emit_insn (extract (m, op1, const0_rtx)); + m = adjust_address (op0, mode, 16); + emit_insn (extract (m, op1, const1_rtx)); + } + else + emit_insn (move_unaligned (op0, op1)); +} + /* Implement the movmisalign patterns for SSE. Non-SSE modes go straight to ix86_expand_vector_move. */ /* Code generation for scalar reg-reg moves of single and double precision data: @@ -15672,7 +15732,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) case 32: op0 = gen_lowpart (V32QImode, op0); op1 = gen_lowpart (V32QImode, op1); - emit_insn (gen_avx_movdqu256 (op0, op1)); + ix86_avx256_split_vector_move_misalign (op0, op1); break; default: gcc_unreachable (); @@ -15688,7 +15748,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_avx_movups (op0, op1)); break; case V8SFmode: - emit_insn (gen_avx_movups256 (op0, op1)); + ix86_avx256_split_vector_move_misalign (op0, op1); break; case V2DFmode: if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) @@ -15701,7 +15761,7 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) emit_insn (gen_avx_movupd (op0, op1)); break; case V4DFmode: - emit_insn (gen_avx_movupd256 (op0, op1)); + ix86_avx256_split_vector_move_misalign (op0, op1); break; default: gcc_unreachable (); diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index e02d098..f63a406 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -420,3 +420,11 @@ Emit profiling counter call at function entry before prologue. m8bit-idiv Target Report Mask(USE_8BIT_IDIV) Save Expand 32bit/64bit integer divide into 8bit unsigned integer divide with run-time check + +mavx256-split-unaligned-load +Target Report Mask(AVX256_SPLIT_UNALIGNED_LOAD) Save +Split 32-byte AVX unaligned load + +mavx256-split-unaligned-store +Target Report Mask(AVX256_SPLIT_UNALIGNED_STORE) Save +Split 32-byte AVX unaligned store diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 70a0b34..de11f73 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -203,19 +203,35 @@ return standard_sse_constant_opcode (insn, operands[1]); case 1: case 2: + if (GET_MODE_ALIGNMENT (mode) == 256 + && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE + && misaligned_operand (operands[0], mode)) + || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD + && misaligned_operand (operands[1], mode)))) + gcc_unreachable (); switch (get_attr_mode (insn)) { case MODE_V8SF: case MODE_V4SF: - return "vmovaps\t{%1, %0|%0, %1}"; + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovups\t{%1, %0|%0, %1}"; + else + return "vmovaps\t{%1, %0|%0, %1}"; case MODE_V4DF: case MODE_V2DF: - if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovupd\t{%1, %0|%0, %1}"; + else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) return "vmovaps\t{%1, %0|%0, %1}"; else return "vmovapd\t{%1, %0|%0, %1}"; default: - if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + if (misaligned_operand (operands[0], mode) + || misaligned_operand (operands[1], mode)) + return "vmovdqu\t{%1, %0|%0, %1}"; + else if (TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) return "vmovaps\t{%1, %0|%0, %1}"; else return "vmovdqa\t{%1, %0|%0, %1}"; @@ -400,7 +416,15 @@ UNSPEC_MOVU))] "AVX_VEC_FLOAT_MODE_P (mode) && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "vmovu\t{%1, %0|%0, %1}" +{ + if (GET_MODE_ALIGNMENT (mode) == 256 + && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE + && misaligned_operand (operands[0], mode)) + || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD + && misaligned_operand (operands[1], mode)))) + gcc_unreachable (); + return "vmovu\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") (set_attr "movu" "1") (set_attr "prefix" "vex") @@ -459,7 +483,15 @@ [(match_operand:AVXMODEQI 1 "nonimmediate_operand" "xm,x")] UNSPEC_MOVU))] "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))" - "vmovdqu\t{%1, %0|%0, %1}" +{ + if (GET_MODE_ALIGNMENT (mode) == 256 + && ((TARGET_AVX256_SPLIT_UNALIGNED_STORE + && misaligned_operand (operands[0], mode)) + || (TARGET_AVX256_SPLIT_UNALIGNED_LOAD + && misaligned_operand (operands[1], mode)))) + gcc_unreachable (); + return "vmovdqu\t{%1, %0|%0, %1}"; +} [(set_attr "type" "ssemov") (set_attr "movu" "1") (set_attr "prefix" "vex") diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 925455d..85bf2b4 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -602,7 +602,8 @@ Objective-C and Objective-C++ Dialects}. -momit-leaf-frame-pointer -mno-red-zone -mno-tls-direct-seg-refs @gol -mcmodel=@var{code-model} -mabi=@var{name} @gol -m32 -m64 -mlarge-data-threshold=@var{num} @gol --msse2avx -mfentry -m8bit-idiv} +-msse2avx -mfentry -m8bit-idiv @gol +-mavx256-split-unaligned-load -mavx256-split-unaligned-store} @emph{i386 and x86-64 Windows Options} @gccoptlist{-mconsole -mcygwin -mno-cygwin -mdll @gol @@ -12669,6 +12670,12 @@ runt-time check. If both dividend and divisor are within range of 0 to 255, 8bit unsigned integer divide will be used instead of 32bit/64bit integer divide. +@item -mavx256-split-unaligned-load +@item -mavx256-split-unaligned-store +@opindex avx256-split-unaligned-load +@opindex avx256-split-unaligned-store +Split 32-byte AVX unaligned load and store. + @end table These @samp{-m} switches are supported in addition to the above diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c new file mode 100644 index 0000000..023e859 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#define N 1024 + +float a[N], b[N+3], c[N]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i+3]; +} + +/* { dg-final { scan-assembler-not "\\*avx_movups256/1" } } */ +/* { dg-final { scan-assembler "\\*avx_movups/1" } } */ +/* { dg-final { scan-assembler "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c new file mode 100644 index 0000000..8394e27 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#define N 1024 + +char **ep; +char **fp; + +void +avx_test (void) +{ + int i; + char **ap; + char **bp; + char **cp; + + ap = ep; + bp = fp; + for (i = 128; i >= 0; i--) + { + *ap++ = *cp++; + *bp++ = 0; + } +} + +/* { dg-final { scan-assembler-not "\\*avx_movdqu256/1" } } */ +/* { dg-final { scan-assembler "\\*avx_movdqu/1" } } */ +/* { dg-final { scan-assembler "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c new file mode 100644 index 0000000..ec7d59d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-3.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#define N 1024 + +double a[N], b[N+3], c[N]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + c[i] = a[i] * b[i+3]; +} + +/* { dg-final { scan-assembler-not "\\*avx_movupd256/1" } } */ +/* { dg-final { scan-assembler "\\*avx_movupd/1" } } */ +/* { dg-final { scan-assembler "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c new file mode 100644 index 0000000..0d3ef33 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-4.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" } */ + +#define N 1024 + +float a[N], b[N+3]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i] = a[i+3] * 2; +} + +/* { dg-final { scan-assembler "\\*avx_movups256/1" } } */ +/* { dg-final { scan-assembler-not "\\*avx_movups/1" } } */ +/* { dg-final { scan-assembler-not "vinsertf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-5.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-5.c new file mode 100644 index 0000000..153b66f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-5.c @@ -0,0 +1,43 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#include "avx-check.h" + +#define N 8 + +float a[N+3] = { -1, -1, -1, 24.43, 68.346, 43.35, + 546.46, 46.79, 82.78, 82.7, 9.4 }; +float b[N]; +float c[N]; + +void +foo (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i] = a[i+3] * 2; +} + +__attribute__ ((noinline)) +float +bar (float x) +{ + return x * 2; +} + +void +avx_test (void) +{ + int i; + + foo (); + + for (i = 0; i < N; i++) + c[i] = bar (a[i+3]); + + for (i = 0; i < N; i++) + if (b[i] != c[i]) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-6.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-6.c new file mode 100644 index 0000000..2fa984c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-6.c @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#include "avx-check.h" + +#define N 4 + +double a[N+3] = { -1, -1, -1, 24.43, 68.346, 43.35, 546.46 }; +double b[N]; +double c[N]; + +void +foo (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i] = a[i+3] * 2; +} + +__attribute__ ((noinline)) +double +bar (double x) +{ + return x * 2; +} + +void +avx_test (void) +{ + int i; + + foo (); + + for (i = 0; i < N; i++) + c[i] = bar (a[i+3]); + + for (i = 0; i < N; i++) + if (b[i] != c[i]) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-7.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-7.c new file mode 100644 index 0000000..ad16a53 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-load-7.c @@ -0,0 +1,60 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-load" } */ + +#include "avx-check.h" + +#define N 128 + +char **ep; +char **fp; +char **mp; +char **lp; + +__attribute__ ((noinline)) +void +foo (void) +{ + mp = (char **) malloc (N); + lp = (char **) malloc (N); + ep = (char **) malloc (N); + fp = (char **) malloc (N); +} + +void +avx_test (void) +{ + int i; + char **ap, **bp, **cp, **dp; + char *str = "STR"; + + foo (); + + cp = mp; + dp = lp; + + for (i = N; i >= 0; i--) + { + *cp++ = str; + *dp++ = str; + } + + ap = ep; + bp = fp; + cp = mp; + dp = lp; + + for (i = N; i >= 0; i--) + { + *ap++ = *cp++; + *bp++ = *dp++; + } + + for (i = N; i >= 0; i--) + { + if (strcmp (*--ap, "STR") != 0) + abort (); + if (strcmp (*--bp, "STR") != 0) + abort (); + } +} diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c new file mode 100644 index 0000000..99db55c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-1.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#define N 1024 + +float a[N], b[N+3], c[N], d[N]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i+3] = a[i] * 10.0; + + for (i = 0; i < N; i++) + d[i] = c[i] * 20.0; +} + +/* { dg-final { scan-assembler-not "\\*avx_movups256/2" } } */ +/* { dg-final { scan-assembler "movups.*\\*avx_movv4sf_internal/3" } } */ +/* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c new file mode 100644 index 0000000..38ee9e2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#define N 1024 + +char **ep; +char **fp; + +void +avx_test (void) +{ + int i; + char **ap; + char **bp; + char **cp; + + ap = ep; + bp = fp; + for (i = 128; i >= 0; i--) + { + *ap++ = *cp++; + *bp++ = 0; + } +} + +/* { dg-final { scan-assembler-not "\\*avx_movdqu256/2" } } */ +/* { dg-final { scan-assembler "movdqu.*\\*avx_movv16qi_internal/3" } } */ +/* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c new file mode 100644 index 0000000..eaab6fd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-3.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#define N 1024 + +double a[N], b[N+3], c[N], d[N]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i+3] = a[i] * 10.0; + + for (i = 0; i < N; i++) + d[i] = c[i] * 20.0; +} + +/* { dg-final { scan-assembler-not "\\*avx_movupd256/2" } } */ +/* { dg-final { scan-assembler "movupd.*\\*avx_movv2df_internal/3" } } */ +/* { dg-final { scan-assembler "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-4.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-4.c new file mode 100644 index 0000000..96cca66 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-4.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -dp -mavx -mno-avx256-split-unaligned-load -mno-avx256-split-unaligned-store" } */ + +#define N 1024 + +float a[N], b[N+3], c[N]; + +void +avx_test (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i+3] = a[i] * c[i]; +} + +/* { dg-final { scan-assembler "\\*avx_movups256/2" } } */ +/* { dg-final { scan-assembler-not "\\*avx_movups/2" } } */ +/* { dg-final { scan-assembler-not "\\*avx_movv4sf_internal/3" } } */ +/* { dg-final { scan-assembler-not "vextractf128" } } */ diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-5.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-5.c new file mode 100644 index 0000000..642da3c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-5.c @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#include "avx-check.h" + +#define N 8 + +float a[N] = { 24.43, 68.346, 43.35, 546.46, 46.79, 82.78, 82.7, 9.4 }; +float b[N+3]; +float c[N+3]; + +void +foo (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i+3] = a[i] * 2; +} + +__attribute__ ((noinline)) +float +bar (float x) +{ + return x * 2; +} + +void +avx_test (void) +{ + int i; + + foo (); + + for (i = 0; i < N; i++) + c[i+3] = bar (a[i]); + + for (i = 0; i < N; i++) + if (b[i+3] != c[i+3]) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-6.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-6.c new file mode 100644 index 0000000..a0de7a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-6.c @@ -0,0 +1,42 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#include "avx-check.h" + +#define N 4 + +double a[N] = { 24.43, 68.346, 43.35, 546.46 }; +double b[N+3]; +double c[N+3]; + +void +foo (void) +{ + int i; + + for (i = 0; i < N; i++) + b[i+3] = a[i] * 2; +} + +__attribute__ ((noinline)) +double +bar (double x) +{ + return x * 2; +} + +void +avx_test (void) +{ + int i; + + foo (); + + for (i = 0; i < N; i++) + c[i+3] = bar (a[i]); + + for (i = 0; i < N; i++) + if (b[i+3] != c[i+3]) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-7.c b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-7.c new file mode 100644 index 0000000..4272dc3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx256-unaligned-store-7.c @@ -0,0 +1,45 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx } */ +/* { dg-options "-O3 -dp -mavx -mavx256-split-unaligned-store" } */ + +#include "avx-check.h" + +#define N 128 + +char **ep; +char **fp; + +__attribute__ ((noinline)) +void +foo (void) +{ + ep = (char **) malloc (N); + fp = (char **) malloc (N); +} + +void +avx_test (void) +{ + int i; + char **ap, **bp; + char *str = "STR"; + + foo (); + + ap = ep; + bp = fp; + + for (i = N; i >= 0; i--) + { + *ap++ = str; + *bp++ = str; + } + + for (i = N; i >= 0; i--) + { + if (strcmp (*--ap, "STR") != 0) + abort (); + if (strcmp (*--bp, "STR") != 0) + abort (); + } +}