Message ID | 20210603065408.47912-2-hongtao.liu@intel.com |
---|---|
State | New |
Headers | show |
Series | [1/2] CALL_INSN may not be a real function call. | expand |
Ping This is a splitted backend patch as a follow up of https://gcc.gnu.org/pipermail/gcc-patches/2021-June/571545.html On Thu, Jun 3, 2021 at 2:55 PM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > When __builtin_ia32_vzeroupper is called explicitly, the corresponding > vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, > which leads to incorrect optimization in pass_reload. In order to > solve this problem, this patch refine instructions as call_insns in > which the call has a special vzeroupper ABI. > > gcc/ChangeLog: > > PR target/82735 > * config/i386/i386-expand.c (ix86_expand_builtin): Remove > assignment of cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-features.c > (ix86_add_reg_usage_to_vzerouppers): Delete. > (ix86_add_reg_usage_to_vzeroupper): Ditto. > (rest_of_handle_insert_vzeroupper): Remove > ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end > of the function. > (gate): Remove cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): > Declared. > * config/i386/i386.c (ix86_insn_callee_abi): New function. > (ix86_initialize_callee_abi): Ditto. > (ix86_expand_avx_vzeroupper): Ditto. > (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper > ABI. > (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. > (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper > directly. > * config/i386/i386.h (struct GTY(()) machine_function): Delete > has_explicit_vzeroupper. > * config/i386/i386.md (enum unspec): New member > UNSPEC_CALLEE_ABI. > (I386_DEFAULT,I386_VZEROUPPER,I386_UNKNOWN): New > define_constants for insn callee abi index. > * config/i386/predicates.md (vzeroupper_pattern): Adjust. > * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. > (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. > (*avx_vzeroupper): Rename to .. > (avx_vzeroupper_callee_abi): .. this, and adjust pattern as > call_insn which has a special vzeroupper ABI. > (*avx_vzeroupper_1): Deleted. > > gcc/testsuite/ChangeLog: > > PR target/82735 > * gcc.target/i386/pr82735-1.c: New test. > * gcc.target/i386/pr82735-2.c: New test. > * gcc.target/i386/pr82735-3.c: New test. > * gcc.target/i386/pr82735-4.c: New test. > * gcc.target/i386/pr82735-5.c: New test. > --- > gcc/config/i386/i386-expand.c | 4 - > gcc/config/i386/i386-features.c | 99 +++-------------------- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 55 ++++++++++++- > gcc/config/i386/i386.h | 4 - > gcc/config/i386/i386.md | 10 +++ > gcc/config/i386/predicates.md | 5 +- > gcc/config/i386/sse.md | 59 ++++---------- > gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++ > gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++ > gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 ++ > gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++ > gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++ > 13 files changed, 252 insertions(+), 143 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 9f3d41955a2..d25d59aa4e7 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -13282,10 +13282,6 @@ rdseed_step: > > return 0; > > - case IX86_BUILTIN_VZEROUPPER: > - cfun->machine->has_explicit_vzeroupper = true; > - break; > - > default: > break; > } > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > index 77783a154b6..a25769ae478 100644 > --- a/gcc/config/i386/i386-features.c > +++ b/gcc/config/i386/i386-features.c > @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) > return 0; > } > > -/* Modify the vzeroupper pattern in INSN so that it describes the effect > - that the instruction has on the SSE registers. LIVE_REGS are the set > - of registers that are live across the instruction. > - > - For a live register R we use: > - > - (set (reg:V2DF R) (reg:V2DF R)) > - > - which preserves the low 128 bits but clobbers the upper bits. */ > - > -static void > -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) > -{ > - rtx pattern = PATTERN (insn); > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = nregs; > - for (unsigned int i = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - npats--; > - } > - if (npats == 0) > - return; > - rtvec vec = rtvec_alloc (npats + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); > - for (unsigned int i = 0, j = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - continue; > - rtx reg = gen_rtx_REG (V2DImode, regno); > - ++j; > - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); > - } > - XVEC (pattern, 0) = vec; > - INSN_CODE (insn) = -1; > - df_insn_rescan (insn); > -} > - > -/* Walk the vzeroupper instructions in the function and annotate them > - with the effect that they have on the SSE registers. */ > - > -static void > -ix86_add_reg_usage_to_vzerouppers (void) > -{ > - basic_block bb; > - rtx_insn *insn; > - auto_bitmap live_regs; > - > - df_analyze (); > - FOR_EACH_BB_FN (bb, cfun) > - { > - bitmap_copy (live_regs, df_get_live_out (bb)); > - df_simulate_initialize_backwards (bb, live_regs); > - FOR_BB_INSNS_REVERSE (bb, insn) > - { > - if (!NONDEBUG_INSN_P (insn)) > - continue; > - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) > - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); > - df_simulate_one_insn_backwards (bb, insn, live_regs); > - } > - } > -} > - > static unsigned int > rest_of_handle_insert_vzeroupper (void) > { > - if (TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - { > - /* vzeroupper instructions are inserted immediately after reload to > - account for possible spills from 256bit or 512bit registers. The pass > - reuses mode switching infrastructure by re-running mode insertion > - pass, so disable entities that have already been processed. */ > - for (int i = 0; i < MAX_386_ENTITIES; i++) > - ix86_optimize_mode_switching[i] = 0; > + /* vzeroupper instructions are inserted immediately after reload to > + account for possible spills from 256bit or 512bit registers. The pass > + reuses mode switching infrastructure by re-running mode insertion > + pass, so disable entities that have already been processed. */ > + for (int i = 0; i < MAX_386_ENTITIES; i++) > + ix86_optimize_mode_switching[i] = 0; > > - ix86_optimize_mode_switching[AVX_U128] = 1; > + ix86_optimize_mode_switching[AVX_U128] = 1; > > - /* Call optimize_mode_switching. */ > - g->get_passes ()->execute_pass_mode_switching (); > - } > - ix86_add_reg_usage_to_vzerouppers (); > + /* Call optimize_mode_switching. */ > + g->get_passes ()->execute_pass_mode_switching (); > + > + df_analyze (); > return 0; > } > > @@ -1882,11 +1812,8 @@ public: > /* opt_pass methods: */ > virtual bool gate (function *) > { > - return TARGET_AVX > - && ((TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - || cfun->machine->has_explicit_vzeroupper); > + return TARGET_AVX && TARGET_VZEROUPPER > + && flag_expensive_optimizations && !optimize_size; > } > > virtual unsigned int execute (function *) > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 7782cf1163f..e6ac9390777 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); > extern void ix86_move_vector_high_sse_to_mmx (rtx); > extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > extern void ix86_split_mmx_punpck (rtx[], bool); > +extern void ix86_expand_avx_vzeroupper (void); > > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 743d8a25fe3..f0b66dd0d56 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, > break; > case AVX_U128: > if (mode == AVX_U128_CLEAN) > - emit_insn (gen_avx_vzeroupper ()); > + ix86_expand_avx_vzeroupper (); > break; > case I387_ROUNDEVEN: > case I387_TRUNC: > @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) > return false; > } > > +/* Implement TARGET_INSN_CALLEE_ABI. */ > + > +const predefined_function_abi & > +ix86_insn_callee_abi (const rtx_insn *insn) > +{ > + unsigned int abi_id = 0; > + rtx pat = PATTERN (insn); > + if (vzeroupper_pattern (pat, VOIDmode)) > + abi_id = I386_VZEROUPPER; > + > + return function_abis[abi_id]; > +} > + > +/* Initialize function_abis with corresponding abi_id, > + currently only handle vzeroupper. */ > +void > +ix86_initialize_callee_abi (unsigned int abi_id) > +{ > + gcc_assert (abi_id == I386_VZEROUPPER); > + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; > + if (!vzeroupper_abi.initialized_p ()) > + { > + HARD_REG_SET full_reg_clobbers; > + CLEAR_HARD_REG_SET (full_reg_clobbers); > + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); > + } > +} > + > +void > +ix86_expand_avx_vzeroupper (void) > +{ > + /* Initialize vzeroupper_abi here. */ > + ix86_initialize_callee_abi (I386_VZEROUPPER); > + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); > + /* Return false for non-local goto in can_nonlocal_goto. */ > + make_reg_eh_region_note (insn, 0, INT_MIN); > + /* Flag used for call_insn indicates it's a fake call. */ > + RTX_FLAG (insn, used) = 1; > +} > + > + > /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that > saves SSE registers across calls is Win64 (thus no need to check the > current ABI here), and with AVX enabled Win64 only guarantees that > the low 16 bytes are saved. */ > > static bool > -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, > +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, > machine_mode mode) > { > + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ > + if (abi_id == I386_VZEROUPPER) > + return (GET_MODE_SIZE (mode) > 16 > + && ((TARGET_64BIT > + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) > + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); > + > return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; > } > > @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) > #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ > ix86_hard_regno_call_part_clobbered > > +#undef TARGET_INSN_CALLEE_ABI > +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi > + > #undef TARGET_CAN_CHANGE_MODE_CLASS > #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 53d503fc6e0..919d0b2418a 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { > /* True if the function needs a stack frame. */ > BOOL_BITFIELD stack_frame_required : 1; > > - /* True if __builtin_ia32_vzeroupper () has been expanded in current > - function. */ > - BOOL_BITFIELD has_explicit_vzeroupper : 1; > - > /* True if we should act silently, rather than raise an error for > invalid calls. */ > BOOL_BITFIELD silent_p : 1; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 2fc8fae30f3..5d9f5aa39ac 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ > ;; For MOVDIRI and MOVDIR64B support > UNSPEC_MOVDIRI > UNSPEC_MOVDIR64B > + > + ;; For insn_callee_abi: > + UNSPEC_CALLEE_ABI > + > ]) > > (define_c_enum "unspecv" [ > @@ -447,6 +451,12 @@ (define_constants > (FIRST_PSEUDO_REG 76) > ]) > > +;; Insn callee abi index. > +(define_constants > + [(I386_DEFAULT 0) > + (I386_VZEROUPPER 1) > + (I386_UNKNOWN 2)]) > + > ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls > ;; from i386.c. > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index abd307ebdb8..8b787553f32 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" > ;; return true if OP is a vzeroupper pattern. > (define_predicate "vzeroupper_pattern" > (and (match_code "parallel") > - (match_code "unspec_volatile" "a") > - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) > + (match_code "unspec" "b") > + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") > + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == I386_VZEROUPPER"))) > > ;; Return true if OP is an addsub vec_merge operation > (define_predicate "addsub_vm_operator" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index a4503ddcb73..949347a3247 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -205,7 +205,6 @@ (define_c_enum "unspecv" [ > UNSPECV_MONITOR > UNSPECV_MWAIT > UNSPECV_VZEROALL > - UNSPECV_VZEROUPPER > > ;; For KEYLOCKER > UNSPECV_LOADIWKEY > @@ -20857,14 +20856,22 @@ (define_insn "*avx_vzeroall" > ;; if the upper 128bits are unused. Initially we expand the instructions > ;; as though they had no effect on the SSE registers, but later add SETs and > ;; CLOBBERs to the PARALLEL to model the real effect. > + > (define_expand "avx_vzeroupper" > - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX") > + [(parallel [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] > + "TARGET_AVX" > +{ > + ix86_expand_avx_vzeroupper (); > + DONE; > +}) > > -(define_insn "*avx_vzeroupper" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" > +(define_insn "avx_vzeroupper_callee_abi" > + [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)] > + "TARGET_AVX" > "vzeroupper" > [(set_attr "type" "sse") > (set_attr "modrm" "0") > @@ -20873,44 +20880,6 @@ (define_insn "*avx_vzeroupper" > (set_attr "btver2_decode" "vector") > (set_attr "mode" "OI")]) > > -(define_insn_and_split "*avx_vzeroupper_1" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" > - "#" > - "&& epilogue_completed" > - [(match_dup 0)] > -{ > - /* For IPA-RA purposes, make it clear the instruction clobbers > - even XMM registers not mentioned explicitly in the pattern. */ > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = XVECLEN (operands[0], 0); > - rtvec vec = rtvec_alloc (nregs + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); > - for (unsigned int i = 0, j = 1; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (j < npats > - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) > - { > - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); > - j++; > - } > - else > - { > - rtx reg = gen_rtx_REG (V2DImode, regno); > - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); > - } > - } > - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); > -} > - [(set_attr "type" "sse") > - (set_attr "modrm" "0") > - (set_attr "memory" "none") > - (set_attr "prefix" "vex") > - (set_attr "btver2_decode" "vector") > - (set_attr "mode" "OI")]) > - > (define_mode_attr pbroadcast_evex_isa > [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") > (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c > new file mode 100644 > index 00000000000..1a63b9ae9c9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c > @@ -0,0 +1,29 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-require-effective-target avx } */ > + > +#include "avx-check.h" > + > +void > +__attribute__ ((noipa)) > +mtest(char *dest) > +{ > + __m256i ymm1 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); > + _mm256_zeroupper(); > + __m256i ymm2 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)dest, ymm2); > +} > + > +void > +avx_test () > +{ > + char buf[64]; > + for (int i = 0; i != 64; i++) > + buf[i] = 2; > + mtest (buf); > + > + for (int i = 0; i < 32; ++i) > + if (buf[i] != 1) > + __builtin_abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c > new file mode 100644 > index 00000000000..ac9d006f794 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2" } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + /* xmm1 can be propagated to xmm2 by CSE. */ > + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, > + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); > + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + __m128i xmm2 = xmm1; > + _mm_storeu_si128((__m128i *)dest, xmm2); > +} > + > +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ > +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c > new file mode 100644 > index 00000000000..e3f801e6924 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c > @@ -0,0 +1,5 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2 -mabi=ms" } */ > +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ > + > +#include "pr82735-2.c" > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c > new file mode 100644 > index 00000000000..78c0a6cb2c8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c > new file mode 100644 > index 00000000000..2a58cbe52d0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c > @@ -0,0 +1,54 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > -- > 2.18.1 >
On Thu, Jun 3, 2021 at 8:54 AM liuhongt <hongtao.liu@intel.com> wrote: > > When __builtin_ia32_vzeroupper is called explicitly, the corresponding > vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, > which leads to incorrect optimization in pass_reload. In order to > solve this problem, this patch refine instructions as call_insns in > which the call has a special vzeroupper ABI. > > gcc/ChangeLog: > > PR target/82735 > * config/i386/i386-expand.c (ix86_expand_builtin): Remove > assignment of cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-features.c > (ix86_add_reg_usage_to_vzerouppers): Delete. > (ix86_add_reg_usage_to_vzeroupper): Ditto. > (rest_of_handle_insert_vzeroupper): Remove > ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end > of the function. > (gate): Remove cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): > Declared. > * config/i386/i386.c (ix86_insn_callee_abi): New function. > (ix86_initialize_callee_abi): Ditto. > (ix86_expand_avx_vzeroupper): Ditto. > (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper > ABI. > (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. > (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper > directly. > * config/i386/i386.h (struct GTY(()) machine_function): Delete > has_explicit_vzeroupper. > * config/i386/i386.md (enum unspec): New member > UNSPEC_CALLEE_ABI. > (I386_DEFAULT,I386_VZEROUPPER,I386_UNKNOWN): New > define_constants for insn callee abi index. > * config/i386/predicates.md (vzeroupper_pattern): Adjust. > * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. > (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. > (*avx_vzeroupper): Rename to .. > (avx_vzeroupper_callee_abi): .. this, and adjust pattern as > call_insn which has a special vzeroupper ABI. > (*avx_vzeroupper_1): Deleted. > > gcc/testsuite/ChangeLog: > > PR target/82735 > * gcc.target/i386/pr82735-1.c: New test. > * gcc.target/i386/pr82735-2.c: New test. > * gcc.target/i386/pr82735-3.c: New test. > * gcc.target/i386/pr82735-4.c: New test. > * gcc.target/i386/pr82735-5.c: New test. LGTM, with a small nit below. Thanks, Uros. > --- > gcc/config/i386/i386-expand.c | 4 - > gcc/config/i386/i386-features.c | 99 +++-------------------- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 55 ++++++++++++- > gcc/config/i386/i386.h | 4 - > gcc/config/i386/i386.md | 10 +++ > gcc/config/i386/predicates.md | 5 +- > gcc/config/i386/sse.md | 59 ++++---------- > gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++ > gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++ > gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 ++ > gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++ > gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++ > 13 files changed, 252 insertions(+), 143 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 9f3d41955a2..d25d59aa4e7 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -13282,10 +13282,6 @@ rdseed_step: > > return 0; > > - case IX86_BUILTIN_VZEROUPPER: > - cfun->machine->has_explicit_vzeroupper = true; > - break; > - > default: > break; > } > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > index 77783a154b6..a25769ae478 100644 > --- a/gcc/config/i386/i386-features.c > +++ b/gcc/config/i386/i386-features.c > @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) > return 0; > } > > -/* Modify the vzeroupper pattern in INSN so that it describes the effect > - that the instruction has on the SSE registers. LIVE_REGS are the set > - of registers that are live across the instruction. > - > - For a live register R we use: > - > - (set (reg:V2DF R) (reg:V2DF R)) > - > - which preserves the low 128 bits but clobbers the upper bits. */ > - > -static void > -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) > -{ > - rtx pattern = PATTERN (insn); > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = nregs; > - for (unsigned int i = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - npats--; > - } > - if (npats == 0) > - return; > - rtvec vec = rtvec_alloc (npats + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); > - for (unsigned int i = 0, j = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - continue; > - rtx reg = gen_rtx_REG (V2DImode, regno); > - ++j; > - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); > - } > - XVEC (pattern, 0) = vec; > - INSN_CODE (insn) = -1; > - df_insn_rescan (insn); > -} > - > -/* Walk the vzeroupper instructions in the function and annotate them > - with the effect that they have on the SSE registers. */ > - > -static void > -ix86_add_reg_usage_to_vzerouppers (void) > -{ > - basic_block bb; > - rtx_insn *insn; > - auto_bitmap live_regs; > - > - df_analyze (); > - FOR_EACH_BB_FN (bb, cfun) > - { > - bitmap_copy (live_regs, df_get_live_out (bb)); > - df_simulate_initialize_backwards (bb, live_regs); > - FOR_BB_INSNS_REVERSE (bb, insn) > - { > - if (!NONDEBUG_INSN_P (insn)) > - continue; > - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) > - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); > - df_simulate_one_insn_backwards (bb, insn, live_regs); > - } > - } > -} > - > static unsigned int > rest_of_handle_insert_vzeroupper (void) > { > - if (TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - { > - /* vzeroupper instructions are inserted immediately after reload to > - account for possible spills from 256bit or 512bit registers. The pass > - reuses mode switching infrastructure by re-running mode insertion > - pass, so disable entities that have already been processed. */ > - for (int i = 0; i < MAX_386_ENTITIES; i++) > - ix86_optimize_mode_switching[i] = 0; > + /* vzeroupper instructions are inserted immediately after reload to > + account for possible spills from 256bit or 512bit registers. The pass > + reuses mode switching infrastructure by re-running mode insertion > + pass, so disable entities that have already been processed. */ > + for (int i = 0; i < MAX_386_ENTITIES; i++) > + ix86_optimize_mode_switching[i] = 0; > > - ix86_optimize_mode_switching[AVX_U128] = 1; > + ix86_optimize_mode_switching[AVX_U128] = 1; > > - /* Call optimize_mode_switching. */ > - g->get_passes ()->execute_pass_mode_switching (); > - } > - ix86_add_reg_usage_to_vzerouppers (); > + /* Call optimize_mode_switching. */ > + g->get_passes ()->execute_pass_mode_switching (); > + > + df_analyze (); > return 0; > } > > @@ -1882,11 +1812,8 @@ public: > /* opt_pass methods: */ > virtual bool gate (function *) > { > - return TARGET_AVX > - && ((TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - || cfun->machine->has_explicit_vzeroupper); > + return TARGET_AVX && TARGET_VZEROUPPER > + && flag_expensive_optimizations && !optimize_size; > } > > virtual unsigned int execute (function *) > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 7782cf1163f..e6ac9390777 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); > extern void ix86_move_vector_high_sse_to_mmx (rtx); > extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > extern void ix86_split_mmx_punpck (rtx[], bool); > +extern void ix86_expand_avx_vzeroupper (void); > > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 743d8a25fe3..f0b66dd0d56 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, > break; > case AVX_U128: > if (mode == AVX_U128_CLEAN) > - emit_insn (gen_avx_vzeroupper ()); > + ix86_expand_avx_vzeroupper (); > break; > case I387_ROUNDEVEN: > case I387_TRUNC: > @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) > return false; > } > > +/* Implement TARGET_INSN_CALLEE_ABI. */ > + > +const predefined_function_abi & > +ix86_insn_callee_abi (const rtx_insn *insn) > +{ > + unsigned int abi_id = 0; > + rtx pat = PATTERN (insn); > + if (vzeroupper_pattern (pat, VOIDmode)) > + abi_id = I386_VZEROUPPER; > + > + return function_abis[abi_id]; > +} > + > +/* Initialize function_abis with corresponding abi_id, > + currently only handle vzeroupper. */ > +void > +ix86_initialize_callee_abi (unsigned int abi_id) > +{ > + gcc_assert (abi_id == I386_VZEROUPPER); > + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; > + if (!vzeroupper_abi.initialized_p ()) > + { > + HARD_REG_SET full_reg_clobbers; > + CLEAR_HARD_REG_SET (full_reg_clobbers); > + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); > + } > +} > + > +void > +ix86_expand_avx_vzeroupper (void) > +{ > + /* Initialize vzeroupper_abi here. */ > + ix86_initialize_callee_abi (I386_VZEROUPPER); > + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); > + /* Return false for non-local goto in can_nonlocal_goto. */ > + make_reg_eh_region_note (insn, 0, INT_MIN); > + /* Flag used for call_insn indicates it's a fake call. */ > + RTX_FLAG (insn, used) = 1; > +} > + > + > /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that > saves SSE registers across calls is Win64 (thus no need to check the > current ABI here), and with AVX enabled Win64 only guarantees that > the low 16 bytes are saved. */ > > static bool > -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, > +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, > machine_mode mode) > { > + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ > + if (abi_id == I386_VZEROUPPER) > + return (GET_MODE_SIZE (mode) > 16 > + && ((TARGET_64BIT > + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) > + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); > + > return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; > } > > @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) > #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ > ix86_hard_regno_call_part_clobbered > > +#undef TARGET_INSN_CALLEE_ABI > +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi > + > #undef TARGET_CAN_CHANGE_MODE_CLASS > #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 53d503fc6e0..919d0b2418a 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { > /* True if the function needs a stack frame. */ > BOOL_BITFIELD stack_frame_required : 1; > > - /* True if __builtin_ia32_vzeroupper () has been expanded in current > - function. */ > - BOOL_BITFIELD has_explicit_vzeroupper : 1; > - > /* True if we should act silently, rather than raise an error for > invalid calls. */ > BOOL_BITFIELD silent_p : 1; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 2fc8fae30f3..5d9f5aa39ac 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ > ;; For MOVDIRI and MOVDIR64B support > UNSPEC_MOVDIRI > UNSPEC_MOVDIR64B > + > + ;; For insn_callee_abi: > + UNSPEC_CALLEE_ABI > + > ]) > > (define_c_enum "unspecv" [ > @@ -447,6 +451,12 @@ (define_constants > (FIRST_PSEUDO_REG 76) > ]) > > +;; Insn callee abi index. > +(define_constants > + [(I386_DEFAULT 0) > + (I386_VZEROUPPER 1) > + (I386_UNKNOWN 2)]) Please name these ABI_DEFAULT, ABI_VZEROUPPER and ABI_UNKNOWN. > + > ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls > ;; from i386.c. > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index abd307ebdb8..8b787553f32 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" > ;; return true if OP is a vzeroupper pattern. > (define_predicate "vzeroupper_pattern" > (and (match_code "parallel") > - (match_code "unspec_volatile" "a") > - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) > + (match_code "unspec" "b") > + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") > + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == I386_VZEROUPPER"))) > > ;; Return true if OP is an addsub vec_merge operation > (define_predicate "addsub_vm_operator" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index a4503ddcb73..949347a3247 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -205,7 +205,6 @@ (define_c_enum "unspecv" [ > UNSPECV_MONITOR > UNSPECV_MWAIT > UNSPECV_VZEROALL > - UNSPECV_VZEROUPPER > > ;; For KEYLOCKER > UNSPECV_LOADIWKEY > @@ -20857,14 +20856,22 @@ (define_insn "*avx_vzeroall" > ;; if the upper 128bits are unused. Initially we expand the instructions > ;; as though they had no effect on the SSE registers, but later add SETs and > ;; CLOBBERs to the PARALLEL to model the real effect. > + > (define_expand "avx_vzeroupper" > - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX") > + [(parallel [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] > + "TARGET_AVX" > +{ > + ix86_expand_avx_vzeroupper (); > + DONE; > +}) > > -(define_insn "*avx_vzeroupper" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" > +(define_insn "avx_vzeroupper_callee_abi" > + [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)] > + "TARGET_AVX" > "vzeroupper" > [(set_attr "type" "sse") > (set_attr "modrm" "0") > @@ -20873,44 +20880,6 @@ (define_insn "*avx_vzeroupper" > (set_attr "btver2_decode" "vector") > (set_attr "mode" "OI")]) > > -(define_insn_and_split "*avx_vzeroupper_1" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" > - "#" > - "&& epilogue_completed" > - [(match_dup 0)] > -{ > - /* For IPA-RA purposes, make it clear the instruction clobbers > - even XMM registers not mentioned explicitly in the pattern. */ > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = XVECLEN (operands[0], 0); > - rtvec vec = rtvec_alloc (nregs + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); > - for (unsigned int i = 0, j = 1; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (j < npats > - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) > - { > - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); > - j++; > - } > - else > - { > - rtx reg = gen_rtx_REG (V2DImode, regno); > - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); > - } > - } > - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); > -} > - [(set_attr "type" "sse") > - (set_attr "modrm" "0") > - (set_attr "memory" "none") > - (set_attr "prefix" "vex") > - (set_attr "btver2_decode" "vector") > - (set_attr "mode" "OI")]) > - > (define_mode_attr pbroadcast_evex_isa > [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") > (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c > new file mode 100644 > index 00000000000..1a63b9ae9c9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c > @@ -0,0 +1,29 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-require-effective-target avx } */ > + > +#include "avx-check.h" > + > +void > +__attribute__ ((noipa)) > +mtest(char *dest) > +{ > + __m256i ymm1 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); > + _mm256_zeroupper(); > + __m256i ymm2 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)dest, ymm2); > +} > + > +void > +avx_test () > +{ > + char buf[64]; > + for (int i = 0; i != 64; i++) > + buf[i] = 2; > + mtest (buf); > + > + for (int i = 0; i < 32; ++i) > + if (buf[i] != 1) > + __builtin_abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c > new file mode 100644 > index 00000000000..ac9d006f794 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2" } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + /* xmm1 can be propagated to xmm2 by CSE. */ > + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, > + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); > + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + __m128i xmm2 = xmm1; > + _mm_storeu_si128((__m128i *)dest, xmm2); > +} > + > +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ > +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c > new file mode 100644 > index 00000000000..e3f801e6924 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c > @@ -0,0 +1,5 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2 -mabi=ms" } */ > +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ > + > +#include "pr82735-2.c" > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c > new file mode 100644 > index 00000000000..78c0a6cb2c8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c > new file mode 100644 > index 00000000000..2a58cbe52d0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c > @@ -0,0 +1,54 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > -- > 2.18.1 >
On Fri, Jun 4, 2021 at 2:27 PM Uros Bizjak via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > On Thu, Jun 3, 2021 at 8:54 AM liuhongt <hongtao.liu@intel.com> wrote: > > > > When __builtin_ia32_vzeroupper is called explicitly, the corresponding > > vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, > > which leads to incorrect optimization in pass_reload. In order to > > solve this problem, this patch refine instructions as call_insns in > > which the call has a special vzeroupper ABI. > > > > gcc/ChangeLog: > > > > PR target/82735 > > * config/i386/i386-expand.c (ix86_expand_builtin): Remove > > assignment of cfun->machine->has_explicit_vzeroupper. > > * config/i386/i386-features.c > > (ix86_add_reg_usage_to_vzerouppers): Delete. > > (ix86_add_reg_usage_to_vzeroupper): Ditto. > > (rest_of_handle_insert_vzeroupper): Remove > > ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end > > of the function. > > (gate): Remove cfun->machine->has_explicit_vzeroupper. > > * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): > > Declared. > > * config/i386/i386.c (ix86_insn_callee_abi): New function. > > (ix86_initialize_callee_abi): Ditto. > > (ix86_expand_avx_vzeroupper): Ditto. > > (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper > > ABI. > > (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. > > (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper > > directly. > > * config/i386/i386.h (struct GTY(()) machine_function): Delete > > has_explicit_vzeroupper. > > * config/i386/i386.md (enum unspec): New member > > UNSPEC_CALLEE_ABI. > > (I386_DEFAULT,I386_VZEROUPPER,I386_UNKNOWN): New > > define_constants for insn callee abi index. > > * config/i386/predicates.md (vzeroupper_pattern): Adjust. > > * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. > > (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. > > (*avx_vzeroupper): Rename to .. > > (avx_vzeroupper_callee_abi): .. this, and adjust pattern as > > call_insn which has a special vzeroupper ABI. > > (*avx_vzeroupper_1): Deleted. > > > > gcc/testsuite/ChangeLog: > > > > PR target/82735 > > * gcc.target/i386/pr82735-1.c: New test. > > * gcc.target/i386/pr82735-2.c: New test. > > * gcc.target/i386/pr82735-3.c: New test. > > * gcc.target/i386/pr82735-4.c: New test. > > * gcc.target/i386/pr82735-5.c: New test. > > LGTM, with a small nit below. > > Thanks, > Uros. > > > --- > > gcc/config/i386/i386-expand.c | 4 - > > gcc/config/i386/i386-features.c | 99 +++-------------------- > > gcc/config/i386/i386-protos.h | 1 + > > gcc/config/i386/i386.c | 55 ++++++++++++- > > gcc/config/i386/i386.h | 4 - > > gcc/config/i386/i386.md | 10 +++ > > gcc/config/i386/predicates.md | 5 +- > > gcc/config/i386/sse.md | 59 ++++---------- > > gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++ > > gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++ > > gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 ++ > > gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++ > > gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++ > > 13 files changed, 252 insertions(+), 143 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c > > > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > > index 9f3d41955a2..d25d59aa4e7 100644 > > --- a/gcc/config/i386/i386-expand.c > > +++ b/gcc/config/i386/i386-expand.c > > @@ -13282,10 +13282,6 @@ rdseed_step: > > > > return 0; > > > > - case IX86_BUILTIN_VZEROUPPER: > > - cfun->machine->has_explicit_vzeroupper = true; > > - break; > > - > > default: > > break; > > } > > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > > index 77783a154b6..a25769ae478 100644 > > --- a/gcc/config/i386/i386-features.c > > +++ b/gcc/config/i386/i386-features.c > > @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) > > return 0; > > } > > > > -/* Modify the vzeroupper pattern in INSN so that it describes the effect > > - that the instruction has on the SSE registers. LIVE_REGS are the set > > - of registers that are live across the instruction. > > - > > - For a live register R we use: > > - > > - (set (reg:V2DF R) (reg:V2DF R)) > > - > > - which preserves the low 128 bits but clobbers the upper bits. */ > > - > > -static void > > -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) > > -{ > > - rtx pattern = PATTERN (insn); > > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > > - unsigned int npats = nregs; > > - for (unsigned int i = 0; i < nregs; ++i) > > - { > > - unsigned int regno = GET_SSE_REGNO (i); > > - if (!bitmap_bit_p (live_regs, regno)) > > - npats--; > > - } > > - if (npats == 0) > > - return; > > - rtvec vec = rtvec_alloc (npats + 1); > > - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); > > - for (unsigned int i = 0, j = 0; i < nregs; ++i) > > - { > > - unsigned int regno = GET_SSE_REGNO (i); > > - if (!bitmap_bit_p (live_regs, regno)) > > - continue; > > - rtx reg = gen_rtx_REG (V2DImode, regno); > > - ++j; > > - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); > > - } > > - XVEC (pattern, 0) = vec; > > - INSN_CODE (insn) = -1; > > - df_insn_rescan (insn); > > -} > > - > > -/* Walk the vzeroupper instructions in the function and annotate them > > - with the effect that they have on the SSE registers. */ > > - > > -static void > > -ix86_add_reg_usage_to_vzerouppers (void) > > -{ > > - basic_block bb; > > - rtx_insn *insn; > > - auto_bitmap live_regs; > > - > > - df_analyze (); > > - FOR_EACH_BB_FN (bb, cfun) > > - { > > - bitmap_copy (live_regs, df_get_live_out (bb)); > > - df_simulate_initialize_backwards (bb, live_regs); > > - FOR_BB_INSNS_REVERSE (bb, insn) > > - { > > - if (!NONDEBUG_INSN_P (insn)) > > - continue; > > - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) > > - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); > > - df_simulate_one_insn_backwards (bb, insn, live_regs); > > - } > > - } > > -} > > - > > static unsigned int > > rest_of_handle_insert_vzeroupper (void) > > { > > - if (TARGET_VZEROUPPER > > - && flag_expensive_optimizations > > - && !optimize_size) > > - { > > - /* vzeroupper instructions are inserted immediately after reload to > > - account for possible spills from 256bit or 512bit registers. The pass > > - reuses mode switching infrastructure by re-running mode insertion > > - pass, so disable entities that have already been processed. */ > > - for (int i = 0; i < MAX_386_ENTITIES; i++) > > - ix86_optimize_mode_switching[i] = 0; > > + /* vzeroupper instructions are inserted immediately after reload to > > + account for possible spills from 256bit or 512bit registers. The pass > > + reuses mode switching infrastructure by re-running mode insertion > > + pass, so disable entities that have already been processed. */ > > + for (int i = 0; i < MAX_386_ENTITIES; i++) > > + ix86_optimize_mode_switching[i] = 0; > > > > - ix86_optimize_mode_switching[AVX_U128] = 1; > > + ix86_optimize_mode_switching[AVX_U128] = 1; > > > > - /* Call optimize_mode_switching. */ > > - g->get_passes ()->execute_pass_mode_switching (); > > - } > > - ix86_add_reg_usage_to_vzerouppers (); > > + /* Call optimize_mode_switching. */ > > + g->get_passes ()->execute_pass_mode_switching (); > > + > > + df_analyze (); > > return 0; > > } > > > > @@ -1882,11 +1812,8 @@ public: > > /* opt_pass methods: */ > > virtual bool gate (function *) > > { > > - return TARGET_AVX > > - && ((TARGET_VZEROUPPER > > - && flag_expensive_optimizations > > - && !optimize_size) > > - || cfun->machine->has_explicit_vzeroupper); > > + return TARGET_AVX && TARGET_VZEROUPPER > > + && flag_expensive_optimizations && !optimize_size; > > } > > > > virtual unsigned int execute (function *) > > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > > index 7782cf1163f..e6ac9390777 100644 > > --- a/gcc/config/i386/i386-protos.h > > +++ b/gcc/config/i386/i386-protos.h > > @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); > > extern void ix86_move_vector_high_sse_to_mmx (rtx); > > extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > > extern void ix86_split_mmx_punpck (rtx[], bool); > > +extern void ix86_expand_avx_vzeroupper (void); > > > > #ifdef TREE_CODE > > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index 743d8a25fe3..f0b66dd0d56 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, > > break; > > case AVX_U128: > > if (mode == AVX_U128_CLEAN) > > - emit_insn (gen_avx_vzeroupper ()); > > + ix86_expand_avx_vzeroupper (); > > break; > > case I387_ROUNDEVEN: > > case I387_TRUNC: > > @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) > > return false; > > } > > > > +/* Implement TARGET_INSN_CALLEE_ABI. */ > > + > > +const predefined_function_abi & > > +ix86_insn_callee_abi (const rtx_insn *insn) > > +{ > > + unsigned int abi_id = 0; > > + rtx pat = PATTERN (insn); > > + if (vzeroupper_pattern (pat, VOIDmode)) > > + abi_id = I386_VZEROUPPER; > > + > > + return function_abis[abi_id]; > > +} > > + > > +/* Initialize function_abis with corresponding abi_id, > > + currently only handle vzeroupper. */ > > +void > > +ix86_initialize_callee_abi (unsigned int abi_id) > > +{ > > + gcc_assert (abi_id == I386_VZEROUPPER); > > + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; > > + if (!vzeroupper_abi.initialized_p ()) > > + { > > + HARD_REG_SET full_reg_clobbers; > > + CLEAR_HARD_REG_SET (full_reg_clobbers); > > + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); > > + } > > +} > > + > > +void > > +ix86_expand_avx_vzeroupper (void) > > +{ > > + /* Initialize vzeroupper_abi here. */ > > + ix86_initialize_callee_abi (I386_VZEROUPPER); > > + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); > > + /* Return false for non-local goto in can_nonlocal_goto. */ > > + make_reg_eh_region_note (insn, 0, INT_MIN); > > + /* Flag used for call_insn indicates it's a fake call. */ > > + RTX_FLAG (insn, used) = 1; > > +} > > + > > + > > /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that > > saves SSE registers across calls is Win64 (thus no need to check the > > current ABI here), and with AVX enabled Win64 only guarantees that > > the low 16 bytes are saved. */ > > > > static bool > > -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, > > +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, > > machine_mode mode) > > { > > + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ > > + if (abi_id == I386_VZEROUPPER) > > + return (GET_MODE_SIZE (mode) > 16 > > + && ((TARGET_64BIT > > + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) > > + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); > > + > > return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; > > } > > > > @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) > > #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ > > ix86_hard_regno_call_part_clobbered > > > > +#undef TARGET_INSN_CALLEE_ABI > > +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi > > + > > #undef TARGET_CAN_CHANGE_MODE_CLASS > > #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index 53d503fc6e0..919d0b2418a 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { > > /* True if the function needs a stack frame. */ > > BOOL_BITFIELD stack_frame_required : 1; > > > > - /* True if __builtin_ia32_vzeroupper () has been expanded in current > > - function. */ > > - BOOL_BITFIELD has_explicit_vzeroupper : 1; > > - > > /* True if we should act silently, rather than raise an error for > > invalid calls. */ > > BOOL_BITFIELD silent_p : 1; > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > > index 2fc8fae30f3..5d9f5aa39ac 100644 > > --- a/gcc/config/i386/i386.md > > +++ b/gcc/config/i386/i386.md > > @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ > > ;; For MOVDIRI and MOVDIR64B support > > UNSPEC_MOVDIRI > > UNSPEC_MOVDIR64B > > + > > + ;; For insn_callee_abi: > > + UNSPEC_CALLEE_ABI > > + > > ]) > > > > (define_c_enum "unspecv" [ > > @@ -447,6 +451,12 @@ (define_constants > > (FIRST_PSEUDO_REG 76) > > ]) > > > > +;; Insn callee abi index. > > +(define_constants > > + [(I386_DEFAULT 0) > > + (I386_VZEROUPPER 1) > > + (I386_UNKNOWN 2)]) > > Please name these ABI_DEFAULT, ABI_VZEROUPPER and ABI_UNKNOWN. Yes, thanks for the review. I'll hold this patch until middle-end part is approved. > > > + > > ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls > > ;; from i386.c. > > > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > > index abd307ebdb8..8b787553f32 100644 > > --- a/gcc/config/i386/predicates.md > > +++ b/gcc/config/i386/predicates.md > > @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" > > ;; return true if OP is a vzeroupper pattern. > > (define_predicate "vzeroupper_pattern" > > (and (match_code "parallel") > > - (match_code "unspec_volatile" "a") > > - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) > > + (match_code "unspec" "b") > > + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") > > + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == I386_VZEROUPPER"))) > > > > ;; Return true if OP is an addsub vec_merge operation > > (define_predicate "addsub_vm_operator" > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > > index a4503ddcb73..949347a3247 100644 > > --- a/gcc/config/i386/sse.md > > +++ b/gcc/config/i386/sse.md > > @@ -205,7 +205,6 @@ (define_c_enum "unspecv" [ > > UNSPECV_MONITOR > > UNSPECV_MWAIT > > UNSPECV_VZEROALL > > - UNSPECV_VZEROUPPER > > > > ;; For KEYLOCKER > > UNSPECV_LOADIWKEY > > @@ -20857,14 +20856,22 @@ (define_insn "*avx_vzeroall" > > ;; if the upper 128bits are unused. Initially we expand the instructions > > ;; as though they had no effect on the SSE registers, but later add SETs and > > ;; CLOBBERs to the PARALLEL to model the real effect. > > + > > (define_expand "avx_vzeroupper" > > - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > > - "TARGET_AVX") > > + [(parallel [(call (mem:QI (const_int 0)) > > + (const_int 0)) > > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] > > + "TARGET_AVX" > > +{ > > + ix86_expand_avx_vzeroupper (); > > + DONE; > > +}) > > > > -(define_insn "*avx_vzeroupper" > > - [(match_parallel 0 "vzeroupper_pattern" > > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > > - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" > > +(define_insn "avx_vzeroupper_callee_abi" > > + [(call (mem:QI (const_int 0)) > > + (const_int 0)) > > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)] > > + "TARGET_AVX" > > "vzeroupper" > > [(set_attr "type" "sse") > > (set_attr "modrm" "0") > > @@ -20873,44 +20880,6 @@ (define_insn "*avx_vzeroupper" > > (set_attr "btver2_decode" "vector") > > (set_attr "mode" "OI")]) > > > > -(define_insn_and_split "*avx_vzeroupper_1" > > - [(match_parallel 0 "vzeroupper_pattern" > > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > > - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" > > - "#" > > - "&& epilogue_completed" > > - [(match_dup 0)] > > -{ > > - /* For IPA-RA purposes, make it clear the instruction clobbers > > - even XMM registers not mentioned explicitly in the pattern. */ > > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > > - unsigned int npats = XVECLEN (operands[0], 0); > > - rtvec vec = rtvec_alloc (nregs + 1); > > - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); > > - for (unsigned int i = 0, j = 1; i < nregs; ++i) > > - { > > - unsigned int regno = GET_SSE_REGNO (i); > > - if (j < npats > > - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) > > - { > > - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); > > - j++; > > - } > > - else > > - { > > - rtx reg = gen_rtx_REG (V2DImode, regno); > > - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); > > - } > > - } > > - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); > > -} > > - [(set_attr "type" "sse") > > - (set_attr "modrm" "0") > > - (set_attr "memory" "none") > > - (set_attr "prefix" "vex") > > - (set_attr "btver2_decode" "vector") > > - (set_attr "mode" "OI")]) > > - > > (define_mode_attr pbroadcast_evex_isa > > [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") > > (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") > > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c > > new file mode 100644 > > index 00000000000..1a63b9ae9c9 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c > > @@ -0,0 +1,29 @@ > > +/* { dg-do run } */ > > +/* { dg-options "-O2 -mavx" } */ > > +/* { dg-require-effective-target avx } */ > > + > > +#include "avx-check.h" > > + > > +void > > +__attribute__ ((noipa)) > > +mtest(char *dest) > > +{ > > + __m256i ymm1 = _mm256_set1_epi8((char)0x1); > > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); > > + _mm256_zeroupper(); > > + __m256i ymm2 = _mm256_set1_epi8((char)0x1); > > + _mm256_storeu_si256((__m256i *)dest, ymm2); > > +} > > + > > +void > > +avx_test () > > +{ > > + char buf[64]; > > + for (int i = 0; i != 64; i++) > > + buf[i] = 2; > > + mtest (buf); > > + > > + for (int i = 0; i < 32; ++i) > > + if (buf[i] != 1) > > + __builtin_abort (); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c > > new file mode 100644 > > index 00000000000..ac9d006f794 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c > > @@ -0,0 +1,22 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mavx -O2" } */ > > + > > +#include <immintrin.h> > > + > > +void test(char *dest) > > +{ > > + /* xmm1 can be propagated to xmm2 by CSE. */ > > + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, > > + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); > > + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + __m128i xmm2 = xmm1; > > + _mm_storeu_si128((__m128i *)dest, xmm2); > > +} > > + > > +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ > > +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ > > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c > > new file mode 100644 > > index 00000000000..e3f801e6924 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c > > @@ -0,0 +1,5 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-mavx -O2 -mabi=ms" } */ > > +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ > > + > > +#include "pr82735-2.c" > > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c > > new file mode 100644 > > index 00000000000..78c0a6cb2c8 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c > > @@ -0,0 +1,48 @@ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > > + > > +#include <immintrin.h> > > + > > +void test(char *dest) > > +{ > > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > > + "vmovdqa\t%%ymm0, %1\n\t" > > + "vmovdqa\t%%ymm0, %2\n\t" > > + "vmovdqa\t%%ymm0, %3\n\t" > > + "vmovdqa\t%%ymm0, %4\n\t" > > + "vmovdqa\t%%ymm0, %5\n\t" > > + "vmovdqa\t%%ymm0, %6\n\t" > > + "vmovdqa\t%%ymm0, %7\n\t" > > + "vmovdqa\t%%ymm0, %8\n\t" > > + "vmovdqa\t%%ymm0, %9\n\t" > > + "vmovdqa\t%%ymm0, %10\n\t" > > + "vmovdqa\t%%ymm0, %11\n\t" > > + "vmovdqa\t%%ymm0, %12\n\t" > > + "vmovdqa\t%%ymm0, %13\n\t" > > + "vmovdqa\t%%ymm0, %14\n\t" > > + "vmovdqa\t%%ymm0, %15\n\t" > > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > > + "=v"(ymm0) > > + ::); > > + _mm256_zeroupper(); > > + _mm256_storeu_si256((__m256i *)dest, ymm1); > > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > > +} > > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c > > new file mode 100644 > > index 00000000000..2a58cbe52d0 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c > > @@ -0,0 +1,54 @@ > > +/* { dg-do compile { target { ! ia32 } } } */ > > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > > + > > +#include <immintrin.h> > > + > > +void test(char *dest) > > +{ > > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; > > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > > + "vmovdqa\t%%ymm0, %1\n\t" > > + "vmovdqa\t%%ymm0, %2\n\t" > > + "vmovdqa\t%%ymm0, %3\n\t" > > + "vmovdqa\t%%ymm0, %4\n\t" > > + "vmovdqa\t%%ymm0, %5\n\t" > > + "vmovdqa\t%%ymm0, %6\n\t" > > + "vmovdqa\t%%ymm0, %7\n\t" > > + "vmovdqa\t%%ymm0, %8\n\t" > > + "vmovdqa\t%%ymm0, %9\n\t" > > + "vmovdqa\t%%ymm0, %10\n\t" > > + "vmovdqa\t%%ymm0, %11\n\t" > > + "vmovdqa\t%%ymm0, %12\n\t" > > + "vmovdqa\t%%ymm0, %13\n\t" > > + "vmovdqa\t%%ymm0, %14\n\t" > > + "vmovdqa\t%%ymm0, %15\n\t" > > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), > > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), > > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), > > + "=v"(ymm0) > > + ::); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_zeroupper(); > > + _mm256_storeu_si256((__m256i *)dest, ymm1); > > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > > +} > > -- > > 2.18.1 > >
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9f3d41955a2..d25d59aa4e7 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -13282,10 +13282,6 @@ rdseed_step: return 0; - case IX86_BUILTIN_VZEROUPPER: - cfun->machine->has_explicit_vzeroupper = true; - break; - default: break; } diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index 77783a154b6..a25769ae478 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) return 0; } -/* Modify the vzeroupper pattern in INSN so that it describes the effect - that the instruction has on the SSE registers. LIVE_REGS are the set - of registers that are live across the instruction. - - For a live register R we use: - - (set (reg:V2DF R) (reg:V2DF R)) - - which preserves the low 128 bits but clobbers the upper bits. */ - -static void -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) -{ - rtx pattern = PATTERN (insn); - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = nregs; - for (unsigned int i = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - npats--; - } - if (npats == 0) - return; - rtvec vec = rtvec_alloc (npats + 1); - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); - for (unsigned int i = 0, j = 0; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (!bitmap_bit_p (live_regs, regno)) - continue; - rtx reg = gen_rtx_REG (V2DImode, regno); - ++j; - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); - } - XVEC (pattern, 0) = vec; - INSN_CODE (insn) = -1; - df_insn_rescan (insn); -} - -/* Walk the vzeroupper instructions in the function and annotate them - with the effect that they have on the SSE registers. */ - -static void -ix86_add_reg_usage_to_vzerouppers (void) -{ - basic_block bb; - rtx_insn *insn; - auto_bitmap live_regs; - - df_analyze (); - FOR_EACH_BB_FN (bb, cfun) - { - bitmap_copy (live_regs, df_get_live_out (bb)); - df_simulate_initialize_backwards (bb, live_regs); - FOR_BB_INSNS_REVERSE (bb, insn) - { - if (!NONDEBUG_INSN_P (insn)) - continue; - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); - df_simulate_one_insn_backwards (bb, insn, live_regs); - } - } -} - static unsigned int rest_of_handle_insert_vzeroupper (void) { - if (TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - { - /* vzeroupper instructions are inserted immediately after reload to - account for possible spills from 256bit or 512bit registers. The pass - reuses mode switching infrastructure by re-running mode insertion - pass, so disable entities that have already been processed. */ - for (int i = 0; i < MAX_386_ENTITIES; i++) - ix86_optimize_mode_switching[i] = 0; + /* vzeroupper instructions are inserted immediately after reload to + account for possible spills from 256bit or 512bit registers. The pass + reuses mode switching infrastructure by re-running mode insertion + pass, so disable entities that have already been processed. */ + for (int i = 0; i < MAX_386_ENTITIES; i++) + ix86_optimize_mode_switching[i] = 0; - ix86_optimize_mode_switching[AVX_U128] = 1; + ix86_optimize_mode_switching[AVX_U128] = 1; - /* Call optimize_mode_switching. */ - g->get_passes ()->execute_pass_mode_switching (); - } - ix86_add_reg_usage_to_vzerouppers (); + /* Call optimize_mode_switching. */ + g->get_passes ()->execute_pass_mode_switching (); + + df_analyze (); return 0; } @@ -1882,11 +1812,8 @@ public: /* opt_pass methods: */ virtual bool gate (function *) { - return TARGET_AVX - && ((TARGET_VZEROUPPER - && flag_expensive_optimizations - && !optimize_size) - || cfun->machine->has_explicit_vzeroupper); + return TARGET_AVX && TARGET_VZEROUPPER + && flag_expensive_optimizations && !optimize_size; } virtual unsigned int execute (function *) diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index 7782cf1163f..e6ac9390777 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); extern void ix86_move_vector_high_sse_to_mmx (rtx); extern void ix86_split_mmx_pack (rtx[], enum rtx_code); extern void ix86_split_mmx_punpck (rtx[], bool); +extern void ix86_expand_avx_vzeroupper (void); #ifdef TREE_CODE extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 743d8a25fe3..f0b66dd0d56 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int prev_mode ATTRIBUTE_UNUSED, break; case AVX_U128: if (mode == AVX_U128_CLEAN) - emit_insn (gen_avx_vzeroupper ()); + ix86_expand_avx_vzeroupper (); break; case I387_ROUNDEVEN: case I387_TRUNC: @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) return false; } +/* Implement TARGET_INSN_CALLEE_ABI. */ + +const predefined_function_abi & +ix86_insn_callee_abi (const rtx_insn *insn) +{ + unsigned int abi_id = 0; + rtx pat = PATTERN (insn); + if (vzeroupper_pattern (pat, VOIDmode)) + abi_id = I386_VZEROUPPER; + + return function_abis[abi_id]; +} + +/* Initialize function_abis with corresponding abi_id, + currently only handle vzeroupper. */ +void +ix86_initialize_callee_abi (unsigned int abi_id) +{ + gcc_assert (abi_id == I386_VZEROUPPER); + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; + if (!vzeroupper_abi.initialized_p ()) + { + HARD_REG_SET full_reg_clobbers; + CLEAR_HARD_REG_SET (full_reg_clobbers); + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); + } +} + +void +ix86_expand_avx_vzeroupper (void) +{ + /* Initialize vzeroupper_abi here. */ + ix86_initialize_callee_abi (I386_VZEROUPPER); + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); + /* Return false for non-local goto in can_nonlocal_goto. */ + make_reg_eh_region_note (insn, 0, INT_MIN); + /* Flag used for call_insn indicates it's a fake call. */ + RTX_FLAG (insn, used) = 1; +} + + /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that saves SSE registers across calls is Win64 (thus no need to check the current ABI here), and with AVX enabled Win64 only guarantees that the low 16 bytes are saved. */ static bool -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, machine_mode mode) { + /* Special ABI for vzeroupper which only clobber higher part of sse regs. */ + if (abi_id == I386_VZEROUPPER) + return (GET_MODE_SIZE (mode) > 16 + && ((TARGET_64BIT + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); + return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; } @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ ix86_hard_regno_call_part_clobbered +#undef TARGET_INSN_CALLEE_ABI +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi + #undef TARGET_CAN_CHANGE_MODE_CLASS #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 53d503fc6e0..919d0b2418a 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { /* True if the function needs a stack frame. */ BOOL_BITFIELD stack_frame_required : 1; - /* True if __builtin_ia32_vzeroupper () has been expanded in current - function. */ - BOOL_BITFIELD has_explicit_vzeroupper : 1; - /* True if we should act silently, rather than raise an error for invalid calls. */ BOOL_BITFIELD silent_p : 1; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 2fc8fae30f3..5d9f5aa39ac 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ ;; For MOVDIRI and MOVDIR64B support UNSPEC_MOVDIRI UNSPEC_MOVDIR64B + + ;; For insn_callee_abi: + UNSPEC_CALLEE_ABI + ]) (define_c_enum "unspecv" [ @@ -447,6 +451,12 @@ (define_constants (FIRST_PSEUDO_REG 76) ]) +;; Insn callee abi index. +(define_constants + [(I386_DEFAULT 0) + (I386_VZEROUPPER 1) + (I386_UNKNOWN 2)]) + ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls ;; from i386.c. diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md index abd307ebdb8..8b787553f32 100644 --- a/gcc/config/i386/predicates.md +++ b/gcc/config/i386/predicates.md @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" ;; return true if OP is a vzeroupper pattern. (define_predicate "vzeroupper_pattern" (and (match_code "parallel") - (match_code "unspec_volatile" "a") - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) + (match_code "unspec" "b") + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == I386_VZEROUPPER"))) ;; Return true if OP is an addsub vec_merge operation (define_predicate "addsub_vm_operator" diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a4503ddcb73..949347a3247 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -205,7 +205,6 @@ (define_c_enum "unspecv" [ UNSPECV_MONITOR UNSPECV_MWAIT UNSPECV_VZEROALL - UNSPECV_VZEROUPPER ;; For KEYLOCKER UNSPECV_LOADIWKEY @@ -20857,14 +20856,22 @@ (define_insn "*avx_vzeroall" ;; if the upper 128bits are unused. Initially we expand the instructions ;; as though they had no effect on the SSE registers, but later add SETs and ;; CLOBBERs to the PARALLEL to model the real effect. + (define_expand "avx_vzeroupper" - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX") + [(parallel [(call (mem:QI (const_int 0)) + (const_int 0)) + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] + "TARGET_AVX" +{ + ix86_expand_avx_vzeroupper (); + DONE; +}) -(define_insn "*avx_vzeroupper" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" +(define_insn "avx_vzeroupper_callee_abi" + [(call (mem:QI (const_int 0)) + (const_int 0)) + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)] + "TARGET_AVX" "vzeroupper" [(set_attr "type" "sse") (set_attr "modrm" "0") @@ -20873,44 +20880,6 @@ (define_insn "*avx_vzeroupper" (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) -(define_insn_and_split "*avx_vzeroupper_1" - [(match_parallel 0 "vzeroupper_pattern" - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" - "#" - "&& epilogue_completed" - [(match_dup 0)] -{ - /* For IPA-RA purposes, make it clear the instruction clobbers - even XMM registers not mentioned explicitly in the pattern. */ - unsigned int nregs = TARGET_64BIT ? 16 : 8; - unsigned int npats = XVECLEN (operands[0], 0); - rtvec vec = rtvec_alloc (nregs + 1); - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); - for (unsigned int i = 0, j = 1; i < nregs; ++i) - { - unsigned int regno = GET_SSE_REGNO (i); - if (j < npats - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) - { - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); - j++; - } - else - { - rtx reg = gen_rtx_REG (V2DImode, regno); - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); - } - } - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); -} - [(set_attr "type" "sse") - (set_attr "modrm" "0") - (set_attr "memory" "none") - (set_attr "prefix" "vex") - (set_attr "btver2_decode" "vector") - (set_attr "mode" "OI")]) - (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c b/gcc/testsuite/gcc.target/i386/pr82735-1.c new file mode 100644 index 00000000000..1a63b9ae9c9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c @@ -0,0 +1,29 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-require-effective-target avx } */ + +#include "avx-check.h" + +void +__attribute__ ((noipa)) +mtest(char *dest) +{ + __m256i ymm1 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); + _mm256_zeroupper(); + __m256i ymm2 = _mm256_set1_epi8((char)0x1); + _mm256_storeu_si256((__m256i *)dest, ymm2); +} + +void +avx_test () +{ + char buf[64]; + for (int i = 0; i != 64; i++) + buf[i] = 2; + mtest (buf); + + for (int i = 0; i < 32; ++i) + if (buf[i] != 1) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c b/gcc/testsuite/gcc.target/i386/pr82735-2.c new file mode 100644 index 00000000000..ac9d006f794 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2" } */ + +#include <immintrin.h> + +void test(char *dest) +{ + /* xmm1 can be propagated to xmm2 by CSE. */ + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + __m128i xmm2 = xmm1; + _mm_storeu_si128((__m128i *)dest, xmm2); +} + +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, %xmm[0-9]} 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c b/gcc/testsuite/gcc.target/i386/pr82735-3.c new file mode 100644 index 00000000000..e3f801e6924 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx -O2 -mabi=ms" } */ +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ + +#include "pr82735-2.c" diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c b/gcc/testsuite/gcc.target/i386/pr82735-4.c new file mode 100644 index 00000000000..78c0a6cb2c8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c @@ -0,0 +1,48 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include <immintrin.h> + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +} diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c b/gcc/testsuite/gcc.target/i386/pr82735-5.c new file mode 100644 index 00000000000..2a58cbe52d0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c @@ -0,0 +1,54 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ + +#include <immintrin.h> + +void test(char *dest) +{ + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, ymm11, ymm12, ymm13, ymm14, ymm15; + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" + "vmovdqa\t%%ymm0, %1\n\t" + "vmovdqa\t%%ymm0, %2\n\t" + "vmovdqa\t%%ymm0, %3\n\t" + "vmovdqa\t%%ymm0, %4\n\t" + "vmovdqa\t%%ymm0, %5\n\t" + "vmovdqa\t%%ymm0, %6\n\t" + "vmovdqa\t%%ymm0, %7\n\t" + "vmovdqa\t%%ymm0, %8\n\t" + "vmovdqa\t%%ymm0, %9\n\t" + "vmovdqa\t%%ymm0, %10\n\t" + "vmovdqa\t%%ymm0, %11\n\t" + "vmovdqa\t%%ymm0, %12\n\t" + "vmovdqa\t%%ymm0, %13\n\t" + "vmovdqa\t%%ymm0, %14\n\t" + "vmovdqa\t%%ymm0, %15\n\t" + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" (ymm5), + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" (ymm10), + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" (ymm15), + "=v"(ymm0) + ::); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_zeroupper(); + _mm256_storeu_si256((__m256i *)dest, ymm1); + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); +}