Message ID | 20160419144843.GA7801@intel.com |
---|---|
State | New |
Headers | show |
On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: > Since mov<mode>_internal patterns handle both aligned/unaligned load > and store, we can simplify ix86_avx256_split_vector_move_misalign and > ix86_expand_vector_move_misalign. > > Tested on x86-64. OK for trunk? > > H.J. > --- > * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): > Short-cut unaligned load and store cases. Handle all integer > vector modes. > (ix86_expand_vector_move_misalign): Short-cut unaligned load > and store cases. Call ix86_avx256_split_vector_move_misalign > directly without checking mode class. LGTM, but it is hard to review interwoven code movements and deletions... Hopefully OK. Thanks, Uros. > --- > gcc/config/i386/i386.c | 252 ++++++++++++++++--------------------------------- > 1 file changed, 81 insertions(+), 171 deletions(-) > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 4e48572..e056f68 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) > rtx (*extract) (rtx, rtx, rtx); > machine_mode mode; > > - switch (GET_MODE (op0)) > + if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) > + || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > + > + rtx orig_op0 = NULL_RTX; > + mode = GET_MODE (op0); > + switch (GET_MODE_CLASS (mode)) > + { > + case MODE_VECTOR_INT: > + case MODE_INT: > + if (mode != V32QImode) > + { > + if (!MEM_P (op0)) > + { > + orig_op0 = op0; > + op0 = gen_reg_rtx (V32QImode); > + } > + else > + op0 = gen_lowpart (V32QImode, op0); > + op1 = gen_lowpart (V32QImode, op1); > + mode = V32QImode; > + } > + break; > + case MODE_VECTOR_FLOAT: > + break; > + default: > + gcc_unreachable (); > + } > + > + switch (mode) > { > default: > gcc_unreachable (); > @@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) > > if (MEM_P (op1)) > { > - if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD > - && optimize_insn_for_speed_p ()) > - { > - rtx r = gen_reg_rtx (mode); > - m = adjust_address (op1, mode, 0); > - emit_move_insn (r, m); > - m = adjust_address (op1, mode, 16); > - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); > - emit_move_insn (op0, r); > - } > - else > - emit_insn (gen_rtx_SET (op0, op1)); > + rtx r = gen_reg_rtx (mode); > + m = adjust_address (op1, mode, 0); > + emit_move_insn (r, m); > + m = adjust_address (op1, mode, 16); > + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); > + emit_move_insn (op0, r); > } > else if (MEM_P (op0)) > { > - if (TARGET_AVX256_SPLIT_UNALIGNED_STORE > - && optimize_insn_for_speed_p ()) > - { > - m = adjust_address (op0, mode, 0); > - emit_insn (extract (m, op1, const0_rtx)); > - m = adjust_address (op0, mode, 16); > - emit_insn (extract (m, op1, const1_rtx)); > - } > - else > - emit_insn (gen_rtx_SET (op0, op1)); > + m = adjust_address (op0, mode, 0); > + emit_insn (extract (m, op1, const0_rtx)); > + m = adjust_address (op0, mode, 16); > + emit_insn (extract (m, op1, const1_rtx)); > } > else > gcc_unreachable (); > + > + if (orig_op0) > + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > } > > /* Implement the movmisalign patterns for SSE. Non-SSE modes go > @@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) > void > ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) > { > - rtx op0, op1, orig_op0 = NULL_RTX, m; > + rtx op0, op1, m; > > op0 = operands[0]; > op1 = operands[1]; > > - if (GET_MODE_SIZE (mode) == 64) > + /* Use unaligned load/store for AVX512 or when optimizing for size. */ > + if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) > { > - switch (GET_MODE_CLASS (mode)) > - { > - case MODE_VECTOR_INT: > - case MODE_INT: > - if (GET_MODE (op0) != V16SImode) > - { > - if (!MEM_P (op0)) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V16SImode); > - } > - else > - op0 = gen_lowpart (V16SImode, op0); > - } > - op1 = gen_lowpart (V16SImode, op1); > - /* FALLTHRU */ > - > - case MODE_VECTOR_FLOAT: > - > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - break; > - > - default: > - gcc_unreachable (); > - } > - > + emit_insn (gen_rtx_SET (op0, op1)); > return; > } > > - if (TARGET_AVX > - && GET_MODE_SIZE (mode) == 32) > + if (TARGET_AVX) > { > - switch (GET_MODE_CLASS (mode)) > - { > - case MODE_VECTOR_INT: > - case MODE_INT: > - if (GET_MODE (op0) != V32QImode) > - { > - if (!MEM_P (op0)) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V32QImode); > - } > - else > - op0 = gen_lowpart (V32QImode, op0); > - } > - op1 = gen_lowpart (V32QImode, op1); > - /* FALLTHRU */ > - > - case MODE_VECTOR_FLOAT: > - ix86_avx256_split_vector_move_misalign (op0, op1); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - break; > + if (GET_MODE_SIZE (mode) == 32) > + ix86_avx256_split_vector_move_misalign (op0, op1); > + else > + /* Always use 128-bit mov<mode>_internal pattern for AVX. */ > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > > - default: > - gcc_unreachable (); > - } > + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > + return; > + } > > + /* ??? If we have typed data, then it would appear that using > + movdqu is the only way to get unaligned data loaded with > + integer type. */ > + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > + { > + emit_insn (gen_rtx_SET (op0, op1)); > return; > } > > if (MEM_P (op1)) > { > - /* Normal *mov<mode>_internal pattern will handle > - unaligned loads just fine if misaligned_operand > - is true, and without the UNSPEC it can be combined > - with arithmetic instructions. */ > - if (TARGET_AVX > - && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT > - || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) > - && misaligned_operand (op1, GET_MODE (op1))) > - emit_insn (gen_rtx_SET (op0, op1)); > - /* ??? If we have typed data, then it would appear that using > - movdqu is the only way to get unaligned data loaded with > - integer type. */ > - else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - { > - if (GET_MODE (op0) != V16QImode) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V16QImode); > - } > - op1 = gen_lowpart (V16QImode, op1); > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); > - } > - else if (TARGET_SSE2 && mode == V2DFmode) > + if (TARGET_SSE2 && mode == V2DFmode) > { > rtx zero; > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - return; > - } > - > /* When SSE registers are split into halves, we can avoid > writing to the top half twice. */ > if (TARGET_SSE_SPLIT_REGS) > @@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) > { > rtx t; > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - if (GET_MODE (op0) != V4SFmode) > - { > - orig_op0 = op0; > - op0 = gen_reg_rtx (V4SFmode); > - } > - op1 = gen_lowpart (V4SFmode, op1); > - emit_insn (gen_rtx_SET (op0, op1)); > - if (orig_op0) > - emit_move_insn (orig_op0, > - gen_lowpart (GET_MODE (orig_op0), op0)); > - return; > - } > - > if (mode != V4SFmode) > t = gen_reg_rtx (V4SFmode); > else > @@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) > } > else if (MEM_P (op0)) > { > - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) > - { > - op0 = gen_lowpart (V16QImode, op0); > - op1 = gen_lowpart (V16QImode, op1); > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - } > - else if (TARGET_SSE2 && mode == V2DFmode) > - { > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - /* We will eventually emit movups based on insn attributes. */ > - emit_insn (gen_rtx_SET (op0, op1)); > - else > - { > - m = adjust_address (op0, DFmode, 0); > - emit_insn (gen_sse2_storelpd (m, op1)); > - m = adjust_address (op0, DFmode, 8); > - emit_insn (gen_sse2_storehpd (m, op1)); > - } > + if (TARGET_SSE2 && mode == V2DFmode) > + { > + m = adjust_address (op0, DFmode, 0); > + emit_insn (gen_sse2_storelpd (m, op1)); > + m = adjust_address (op0, DFmode, 8); > + emit_insn (gen_sse2_storehpd (m, op1)); > } > else > { > if (mode != V4SFmode) > op1 = gen_lowpart (V4SFmode, op1); > > - if (TARGET_AVX > - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL > - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL > - || optimize_insn_for_size_p ()) > - { > - op0 = gen_lowpart (V4SFmode, op0); > - emit_insn (gen_rtx_SET (op0, op1)); > - } > - else > - { > - m = adjust_address (op0, V2SFmode, 0); > - emit_insn (gen_sse_storelps (m, op1)); > - m = adjust_address (op0, V2SFmode, 8); > - emit_insn (gen_sse_storehps (m, op1)); > - } > + m = adjust_address (op0, V2SFmode, 0); > + emit_insn (gen_sse_storelps (m, op1)); > + m = adjust_address (op0, V2SFmode, 8); > + emit_insn (gen_sse_storehps (m, op1)); > } > } > else > -- > 2.5.5 >
On Wed, Apr 20, 2016 at 1:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote: > On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: >> Since mov<mode>_internal patterns handle both aligned/unaligned load >> and store, we can simplify ix86_avx256_split_vector_move_misalign and >> ix86_expand_vector_move_misalign. >> >> Tested on x86-64. OK for trunk? >> >> H.J. >> --- >> * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): >> Short-cut unaligned load and store cases. Handle all integer >> vector modes. >> (ix86_expand_vector_move_misalign): Short-cut unaligned load >> and store cases. Call ix86_avx256_split_vector_move_misalign >> directly without checking mode class. > > LGTM, but it is hard to review interwoven code movements and deletions... > > Hopefully OK. BTW: There are a couple of regressions in the testsuite [1] when configured --with-arch=corei7. Can you please look at the testcases, if scan patterns need to be adjusted? FAIL: gcc.target/i386/avx256-unaligned-load-1.c scan-assembler-not (avx_loadups256|vmovups[^\\n\\r]*movv8sf_internal) FAIL: gcc.target/i386/avx256-unaligned-store-2.c scan-assembler vmovups.*movv16qi_internal/3 [1] https://gcc.gnu.org/ml/gcc-testresults/2016-04/msg01932.html Uros.
On Wed, Apr 20, 2016 at 4:19 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > On Wed, Apr 20, 2016 at 1:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote: >> On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote: >>> Since mov<mode>_internal patterns handle both aligned/unaligned load >>> and store, we can simplify ix86_avx256_split_vector_move_misalign and >>> ix86_expand_vector_move_misalign. >>> >>> Tested on x86-64. OK for trunk? >>> >>> H.J. >>> --- >>> * config/i386/i386.c (ix86_avx256_split_vector_move_misalign): >>> Short-cut unaligned load and store cases. Handle all integer >>> vector modes. >>> (ix86_expand_vector_move_misalign): Short-cut unaligned load >>> and store cases. Call ix86_avx256_split_vector_move_misalign >>> directly without checking mode class. >> >> LGTM, but it is hard to review interwoven code movements and deletions... >> >> Hopefully OK. > > BTW: There are a couple of regressions in the testsuite [1] when > configured --with-arch=corei7. Can you please look at the testcases, > if scan patterns need to be adjusted? > > FAIL: gcc.target/i386/avx256-unaligned-load-1.c scan-assembler-not > (avx_loadups256|vmovups[^\\n\\r]*movv8sf_internal) > FAIL: gcc.target/i386/avx256-unaligned-store-2.c scan-assembler > vmovups.*movv16qi_internal/3 > > [1] https://gcc.gnu.org/ml/gcc-testresults/2016-04/msg01932.html I will submit a patch.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4e48572..e056f68 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) rtx (*extract) (rtx, rtx, rtx); machine_mode mode; - switch (GET_MODE (op0)) + if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD) + || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE)) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + + rtx orig_op0 = NULL_RTX; + mode = GET_MODE (op0); + switch (GET_MODE_CLASS (mode)) + { + case MODE_VECTOR_INT: + case MODE_INT: + if (mode != V32QImode) + { + if (!MEM_P (op0)) + { + orig_op0 = op0; + op0 = gen_reg_rtx (V32QImode); + } + else + op0 = gen_lowpart (V32QImode, op0); + op1 = gen_lowpart (V32QImode, op1); + mode = V32QImode; + } + break; + case MODE_VECTOR_FLOAT: + break; + default: + gcc_unreachable (); + } + + switch (mode) { default: gcc_unreachable (); @@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) if (MEM_P (op1)) { - if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD - && optimize_insn_for_speed_p ()) - { - rtx r = gen_reg_rtx (mode); - m = adjust_address (op1, mode, 0); - emit_move_insn (r, m); - m = adjust_address (op1, mode, 16); - r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); - emit_move_insn (op0, r); - } - else - emit_insn (gen_rtx_SET (op0, op1)); + rtx r = gen_reg_rtx (mode); + m = adjust_address (op1, mode, 0); + emit_move_insn (r, m); + m = adjust_address (op1, mode, 16); + r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m); + emit_move_insn (op0, r); } else if (MEM_P (op0)) { - if (TARGET_AVX256_SPLIT_UNALIGNED_STORE - && optimize_insn_for_speed_p ()) - { - m = adjust_address (op0, mode, 0); - emit_insn (extract (m, op1, const0_rtx)); - m = adjust_address (op0, mode, 16); - emit_insn (extract (m, op1, const1_rtx)); - } - else - emit_insn (gen_rtx_SET (op0, op1)); + m = adjust_address (op0, mode, 0); + emit_insn (extract (m, op1, const0_rtx)); + m = adjust_address (op0, mode, 16); + emit_insn (extract (m, op1, const1_rtx)); } else gcc_unreachable (); + + if (orig_op0) + emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); } /* Implement the movmisalign patterns for SSE. Non-SSE modes go @@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1) void ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) { - rtx op0, op1, orig_op0 = NULL_RTX, m; + rtx op0, op1, m; op0 = operands[0]; op1 = operands[1]; - if (GET_MODE_SIZE (mode) == 64) + /* Use unaligned load/store for AVX512 or when optimizing for size. */ + if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ()) { - switch (GET_MODE_CLASS (mode)) - { - case MODE_VECTOR_INT: - case MODE_INT: - if (GET_MODE (op0) != V16SImode) - { - if (!MEM_P (op0)) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V16SImode); - } - else - op0 = gen_lowpart (V16SImode, op0); - } - op1 = gen_lowpart (V16SImode, op1); - /* FALLTHRU */ - - case MODE_VECTOR_FLOAT: - - emit_insn (gen_rtx_SET (op0, op1)); - if (orig_op0) - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); - break; - - default: - gcc_unreachable (); - } - + emit_insn (gen_rtx_SET (op0, op1)); return; } - if (TARGET_AVX - && GET_MODE_SIZE (mode) == 32) + if (TARGET_AVX) { - switch (GET_MODE_CLASS (mode)) - { - case MODE_VECTOR_INT: - case MODE_INT: - if (GET_MODE (op0) != V32QImode) - { - if (!MEM_P (op0)) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V32QImode); - } - else - op0 = gen_lowpart (V32QImode, op0); - } - op1 = gen_lowpart (V32QImode, op1); - /* FALLTHRU */ - - case MODE_VECTOR_FLOAT: - ix86_avx256_split_vector_move_misalign (op0, op1); - if (orig_op0) - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); - break; + if (GET_MODE_SIZE (mode) == 32) + ix86_avx256_split_vector_move_misalign (op0, op1); + else + /* Always use 128-bit mov<mode>_internal pattern for AVX. */ + emit_insn (gen_rtx_SET (op0, op1)); + return; + } - default: - gcc_unreachable (); - } + if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL + || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL) + { + emit_insn (gen_rtx_SET (op0, op1)); + return; + } + /* ??? If we have typed data, then it would appear that using + movdqu is the only way to get unaligned data loaded with + integer type. */ + if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) + { + emit_insn (gen_rtx_SET (op0, op1)); return; } if (MEM_P (op1)) { - /* Normal *mov<mode>_internal pattern will handle - unaligned loads just fine if misaligned_operand - is true, and without the UNSPEC it can be combined - with arithmetic instructions. */ - if (TARGET_AVX - && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT - || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) - && misaligned_operand (op1, GET_MODE (op1))) - emit_insn (gen_rtx_SET (op0, op1)); - /* ??? If we have typed data, then it would appear that using - movdqu is the only way to get unaligned data loaded with - integer type. */ - else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - if (GET_MODE (op0) != V16QImode) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V16QImode); - } - op1 = gen_lowpart (V16QImode, op1); - /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_rtx_SET (op0, op1)); - if (orig_op0) - emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0)); - } - else if (TARGET_SSE2 && mode == V2DFmode) + if (TARGET_SSE2 && mode == V2DFmode) { rtx zero; - if (TARGET_AVX - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - || optimize_insn_for_size_p ()) - { - /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_rtx_SET (op0, op1)); - return; - } - /* When SSE registers are split into halves, we can avoid writing to the top half twice. */ if (TARGET_SSE_SPLIT_REGS) @@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) { rtx t; - if (TARGET_AVX - || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - || optimize_insn_for_size_p ()) - { - if (GET_MODE (op0) != V4SFmode) - { - orig_op0 = op0; - op0 = gen_reg_rtx (V4SFmode); - } - op1 = gen_lowpart (V4SFmode, op1); - emit_insn (gen_rtx_SET (op0, op1)); - if (orig_op0) - emit_move_insn (orig_op0, - gen_lowpart (GET_MODE (orig_op0), op0)); - return; - } - if (mode != V4SFmode) t = gen_reg_rtx (V4SFmode); else @@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[]) } else if (MEM_P (op0)) { - if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT) - { - op0 = gen_lowpart (V16QImode, op0); - op1 = gen_lowpart (V16QImode, op1); - /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_rtx_SET (op0, op1)); - } - else if (TARGET_SSE2 && mode == V2DFmode) - { - if (TARGET_AVX - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - || optimize_insn_for_size_p ()) - /* We will eventually emit movups based on insn attributes. */ - emit_insn (gen_rtx_SET (op0, op1)); - else - { - m = adjust_address (op0, DFmode, 0); - emit_insn (gen_sse2_storelpd (m, op1)); - m = adjust_address (op0, DFmode, 8); - emit_insn (gen_sse2_storehpd (m, op1)); - } + if (TARGET_SSE2 && mode == V2DFmode) + { + m = adjust_address (op0, DFmode, 0); + emit_insn (gen_sse2_storelpd (m, op1)); + m = adjust_address (op0, DFmode, 8); + emit_insn (gen_sse2_storehpd (m, op1)); } else { if (mode != V4SFmode) op1 = gen_lowpart (V4SFmode, op1); - if (TARGET_AVX - || TARGET_SSE_UNALIGNED_STORE_OPTIMAL - || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL - || optimize_insn_for_size_p ()) - { - op0 = gen_lowpart (V4SFmode, op0); - emit_insn (gen_rtx_SET (op0, op1)); - } - else - { - m = adjust_address (op0, V2SFmode, 0); - emit_insn (gen_sse_storelps (m, op1)); - m = adjust_address (op0, V2SFmode, 8); - emit_insn (gen_sse_storehps (m, op1)); - } + m = adjust_address (op0, V2SFmode, 0); + emit_insn (gen_sse_storelps (m, op1)); + m = adjust_address (op0, V2SFmode, 8); + emit_insn (gen_sse_storehps (m, op1)); } } else