diff mbox

Simplify ix86_expand_vector_move_misalign

Message ID 20160419144843.GA7801@intel.com
State New
Headers show

Commit Message

H.J. Lu April 19, 2016, 2:48 p.m. UTC
Since mov<mode>_internal patterns handle both aligned/unaligned load
and store, we can simplify ix86_avx256_split_vector_move_misalign and
ix86_expand_vector_move_misalign.

Tested on x86-64.  OK for trunk?

H.J.
---
	* config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
	Short-cut unaligned load and store cases.  Handle all integer
	vector modes.
	(ix86_expand_vector_move_misalign): Short-cut unaligned load
	and store cases.  Call ix86_avx256_split_vector_move_misalign
	directly without checking mode class.
---
 gcc/config/i386/i386.c | 252 ++++++++++++++++---------------------------------
 1 file changed, 81 insertions(+), 171 deletions(-)

Comments

Uros Bizjak April 20, 2016, 11:09 a.m. UTC | #1
On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> Since mov<mode>_internal patterns handle both aligned/unaligned load
> and store, we can simplify ix86_avx256_split_vector_move_misalign and
> ix86_expand_vector_move_misalign.
>
> Tested on x86-64.  OK for trunk?
>
> H.J.
> ---
>         * config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
>         Short-cut unaligned load and store cases.  Handle all integer
>         vector modes.
>         (ix86_expand_vector_move_misalign): Short-cut unaligned load
>         and store cases.  Call ix86_avx256_split_vector_move_misalign
>         directly without checking mode class.

LGTM, but it is hard to review interwoven code movements and deletions...

Hopefully OK.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c | 252 ++++++++++++++++---------------------------------
>  1 file changed, 81 insertions(+), 171 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 4e48572..e056f68 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -18820,7 +18820,39 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
>    rtx (*extract) (rtx, rtx, rtx);
>    machine_mode mode;
>
> -  switch (GET_MODE (op0))
> +  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> +      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
> +    {
> +      emit_insn (gen_rtx_SET (op0, op1));
> +      return;
> +    }
> +
> +  rtx orig_op0 = NULL_RTX;
> +  mode = GET_MODE (op0);
> +  switch (GET_MODE_CLASS (mode))
> +    {
> +    case MODE_VECTOR_INT:
> +    case MODE_INT:
> +      if (mode != V32QImode)
> +       {
> +         if (!MEM_P (op0))
> +           {
> +             orig_op0 = op0;
> +             op0 = gen_reg_rtx (V32QImode);
> +           }
> +         else
> +           op0 = gen_lowpart (V32QImode, op0);
> +         op1 = gen_lowpart (V32QImode, op1);
> +         mode = V32QImode;
> +       }
> +      break;
> +    case MODE_VECTOR_FLOAT:
> +      break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  switch (mode)
>      {
>      default:
>        gcc_unreachable ();
> @@ -18840,34 +18872,25 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
>
>    if (MEM_P (op1))
>      {
> -      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
> -         && optimize_insn_for_speed_p ())
> -       {
> -         rtx r = gen_reg_rtx (mode);
> -         m = adjust_address (op1, mode, 0);
> -         emit_move_insn (r, m);
> -         m = adjust_address (op1, mode, 16);
> -         r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
> -         emit_move_insn (op0, r);
> -       }
> -      else
> -       emit_insn (gen_rtx_SET (op0, op1));
> +      rtx r = gen_reg_rtx (mode);
> +      m = adjust_address (op1, mode, 0);
> +      emit_move_insn (r, m);
> +      m = adjust_address (op1, mode, 16);
> +      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
> +      emit_move_insn (op0, r);
>      }
>    else if (MEM_P (op0))
>      {
> -      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE
> -         && optimize_insn_for_speed_p ())
> -       {
> -         m = adjust_address (op0, mode, 0);
> -         emit_insn (extract (m, op1, const0_rtx));
> -         m = adjust_address (op0, mode, 16);
> -         emit_insn (extract (m, op1, const1_rtx));
> -       }
> -      else
> -       emit_insn (gen_rtx_SET (op0, op1));
> +      m = adjust_address (op0, mode, 0);
> +      emit_insn (extract (m, op1, const0_rtx));
> +      m = adjust_address (op0, mode, 16);
> +      emit_insn (extract (m, op1, const1_rtx));
>      }
>    else
>      gcc_unreachable ();
> +
> +  if (orig_op0)
> +    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
>  }
>
>  /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
> @@ -18925,118 +18948,50 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
>  void
>  ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
>  {
> -  rtx op0, op1, orig_op0 = NULL_RTX, m;
> +  rtx op0, op1, m;
>
>    op0 = operands[0];
>    op1 = operands[1];
>
> -  if (GET_MODE_SIZE (mode) == 64)
> +  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
> +  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
>      {
> -      switch (GET_MODE_CLASS (mode))
> -       {
> -       case MODE_VECTOR_INT:
> -       case MODE_INT:
> -         if (GET_MODE (op0) != V16SImode)
> -           {
> -             if (!MEM_P (op0))
> -               {
> -                 orig_op0 = op0;
> -                 op0 = gen_reg_rtx (V16SImode);
> -               }
> -             else
> -               op0 = gen_lowpart (V16SImode, op0);
> -           }
> -         op1 = gen_lowpart (V16SImode, op1);
> -         /* FALLTHRU */
> -
> -       case MODE_VECTOR_FLOAT:
> -
> -         emit_insn (gen_rtx_SET (op0, op1));
> -         if (orig_op0)
> -           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> -         break;
> -
> -       default:
> -         gcc_unreachable ();
> -       }
> -
> +      emit_insn (gen_rtx_SET (op0, op1));
>        return;
>      }
>
> -  if (TARGET_AVX
> -      && GET_MODE_SIZE (mode) == 32)
> +  if (TARGET_AVX)
>      {
> -      switch (GET_MODE_CLASS (mode))
> -       {
> -       case MODE_VECTOR_INT:
> -       case MODE_INT:
> -         if (GET_MODE (op0) != V32QImode)
> -           {
> -             if (!MEM_P (op0))
> -               {
> -                 orig_op0 = op0;
> -                 op0 = gen_reg_rtx (V32QImode);
> -               }
> -             else
> -               op0 = gen_lowpart (V32QImode, op0);
> -           }
> -         op1 = gen_lowpart (V32QImode, op1);
> -         /* FALLTHRU */
> -
> -       case MODE_VECTOR_FLOAT:
> -         ix86_avx256_split_vector_move_misalign (op0, op1);
> -         if (orig_op0)
> -           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> -         break;
> +      if (GET_MODE_SIZE (mode) == 32)
> +       ix86_avx256_split_vector_move_misalign (op0, op1);
> +      else
> +       /* Always use 128-bit mov<mode>_internal pattern for AVX.  */
> +       emit_insn (gen_rtx_SET (op0, op1));
> +      return;
> +    }
>
> -       default:
> -         gcc_unreachable ();
> -       }
> +  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
> +    {
> +      emit_insn (gen_rtx_SET (op0, op1));
> +      return;
> +    }
>
> +  /* ??? If we have typed data, then it would appear that using
> +     movdqu is the only way to get unaligned data loaded with
> +     integer type.  */
> +  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> +    {
> +      emit_insn (gen_rtx_SET (op0, op1));
>        return;
>      }
>
>    if (MEM_P (op1))
>      {
> -      /* Normal *mov<mode>_internal pattern will handle
> -        unaligned loads just fine if misaligned_operand
> -        is true, and without the UNSPEC it can be combined
> -        with arithmetic instructions.  */
> -      if (TARGET_AVX
> -         && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
> -             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
> -         && misaligned_operand (op1, GET_MODE (op1)))
> -       emit_insn (gen_rtx_SET (op0, op1));
> -      /* ??? If we have typed data, then it would appear that using
> -        movdqu is the only way to get unaligned data loaded with
> -        integer type.  */
> -      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> -       {
> -         if (GET_MODE (op0) != V16QImode)
> -           {
> -             orig_op0 = op0;
> -             op0 = gen_reg_rtx (V16QImode);
> -           }
> -         op1 = gen_lowpart (V16QImode, op1);
> -         /* We will eventually emit movups based on insn attributes.  */
> -         emit_insn (gen_rtx_SET (op0, op1));
> -         if (orig_op0)
> -           emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
> -       }
> -      else if (TARGET_SSE2 && mode == V2DFmode)
> +      if (TARGET_SSE2 && mode == V2DFmode)
>          {
>            rtx zero;
>
> -         if (TARGET_AVX
> -             || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> -             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> -             || optimize_insn_for_size_p ())
> -           {
> -             /* We will eventually emit movups based on insn attributes.  */
> -             emit_insn (gen_rtx_SET (op0, op1));
> -             return;
> -           }
> -
>           /* When SSE registers are split into halves, we can avoid
>              writing to the top half twice.  */
>           if (TARGET_SSE_SPLIT_REGS)
> @@ -19066,24 +19021,6 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
>          {
>           rtx t;
>
> -         if (TARGET_AVX
> -             || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> -             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> -             || optimize_insn_for_size_p ())
> -           {
> -             if (GET_MODE (op0) != V4SFmode)
> -               {
> -                 orig_op0 = op0;
> -                 op0 = gen_reg_rtx (V4SFmode);
> -               }
> -             op1 = gen_lowpart (V4SFmode, op1);
> -             emit_insn (gen_rtx_SET (op0, op1));
> -             if (orig_op0)
> -               emit_move_insn (orig_op0,
> -                               gen_lowpart (GET_MODE (orig_op0), op0));
> -             return;
> -            }
> -
>           if (mode != V4SFmode)
>             t = gen_reg_rtx (V4SFmode);
>           else
> @@ -19104,49 +19041,22 @@ ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
>      }
>    else if (MEM_P (op0))
>      {
> -      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
> -        {
> -         op0 = gen_lowpart (V16QImode, op0);
> -         op1 = gen_lowpart (V16QImode, op1);
> -         /* We will eventually emit movups based on insn attributes.  */
> -         emit_insn (gen_rtx_SET (op0, op1));
> -       }
> -      else if (TARGET_SSE2 && mode == V2DFmode)
> -       {
> -         if (TARGET_AVX
> -             || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> -             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> -             || optimize_insn_for_size_p ())
> -           /* We will eventually emit movups based on insn attributes.  */
> -           emit_insn (gen_rtx_SET (op0, op1));
> -         else
> -           {
> -             m = adjust_address (op0, DFmode, 0);
> -             emit_insn (gen_sse2_storelpd (m, op1));
> -             m = adjust_address (op0, DFmode, 8);
> -             emit_insn (gen_sse2_storehpd (m, op1));
> -           }
> +      if (TARGET_SSE2 && mode == V2DFmode)
> +       {
> +         m = adjust_address (op0, DFmode, 0);
> +         emit_insn (gen_sse2_storelpd (m, op1));
> +         m = adjust_address (op0, DFmode, 8);
> +         emit_insn (gen_sse2_storehpd (m, op1));
>         }
>        else
>         {
>           if (mode != V4SFmode)
>             op1 = gen_lowpart (V4SFmode, op1);
>
> -         if (TARGET_AVX
> -             || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> -             || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
> -             || optimize_insn_for_size_p ())
> -           {
> -             op0 = gen_lowpart (V4SFmode, op0);
> -             emit_insn (gen_rtx_SET (op0, op1));
> -           }
> -         else
> -           {
> -             m = adjust_address (op0, V2SFmode, 0);
> -             emit_insn (gen_sse_storelps (m, op1));
> -             m = adjust_address (op0, V2SFmode, 8);
> -             emit_insn (gen_sse_storehps (m, op1));
> -           }
> +         m = adjust_address (op0, V2SFmode, 0);
> +         emit_insn (gen_sse_storelps (m, op1));
> +         m = adjust_address (op0, V2SFmode, 8);
> +         emit_insn (gen_sse_storehps (m, op1));
>         }
>      }
>    else
> --
> 2.5.5
>
Uros Bizjak April 20, 2016, 11:19 a.m. UTC | #2
On Wed, Apr 20, 2016 at 1:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>> Since mov<mode>_internal patterns handle both aligned/unaligned load
>> and store, we can simplify ix86_avx256_split_vector_move_misalign and
>> ix86_expand_vector_move_misalign.
>>
>> Tested on x86-64.  OK for trunk?
>>
>> H.J.
>> ---
>>         * config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
>>         Short-cut unaligned load and store cases.  Handle all integer
>>         vector modes.
>>         (ix86_expand_vector_move_misalign): Short-cut unaligned load
>>         and store cases.  Call ix86_avx256_split_vector_move_misalign
>>         directly without checking mode class.
>
> LGTM, but it is hard to review interwoven code movements and deletions...
>
> Hopefully OK.

BTW: There are a couple of regressions in the testsuite [1] when
configured --with-arch=corei7. Can you please look at the testcases,
if scan patterns need to be adjusted?

FAIL: gcc.target/i386/avx256-unaligned-load-1.c scan-assembler-not
(avx_loadups256|vmovups[^\\n\\r]*movv8sf_internal)
FAIL: gcc.target/i386/avx256-unaligned-store-2.c scan-assembler
vmovups.*movv16qi_internal/3

[1] https://gcc.gnu.org/ml/gcc-testresults/2016-04/msg01932.html

Uros.
H.J. Lu April 20, 2016, 1:36 p.m. UTC | #3
On Wed, Apr 20, 2016 at 4:19 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Apr 20, 2016 at 1:09 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Tue, Apr 19, 2016 at 4:48 PM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>>> Since mov<mode>_internal patterns handle both aligned/unaligned load
>>> and store, we can simplify ix86_avx256_split_vector_move_misalign and
>>> ix86_expand_vector_move_misalign.
>>>
>>> Tested on x86-64.  OK for trunk?
>>>
>>> H.J.
>>> ---
>>>         * config/i386/i386.c (ix86_avx256_split_vector_move_misalign):
>>>         Short-cut unaligned load and store cases.  Handle all integer
>>>         vector modes.
>>>         (ix86_expand_vector_move_misalign): Short-cut unaligned load
>>>         and store cases.  Call ix86_avx256_split_vector_move_misalign
>>>         directly without checking mode class.
>>
>> LGTM, but it is hard to review interwoven code movements and deletions...
>>
>> Hopefully OK.
>
> BTW: There are a couple of regressions in the testsuite [1] when
> configured --with-arch=corei7. Can you please look at the testcases,
> if scan patterns need to be adjusted?
>
> FAIL: gcc.target/i386/avx256-unaligned-load-1.c scan-assembler-not
> (avx_loadups256|vmovups[^\\n\\r]*movv8sf_internal)
> FAIL: gcc.target/i386/avx256-unaligned-store-2.c scan-assembler
> vmovups.*movv16qi_internal/3
>
> [1] https://gcc.gnu.org/ml/gcc-testresults/2016-04/msg01932.html

I will submit a patch.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 4e48572..e056f68 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -18820,7 +18820,39 @@  ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
   rtx (*extract) (rtx, rtx, rtx);
   machine_mode mode;
 
-  switch (GET_MODE (op0))
+  if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+      || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
+
+  rtx orig_op0 = NULL_RTX;
+  mode = GET_MODE (op0);
+  switch (GET_MODE_CLASS (mode))
+    {
+    case MODE_VECTOR_INT:
+    case MODE_INT:
+      if (mode != V32QImode)
+	{
+	  if (!MEM_P (op0))
+	    {
+	      orig_op0 = op0;
+	      op0 = gen_reg_rtx (V32QImode);
+	    }
+	  else
+	    op0 = gen_lowpart (V32QImode, op0);
+	  op1 = gen_lowpart (V32QImode, op1);
+	  mode = V32QImode;
+	}
+      break;
+    case MODE_VECTOR_FLOAT:
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  switch (mode)
     {
     default:
       gcc_unreachable ();
@@ -18840,34 +18872,25 @@  ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
 
   if (MEM_P (op1))
     {
-      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD
-	  && optimize_insn_for_speed_p ())
-	{
-	  rtx r = gen_reg_rtx (mode);
-	  m = adjust_address (op1, mode, 0);
-	  emit_move_insn (r, m);
-	  m = adjust_address (op1, mode, 16);
-	  r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
-	  emit_move_insn (op0, r);
-	}
-      else
-	emit_insn (gen_rtx_SET (op0, op1));
+      rtx r = gen_reg_rtx (mode);
+      m = adjust_address (op1, mode, 0);
+      emit_move_insn (r, m);
+      m = adjust_address (op1, mode, 16);
+      r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
+      emit_move_insn (op0, r);
     }
   else if (MEM_P (op0))
     {
-      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE
-	  && optimize_insn_for_speed_p ())
-	{
-	  m = adjust_address (op0, mode, 0);
-	  emit_insn (extract (m, op1, const0_rtx));
-	  m = adjust_address (op0, mode, 16);
-	  emit_insn (extract (m, op1, const1_rtx));
-	}
-      else
-	emit_insn (gen_rtx_SET (op0, op1));
+      m = adjust_address (op0, mode, 0);
+      emit_insn (extract (m, op1, const0_rtx));
+      m = adjust_address (op0, mode, 16);
+      emit_insn (extract (m, op1, const1_rtx));
     }
   else
     gcc_unreachable ();
+
+  if (orig_op0)
+    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
 }
 
 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
@@ -18925,118 +18948,50 @@  ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
 void
 ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
 {
-  rtx op0, op1, orig_op0 = NULL_RTX, m;
+  rtx op0, op1, m;
 
   op0 = operands[0];
   op1 = operands[1];
 
-  if (GET_MODE_SIZE (mode) == 64)
+  /* Use unaligned load/store for AVX512 or when optimizing for size.  */
+  if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
     {
-      switch (GET_MODE_CLASS (mode))
-	{
-	case MODE_VECTOR_INT:
-	case MODE_INT:
-	  if (GET_MODE (op0) != V16SImode)
-	    {
-	      if (!MEM_P (op0))
-		{
-		  orig_op0 = op0;
-		  op0 = gen_reg_rtx (V16SImode);
-		}
-	      else
-		op0 = gen_lowpart (V16SImode, op0);
-	    }
-	  op1 = gen_lowpart (V16SImode, op1);
-	  /* FALLTHRU */
-
-	case MODE_VECTOR_FLOAT:
-
-	  emit_insn (gen_rtx_SET (op0, op1));
-	  if (orig_op0)
-	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
-	  break;
-
-	default:
-	  gcc_unreachable ();
-	}
-
+      emit_insn (gen_rtx_SET (op0, op1));
       return;
     }
 
-  if (TARGET_AVX
-      && GET_MODE_SIZE (mode) == 32)
+  if (TARGET_AVX)
     {
-      switch (GET_MODE_CLASS (mode))
-	{
-	case MODE_VECTOR_INT:
-	case MODE_INT:
-	  if (GET_MODE (op0) != V32QImode)
-	    {
-	      if (!MEM_P (op0))
-		{
-		  orig_op0 = op0;
-		  op0 = gen_reg_rtx (V32QImode);
-		}
-	      else
-		op0 = gen_lowpart (V32QImode, op0);
-	    }
-	  op1 = gen_lowpart (V32QImode, op1);
-	  /* FALLTHRU */
-
-	case MODE_VECTOR_FLOAT:
-	  ix86_avx256_split_vector_move_misalign (op0, op1);
-	  if (orig_op0)
-	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
-	  break;
+      if (GET_MODE_SIZE (mode) == 32)
+	ix86_avx256_split_vector_move_misalign (op0, op1);
+      else
+	/* Always use 128-bit mov<mode>_internal pattern for AVX.  */
+	emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
 
-	default:
-	  gcc_unreachable ();
-	}
+  if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
+      return;
+    }
 
+  /* ??? If we have typed data, then it would appear that using
+     movdqu is the only way to get unaligned data loaded with
+     integer type.  */
+  if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    {
+      emit_insn (gen_rtx_SET (op0, op1));
       return;
     }
 
   if (MEM_P (op1))
     {
-      /* Normal *mov<mode>_internal pattern will handle
-	 unaligned loads just fine if misaligned_operand
-	 is true, and without the UNSPEC it can be combined
-	 with arithmetic instructions.  */
-      if (TARGET_AVX
-	  && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
-	  && misaligned_operand (op1, GET_MODE (op1)))
-	emit_insn (gen_rtx_SET (op0, op1));
-      /* ??? If we have typed data, then it would appear that using
-	 movdqu is the only way to get unaligned data loaded with
-	 integer type.  */
-      else if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-	{
-	  if (GET_MODE (op0) != V16QImode)
-	    {
-	      orig_op0 = op0;
-	      op0 = gen_reg_rtx (V16QImode);
-	    }
-	  op1 = gen_lowpart (V16QImode, op1);
-	  /* We will eventually emit movups based on insn attributes.  */
-	  emit_insn (gen_rtx_SET (op0, op1));
-	  if (orig_op0)
-	    emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
-	}
-      else if (TARGET_SSE2 && mode == V2DFmode)
+      if (TARGET_SSE2 && mode == V2DFmode)
         {
           rtx zero;
 
-	  if (TARGET_AVX
-	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
-	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
-	      || optimize_insn_for_size_p ())
-	    {
-	      /* We will eventually emit movups based on insn attributes.  */
-	      emit_insn (gen_rtx_SET (op0, op1));
-	      return;
-	    }
-
 	  /* When SSE registers are split into halves, we can avoid
 	     writing to the top half twice.  */
 	  if (TARGET_SSE_SPLIT_REGS)
@@ -19066,24 +19021,6 @@  ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
         {
 	  rtx t;
 
-	  if (TARGET_AVX
-	      || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
-	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
-	      || optimize_insn_for_size_p ())
-	    {
-	      if (GET_MODE (op0) != V4SFmode)
-		{
-		  orig_op0 = op0;
-		  op0 = gen_reg_rtx (V4SFmode);
-		}
-	      op1 = gen_lowpart (V4SFmode, op1);
-	      emit_insn (gen_rtx_SET (op0, op1));
-	      if (orig_op0)
-		emit_move_insn (orig_op0,
-				gen_lowpart (GET_MODE (orig_op0), op0));
-	      return;
-            }
-
 	  if (mode != V4SFmode)
 	    t = gen_reg_rtx (V4SFmode);
 	  else
@@ -19104,49 +19041,22 @@  ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
     }
   else if (MEM_P (op0))
     {
-      if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
-        {
-	  op0 = gen_lowpart (V16QImode, op0);
-	  op1 = gen_lowpart (V16QImode, op1);
-	  /* We will eventually emit movups based on insn attributes.  */
-	  emit_insn (gen_rtx_SET (op0, op1));
-	}
-      else if (TARGET_SSE2 && mode == V2DFmode)
-	{
-	  if (TARGET_AVX
-	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
-	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
-	      || optimize_insn_for_size_p ())
-	    /* We will eventually emit movups based on insn attributes.  */
-	    emit_insn (gen_rtx_SET (op0, op1));
-	  else
-	    {
-	      m = adjust_address (op0, DFmode, 0);
-	      emit_insn (gen_sse2_storelpd (m, op1));
-	      m = adjust_address (op0, DFmode, 8);
-	      emit_insn (gen_sse2_storehpd (m, op1));
-	    }
+      if (TARGET_SSE2 && mode == V2DFmode)
+	{
+	  m = adjust_address (op0, DFmode, 0);
+	  emit_insn (gen_sse2_storelpd (m, op1));
+	  m = adjust_address (op0, DFmode, 8);
+	  emit_insn (gen_sse2_storehpd (m, op1));
 	}
       else
 	{
 	  if (mode != V4SFmode)
 	    op1 = gen_lowpart (V4SFmode, op1);
 
-	  if (TARGET_AVX
-	      || TARGET_SSE_UNALIGNED_STORE_OPTIMAL
-	      || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
-	      || optimize_insn_for_size_p ())
-	    {
-	      op0 = gen_lowpart (V4SFmode, op0);
-	      emit_insn (gen_rtx_SET (op0, op1));
-	    }
-	  else
-	    {
-	      m = adjust_address (op0, V2SFmode, 0);
-	      emit_insn (gen_sse_storelps (m, op1));
-	      m = adjust_address (op0, V2SFmode, 8);
-	      emit_insn (gen_sse_storehps (m, op1));
-	    }
+	  m = adjust_address (op0, V2SFmode, 0);
+	  emit_insn (gen_sse_storelps (m, op1));
+	  m = adjust_address (op0, V2SFmode, 8);
+	  emit_insn (gen_sse_storehps (m, op1));
 	}
     }
   else