diff mbox series

Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns.

Message ID 20220711011506.103835-1-hongtao.liu@intel.com
State New
Headers show
Series Allocate general register(memory/immediate) for 16/32/64-bit vector bit_op patterns. | expand

Commit Message

liuhongt July 11, 2022, 1:15 a.m. UTC
And split it to GPR-version instruction after reload.

This will enable below optimization for 16/32/64-bit vector bit_op

-       movd    (%rdi), %xmm0
-       movd    (%rsi), %xmm1
-       pand    %xmm1, %xmm0
-       movd    %xmm0, (%rdi)
+       movl    (%rsi), %eax
+       andl    %eax, (%rdi)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

	PR target/106038
	* config/i386/mmx.md (<code><mode>3): Expand
	with (clobber (reg:CC flags_reg)) under TARGET_64BIT
	(mmx_code><mode>3): Ditto.
	(*mmx_<code><mode>3_1): New define_insn, add post_reload
	splitter after it.
	(*<code><mode>3): New define_insn, also add post_reload
	splitter after it.
	(mmxinsnmode): New mode attribute.
	(VI_16_32_64): New mode iterator.
	(*mov<mode>_imm): Refactor with mmxinsnmode.
	* config/i386/predicates.md
	(nonimmediate_or_x86_64_vector_cst): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr106038-1.c: New test.
	* gcc.target/i386/pr106038-2.c: New test.
	* gcc.target/i386/pr106038-3.c: New test.
---
 gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
 gcc/config/i386/predicates.md              |   4 +
 gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
 gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
 gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
 5 files changed, 213 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c

Comments

Uros Bizjak July 11, 2022, 8:02 a.m. UTC | #1
On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> And split it to GPR-version instruction after reload.
>
> This will enable below optimization for 16/32/64-bit vector bit_op
>
> -       movd    (%rdi), %xmm0
> -       movd    (%rsi), %xmm1
> -       pand    %xmm1, %xmm0
> -       movd    %xmm0, (%rdi)
> +       movl    (%rsi), %eax
> +       andl    %eax, (%rdi)
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

The patch will create many interunit moves (xmm <-> gpr) for anything
but the most simple logic sequences, because operations with
memory/immediate will be forced into GPR registers, while reg/reg
operations will remain in XMM registers.

I tried to introduce GPR registers to MMX logic insns in the past and
observed the above behavior, but perhaps RA evolved in the mean time
to handle different register sets better (especially under register
pressure). However, I would advise to be careful with this
functionality.

Perhaps this problem should be attacked in stages. First, please
introduce GPR registers to MMX logic instructions (similar to how
VI_16_32 mode instructions are handled). After RA effects will be
analysed, only then memory/immediate handling should be added. Also,
please don't forget to handle ANDNOT insn - TARGET_BMI slightly
complicates this part, but this is also solved with VI_16_32 mode
instructions.

Uros.

>
> gcc/ChangeLog:
>
>         PR target/106038
>         * config/i386/mmx.md (<code><mode>3): Expand
>         with (clobber (reg:CC flags_reg)) under TARGET_64BIT
>         (mmx_code><mode>3): Ditto.
>         (*mmx_<code><mode>3_1): New define_insn, add post_reload
>         splitter after it.
>         (*<code><mode>3): New define_insn, also add post_reload
>         splitter after it.
>         (mmxinsnmode): New mode attribute.
>         (VI_16_32_64): New mode iterator.
>         (*mov<mode>_imm): Refactor with mmxinsnmode.
>         * config/i386/predicates.md
>         (nonimmediate_or_x86_64_vector_cst): New predicate.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr106038-1.c: New test.
>         * gcc.target/i386/pr106038-2.c: New test.
>         * gcc.target/i386/pr106038-3.c: New test.
> ---
>  gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
>  gcc/config/i386/predicates.md              |   4 +
>  gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
>  gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
>  gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
>  5 files changed, 213 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c
>
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index 3294c1e6274..85b06abea27 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
>      (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
>      (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
>
> +(define_mode_iterator VI_16_32_64
> +   [V2QI V4QI V2HI
> +    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
> +    (V2SI "TARGET_64BIT")])
> +
>  ;; V2S* modes
>  (define_mode_iterator V2FI [V2SF V2SI])
>
> @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
>    [(V8QI "b") (V4QI "b") (V2QI "b")
>     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
>
> +;; Mapping to same size integral mode.
> +(define_mode_attr mmxinsnmode
> +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> +   (V4HI "DI") (V2HI "SI")
> +   (V2SI "DI")
> +   (V4HF "DI") (V2HF "SI")
> +   (V2SF "DI")])
> +
>  (define_mode_attr mmxdoublemode
>    [(V8QI "V8HI") (V4HI "V4SI")])
>
> @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
>    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
>                                                             <MODE>mode);
>    operands[1] = GEN_INT (val);
> -  machine_mode mode;
> -  switch (GET_MODE_SIZE (<MODE>mode))
> -    {
> -    case 2:
> -      mode = HImode;
> -      break;
> -    case 4:
> -      mode = SImode;
> -      break;
> -    case 8:
> -      mode = DImode;
> -      break;
> -    default:
> -      gcc_unreachable ();
> -    }
> -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  ;; For TARGET_64BIT we always round up to 8 bytes.
> @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3"
>           (match_operand:MMXMODEI 1 "register_mmxmem_operand")
>           (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
>    "TARGET_MMX || TARGET_MMX_WITH_SSE"
> -  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
> +{
> +  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
> +  if (TARGET_64BIT)
> +  {
> +    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +    DONE;
> +  }
> +})
>
>  (define_expand "<code><mode>3"
>    [(set (match_operand:MMXMODEI 0 "register_operand")
>         (any_logic:MMXMODEI
>           (match_operand:MMXMODEI 1 "register_operand")
>           (match_operand:MMXMODEI 2 "register_operand")))]
> -  "TARGET_MMX_WITH_SSE")
> +  "TARGET_MMX_WITH_SSE"
> +{
> +  if (TARGET_64BIT)
> +    {
> +      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +      DONE;
> +    }
> +})
>
>  (define_insn "*mmx_<code><mode>3"
>    [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
> @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3"
>     (set_attr "type" "mmxadd,sselog,sselog,sselog")
>     (set_attr "mode" "DI,TI,TI,TI")])
>
> -(define_insn "<code><mode>3"
> -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> -        (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> +(define_insn "*mmx_<code><mode>3_1"
> +  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
> +        (any_logic:MMXMODEI
> +         (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
> +         (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
> +    (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && (TARGET_MMX || TARGET_SSE2)
> +   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> +  "#"
> +  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
> +   (set_attr "mmx_isa" "native,*,*,*,*,*")
> +   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
> +   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
> +
> +(define_split
> +  [(set (match_operand:MMXMODEI 0 "register_operand")
> +        (any_logic:MMXMODEI
> +         (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> +         (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
>     (clobber (reg:CC FLAGS_REG))]
> +  "TARGET_64BIT
> +   && (TARGET_MMX || TARGET_SSE2)
> +   && reload_completed
> +   && !general_reg_operand (operands[0], <MODE>mode)"
> +  [(set (match_dup 0)
> +       (any_logic:<MODE> (match_dup 1) (match_dup 2)))])
> +
> +(define_expand "<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "register_operand")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "register_operand")
> +         (match_operand:VI_16_32 2 "register_operand")))]
>    ""
> +{
> +  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> +  DONE;
> +})
> +
> +(define_insn "*<code><mode>3"
> +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
> +        (any_logic:VI_16_32
> +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
> +   (clobber (reg:CC FLAGS_REG))]
> +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
>    "#"
> -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> -   (set_attr "type" "alu,sselog,sselog,sselog")
> -   (set_attr "mode" "SI,TI,TI,TI")])
> +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> +   (set_attr "mode" "SI,SI,TI,TI,TI")])
>
>  (define_split
> -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> -        (any_logic:VI_16_32
> -         (match_operand:VI_16_32 1 "general_reg_operand")
> -         (match_operand:VI_16_32 2 "general_reg_operand")))
> +  [(set (match_operand:VI_16_32_64 0 "")
> +        (any_logic:VI_16_32_64
> +         (match_operand:VI_16_32_64 1 "")
> +         (match_operand:VI_16_32_64 2 "")))
>     (clobber (reg:CC FLAGS_REG))]
> -  "reload_completed"
> +  "reload_completed
> +  && !sse_reg_operand (operands[1], <MODE>mode)
> +  && !sse_reg_operand (operands[2], <MODE>mode)
> +  && !sse_reg_operand (operands[0], <MODE>mode)"
>    [(parallel
>       [(set (match_dup 0)
> -          (any_logic:SI (match_dup 1) (match_dup 2)))
> +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
>        (clobber (reg:CC FLAGS_REG))])]
>  {
> -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> +  if (CONSTANT_P (operands[2]))
> +    {
> +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> +                                                               <MODE>mode);
> +      operands[2] = GEN_INT (val);
> +     }
> +   else
> +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
>  })
>
>  (define_split
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index c71c453cceb..62280f58478 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
>    return trunc_int_for_mode (val, SImode) == val;
>  })
>
> +(define_predicate "nonimmediate_or_x86_64_vector_cst"
> +  (ior (match_operand 0 "nonimmediate_operand")
> +       (match_operand 0 "x86_64_const_vector_operand")))
> +
>  ;; Return true when OP is nonimmediate or standard SSE constant.
>  (define_predicate "nonimmediate_or_sse_const_operand"
>    (ior (match_operand 0 "nonimmediate_operand")
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> new file mode 100644
> index 00000000000..ac5d1990682
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +}
> +
> +void
> +foo1 (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo2 (char* a, char* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +  a[4] &= b[4];
> +  a[5] &= b[5];
> +  a[6] &= b[6];
> +  a[7] &= b[7];
> +}
> +
> +void
> +foo3 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> +
> +void
> +foo5 (char* a, char* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 2;
> +  a[3] &= 3;
> +  a[4] &= 4;
> +  a[5] &= 5;
> +  a[6] &= 6;
> +  a[7] &= 7;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> new file mode 100644
> index 00000000000..dce8a536a95
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> @@ -0,0 +1,35 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo (short* a, short* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +  a[2] &= b[2];
> +  a[3] &= b[3];
> +}
> +
> +void
> +foo1 (short* a, short* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo3 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +  a[2] &= 3;
> +  a[3] &= 3;
> +}
> +
> +void
> +foo4 (short* a, short* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> new file mode 100644
> index 00000000000..3c7bd978f36
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-msse2 -O2" } */
> +/* { dg-final { scan-assembler-not "xmm" } } */
> +
> +void
> +foo1 (int* a, int* __restrict b)
> +{
> +  a[0] &= b[0];
> +  a[1] &= b[1];
> +}
> +
> +void
> +foo4 (int* a, int* __restrict b)
> +{
> +  a[0] &= 1;
> +  a[1] &= 2;
> +}
> --
> 2.18.1
>
Hongtao Liu July 12, 2022, 6:37 a.m. UTC | #2
On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > And split it to GPR-version instruction after reload.
> >
> > This will enable below optimization for 16/32/64-bit vector bit_op
> >
> > -       movd    (%rdi), %xmm0
> > -       movd    (%rsi), %xmm1
> > -       pand    %xmm1, %xmm0
> > -       movd    %xmm0, (%rdi)
> > +       movl    (%rsi), %eax
> > +       andl    %eax, (%rdi)
> >
> > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > Ok for trunk?
>
> The patch will create many interunit moves (xmm <-> gpr) for anything
> but the most simple logic sequences, because operations with
> memory/immediate will be forced into GPR registers, while reg/reg
> operations will remain in XMM registers.
Agree not to deal with mem/immediate at first.
>
> I tried to introduce GPR registers to MMX logic insns in the past and
> observed the above behavior, but perhaps RA evolved in the mean time
> to handle different register sets better (especially under register
> pressure). However, I would advise to be careful with this
> functionality.
>
> Perhaps this problem should be attacked in stages. First, please
> introduce GPR registers to MMX logic instructions (similar to how
> VI_16_32 mode instructions are handled). After RA effects will be
There's "?r" in VI_16_32 logic instructions which prevent RA allocate
gpr for testcase in the patch.
Is it ok to remove "?" for them(Also add alternative "r" instead of
"?r" in mmx logic insns)?
If there's other instructions that prefer "v to "r", then RA will
allocate "v", but for logic instructions, "r" and “v" should be
treated equally, just as in the 16/32/64-bit vector
mov<mode>_internal.
> analysed, only then memory/immediate handling should be added. Also,
> please don't forget to handle ANDNOT insn - TARGET_BMI slightly
> complicates this part, but this is also solved with VI_16_32 mode
> instructions.
>
> Uros.
>
> >
> > gcc/ChangeLog:
> >
> >         PR target/106038
> >         * config/i386/mmx.md (<code><mode>3): Expand
> >         with (clobber (reg:CC flags_reg)) under TARGET_64BIT
> >         (mmx_code><mode>3): Ditto.
> >         (*mmx_<code><mode>3_1): New define_insn, add post_reload
> >         splitter after it.
> >         (*<code><mode>3): New define_insn, also add post_reload
> >         splitter after it.
> >         (mmxinsnmode): New mode attribute.
> >         (VI_16_32_64): New mode iterator.
> >         (*mov<mode>_imm): Refactor with mmxinsnmode.
> >         * config/i386/predicates.md
> >         (nonimmediate_or_x86_64_vector_cst): New predicate.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/pr106038-1.c: New test.
> >         * gcc.target/i386/pr106038-2.c: New test.
> >         * gcc.target/i386/pr106038-3.c: New test.
> > ---
> >  gcc/config/i386/mmx.md                     | 131 +++++++++++++++------
> >  gcc/config/i386/predicates.md              |   4 +
> >  gcc/testsuite/gcc.target/i386/pr106038-1.c |  61 ++++++++++
> >  gcc/testsuite/gcc.target/i386/pr106038-2.c |  35 ++++++
> >  gcc/testsuite/gcc.target/i386/pr106038-3.c |  17 +++
> >  5 files changed, 213 insertions(+), 35 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr106038-3.c
> >
> > diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> > index 3294c1e6274..85b06abea27 100644
> > --- a/gcc/config/i386/mmx.md
> > +++ b/gcc/config/i386/mmx.md
> > @@ -75,6 +75,11 @@ (define_mode_iterator V_16_32_64
> >      (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
> >      (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
> >
> > +(define_mode_iterator VI_16_32_64
> > +   [V2QI V4QI V2HI
> > +    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
> > +    (V2SI "TARGET_64BIT")])
> > +
> >  ;; V2S* modes
> >  (define_mode_iterator V2FI [V2SF V2SI])
> >
> > @@ -86,6 +91,14 @@ (define_mode_attr mmxvecsize
> >    [(V8QI "b") (V4QI "b") (V2QI "b")
> >     (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
> >
> > +;; Mapping to same size integral mode.
> > +(define_mode_attr mmxinsnmode
> > +  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
> > +   (V4HI "DI") (V2HI "SI")
> > +   (V2SI "DI")
> > +   (V4HF "DI") (V2HF "SI")
> > +   (V2SF "DI")])
> > +
> >  (define_mode_attr mmxdoublemode
> >    [(V8QI "V8HI") (V4HI "V4SI")])
> >
> > @@ -350,22 +363,7 @@ (define_insn_and_split "*mov<mode>_imm"
> >    HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
> >                                                             <MODE>mode);
> >    operands[1] = GEN_INT (val);
> > -  machine_mode mode;
> > -  switch (GET_MODE_SIZE (<MODE>mode))
> > -    {
> > -    case 2:
> > -      mode = HImode;
> > -      break;
> > -    case 4:
> > -      mode = SImode;
> > -      break;
> > -    case 8:
> > -      mode = DImode;
> > -      break;
> > -    default:
> > -      gcc_unreachable ();
> > -    }
> > -  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  ;; For TARGET_64BIT we always round up to 8 bytes.
> > @@ -2948,14 +2946,28 @@ (define_expand "mmx_<code><mode>3"
> >           (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> >           (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
> >    "TARGET_MMX || TARGET_MMX_WITH_SSE"
> > -  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
> > +{
> > +  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
> > +  if (TARGET_64BIT)
> > +  {
> > +    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +    DONE;
> > +  }
> > +})
> >
> >  (define_expand "<code><mode>3"
> >    [(set (match_operand:MMXMODEI 0 "register_operand")
> >         (any_logic:MMXMODEI
> >           (match_operand:MMXMODEI 1 "register_operand")
> >           (match_operand:MMXMODEI 2 "register_operand")))]
> > -  "TARGET_MMX_WITH_SSE")
> > +  "TARGET_MMX_WITH_SSE"
> > +{
> > +  if (TARGET_64BIT)
> > +    {
> > +      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +      DONE;
> > +    }
> > +})
> >
> >  (define_insn "*mmx_<code><mode>3"
> >    [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
> > @@ -2974,33 +2986,82 @@ (define_insn "*mmx_<code><mode>3"
> >     (set_attr "type" "mmxadd,sselog,sselog,sselog")
> >     (set_attr "mode" "DI,TI,TI,TI")])
> >
> > -(define_insn "<code><mode>3"
> > -  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
> > -        (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
> > -         (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
> > +(define_insn "*mmx_<code><mode>3_1"
> > +  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
> > +        (any_logic:MMXMODEI
> > +         (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
> > +         (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
> > +    (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_64BIT
> > +   && (TARGET_MMX || TARGET_SSE2)
> > +   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> > +  "#"
> > +  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
> > +   (set_attr "mmx_isa" "native,*,*,*,*,*")
> > +   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
> > +   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
> > +
> > +(define_split
> > +  [(set (match_operand:MMXMODEI 0 "register_operand")
> > +        (any_logic:MMXMODEI
> > +         (match_operand:MMXMODEI 1 "register_mmxmem_operand")
> > +         (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
> >     (clobber (reg:CC FLAGS_REG))]
> > +  "TARGET_64BIT
> > +   && (TARGET_MMX || TARGET_SSE2)
> > +   && reload_completed
> > +   && !general_reg_operand (operands[0], <MODE>mode)"
> > +  [(set (match_dup 0)
> > +       (any_logic:<MODE> (match_dup 1) (match_dup 2)))])
> > +
> > +(define_expand "<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "register_operand")
> > +        (any_logic:VI_16_32
> > +         (match_operand:VI_16_32 1 "register_operand")
> > +         (match_operand:VI_16_32 2 "register_operand")))]
> >    ""
> > +{
> > +  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
> > +  DONE;
> > +})
> > +
> > +(define_insn "*<code><mode>3"
> > +  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
> > +        (any_logic:VI_16_32
> > +         (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
> > +         (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
> > +   (clobber (reg:CC FLAGS_REG))]
> > +  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
> >    "#"
> > -  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
> > -   (set_attr "type" "alu,sselog,sselog,sselog")
> > -   (set_attr "mode" "SI,TI,TI,TI")])
> > +  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
> > +   (set_attr "type" "alu,alu,sselog,sselog,sselog")
> > +   (set_attr "mode" "SI,SI,TI,TI,TI")])
> >
> >  (define_split
> > -  [(set (match_operand:VI_16_32 0 "general_reg_operand")
> > -        (any_logic:VI_16_32
> > -         (match_operand:VI_16_32 1 "general_reg_operand")
> > -         (match_operand:VI_16_32 2 "general_reg_operand")))
> > +  [(set (match_operand:VI_16_32_64 0 "")
> > +        (any_logic:VI_16_32_64
> > +         (match_operand:VI_16_32_64 1 "")
> > +         (match_operand:VI_16_32_64 2 "")))
> >     (clobber (reg:CC FLAGS_REG))]
> > -  "reload_completed"
> > +  "reload_completed
> > +  && !sse_reg_operand (operands[1], <MODE>mode)
> > +  && !sse_reg_operand (operands[2], <MODE>mode)
> > +  && !sse_reg_operand (operands[0], <MODE>mode)"
> >    [(parallel
> >       [(set (match_dup 0)
> > -          (any_logic:SI (match_dup 1) (match_dup 2)))
> > +          (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
> >        (clobber (reg:CC FLAGS_REG))])]
> >  {
> > -  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
> > -  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
> > -  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
> > +  if (CONSTANT_P (operands[2]))
> > +    {
> > +      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
> > +                                                               <MODE>mode);
> > +      operands[2] = GEN_INT (val);
> > +     }
> > +   else
> > +    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
> > +  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
> > +  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
> >  })
> >
> >  (define_split
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index c71c453cceb..62280f58478 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -1205,6 +1205,10 @@ (define_predicate "x86_64_const_vector_operand"
> >    return trunc_int_for_mode (val, SImode) == val;
> >  })
> >
> > +(define_predicate "nonimmediate_or_x86_64_vector_cst"
> > +  (ior (match_operand 0 "nonimmediate_operand")
> > +       (match_operand 0 "x86_64_const_vector_operand")))
> > +
> >  ;; Return true when OP is nonimmediate or standard SSE constant.
> >  (define_predicate "nonimmediate_or_sse_const_operand"
> >    (ior (match_operand 0 "nonimmediate_operand")
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > new file mode 100644
> > index 00000000000..ac5d1990682
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
> > @@ -0,0 +1,61 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +}
> > +
> > +void
> > +foo1 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo2 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +  a[4] &= b[4];
> > +  a[5] &= b[5];
> > +  a[6] &= b[6];
> > +  a[7] &= b[7];
> > +}
> > +
> > +void
> > +foo3 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 3;
> > +  a[3] &= 3;
> > +}
> > +
> > +void
> > +foo4 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > +
> > +void
> > +foo5 (char* a, char* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 2;
> > +  a[3] &= 3;
> > +  a[4] &= 4;
> > +  a[5] &= 5;
> > +  a[6] &= 6;
> > +  a[7] &= 7;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> > new file mode 100644
> > index 00000000000..dce8a536a95
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
> > @@ -0,0 +1,35 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo (short* a, short* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +  a[2] &= b[2];
> > +  a[3] &= b[3];
> > +}
> > +
> > +void
> > +foo1 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo3 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +  a[2] &= 3;
> > +  a[3] &= 3;
> > +}
> > +
> > +void
> > +foo4 (short* a, short* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> > new file mode 100644
> > index 00000000000..3c7bd978f36
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-msse2 -O2" } */
> > +/* { dg-final { scan-assembler-not "xmm" } } */
> > +
> > +void
> > +foo1 (int* a, int* __restrict b)
> > +{
> > +  a[0] &= b[0];
> > +  a[1] &= b[1];
> > +}
> > +
> > +void
> > +foo4 (int* a, int* __restrict b)
> > +{
> > +  a[0] &= 1;
> > +  a[1] &= 2;
> > +}
> > --
> > 2.18.1
> >
Uros Bizjak July 12, 2022, 7:15 a.m. UTC | #3
On Tue, Jul 12, 2022 at 8:37 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jul 11, 2022 at 4:03 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, Jul 11, 2022 at 3:15 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > And split it to GPR-version instruction after reload.
> > >
> > > This will enable below optimization for 16/32/64-bit vector bit_op
> > >
> > > -       movd    (%rdi), %xmm0
> > > -       movd    (%rsi), %xmm1
> > > -       pand    %xmm1, %xmm0
> > > -       movd    %xmm0, (%rdi)
> > > +       movl    (%rsi), %eax
> > > +       andl    %eax, (%rdi)
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> >
> > The patch will create many interunit moves (xmm <-> gpr) for anything
> > but the most simple logic sequences, because operations with
> > memory/immediate will be forced into GPR registers, while reg/reg
> > operations will remain in XMM registers.
> Agree not to deal with mem/immediate at first.
> >
> > I tried to introduce GPR registers to MMX logic insns in the past and
> > observed the above behavior, but perhaps RA evolved in the mean time
> > to handle different register sets better (especially under register
> > pressure). However, I would advise to be careful with this
> > functionality.
> >
> > Perhaps this problem should be attacked in stages. First, please
> > introduce GPR registers to MMX logic instructions (similar to how
> > VI_16_32 mode instructions are handled). After RA effects will be
> There's "?r" in VI_16_32 logic instructions which prevent RA allocate
> gpr for testcase in the patch.
> Is it ok to remove "?" for them(Also add alternative "r" instead of
> "?r" in mmx logic insns)?
> If there's other instructions that prefer "v to "r", then RA will
> allocate "v", but for logic instructions, "r" and “v" should be
> treated equally, just as in the 16/32/64-bit vector
> mov<mode>_internal.

?r was introduced under the assumption that we want vector values
mostly in vector registers. Currently there are no instructions with
memory or immediate operand, so that made sense at the time. Let's
keep ?r until logic instructions with mem/imm operands are introduced.
So, for the patch that adds 64-bit vector logic in GPR, I would advise
to first introduce only register operands. mem/imm operands should be
added in a follow-up patch when the "?r" constraint will also be
relaxed.

Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 3294c1e6274..85b06abea27 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -75,6 +75,11 @@  (define_mode_iterator V_16_32_64
     (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT") (V4HF "TARGET_64BIT")
     (V2SI "TARGET_64BIT") (V2SF "TARGET_64BIT")])
 
+(define_mode_iterator VI_16_32_64
+   [V2QI V4QI V2HI
+    (V8QI "TARGET_64BIT") (V4HI "TARGET_64BIT")
+    (V2SI "TARGET_64BIT")])
+
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
@@ -86,6 +91,14 @@  (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
    (V4HI "w") (V2HI "w") (V2SI "d") (V1DI "q")])
 
+;; Mapping to same size integral mode.
+(define_mode_attr mmxinsnmode
+  [(V8QI "DI") (V4QI "SI") (V2QI "HI")
+   (V4HI "DI") (V2HI "SI")
+   (V2SI "DI")
+   (V4HF "DI") (V2HF "SI")
+   (V2SF "DI")])
+
 (define_mode_attr mmxdoublemode
   [(V8QI "V8HI") (V4HI "V4SI")])
 
@@ -350,22 +363,7 @@  (define_insn_and_split "*mov<mode>_imm"
   HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[1],
 							    <MODE>mode);
   operands[1] = GEN_INT (val);
-  machine_mode mode;
-  switch (GET_MODE_SIZE (<MODE>mode))
-    {
-    case 2:
-      mode = HImode;
-      break;
-    case 4:
-      mode = SImode;
-      break;
-    case 8:
-      mode = DImode;
-      break;
-    default:
-      gcc_unreachable ();
-    }
-  operands[0] = lowpart_subreg (mode, operands[0], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 ;; For TARGET_64BIT we always round up to 8 bytes.
@@ -2948,14 +2946,28 @@  (define_expand "mmx_<code><mode>3"
 	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
 	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
-  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+  if (TARGET_64BIT)
+  {
+    ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+    DONE;
+  }
+})
 
 (define_expand "<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand")
 	(any_logic:MMXMODEI
 	  (match_operand:MMXMODEI 1 "register_operand")
 	  (match_operand:MMXMODEI 2 "register_operand")))]
-  "TARGET_MMX_WITH_SSE")
+  "TARGET_MMX_WITH_SSE"
+{
+  if (TARGET_64BIT)
+    {
+      ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+      DONE;
+    }
+})
 
 (define_insn "*mmx_<code><mode>3"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y,x,x,v")
@@ -2974,33 +2986,82 @@  (define_insn "*mmx_<code><mode>3"
    (set_attr "type" "mmxadd,sselog,sselog,sselog")
    (set_attr "mode" "DI,TI,TI,TI")])
 
-(define_insn "<code><mode>3"
-  [(set (match_operand:VI_16_32 0 "register_operand" "=?r,x,x,v")
-        (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "register_operand" "%0,0,x,v")
-	  (match_operand:VI_16_32 2 "register_operand" "r,x,x,v")))
+(define_insn "*mmx_<code><mode>3_1"
+  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "=y,x,x,v,rm,r")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0,0,x,v,0,0")
+	  (match_operand:MMXMODEI 2 "nonimmediate_or_x86_64_vector_cst" "ym,x,x,v,ri,m")))
+    (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && (TARGET_MMX || TARGET_SSE2)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "#"
+  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl,x64,x64")
+   (set_attr "mmx_isa" "native,*,*,*,*,*")
+   (set_attr "type" "mmxadd,sselog,sselog,sselog,alu,alu")
+   (set_attr "mode" "DI,TI,TI,TI,DI,DI")])
+
+(define_split
+  [(set (match_operand:MMXMODEI 0 "register_operand")
+        (any_logic:MMXMODEI
+	  (match_operand:MMXMODEI 1 "register_mmxmem_operand")
+	  (match_operand:MMXMODEI 2 "register_mmxmem_operand")))
    (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT
+   && (TARGET_MMX || TARGET_SSE2)
+   && reload_completed
+   && !general_reg_operand (operands[0], <MODE>mode)"
+  [(set (match_dup 0)
+	(any_logic:<MODE> (match_dup 1) (match_dup 2)))])
+
+(define_expand "<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "register_operand")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "register_operand")
+	  (match_operand:VI_16_32 2 "register_operand")))]
   ""
+{
+  ix86_expand_binary_operator (<CODE>, <MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "*<code><mode>3"
+  [(set (match_operand:VI_16_32 0 "nonimmediate_operand" "=rm,r,x,x,v")
+        (any_logic:VI_16_32
+	  (match_operand:VI_16_32 1 "nonimmediate_operand" "%0,0,0,x,v")
+	  (match_operand:VI_16_32 2 "nonimmediate_or_x86_64_vector_cst" "ri,m,x,x,v")))
+   (clobber (reg:CC FLAGS_REG))]
+  "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
   "#"
-  [(set_attr "isa" "*,sse2_noavx,avx,avx512vl")
-   (set_attr "type" "alu,sselog,sselog,sselog")
-   (set_attr "mode" "SI,TI,TI,TI")])
+  [(set_attr "isa" "*,*,sse2_noavx,avx,avx512vl")
+   (set_attr "type" "alu,alu,sselog,sselog,sselog")
+   (set_attr "mode" "SI,SI,TI,TI,TI")])
 
 (define_split
-  [(set (match_operand:VI_16_32 0 "general_reg_operand")
-        (any_logic:VI_16_32
-	  (match_operand:VI_16_32 1 "general_reg_operand")
-	  (match_operand:VI_16_32 2 "general_reg_operand")))
+  [(set (match_operand:VI_16_32_64 0 "")
+        (any_logic:VI_16_32_64
+	  (match_operand:VI_16_32_64 1 "")
+	  (match_operand:VI_16_32_64 2 "")))
    (clobber (reg:CC FLAGS_REG))]
-  "reload_completed"
+  "reload_completed
+  && !sse_reg_operand (operands[1], <MODE>mode)
+  && !sse_reg_operand (operands[2], <MODE>mode)
+  && !sse_reg_operand (operands[0], <MODE>mode)"
   [(parallel
      [(set (match_dup 0)
-	   (any_logic:SI (match_dup 1) (match_dup 2)))
+	   (any_logic:<mmxinsnmode> (match_dup 1) (match_dup 2)))
       (clobber (reg:CC FLAGS_REG))])]
 {
-  operands[2] = lowpart_subreg (SImode, operands[2], <MODE>mode);
-  operands[1] = lowpart_subreg (SImode, operands[1], <MODE>mode);
-  operands[0] = lowpart_subreg (SImode, operands[0], <MODE>mode);
+  if (CONSTANT_P (operands[2]))
+    {
+      HOST_WIDE_INT val = ix86_convert_const_vector_to_integer (operands[2],
+								<MODE>mode);
+      operands[2] = GEN_INT (val);
+     }
+   else
+    operands[2] = lowpart_subreg (<mmxinsnmode>mode, operands[2], <MODE>mode);
+  operands[1] = lowpart_subreg (<mmxinsnmode>mode, operands[1], <MODE>mode);
+  operands[0] = lowpart_subreg (<mmxinsnmode>mode, operands[0], <MODE>mode);
 })
 
 (define_split
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index c71c453cceb..62280f58478 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1205,6 +1205,10 @@  (define_predicate "x86_64_const_vector_operand"
   return trunc_int_for_mode (val, SImode) == val;
 })
 
+(define_predicate "nonimmediate_or_x86_64_vector_cst"
+  (ior (match_operand 0 "nonimmediate_operand")
+       (match_operand 0 "x86_64_const_vector_operand")))
+
 ;; Return true when OP is nonimmediate or standard SSE constant.
 (define_predicate "nonimmediate_or_sse_const_operand"
   (ior (match_operand 0 "nonimmediate_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-1.c b/gcc/testsuite/gcc.target/i386/pr106038-1.c
new file mode 100644
index 00000000000..ac5d1990682
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-1.c
@@ -0,0 +1,61 @@ 
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo2 (char* a, char* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+  a[4] &= b[4];
+  a[5] &= b[5];
+  a[6] &= b[6];
+  a[7] &= b[7];
+}
+
+void
+foo3 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
+
+void
+foo5 (char* a, char* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 2;
+  a[3] &= 3;
+  a[4] &= 4;
+  a[5] &= 5;
+  a[6] &= 6;
+  a[7] &= 7;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-2.c b/gcc/testsuite/gcc.target/i386/pr106038-2.c
new file mode 100644
index 00000000000..dce8a536a95
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-2.c
@@ -0,0 +1,35 @@ 
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+  a[2] &= b[2];
+  a[3] &= b[3];
+}
+
+void
+foo1 (short* a, short* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo3 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+  a[2] &= 3;
+  a[3] &= 3;
+}
+
+void
+foo4 (short* a, short* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr106038-3.c b/gcc/testsuite/gcc.target/i386/pr106038-3.c
new file mode 100644
index 00000000000..3c7bd978f36
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr106038-3.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2" } */
+/* { dg-final { scan-assembler-not "xmm" } } */
+
+void
+foo1 (int* a, int* __restrict b)
+{
+  a[0] &= b[0];
+  a[1] &= b[1];
+}
+
+void
+foo4 (int* a, int* __restrict b)
+{
+  a[0] &= 1;
+  a[1] &= 2;
+}