diff mbox series

Disparage slightly the mask register alternative for bitwise operations. [PR target/101142]

Message ID 20210621045611.46842-1-hongtao.liu@intel.com
State New
Headers show
Series Disparage slightly the mask register alternative for bitwise operations. [PR target/101142] | expand

Commit Message

liuhongt June 21, 2021, 4:56 a.m. UTC
The avx512 supports bitwise operations with mask registers, but the
throughput of those instructions is much lower than that of the
corresponding gpr version, so we would additionally disparages
slightly the mask register alternative for bitwise operations in the
LRA.

Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
MASK_REGS first since it has already been disparaged.

gcc/ChangeLog:

	PR target/101142
	* config/i386/i386.md: (*anddi_1): Disparage slightly the mask
	register alternative.
	(*and<mode>_1): Ditto.
	(*andqi_1): Ditto.
	(*andn<mode>_1): Ditto.
	(*<code><mode>_1): Ditto.
	(*<code>qi_1): Ditto.
	(*one_cmpl<mode>2_1): Ditto.
	(*one_cmplsi2_1_zext): Ditto.
	(*one_cmplqi2_1): Ditto.
	* config/i386/i386.c (x86_order_regs_for_local_alloc): Change
	the order of mask registers to be before general registers.

gcc/testsuite/ChangeLog:

	PR target/101142
	* gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
	* gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
---
 gcc/config/i386/i386.c                        |  8 +-
 gcc/config/i386/i386.md                       | 20 ++---
 .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
 .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
 .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
 .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
 6 files changed, 91 insertions(+), 59 deletions(-)

Comments

Uros Bizjak June 21, 2021, 7:27 a.m. UTC | #1
On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> The avx512 supports bitwise operations with mask registers, but the
> throughput of those instructions is much lower than that of the
> corresponding gpr version, so we would additionally disparages
> slightly the mask register alternative for bitwise operations in the
> LRA.

This was the reason for UNSPEC tagged instructions with mask
registers, used mainly for builtins.

Also, care should be taken if we want mask registers to be used under
GPR pressure, or it is better to spill GPR registers. In the past,
DImode values caused a lot of problems with MMX registers on x86-64,
but we were able to hand-tune the allocator in the way you propose.

Let's try the proposed approach to see what happens.

> Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> MASK_REGS first since it has already been disparaged.
>
> gcc/ChangeLog:
>
>         PR target/101142
>         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
>         register alternative.
>         (*and<mode>_1): Ditto.
>         (*andqi_1): Ditto.
>         (*andn<mode>_1): Ditto.
>         (*<code><mode>_1): Ditto.
>         (*<code>qi_1): Ditto.
>         (*one_cmpl<mode>2_1): Ditto.
>         (*one_cmplsi2_1_zext): Ditto.
>         (*one_cmplqi2_1): Ditto.
>         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
>         the order of mask registers to be before general registers.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/101142
>         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
>         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
>         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
>         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.

OK with a comment addition, see inline.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386.c                        |  8 +-
>  gcc/config/i386/i386.md                       | 20 ++---
>  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
>  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
>  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
>  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
>  6 files changed, 91 insertions(+), 59 deletions(-)
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index a61255857ff..a651853ca3b 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
>     int pos = 0;
>     int i;
>
> +   /* Mask register.  */
> +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> +     reg_alloc_order [pos++] = i;

Please add a comment why mask registers should come first.

>     /* First allocate the local general purpose registers.  */
>     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
>       if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
> @@ -20489,10 +20493,6 @@ x86_order_regs_for_local_alloc (void)
>     for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
>       reg_alloc_order [pos++] = i;
>
> -   /* Mask register.  */
> -   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> -     reg_alloc_order [pos++] = i;
> -
>     /* x87 registers.  */
>     if (TARGET_SSE_MATH)
>       for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index 6e4abf32e7c..3eef56b27d7 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -9138,7 +9138,7 @@ (define_insn_and_split "*anddi3_doubleword"
>  })
>
>  (define_insn "*anddi_1"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k")
>         (and:DI
>          (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
>          (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
> @@ -9226,7 +9226,7 @@ (define_insn "*andsi_1_zext"
>     (set_attr "mode" "SI")])
>
>  (define_insn "*and<mode>_1"
> -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k")
>         (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
>                    (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
>     (clobber (reg:CC FLAGS_REG))]
> @@ -9255,7 +9255,7 @@ (define_insn "*and<mode>_1"
>     (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
>
>  (define_insn "*andqi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
>         (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
>                 (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
> @@ -9651,7 +9651,7 @@ (define_split
>  })
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
> +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k")
>         (and:SWI48
>           (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
>           (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
> @@ -9667,7 +9667,7 @@ (define_insn "*andn<mode>_1"
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*andn<mode>_1"
> -  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
> +  [(set (match_operand:SWI12 0 "register_operand" "=r,?k")
>         (and:SWI12
>           (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
>           (match_operand:SWI12 2 "register_operand" "r,k")))
> @@ -9757,7 +9757,7 @@ (define_insn_and_split "*<code>di3_doubleword"
>  })
>
>  (define_insn "*<code><mode>_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
>         (any_or:SWI248
>          (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
>          (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
> @@ -9847,7 +9847,7 @@ (define_insn "*<code>si_1_zext_imm"
>     (set_attr "mode" "SI")])
>
>  (define_insn "*<code>qi_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
>         (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
>                    (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
>     (clobber (reg:CC FLAGS_REG))]
> @@ -10603,7 +10603,7 @@ (define_insn_and_split "*one_cmpldi2_doubleword"
>    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
>
>  (define_insn "*one_cmpl<mode>2_1"
> -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
>         (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
>    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
>    "@
> @@ -10620,7 +10620,7 @@ (define_insn "*one_cmpl<mode>2_1"
>     (set_attr "mode" "<MODE>")])
>
>  (define_insn "*one_cmplsi2_1_zext"
> -  [(set (match_operand:DI 0 "register_operand" "=r,k")
> +  [(set (match_operand:DI 0 "register_operand" "=r,?k")
>         (zero_extend:DI
>           (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
>    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> @@ -10632,7 +10632,7 @@ (define_insn "*one_cmplsi2_1_zext"
>     (set_attr "mode" "SI,SI")])
>
>  (define_insn "*one_cmplqi2_1"
> -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
>         (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
>    "ix86_unary_operator_ok (NOT, QImode, operands)"
>    "@
> diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> index c5043e224ea..94d6764fc56 100644
> --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> @@ -1,15 +1,31 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -march=skylake-avx512" } */
> -
> -#ifndef DTYPE
> -#define DTYPE u32
> -#endif
> +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
>
>  typedef unsigned long long u64;
>  typedef unsigned int u32;
>  typedef unsigned short u16;
>  typedef unsigned char u8;
>
> +#ifdef DTYPE32
> +typedef u32 DTYPE;
> +#define byteswap byteswapu32
> +#endif
> +
> +#ifdef DTYPE16
> +typedef u16 DTYPE;
> +#define byteswap byteswapu16
> +#endif
> +
> +#ifdef DTYPE8
> +typedef u16 DTYPE;
> +#define byteswap byteswapu8
> +#endif
> +
> +#ifdef DTYPE64
> +typedef u16 DTYPE;
> +#define byteswap byteswapu64
> +#endif
> +
>  #define R(x,n) ( (x >> n) | (x << (32 - n)))
>
>  #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
> @@ -23,36 +39,51 @@ typedef unsigned char u8;
>      d += tmp1;                                           \
>  }
>
> -static inline DTYPE byteswap(DTYPE x)
> +static inline u32 byteswapu32(u32 x)
>  {
> -       x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> -       x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> -       return x;
> +  x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> +  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> +  return x;
>  }
>
> -#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i))
> +static inline u16 byteswapu16(u16 x)
> +{
> +  x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
> +  return x;
> +}
> +
> +static inline u8 byteswapu8(u8 x)
> +{
> +  return x;
> +}
> +
> +static inline u64 byteswapu64(u64 x)
> +{
> +  x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
> +  return x;
> +}
>
> -void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
> +void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
>  {
>      DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
>      DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
>         w8, w9, w10, w11, w12, w13, w14, w15;
> -    w0  = byteswap(*(DTYPE *)(in + 0));
> -    w1  = byteswap(*(DTYPE *)(in + 4));
> -    w2  = byteswap(*(DTYPE *)(in + 8));
> -    w3  = byteswap(*(DTYPE *)(in + 12));
> -    w4  = byteswap(*(DTYPE *)(in + 16));
> -    w5  = byteswap(*(DTYPE *)(in + 20));
> -    w6  = byteswap(*(DTYPE *)(in + 24));
> -    w7  = byteswap(*(DTYPE *)(in + 28));
> -    w8  = byteswap(*(DTYPE *)(in + 32));
> -    w9  = byteswap(*(DTYPE *)(in + 36));
> -    w10 = byteswap(*(DTYPE *)(in + 40));
> -    w11 = byteswap(*(DTYPE *)(in + 44));
> -    w12 = byteswap(*(DTYPE *)(in + 48));
> -    w13 = byteswap(*(DTYPE *)(in + 52));
> -    w14 = byteswap(*(DTYPE *)(in + 56));
> -    w15 = byteswap(*(DTYPE *)(in + 60));
> +    w0  = byteswap(in[0]);
> +    w1  = byteswap(in[1]);
> +    w2  = byteswap(in[2]);
> +    w3  = byteswap(in[3]);
> +    w4  = byteswap(in[4]);
> +    w5  = byteswap(in[5]);
> +    w6  = byteswap(in[6]);
> +    w7  = byteswap(in[7]);
> +    w8  = byteswap(in[8]);
> +    w9  = byteswap(in[9]);
> +    w10 = byteswap(in[10]);
> +    w11 = byteswap(in[11]);
> +    w12 = byteswap(in[12]);
> +    w13 = byteswap(in[13]);
> +    w14 = byteswap(in[14]);
> +    w15 = byteswap(in[15]);
>      a = out[0];
>      b = out[1];
>      c = out[2];
> @@ -90,3 +121,7 @@ void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
>  }
>
>  /* { dg-final { scan-assembler "kmovd" } } */
> +/* { dg-final { scan-assembler-not "knot" } } */
> +/* { dg-final { scan-assembler-not "kxor" } } */
> +/* { dg-final { scan-assembler-not "kor" } } */
> +/* { dg-final { scan-assembler-not "kandn" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> index 1f0c6b474d6..c7d09769ed5 100644
> --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> @@ -1,10 +1,9 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -march=skylake-avx512" } */
> -
> -#ifndef DTYPE
> -#define DTYPE u16
> -#endif
> +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE16" } */
>
>  #include "spill_to_mask-1.c"
>
> -/* { dg-final { scan-assembler "kmovw" } } */
> +/* { dg-final { scan-assembler-not "knot" } } */
> +/* { dg-final { scan-assembler-not "kxor" } } */
> +/* { dg-final { scan-assembler-not "kor" } } */
> +/* { dg-final { scan-assembler-not "kandn" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> index 5b59090c296..b7a383fe12f 100644
> --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> @@ -1,10 +1,9 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -march=skylake-avx512" } */
> -
> -#ifndef DTYPE
> -#define DTYPE u8
> -#endif
> +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE8" } */
>
>  #include "spill_to_mask-1.c"
>
> -/* { dg-final { scan-assembler "kmovb" } } */
> +/* { dg-final { scan-assembler-not "knot" } } */
> +/* { dg-final { scan-assembler-not "kxor" } } */
> +/* { dg-final { scan-assembler-not "kor" } } */
> +/* { dg-final { scan-assembler-not "kandn" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> index f111cf42b36..d67038d5b88 100644
> --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> @@ -1,10 +1,9 @@
>  /* { dg-do compile  { target { ! ia32 } } } */
> -/* { dg-options "-O2 -march=skylake-avx512" } */
> -
> -#ifndef DTYPE
> -#define DTYPE u64
> -#endif
> +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE64" } */
>
>  #include "spill_to_mask-1.c"
>
> -/* { dg-final { scan-assembler "kmovq" } } */
> +/* { dg-final { scan-assembler-not "knot" } } */
> +/* { dg-final { scan-assembler-not "kxor" } } */
> +/* { dg-final { scan-assembler-not "kor" } } */
> +/* { dg-final { scan-assembler-not "kandn" } } */
> --
> 2.18.1
>
Hongtao Liu June 21, 2021, 8:12 a.m. UTC | #2
On Mon, Jun 21, 2021 at 3:28 PM Uros Bizjak via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > The avx512 supports bitwise operations with mask registers, but the
> > throughput of those instructions is much lower than that of the
> > corresponding gpr version, so we would additionally disparages
> > slightly the mask register alternative for bitwise operations in the
> > LRA.
>
> This was the reason for UNSPEC tagged instructions with mask
> registers, used mainly for builtins.
>
> Also, care should be taken if we want mask registers to be used under
> GPR pressure, or it is better to spill GPR registers. In the past,
> DImode values caused a lot of problems with MMX registers on x86-64,
> but we were able to hand-tune the allocator in the way you propose.
>
> Let's try the proposed approach to see what happens.
>
> > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > MASK_REGS first since it has already been disparaged.
> >
> > gcc/ChangeLog:
> >
> >         PR target/101142
> >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> >         register alternative.
> >         (*and<mode>_1): Ditto.
> >         (*andqi_1): Ditto.
> >         (*andn<mode>_1): Ditto.
> >         (*<code><mode>_1): Ditto.
> >         (*<code>qi_1): Ditto.
> >         (*one_cmpl<mode>2_1): Ditto.
> >         (*one_cmplsi2_1_zext): Ditto.
> >         (*one_cmplqi2_1): Ditto.
> >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> >         the order of mask registers to be before general registers.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/101142
> >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
>
> OK with a comment addition, see inline.
>
> Thanks,
> Uros.
>
> > ---
> >  gcc/config/i386/i386.c                        |  8 +-
> >  gcc/config/i386/i386.md                       | 20 ++---
> >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> >  6 files changed, 91 insertions(+), 59 deletions(-)
> >
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index a61255857ff..a651853ca3b 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> >     int pos = 0;
> >     int i;
> >
> > +   /* Mask register.  */
> > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > +     reg_alloc_order [pos++] = i;
>
> Please add a comment why mask registers should come first.
Thanks for the review, this is the patch i'm check in.
>
> >     /* First allocate the local general purpose registers.  */
> >     for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
> >       if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
> > @@ -20489,10 +20493,6 @@ x86_order_regs_for_local_alloc (void)
> >     for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
> >       reg_alloc_order [pos++] = i;
> >
> > -   /* Mask register.  */
> > -   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > -     reg_alloc_order [pos++] = i;
> > -
> >     /* x87 registers.  */
> >     if (TARGET_SSE_MATH)
> >       for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
> > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> > index 6e4abf32e7c..3eef56b27d7 100644
> > --- a/gcc/config/i386/i386.md
> > +++ b/gcc/config/i386/i386.md
> > @@ -9138,7 +9138,7 @@ (define_insn_and_split "*anddi3_doubleword"
> >  })
> >
> >  (define_insn "*anddi_1"
> > -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
> > +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k")
> >         (and:DI
> >          (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
> >          (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
> > @@ -9226,7 +9226,7 @@ (define_insn "*andsi_1_zext"
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*and<mode>_1"
> > -  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
> > +  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k")
> >         (and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
> >                    (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -9255,7 +9255,7 @@ (define_insn "*and<mode>_1"
> >     (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
> >
> >  (define_insn "*andqi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
> >         (and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> >                 (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -9651,7 +9651,7 @@ (define_split
> >  })
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
> > +  [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k")
> >         (and:SWI48
> >           (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
> >           (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
> > @@ -9667,7 +9667,7 @@ (define_insn "*andn<mode>_1"
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*andn<mode>_1"
> > -  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
> > +  [(set (match_operand:SWI12 0 "register_operand" "=r,?k")
> >         (and:SWI12
> >           (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
> >           (match_operand:SWI12 2 "register_operand" "r,k")))
> > @@ -9757,7 +9757,7 @@ (define_insn_and_split "*<code>di3_doubleword"
> >  })
> >
> >  (define_insn "*<code><mode>_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
> >         (any_or:SWI248
> >          (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
> >          (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
> > @@ -9847,7 +9847,7 @@ (define_insn "*<code>si_1_zext_imm"
> >     (set_attr "mode" "SI")])
> >
> >  (define_insn "*<code>qi_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
> >         (any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
> >                    (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
> >     (clobber (reg:CC FLAGS_REG))]
> > @@ -10603,7 +10603,7 @@ (define_insn_and_split "*one_cmpldi2_doubleword"
> >    "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
> >
> >  (define_insn "*one_cmpl<mode>2_1"
> > -  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
> > +  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
> >         (not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
> >    "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
> >    "@
> > @@ -10620,7 +10620,7 @@ (define_insn "*one_cmpl<mode>2_1"
> >     (set_attr "mode" "<MODE>")])
> >
> >  (define_insn "*one_cmplsi2_1_zext"
> > -  [(set (match_operand:DI 0 "register_operand" "=r,k")
> > +  [(set (match_operand:DI 0 "register_operand" "=r,?k")
> >         (zero_extend:DI
> >           (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
> >    "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
> > @@ -10632,7 +10632,7 @@ (define_insn "*one_cmplsi2_1_zext"
> >     (set_attr "mode" "SI,SI")])
> >
> >  (define_insn "*one_cmplqi2_1"
> > -  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
> > +  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
> >         (not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
> >    "ix86_unary_operator_ok (NOT, QImode, operands)"
> >    "@
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > index c5043e224ea..94d6764fc56 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
> > @@ -1,15 +1,31 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u32
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
> >
> >  typedef unsigned long long u64;
> >  typedef unsigned int u32;
> >  typedef unsigned short u16;
> >  typedef unsigned char u8;
> >
> > +#ifdef DTYPE32
> > +typedef u32 DTYPE;
> > +#define byteswap byteswapu32
> > +#endif
> > +
> > +#ifdef DTYPE16
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu16
> > +#endif
> > +
> > +#ifdef DTYPE8
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu8
> > +#endif
> > +
> > +#ifdef DTYPE64
> > +typedef u16 DTYPE;
> > +#define byteswap byteswapu64
> > +#endif
> > +
> >  #define R(x,n) ( (x >> n) | (x << (32 - n)))
> >
> >  #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
> > @@ -23,36 +39,51 @@ typedef unsigned char u8;
> >      d += tmp1;                                           \
> >  }
> >
> > -static inline DTYPE byteswap(DTYPE x)
> > +static inline u32 byteswapu32(u32 x)
> >  {
> > -       x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> > -       x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> > -       return x;
> > +  x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> > +  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> > +  return x;
> >  }
> >
> > -#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i))
> > +static inline u16 byteswapu16(u16 x)
> > +{
> > +  x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;
> > +  return x;
> > +}
> > +
> > +static inline u8 byteswapu8(u8 x)
> > +{
> > +  return x;
> > +}
> > +
> > +static inline u64 byteswapu64(u64 x)
> > +{
> > +  x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
> > +  return x;
> > +}
> >
> > -void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
> > +void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
> >  {
> >      DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
> >      DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
> >         w8, w9, w10, w11, w12, w13, w14, w15;
> > -    w0  = byteswap(*(DTYPE *)(in + 0));
> > -    w1  = byteswap(*(DTYPE *)(in + 4));
> > -    w2  = byteswap(*(DTYPE *)(in + 8));
> > -    w3  = byteswap(*(DTYPE *)(in + 12));
> > -    w4  = byteswap(*(DTYPE *)(in + 16));
> > -    w5  = byteswap(*(DTYPE *)(in + 20));
> > -    w6  = byteswap(*(DTYPE *)(in + 24));
> > -    w7  = byteswap(*(DTYPE *)(in + 28));
> > -    w8  = byteswap(*(DTYPE *)(in + 32));
> > -    w9  = byteswap(*(DTYPE *)(in + 36));
> > -    w10 = byteswap(*(DTYPE *)(in + 40));
> > -    w11 = byteswap(*(DTYPE *)(in + 44));
> > -    w12 = byteswap(*(DTYPE *)(in + 48));
> > -    w13 = byteswap(*(DTYPE *)(in + 52));
> > -    w14 = byteswap(*(DTYPE *)(in + 56));
> > -    w15 = byteswap(*(DTYPE *)(in + 60));
> > +    w0  = byteswap(in[0]);
> > +    w1  = byteswap(in[1]);
> > +    w2  = byteswap(in[2]);
> > +    w3  = byteswap(in[3]);
> > +    w4  = byteswap(in[4]);
> > +    w5  = byteswap(in[5]);
> > +    w6  = byteswap(in[6]);
> > +    w7  = byteswap(in[7]);
> > +    w8  = byteswap(in[8]);
> > +    w9  = byteswap(in[9]);
> > +    w10 = byteswap(in[10]);
> > +    w11 = byteswap(in[11]);
> > +    w12 = byteswap(in[12]);
> > +    w13 = byteswap(in[13]);
> > +    w14 = byteswap(in[14]);
> > +    w15 = byteswap(in[15]);
> >      a = out[0];
> >      b = out[1];
> >      c = out[2];
> > @@ -90,3 +121,7 @@ void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
> >  }
> >
> >  /* { dg-final { scan-assembler "kmovd" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > index 1f0c6b474d6..c7d09769ed5 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u16
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE16" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovw" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > index 5b59090c296..b7a383fe12f 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u8
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE8" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovb" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > index f111cf42b36..d67038d5b88 100644
> > --- a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
> > @@ -1,10 +1,9 @@
> >  /* { dg-do compile  { target { ! ia32 } } } */
> > -/* { dg-options "-O2 -march=skylake-avx512" } */
> > -
> > -#ifndef DTYPE
> > -#define DTYPE u64
> > -#endif
> > +/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE64" } */
> >
> >  #include "spill_to_mask-1.c"
> >
> > -/* { dg-final { scan-assembler "kmovq" } } */
> > +/* { dg-final { scan-assembler-not "knot" } } */
> > +/* { dg-final { scan-assembler-not "kxor" } } */
> > +/* { dg-final { scan-assembler-not "kor" } } */
> > +/* { dg-final { scan-assembler-not "kandn" } } */
> > --
> > 2.18.1
> >
Uros Bizjak June 23, 2021, 7:58 a.m. UTC | #3
On Mon, Jun 21, 2021 at 10:08 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Mon, Jun 21, 2021 at 3:28 PM Uros Bizjak via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao.liu@intel.com> wrote:
> > >
> > > The avx512 supports bitwise operations with mask registers, but the
> > > throughput of those instructions is much lower than that of the
> > > corresponding gpr version, so we would additionally disparages
> > > slightly the mask register alternative for bitwise operations in the
> > > LRA.
> >
> > This was the reason for UNSPEC tagged instructions with mask
> > registers, used mainly for builtins.
> >
> > Also, care should be taken if we want mask registers to be used under
> > GPR pressure, or it is better to spill GPR registers. In the past,
> > DImode values caused a lot of problems with MMX registers on x86-64,
> > but we were able to hand-tune the allocator in the way you propose.
> >
> > Let's try the proposed approach to see what happens.
> >
> > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > MASK_REGS first since it has already been disparaged.
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/101142
> > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > >         register alternative.
> > >         (*and<mode>_1): Ditto.
> > >         (*andqi_1): Ditto.
> > >         (*andn<mode>_1): Ditto.
> > >         (*<code><mode>_1): Ditto.
> > >         (*<code>qi_1): Ditto.
> > >         (*one_cmpl<mode>2_1): Ditto.
> > >         (*one_cmplsi2_1_zext): Ditto.
> > >         (*one_cmplqi2_1): Ditto.
> > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > >         the order of mask registers to be before general registers.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/101142
> > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> >
> > OK with a comment addition, see inline.
> >
> > Thanks,
> > Uros.
> >
> > > ---
> > >  gcc/config/i386/i386.c                        |  8 +-
> > >  gcc/config/i386/i386.md                       | 20 ++---
> > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index a61255857ff..a651853ca3b 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > >     int pos = 0;
> > >     int i;
> > >
> > > +   /* Mask register.  */
> > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > +     reg_alloc_order [pos++] = i;
> >
> > Please add a comment why mask registers should come first.
> Thanks for the review, this is the patch i'm check in.

This patch again caused unwanted mask instructions with -m32 in cpuid
check code, e.g.:

Running target unix/-m32
FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
FAIL: gcc.target/i386/pr96814.c execution test

Debugging pr96814 failure:

  0x0804921d <+66>:    mov    %edx,%ecx
  0x0804921f <+68>:    cpuid
=> 0x08049221 <+70>:    kmovd  %edx,%k0
  0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
  0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)

It looks to me that putting mask registers in front of GPR is looking
for trouble, since apparently disparaging mask registers does not
offset register preference of the new order.

Uros.
Hongtao Liu June 23, 2021, 8:50 a.m. UTC | #4
On Wed, Jun 23, 2021 at 3:59 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Mon, Jun 21, 2021 at 10:08 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Mon, Jun 21, 2021 at 3:28 PM Uros Bizjak via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > >
> > > > The avx512 supports bitwise operations with mask registers, but the
> > > > throughput of those instructions is much lower than that of the
> > > > corresponding gpr version, so we would additionally disparages
> > > > slightly the mask register alternative for bitwise operations in the
> > > > LRA.
> > >
> > > This was the reason for UNSPEC tagged instructions with mask
> > > registers, used mainly for builtins.
> > >
> > > Also, care should be taken if we want mask registers to be used under
> > > GPR pressure, or it is better to spill GPR registers. In the past,
> > > DImode values caused a lot of problems with MMX registers on x86-64,
> > > but we were able to hand-tune the allocator in the way you propose.
> > >
> > > Let's try the proposed approach to see what happens.
> > >
> > > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > > MASK_REGS first since it has already been disparaged.
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR target/101142
> > > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > > >         register alternative.
> > > >         (*and<mode>_1): Ditto.
> > > >         (*andqi_1): Ditto.
> > > >         (*andn<mode>_1): Ditto.
> > > >         (*<code><mode>_1): Ditto.
> > > >         (*<code>qi_1): Ditto.
> > > >         (*one_cmpl<mode>2_1): Ditto.
> > > >         (*one_cmplsi2_1_zext): Ditto.
> > > >         (*one_cmplqi2_1): Ditto.
> > > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > > >         the order of mask registers to be before general registers.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         PR target/101142
> > > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> > >
> > > OK with a comment addition, see inline.
> > >
> > > Thanks,
> > > Uros.
> > >
> > > > ---
> > > >  gcc/config/i386/i386.c                        |  8 +-
> > > >  gcc/config/i386/i386.md                       | 20 ++---
> > > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > > >
> > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > index a61255857ff..a651853ca3b 100644
> > > > --- a/gcc/config/i386/i386.c
> > > > +++ b/gcc/config/i386/i386.c
> > > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > > >     int pos = 0;
> > > >     int i;
> > > >
> > > > +   /* Mask register.  */
> > > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > > +     reg_alloc_order [pos++] = i;
> > >
> > > Please add a comment why mask registers should come first.
> > Thanks for the review, this is the patch i'm check in.
>
> This patch again caused unwanted mask instructions with -m32 in cpuid
> check code, e.g.:
>
> Running target unix/-m32
> FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
> FAIL: gcc.target/i386/pr96814.c execution test
>
> Debugging pr96814 failure:
>
>   0x0804921d <+66>:    mov    %edx,%ecx
>   0x0804921f <+68>:    cpuid
> => 0x08049221 <+70>:    kmovd  %edx,%k0
>   0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
>   0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)
>
> It looks to me that putting mask registers in front of GPR is looking
So it's not functionality but performance issue here, under 32-bit
mode there are only 8 gprs which result in higher register pressure,
and for this we do have mask->integer and integer->mask cost, with
-mtune=bdver1 where cost of kmovd is quite high(16, 20 /*
mask->integer and integer->mask moves */), there's no mask
instructions in cpuid.
I guess we can adjust mask->integer and integer->mask for 32-bit mode
to avoid such a situation?
> for trouble, since apparently disparaging mask registers does not
> offset register preference of the new order.
Hmm, w/o order preference,it will fail bitwise_mask_op-3.c.(costs of
gpr and mask register are equal here) and according to gcc documents,
“?k" is minimal increase in cost of the mask register(one unit more
costly).
The cost model is always difficult in balancing these, as you said
fine tune cost model is always looking for trouble.
>
> Uros.
Hongtao Liu June 23, 2021, 9:37 a.m. UTC | #5
On Wed, Jun 23, 2021 at 4:50 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, Jun 23, 2021 at 3:59 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Mon, Jun 21, 2021 at 10:08 AM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > On Mon, Jun 21, 2021 at 3:28 PM Uros Bizjak via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Mon, Jun 21, 2021 at 6:56 AM liuhongt <hongtao.liu@intel.com> wrote:
> > > > >
> > > > > The avx512 supports bitwise operations with mask registers, but the
> > > > > throughput of those instructions is much lower than that of the
> > > > > corresponding gpr version, so we would additionally disparages
> > > > > slightly the mask register alternative for bitwise operations in the
> > > > > LRA.
> > > >
> > > > This was the reason for UNSPEC tagged instructions with mask
> > > > registers, used mainly for builtins.
> > > >
> > > > Also, care should be taken if we want mask registers to be used under
> > > > GPR pressure, or it is better to spill GPR registers. In the past,
> > > > DImode values caused a lot of problems with MMX registers on x86-64,
> > > > but we were able to hand-tune the allocator in the way you propose.
> > > >
> > > > Let's try the proposed approach to see what happens.
> > > >
> > > > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > > > MASK_REGS first since it has already been disparaged.
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >         PR target/101142
> > > > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > > > >         register alternative.
> > > > >         (*and<mode>_1): Ditto.
> > > > >         (*andqi_1): Ditto.
> > > > >         (*andn<mode>_1): Ditto.
> > > > >         (*<code><mode>_1): Ditto.
> > > > >         (*<code>qi_1): Ditto.
> > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > >         (*one_cmplqi2_1): Ditto.
> > > > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > > > >         the order of mask registers to be before general registers.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >         PR target/101142
> > > > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > > > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > > > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > > > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> > > >
> > > > OK with a comment addition, see inline.
> > > >
> > > > Thanks,
> > > > Uros.
> > > >
> > > > > ---
> > > > >  gcc/config/i386/i386.c                        |  8 +-
> > > > >  gcc/config/i386/i386.md                       | 20 ++---
> > > > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > > > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > > > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > > > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > > > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > > > >
> > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > index a61255857ff..a651853ca3b 100644
> > > > > --- a/gcc/config/i386/i386.c
> > > > > +++ b/gcc/config/i386/i386.c
> > > > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > > > >     int pos = 0;
> > > > >     int i;
> > > > >
> > > > > +   /* Mask register.  */
> > > > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > > > +     reg_alloc_order [pos++] = i;
> > > >
> > > > Please add a comment why mask registers should come first.
> > > Thanks for the review, this is the patch i'm check in.
> >
> > This patch again caused unwanted mask instructions with -m32 in cpuid
> > check code, e.g.:
> >
> > Running target unix/-m32
> > FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
> > FAIL: gcc.target/i386/pr96814.c execution test
> >
> > Debugging pr96814 failure:
> >
> >   0x0804921d <+66>:    mov    %edx,%ecx
> >   0x0804921f <+68>:    cpuid
> > => 0x08049221 <+70>:    kmovd  %edx,%k0
> >   0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
> >   0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)
> >
> > It looks to me that putting mask registers in front of GPR is looking
> So it's not functionality but performance issue here, under 32-bit
> mode there are only 8 gprs which result in higher register pressure,
> and for this we do have mask->integer and integer->mask cost, with
> -mtune=bdver1 where cost of kmovd is quite high(16, 20 /*
> mask->integer and integer->mask moves */), there's no mask
> instructions in cpuid.
> I guess we can adjust mask->integer and integer->mask for 32-bit mode
> to avoid such a situation?
I notice the default option is O0, with -O there's no mask instructions.
IMHO, We don't need to change cost unless there's -O2 cases where mask
instructions regress performance here.
> > for trouble, since apparently disparaging mask registers does not
> > offset register preference of the new order.
> Hmm, w/o order preference,it will fail bitwise_mask_op-3.c.(costs of
> gpr and mask register are equal here) and according to gcc documents,
> “?k" is minimal increase in cost of the mask register(one unit more
> costly).
> The cost model is always difficult in balancing these, as you said
> fine tune cost model is always looking for trouble.
> >
> > Uros.
>
>
>
> --
> BR,
> Hongtao
Uros Bizjak June 23, 2021, 9:41 a.m. UTC | #6
On Wed, Jun 23, 2021 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote:

> > > > > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > > > > MASK_REGS first since it has already been disparaged.
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >         PR target/101142
> > > > > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > > > > >         register alternative.
> > > > > >         (*and<mode>_1): Ditto.
> > > > > >         (*andqi_1): Ditto.
> > > > > >         (*andn<mode>_1): Ditto.
> > > > > >         (*<code><mode>_1): Ditto.
> > > > > >         (*<code>qi_1): Ditto.
> > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > > > > >         the order of mask registers to be before general registers.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > >         PR target/101142
> > > > > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > > > > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > > > > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > > > > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> > > > >
> > > > > OK with a comment addition, see inline.
> > > > >
> > > > > Thanks,
> > > > > Uros.
> > > > >
> > > > > > ---
> > > > > >  gcc/config/i386/i386.c                        |  8 +-
> > > > > >  gcc/config/i386/i386.md                       | 20 ++---
> > > > > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > > > > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > > > > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > > > > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > > > > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > > > > >
> > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > > index a61255857ff..a651853ca3b 100644
> > > > > > --- a/gcc/config/i386/i386.c
> > > > > > +++ b/gcc/config/i386/i386.c
> > > > > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > > > > >     int pos = 0;
> > > > > >     int i;
> > > > > >
> > > > > > +   /* Mask register.  */
> > > > > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > > > > +     reg_alloc_order [pos++] = i;
> > > > >
> > > > > Please add a comment why mask registers should come first.
> > > > Thanks for the review, this is the patch i'm check in.
> > >
> > > This patch again caused unwanted mask instructions with -m32 in cpuid
> > > check code, e.g.:
> > >
> > > Running target unix/-m32
> > > FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
> > > FAIL: gcc.target/i386/pr96814.c execution test
> > >
> > > Debugging pr96814 failure:
> > >
> > >   0x0804921d <+66>:    mov    %edx,%ecx
> > >   0x0804921f <+68>:    cpuid
> > > => 0x08049221 <+70>:    kmovd  %edx,%k0
> > >   0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
> > >   0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)
> > >
> > > It looks to me that putting mask registers in front of GPR is looking
> > So it's not functionality but performance issue here, under 32-bit
> > mode there are only 8 gprs which result in higher register pressure,
> > and for this we do have mask->integer and integer->mask cost, with
> > -mtune=bdver1 where cost of kmovd is quite high(16, 20 /*
> > mask->integer and integer->mask moves */), there's no mask
> > instructions in cpuid.
> > I guess we can adjust mask->integer and integer->mask for 32-bit mode
> > to avoid such a situation?
> I notice the default option is O0, with -O there's no mask instructions.
> IMHO, We don't need to change cost unless there's -O2 cases where mask
> instructions regress performance here.

No, this reasoning is not acceptable. The compiled code will SIGILL on
targets where unsupported mask registers are involved, so GPR should
always have priority compared to mask registers. Based on these
findings, x86_order_regs_for_local_alloc change should be reverted,
and register move costs compensated elsewhere.

Uros.
Uros Bizjak June 23, 2021, 9:54 a.m. UTC | #7
On Wed, Jun 23, 2021 at 11:41 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Jun 23, 2021 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote:
>
> > > > > > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > > > > > MASK_REGS first since it has already been disparaged.
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > >         PR target/101142
> > > > > > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > > > > > >         register alternative.
> > > > > > >         (*and<mode>_1): Ditto.
> > > > > > >         (*andqi_1): Ditto.
> > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > >         (*<code>qi_1): Ditto.
> > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > > > > > >         the order of mask registers to be before general registers.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > >         PR target/101142
> > > > > > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > > > > > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> > > > > >
> > > > > > OK with a comment addition, see inline.
> > > > > >
> > > > > > Thanks,
> > > > > > Uros.
> > > > > >
> > > > > > > ---
> > > > > > >  gcc/config/i386/i386.c                        |  8 +-
> > > > > > >  gcc/config/i386/i386.md                       | 20 ++---
> > > > > > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > > > > > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > > > > > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > > > > > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > > > > > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > > > > > >
> > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > > > index a61255857ff..a651853ca3b 100644
> > > > > > > --- a/gcc/config/i386/i386.c
> > > > > > > +++ b/gcc/config/i386/i386.c
> > > > > > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > > > > > >     int pos = 0;
> > > > > > >     int i;
> > > > > > >
> > > > > > > +   /* Mask register.  */
> > > > > > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > > > > > +     reg_alloc_order [pos++] = i;
> > > > > >
> > > > > > Please add a comment why mask registers should come first.
> > > > > Thanks for the review, this is the patch i'm check in.
> > > >
> > > > This patch again caused unwanted mask instructions with -m32 in cpuid
> > > > check code, e.g.:
> > > >
> > > > Running target unix/-m32
> > > > FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
> > > > FAIL: gcc.target/i386/pr96814.c execution test
> > > >
> > > > Debugging pr96814 failure:
> > > >
> > > >   0x0804921d <+66>:    mov    %edx,%ecx
> > > >   0x0804921f <+68>:    cpuid
> > > > => 0x08049221 <+70>:    kmovd  %edx,%k0
> > > >   0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
> > > >   0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)
> > > >
> > > > It looks to me that putting mask registers in front of GPR is looking
> > > So it's not functionality but performance issue here, under 32-bit
> > > mode there are only 8 gprs which result in higher register pressure,
> > > and for this we do have mask->integer and integer->mask cost, with
> > > -mtune=bdver1 where cost of kmovd is quite high(16, 20 /*
> > > mask->integer and integer->mask moves */), there's no mask
> > > instructions in cpuid.
> > > I guess we can adjust mask->integer and integer->mask for 32-bit mode
> > > to avoid such a situation?
> > I notice the default option is O0, with -O there's no mask instructions.
> > IMHO, We don't need to change cost unless there's -O2 cases where mask
> > instructions regress performance here.
>
> No, this reasoning is not acceptable. The compiled code will SIGILL on
> targets where unsupported mask registers are involved, so GPR should
> always have priority compared to mask registers. Based on these
> findings, x86_order_regs_for_local_alloc change should be reverted,
> and register move costs compensated elsewhere.

Longterm, I think that introducing vector BImode and VxBI vectors
should solve this issue. This would make a clear cut between mask and
GPR operations. I think that generic vector infrastructure supports
BImode vectors, so using e.g. "a & b" instead of builtins should work
well, too.

This approach would also avoid costly moves between register sets.

Uros.
Hongtao Liu June 24, 2021, 1:50 a.m. UTC | #8
On Wed, Jun 23, 2021 at 5:55 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Jun 23, 2021 at 11:41 AM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Jun 23, 2021 at 11:32 AM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > > > > > > > Also when allocano cost of GENERAL_REGS is same as MASK_REGS, allocate
> > > > > > > > MASK_REGS first since it has already been disparaged.
> > > > > > > >
> > > > > > > > gcc/ChangeLog:
> > > > > > > >
> > > > > > > >         PR target/101142
> > > > > > > >         * config/i386/i386.md: (*anddi_1): Disparage slightly the mask
> > > > > > > >         register alternative.
> > > > > > > >         (*and<mode>_1): Ditto.
> > > > > > > >         (*andqi_1): Ditto.
> > > > > > > >         (*andn<mode>_1): Ditto.
> > > > > > > >         (*<code><mode>_1): Ditto.
> > > > > > > >         (*<code>qi_1): Ditto.
> > > > > > > >         (*one_cmpl<mode>2_1): Ditto.
> > > > > > > >         (*one_cmplsi2_1_zext): Ditto.
> > > > > > > >         (*one_cmplqi2_1): Ditto.
> > > > > > > >         * config/i386/i386.c (x86_order_regs_for_local_alloc): Change
> > > > > > > >         the order of mask registers to be before general registers.
> > > > > > > >
> > > > > > > > gcc/testsuite/ChangeLog:
> > > > > > > >
> > > > > > > >         PR target/101142
> > > > > > > >         * gcc.target/i386/spill_to_mask-1.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/spill_to_mask-2.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/spill_to_mask-3.c: Adjust testcase.
> > > > > > > >         * gcc.target/i386/spill_to_mask-4.c: Adjust testcase.
> > > > > > >
> > > > > > > OK with a comment addition, see inline.
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Uros.
> > > > > > >
> > > > > > > > ---
> > > > > > > >  gcc/config/i386/i386.c                        |  8 +-
> > > > > > > >  gcc/config/i386/i386.md                       | 20 ++---
> > > > > > > >  .../gcc.target/i386/spill_to_mask-1.c         | 89 +++++++++++++------
> > > > > > > >  .../gcc.target/i386/spill_to_mask-2.c         | 11 ++-
> > > > > > > >  .../gcc.target/i386/spill_to_mask-3.c         | 11 ++-
> > > > > > > >  .../gcc.target/i386/spill_to_mask-4.c         | 11 ++-
> > > > > > > >  6 files changed, 91 insertions(+), 59 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > > > > > > index a61255857ff..a651853ca3b 100644
> > > > > > > > --- a/gcc/config/i386/i386.c
> > > > > > > > +++ b/gcc/config/i386/i386.c
> > > > > > > > @@ -20463,6 +20463,10 @@ x86_order_regs_for_local_alloc (void)
> > > > > > > >     int pos = 0;
> > > > > > > >     int i;
> > > > > > > >
> > > > > > > > +   /* Mask register.  */
> > > > > > > > +   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
> > > > > > > > +     reg_alloc_order [pos++] = i;
> > > > > > >
> > > > > > > Please add a comment why mask registers should come first.
> > > > > > Thanks for the review, this is the patch i'm check in.
> > > > >
> > > > > This patch again caused unwanted mask instructions with -m32 in cpuid
> > > > > check code, e.g.:
> > > > >
> > > > > Running target unix/-m32
> > > > > FAIL: gcc.target/i386/avx512bw-pr70329-1.c execution test
> > > > > FAIL: gcc.target/i386/pr96814.c execution test
> > > > >
> > > > > Debugging pr96814 failure:
> > > > >
> > > > >   0x0804921d <+66>:    mov    %edx,%ecx
> > > > >   0x0804921f <+68>:    cpuid
> > > > > => 0x08049221 <+70>:    kmovd  %edx,%k0
> > > > >   0x08049225 <+74>:    mov    %eax,-0x8(%ebp)
> > > > >   0x08049228 <+77>:    mov    %ebx,-0xc(%ebp)
> > > > >
> > > > > It looks to me that putting mask registers in front of GPR is looking
> > > > So it's not functionality but performance issue here, under 32-bit
> > > > mode there are only 8 gprs which result in higher register pressure,
> > > > and for this we do have mask->integer and integer->mask cost, with
> > > > -mtune=bdver1 where cost of kmovd is quite high(16, 20 /*
> > > > mask->integer and integer->mask moves */), there's no mask
> > > > instructions in cpuid.
> > > > I guess we can adjust mask->integer and integer->mask for 32-bit mode
> > > > to avoid such a situation?
> > > I notice the default option is O0, with -O there's no mask instructions.
> > > IMHO, We don't need to change cost unless there's -O2 cases where mask
> > > instructions regress performance here.
> >
> > No, this reasoning is not acceptable. The compiled code will SIGILL on
> > targets where unsupported mask registers are involved, so GPR should
> > always have priority compared to mask registers. Based on these
> > findings, x86_order_regs_for_local_alloc change should be reverted,
> > and register move costs compensated elsewhere.
>
> Longterm, I think that introducing vector BImode and VxBI vectors
> should solve this issue. This would make a clear cut between mask and
> GPR operations. I think that generic vector infrastructure supports
> BImode vectors, so using e.g. "a & b" instead of builtins should work
> well, too.
>
> This approach would also avoid costly moves between register sets.
>
Agreeing with the long term part, I've opened
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101185 to record this
regression.

> Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index a61255857ff..a651853ca3b 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -20463,6 +20463,10 @@  x86_order_regs_for_local_alloc (void)
    int pos = 0;
    int i;
 
+   /* Mask register.  */
+   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
+     reg_alloc_order [pos++] = i;
+
    /* First allocate the local general purpose registers.  */
    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
      if (GENERAL_REGNO_P (i) && call_used_or_fixed_reg_p (i))
@@ -20489,10 +20493,6 @@  x86_order_regs_for_local_alloc (void)
    for (i = FIRST_EXT_REX_SSE_REG; i <= LAST_EXT_REX_SSE_REG; i++)
      reg_alloc_order [pos++] = i;
 
-   /* Mask register.  */
-   for (i = FIRST_MASK_REG; i <= LAST_MASK_REG; i++)
-     reg_alloc_order [pos++] = i;
-
    /* x87 registers.  */
    if (TARGET_SSE_MATH)
      for (i = FIRST_STACK_REG; i <= LAST_STACK_REG; i++)
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6e4abf32e7c..3eef56b27d7 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -9138,7 +9138,7 @@  (define_insn_and_split "*anddi3_doubleword"
 })
 
 (define_insn "*anddi_1"
-  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,k")
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,rm,r,r,?k")
 	(and:DI
 	 (match_operand:DI 1 "nonimmediate_operand" "%0,0,0,qm,k")
 	 (match_operand:DI 2 "x86_64_szext_general_operand" "Z,re,m,L,k")))
@@ -9226,7 +9226,7 @@  (define_insn "*andsi_1_zext"
    (set_attr "mode" "SI")])
 
 (define_insn "*and<mode>_1"
-  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,k")
+  [(set (match_operand:SWI24 0 "nonimmediate_operand" "=rm,r,Ya,?k")
 	(and:SWI24 (match_operand:SWI24 1 "nonimmediate_operand" "%0,0,qm,k")
 		   (match_operand:SWI24 2 "<general_operand>" "r<i>,m,L,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -9255,7 +9255,7 @@  (define_insn "*and<mode>_1"
    (set_attr "mode" "<MODE>,<MODE>,SI,<MODE>")])
 
 (define_insn "*andqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
 	(and:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
 		(match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -9651,7 +9651,7 @@  (define_split
 })
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI48 0 "register_operand" "=r,r,k")
+  [(set (match_operand:SWI48 0 "register_operand" "=r,r,?k")
 	(and:SWI48
 	  (not:SWI48 (match_operand:SWI48 1 "register_operand" "r,r,k"))
 	  (match_operand:SWI48 2 "nonimmediate_operand" "r,m,k")))
@@ -9667,7 +9667,7 @@  (define_insn "*andn<mode>_1"
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*andn<mode>_1"
-  [(set (match_operand:SWI12 0 "register_operand" "=r,k")
+  [(set (match_operand:SWI12 0 "register_operand" "=r,?k")
 	(and:SWI12
 	  (not:SWI12 (match_operand:SWI12 1 "register_operand" "r,k"))
 	  (match_operand:SWI12 2 "register_operand" "r,k")))
@@ -9757,7 +9757,7 @@  (define_insn_and_split "*<code>di3_doubleword"
 })
 
 (define_insn "*<code><mode>_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,k")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,r,?k")
 	(any_or:SWI248
 	 (match_operand:SWI248 1 "nonimmediate_operand" "%0,0,k")
 	 (match_operand:SWI248 2 "<general_operand>" "r<i>,m,k")))
@@ -9847,7 +9847,7 @@  (define_insn "*<code>si_1_zext_imm"
    (set_attr "mode" "SI")])
 
 (define_insn "*<code>qi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,q,r,?k")
 	(any_or:QI (match_operand:QI 1 "nonimmediate_operand" "%0,0,0,k")
 		   (match_operand:QI 2 "general_operand" "qn,m,rn,k")))
    (clobber (reg:CC FLAGS_REG))]
@@ -10603,7 +10603,7 @@  (define_insn_and_split "*one_cmpldi2_doubleword"
   "split_double_mode (DImode, &operands[0], 2, &operands[0], &operands[2]);")
 
 (define_insn "*one_cmpl<mode>2_1"
-  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,k")
+  [(set (match_operand:SWI248 0 "nonimmediate_operand" "=rm,?k")
 	(not:SWI248 (match_operand:SWI248 1 "nonimmediate_operand" "0,k")))]
   "ix86_unary_operator_ok (NOT, <MODE>mode, operands)"
   "@
@@ -10620,7 +10620,7 @@  (define_insn "*one_cmpl<mode>2_1"
    (set_attr "mode" "<MODE>")])
 
 (define_insn "*one_cmplsi2_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,k")
+  [(set (match_operand:DI 0 "register_operand" "=r,?k")
 	(zero_extend:DI
 	  (not:SI (match_operand:SI 1 "register_operand" "0,k"))))]
   "TARGET_64BIT && ix86_unary_operator_ok (NOT, SImode, operands)"
@@ -10632,7 +10632,7 @@  (define_insn "*one_cmplsi2_1_zext"
    (set_attr "mode" "SI,SI")])
 
 (define_insn "*one_cmplqi2_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,k")
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=qm,r,?k")
 	(not:QI (match_operand:QI 1 "nonimmediate_operand" "0,0,k")))]
   "ix86_unary_operator_ok (NOT, QImode, operands)"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
index c5043e224ea..94d6764fc56 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c
@@ -1,15 +1,31 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u32
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE32" } */
 
 typedef unsigned long long u64;
 typedef unsigned int u32;
 typedef unsigned short u16;
 typedef unsigned char u8;
 
+#ifdef DTYPE32
+typedef u32 DTYPE;
+#define byteswap byteswapu32
+#endif
+
+#ifdef DTYPE16
+typedef u16 DTYPE;
+#define byteswap byteswapu16
+#endif
+
+#ifdef DTYPE8
+typedef u16 DTYPE;
+#define byteswap byteswapu8
+#endif
+
+#ifdef DTYPE64
+typedef u16 DTYPE;
+#define byteswap byteswapu64
+#endif
+
 #define R(x,n) ( (x >> n) | (x << (32 - n)))
 
 #define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22))
@@ -23,36 +39,51 @@  typedef unsigned char u8;
     d += tmp1;                                           \
 }
 
-static inline DTYPE byteswap(DTYPE x)
+static inline u32 byteswapu32(u32 x)
 {
-	x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
-	x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
-	return x;
+  x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
+  x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;  
+  return x;
 }
 
-#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i))
+static inline u16 byteswapu16(u16 x)
+{
+  x = (x & 0x00FF) << 8 | (x & 0xFF00) >> 8;  
+  return x;
+}
+
+static inline u8 byteswapu8(u8 x)
+{
+  return x;
+}
+
+static inline u64 byteswapu64(u64 x)
+{
+  x = ((u64)(byteswapu32 (x & 0x00000000FFFFFFFF))) << 32 | byteswapu32((x & 0xFFFFFFFF00000000) >> 32);
+  return x;
+}
 
-void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
+void foo (DTYPE in[16], DTYPE out[8], const DTYPE C[16])
 {
     DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h;
     DTYPE w0, w1, w2, w3, w4, w5, w6, w7,
 	w8, w9, w10, w11, w12, w13, w14, w15;
-    w0  = byteswap(*(DTYPE *)(in + 0));
-    w1  = byteswap(*(DTYPE *)(in + 4));
-    w2  = byteswap(*(DTYPE *)(in + 8));
-    w3  = byteswap(*(DTYPE *)(in + 12));
-    w4  = byteswap(*(DTYPE *)(in + 16));
-    w5  = byteswap(*(DTYPE *)(in + 20));
-    w6  = byteswap(*(DTYPE *)(in + 24));
-    w7  = byteswap(*(DTYPE *)(in + 28));
-    w8  = byteswap(*(DTYPE *)(in + 32));
-    w9  = byteswap(*(DTYPE *)(in + 36));
-    w10 = byteswap(*(DTYPE *)(in + 40));
-    w11 = byteswap(*(DTYPE *)(in + 44));
-    w12 = byteswap(*(DTYPE *)(in + 48));
-    w13 = byteswap(*(DTYPE *)(in + 52));
-    w14 = byteswap(*(DTYPE *)(in + 56));
-    w15 = byteswap(*(DTYPE *)(in + 60));
+    w0  = byteswap(in[0]);
+    w1  = byteswap(in[1]);
+    w2  = byteswap(in[2]);
+    w3  = byteswap(in[3]);
+    w4  = byteswap(in[4]);
+    w5  = byteswap(in[5]);
+    w6  = byteswap(in[6]);
+    w7  = byteswap(in[7]);
+    w8  = byteswap(in[8]);
+    w9  = byteswap(in[9]);
+    w10 = byteswap(in[10]);
+    w11 = byteswap(in[11]);
+    w12 = byteswap(in[12]);
+    w13 = byteswap(in[13]);
+    w14 = byteswap(in[14]);
+    w15 = byteswap(in[15]);
     a = out[0];
     b = out[1];
     c = out[2];
@@ -90,3 +121,7 @@  void foo (u8 *in, DTYPE out[8], const DTYPE C[16])
 }
 
 /* { dg-final { scan-assembler "kmovd" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
index 1f0c6b474d6..c7d09769ed5 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c
@@ -1,10 +1,9 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u16
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE16" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovw" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
index 5b59090c296..b7a383fe12f 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c
@@ -1,10 +1,9 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u8
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE8" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovb" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */
diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
index f111cf42b36..d67038d5b88 100644
--- a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
+++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c
@@ -1,10 +1,9 @@ 
 /* { dg-do compile  { target { ! ia32 } } } */
-/* { dg-options "-O2 -march=skylake-avx512" } */
-
-#ifndef DTYPE
-#define DTYPE u64
-#endif
+/* { dg-options "-O2 -march=skylake-avx512 -DDTYPE64" } */
 
 #include "spill_to_mask-1.c"
 
-/* { dg-final { scan-assembler "kmovq" } } */
+/* { dg-final { scan-assembler-not "knot" } } */
+/* { dg-final { scan-assembler-not "kxor" } } */
+/* { dg-final { scan-assembler-not "kor" } } */
+/* { dg-final { scan-assembler-not "kandn" } } */