diff mbox series

AVX512BF16: Do not allow permutation with vcvtne2ps2bf16 [PR115889]

Message ID 20240713074215.2151225-1-hongyu.wang@intel.com
State New
Headers show
Series AVX512BF16: Do not allow permutation with vcvtne2ps2bf16 [PR115889] | expand

Commit Message

Hongyu Wang July 13, 2024, 7:42 a.m. UTC
Hi,

According to the instruction spec of AVX512BF16, the convert from float
to BF16 is not a simple truncation. It has special handling for
denormal/nan, even for normal float it will add an extra bias according
to the least significant bit for bf number. This means we cannot use the
vcvtne2ps2bf16 for any bf16 vector shuffle.
The optimization introduced in r15-1368 adds a specific split to convert
HImode permutation with this instruction, so remove it and treat the
BFmode permutation same as HFmode.

Bootstrapped & regtested on x86_64-pc-linux-gnu. OK for trunk?

gcc/ChangeLog:

	PR target/115889
	* config/i386/predicates.md (vcvtne2ps2bf_parallel): Remove.
	* config/i386/sse.md (hi_cvt_bf): Remove.
	(HI_CVT_BF): Likewise.
	(vpermt2_sepcial_bf16_shuffle_<mode>):Likewise.

gcc/testsuite/ChangeLog:

	PR target/115889
	* gcc.target/i386/vpermt2-special-bf16-shufflue.c: Adjust option
	and output scan.
---
 gcc/config/i386/predicates.md                 | 11 ------
 gcc/config/i386/sse.md                        | 35 -------------------
 .../i386/vpermt2-special-bf16-shufflue.c      |  5 ++-
 3 files changed, 2 insertions(+), 49 deletions(-)

Comments

Hongtao Liu July 15, 2024, 1:40 a.m. UTC | #1
On Sat, Jul 13, 2024 at 3:44 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
>
> Hi,
>
> According to the instruction spec of AVX512BF16, the convert from float
> to BF16 is not a simple truncation. It has special handling for
> denormal/nan, even for normal float it will add an extra bias according
> to the least significant bit for bf number. This means we cannot use the
> vcvtne2ps2bf16 for any bf16 vector shuffle.
> The optimization introduced in r15-1368 adds a specific split to convert
> HImode permutation with this instruction, so remove it and treat the
> BFmode permutation same as HFmode.
>
> Bootstrapped & regtested on x86_64-pc-linux-gnu. OK for trunk?
Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b?
>
> gcc/ChangeLog:
>
>         PR target/115889
>         * config/i386/predicates.md (vcvtne2ps2bf_parallel): Remove.
>         * config/i386/sse.md (hi_cvt_bf): Remove.
>         (HI_CVT_BF): Likewise.
>         (vpermt2_sepcial_bf16_shuffle_<mode>):Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/115889
>         * gcc.target/i386/vpermt2-special-bf16-shufflue.c: Adjust option
>         and output scan.
> ---
>  gcc/config/i386/predicates.md                 | 11 ------
>  gcc/config/i386/sse.md                        | 35 -------------------
>  .../i386/vpermt2-special-bf16-shufflue.c      |  5 ++-
>  3 files changed, 2 insertions(+), 49 deletions(-)
>
> diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> index a894847adaf..5d0bb1e0f54 100644
> --- a/gcc/config/i386/predicates.md
> +++ b/gcc/config/i386/predicates.md
> @@ -2327,14 +2327,3 @@ (define_predicate "apx_ndd_add_memory_operand"
>
>    return true;
>  })
> -
> -;; Check that each element is odd and incrementally increasing from 1
> -(define_predicate "vcvtne2ps2bf_parallel"
> -  (and (match_code "const_vector")
> -       (match_code "const_int" "a"))
> -{
> -  for (int i = 0; i < XVECLEN (op, 0); ++i)
> -    if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
> -      return false;
> -  return true;
> -})
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index b3b4697924b..c134494cd20 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -31460,38 +31460,3 @@ (define_insn "vpdp<vpdpwprodtype>_<mode>"
>    "TARGET_AVXVNNIINT16"
>    "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
>     [(set_attr "prefix" "vex")])
> -
> -(define_mode_attr hi_cvt_bf
> -  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
> -
> -(define_mode_attr HI_CVT_BF
> -  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
> -
> -(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>"
> -  [(set (match_operand:VI2_AVX512F 0 "register_operand")
> -       (unspec:VI2_AVX512F
> -         [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
> -          (match_operand:VI2_AVX512F 2 "register_operand")
> -          (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
> -          UNSPEC_VPERMT2))]
> -  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
> -  "#"
> -  "&& 1"
> -  [(const_int 0)]
> -{
> -  rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode);
> -  operands[2] = lowpart_subreg (<ssePSmode>mode,
> -                               force_reg (<MODE>mode, operands[2]),
> -                               <MODE>mode);
> -  operands[3] = lowpart_subreg (<ssePSmode>mode,
> -                               force_reg (<MODE>mode, operands[3]),
> -                               <MODE>mode);
> -
> -  emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0,
> -                                                  operands[3],
> -                                                  operands[2]));
> -  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0,
> -                                              <HI_CVT_BF>mode));
> -  DONE;
> -}
> -[(set_attr "mode" "<sseinsnmode>")])
> diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> index 5c65f2a9884..4cbc85735de 100755
> --- a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> +++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> @@ -1,7 +1,6 @@
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
> -/* { dg-final { scan-assembler-not "vpermi2b" } } */
> -/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
> +/* { dg-options "-O2 -mavx512vbmi -mavx512vl" } */
> +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
>
>  typedef __bf16 v8bf __attribute__((vector_size(16)));
>  typedef __bf16 v16bf __attribute__((vector_size(32)));
> --
> 2.34.1
>
Hongyu Wang July 15, 2024, 2:21 a.m. UTC | #2
> Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b?

We can still deal with BFmode permutation the same way as HFmode, so
the change in ix86_vectorize_vec_perm_const can be preserved.

Hongtao Liu <crazylht@gmail.com> 于2024年7月15日周一 09:40写道:
>
> On Sat, Jul 13, 2024 at 3:44 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
> >
> > Hi,
> >
> > According to the instruction spec of AVX512BF16, the convert from float
> > to BF16 is not a simple truncation. It has special handling for
> > denormal/nan, even for normal float it will add an extra bias according
> > to the least significant bit for bf number. This means we cannot use the
> > vcvtne2ps2bf16 for any bf16 vector shuffle.
> > The optimization introduced in r15-1368 adds a specific split to convert
> > HImode permutation with this instruction, so remove it and treat the
> > BFmode permutation same as HFmode.
> >
> > Bootstrapped & regtested on x86_64-pc-linux-gnu. OK for trunk?
> Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b?
> >
> > gcc/ChangeLog:
> >
> >         PR target/115889
> >         * config/i386/predicates.md (vcvtne2ps2bf_parallel): Remove.
> >         * config/i386/sse.md (hi_cvt_bf): Remove.
> >         (HI_CVT_BF): Likewise.
> >         (vpermt2_sepcial_bf16_shuffle_<mode>):Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/115889
> >         * gcc.target/i386/vpermt2-special-bf16-shufflue.c: Adjust option
> >         and output scan.
> > ---
> >  gcc/config/i386/predicates.md                 | 11 ------
> >  gcc/config/i386/sse.md                        | 35 -------------------
> >  .../i386/vpermt2-special-bf16-shufflue.c      |  5 ++-
> >  3 files changed, 2 insertions(+), 49 deletions(-)
> >
> > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > index a894847adaf..5d0bb1e0f54 100644
> > --- a/gcc/config/i386/predicates.md
> > +++ b/gcc/config/i386/predicates.md
> > @@ -2327,14 +2327,3 @@ (define_predicate "apx_ndd_add_memory_operand"
> >
> >    return true;
> >  })
> > -
> > -;; Check that each element is odd and incrementally increasing from 1
> > -(define_predicate "vcvtne2ps2bf_parallel"
> > -  (and (match_code "const_vector")
> > -       (match_code "const_int" "a"))
> > -{
> > -  for (int i = 0; i < XVECLEN (op, 0); ++i)
> > -    if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
> > -      return false;
> > -  return true;
> > -})
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index b3b4697924b..c134494cd20 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -31460,38 +31460,3 @@ (define_insn "vpdp<vpdpwprodtype>_<mode>"
> >    "TARGET_AVXVNNIINT16"
> >    "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
> >     [(set_attr "prefix" "vex")])
> > -
> > -(define_mode_attr hi_cvt_bf
> > -  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
> > -
> > -(define_mode_attr HI_CVT_BF
> > -  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
> > -
> > -(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>"
> > -  [(set (match_operand:VI2_AVX512F 0 "register_operand")
> > -       (unspec:VI2_AVX512F
> > -         [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
> > -          (match_operand:VI2_AVX512F 2 "register_operand")
> > -          (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
> > -          UNSPEC_VPERMT2))]
> > -  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
> > -  "#"
> > -  "&& 1"
> > -  [(const_int 0)]
> > -{
> > -  rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode);
> > -  operands[2] = lowpart_subreg (<ssePSmode>mode,
> > -                               force_reg (<MODE>mode, operands[2]),
> > -                               <MODE>mode);
> > -  operands[3] = lowpart_subreg (<ssePSmode>mode,
> > -                               force_reg (<MODE>mode, operands[3]),
> > -                               <MODE>mode);
> > -
> > -  emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0,
> > -                                                  operands[3],
> > -                                                  operands[2]));
> > -  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0,
> > -                                              <HI_CVT_BF>mode));
> > -  DONE;
> > -}
> > -[(set_attr "mode" "<sseinsnmode>")])
> > diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > index 5c65f2a9884..4cbc85735de 100755
> > --- a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > +++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > @@ -1,7 +1,6 @@
> >  /* { dg-do compile } */
> > -/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
> > -/* { dg-final { scan-assembler-not "vpermi2b" } } */
> > -/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
> > +/* { dg-options "-O2 -mavx512vbmi -mavx512vl" } */
> > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
> >
> >  typedef __bf16 v8bf __attribute__((vector_size(16)));
> >  typedef __bf16 v16bf __attribute__((vector_size(32)));
> > --
> > 2.34.1
> >
>
>
> --
> BR,
> Hongtao
Hongtao Liu July 15, 2024, 2:22 a.m. UTC | #3
On Mon, Jul 15, 2024 at 10:21 AM Hongyu Wang <wwwhhhyyy333@gmail.com> wrote:
>
> > Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b?
>
> We can still deal with BFmode permutation the same way as HFmode, so
> the change in ix86_vectorize_vec_perm_const can be preserved.
>
> Hongtao Liu <crazylht@gmail.com> 于2024年7月15日周一 09:40写道:
> >
> > On Sat, Jul 13, 2024 at 3:44 PM Hongyu Wang <hongyu.wang@intel.com> wrote:
> > >
> > > Hi,
> > >
> > > According to the instruction spec of AVX512BF16, the convert from float
> > > to BF16 is not a simple truncation. It has special handling for
> > > denormal/nan, even for normal float it will add an extra bias according
> > > to the least significant bit for bf number. This means we cannot use the
> > > vcvtne2ps2bf16 for any bf16 vector shuffle.
> > > The optimization introduced in r15-1368 adds a specific split to convert
> > > HImode permutation with this instruction, so remove it and treat the
> > > BFmode permutation same as HFmode.
I see, patch LGTM.
> > >
> > > Bootstrapped & regtested on x86_64-pc-linux-gnu. OK for trunk?
> > Could you just git revert 6d0b7b69d143025f271d0041cfa29cf26e6c343b?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/115889
> > >         * config/i386/predicates.md (vcvtne2ps2bf_parallel): Remove.
> > >         * config/i386/sse.md (hi_cvt_bf): Remove.
> > >         (HI_CVT_BF): Likewise.
> > >         (vpermt2_sepcial_bf16_shuffle_<mode>):Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/115889
> > >         * gcc.target/i386/vpermt2-special-bf16-shufflue.c: Adjust option
> > >         and output scan.
> > > ---
> > >  gcc/config/i386/predicates.md                 | 11 ------
> > >  gcc/config/i386/sse.md                        | 35 -------------------
> > >  .../i386/vpermt2-special-bf16-shufflue.c      |  5 ++-
> > >  3 files changed, 2 insertions(+), 49 deletions(-)
> > >
> > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
> > > index a894847adaf..5d0bb1e0f54 100644
> > > --- a/gcc/config/i386/predicates.md
> > > +++ b/gcc/config/i386/predicates.md
> > > @@ -2327,14 +2327,3 @@ (define_predicate "apx_ndd_add_memory_operand"
> > >
> > >    return true;
> > >  })
> > > -
> > > -;; Check that each element is odd and incrementally increasing from 1
> > > -(define_predicate "vcvtne2ps2bf_parallel"
> > > -  (and (match_code "const_vector")
> > > -       (match_code "const_int" "a"))
> > > -{
> > > -  for (int i = 0; i < XVECLEN (op, 0); ++i)
> > > -    if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
> > > -      return false;
> > > -  return true;
> > > -})
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index b3b4697924b..c134494cd20 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -31460,38 +31460,3 @@ (define_insn "vpdp<vpdpwprodtype>_<mode>"
> > >    "TARGET_AVXVNNIINT16"
> > >    "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
> > >     [(set_attr "prefix" "vex")])
> > > -
> > > -(define_mode_attr hi_cvt_bf
> > > -  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
> > > -
> > > -(define_mode_attr HI_CVT_BF
> > > -  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
> > > -
> > > -(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>"
> > > -  [(set (match_operand:VI2_AVX512F 0 "register_operand")
> > > -       (unspec:VI2_AVX512F
> > > -         [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
> > > -          (match_operand:VI2_AVX512F 2 "register_operand")
> > > -          (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
> > > -          UNSPEC_VPERMT2))]
> > > -  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
> > > -  "#"
> > > -  "&& 1"
> > > -  [(const_int 0)]
> > > -{
> > > -  rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode);
> > > -  operands[2] = lowpart_subreg (<ssePSmode>mode,
> > > -                               force_reg (<MODE>mode, operands[2]),
> > > -                               <MODE>mode);
> > > -  operands[3] = lowpart_subreg (<ssePSmode>mode,
> > > -                               force_reg (<MODE>mode, operands[3]),
> > > -                               <MODE>mode);
> > > -
> > > -  emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0,
> > > -                                                  operands[3],
> > > -                                                  operands[2]));
> > > -  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0,
> > > -                                              <HI_CVT_BF>mode));
> > > -  DONE;
> > > -}
> > > -[(set_attr "mode" "<sseinsnmode>")])
> > > diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > > index 5c65f2a9884..4cbc85735de 100755
> > > --- a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > > +++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
> > > @@ -1,7 +1,6 @@
> > >  /* { dg-do compile } */
> > > -/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
> > > -/* { dg-final { scan-assembler-not "vpermi2b" } } */
> > > -/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
> > > +/* { dg-options "-O2 -mavx512vbmi -mavx512vl" } */
> > > +/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
> > >
> > >  typedef __bf16 v8bf __attribute__((vector_size(16)));
> > >  typedef __bf16 v16bf __attribute__((vector_size(32)));
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > BR,
> > Hongtao
diff mbox series

Patch

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index a894847adaf..5d0bb1e0f54 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2327,14 +2327,3 @@  (define_predicate "apx_ndd_add_memory_operand"
 
   return true;
 })
-
-;; Check that each element is odd and incrementally increasing from 1
-(define_predicate "vcvtne2ps2bf_parallel"
-  (and (match_code "const_vector")
-       (match_code "const_int" "a"))
-{
-  for (int i = 0; i < XVECLEN (op, 0); ++i)
-    if (INTVAL (XVECEXP (op, 0, i)) != (2 * i + 1))
-      return false;
-  return true;
-})
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b3b4697924b..c134494cd20 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -31460,38 +31460,3 @@  (define_insn "vpdp<vpdpwprodtype>_<mode>"
   "TARGET_AVXVNNIINT16"
   "vpdp<vpdpwprodtype>\t{%3, %2, %0|%0, %2, %3}"
    [(set_attr "prefix" "vex")])
-
-(define_mode_attr hi_cvt_bf
-  [(V8HI "v8bf") (V16HI "v16bf") (V32HI "v32bf")])
-
-(define_mode_attr HI_CVT_BF
-  [(V8HI "V8BF") (V16HI "V16BF") (V32HI "V32BF")])
-
-(define_insn_and_split "vpermt2_sepcial_bf16_shuffle_<mode>"
-  [(set (match_operand:VI2_AVX512F 0 "register_operand")
-	(unspec:VI2_AVX512F
-	  [(match_operand:VI2_AVX512F 1 "vcvtne2ps2bf_parallel")
-	   (match_operand:VI2_AVX512F 2 "register_operand")
-	   (match_operand:VI2_AVX512F 3 "nonimmediate_operand")]
-	   UNSPEC_VPERMT2))]
-  "TARGET_AVX512VL && TARGET_AVX512BF16 && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(const_int 0)]
-{
-  rtx op0 = gen_reg_rtx (<HI_CVT_BF>mode);
-  operands[2] = lowpart_subreg (<ssePSmode>mode,
-				force_reg (<MODE>mode, operands[2]),
-				<MODE>mode);
-  operands[3] = lowpart_subreg (<ssePSmode>mode,
-				force_reg (<MODE>mode, operands[3]),
-				<MODE>mode);
-
-  emit_insn (gen_avx512f_cvtne2ps2bf16_<hi_cvt_bf>(op0,
-						   operands[3],
-						   operands[2]));
-  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0,
-					       <HI_CVT_BF>mode));
-  DONE;
-}
-[(set_attr "mode" "<sseinsnmode>")])
diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
index 5c65f2a9884..4cbc85735de 100755
--- a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
+++ b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
@@ -1,7 +1,6 @@ 
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512bf16 -mavx512vl" } */
-/* { dg-final { scan-assembler-not "vpermi2b" } } */
-/* { dg-final { scan-assembler-times "vcvtne2ps2bf16" 3 } } */
+/* { dg-options "-O2 -mavx512vbmi -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpermi2w" 3 } } */
 
 typedef __bf16 v8bf __attribute__((vector_size(16)));
 typedef __bf16 v16bf __attribute__((vector_size(32)));