diff mbox series

Guard truncate from vector float to vector __bf16 with !flag_rounding_math && HONOR_NANS (BFmode).

Message ID 20241108023259.528963-1-hongtao.liu@intel.com
State New
Headers show
Series Guard truncate from vector float to vector __bf16 with !flag_rounding_math && HONOR_NANS (BFmode). | expand

Commit Message

liuhongt Nov. 8, 2024, 2:32 a.m. UTC
hw instruction doesn't raise exceptions, turns sNAN into qNAN quietly,
and always round to nearest (even). Output denormals are always
flushed to zero and input denormals are always treated as zero. MXCSR
is not consulted nor updated.
W/o native instructions, flag_unsafe_math_optimizations is needed for
the permutation instructions.
Similar guard extend from vector __bf16 to vector float with
!HONOR_NANS (BFmode).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

	* config/i386/i386.md (truncsf2bf2): Add !flag_rounding_math
	to the condition, require flag_unsafe_math_optimizations when
	native instruction is not available.
	* config/i386/mmx.md: (truncv2sfv2bf2): Ditto.
	(extendv2bfv2sf2): Add !HONOR_NANS (BFmode) to the condition.
	* config/i386/sse.md: (truncv4sfv4sf2): Add
	!flag_rounding_math to the condition, require
	flag_unsafe_math_optimizations when native instruction is not
	available.
	(truncv8sfv8bf2): Ditto.
	(truncv16sfv16bf2): Ditto.
	(extendv4bfv4sf2): Add !HONOR_NANS (BFmode) to the condition.
	(extendv8bfv8sf2): Ditto.
	(extendv16bfv16sf2): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx512bf16-truncsfbf.c: Add -ffast-math.
	* gcc.target/i386/avx512bw-extendbf2sf.c: Ditto.
	* gcc.target/i386/avx512bw-truncsfbf.c: Ditto.
	* gcc.target/i386/sse2-extendbf2sf.c: Ditto.
	* gcc.target/i386/ssse3-truncsfbf.c: Ditto.
---
 gcc/config/i386/i386.md                          | 11 ++++++++++-
 gcc/config/i386/mmx.md                           |  8 ++++++--
 gcc/config/i386/sse.md                           | 16 ++++++++++++----
 .../gcc.target/i386/avx512bf16-truncsfbf.c       |  2 +-
 .../gcc.target/i386/avx512bw-extendbf2sf.c       |  2 +-
 .../gcc.target/i386/avx512bw-truncsfbf.c         |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c |  2 +-
 gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c  |  2 +-
 8 files changed, 33 insertions(+), 12 deletions(-)
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c492fe55881..96d5420d9de 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5694,11 +5694,20 @@  (define_insn "*trunc<mode>hf2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "HF")])
 
+/* vcvtneps2bf16 doesn't honor SNAN, and turn sNAN into qNAN quietly,
+   and it always round to even.
+   flag_unsafte_math_optimization is needed for psrld.
+   If we don't expect qNaNs nor sNaNs and can assume rounding
+   to nearest, we can expand the conversion inline as
+   (fromi + 0x7fff + ((fromi >> 16) & 1)) >> 16.  */
 (define_insn "truncsfbf2"
   [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv")
 	(float_truncate:BF
 	  (match_operand:SF 1 "register_operand" "0,x,v,Yv")))]
-  "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
   "@
   psrld\t{$16, %0|%0, 16}
   %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 021ac90ae2a..61a4f4d21ea 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2998,7 +2998,11 @@  (define_expand "truncv2sfv2bf2"
   [(set (match_operand:V2BF 0 "register_operand")
 	(float_truncate:V2BF
 	  (match_operand:V2SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE
+  && !HONOR_NANS (BFmode) && !flag_rounding_math
+  && (flag_unsafe_math_optimizations
+      || TARGET_AVXNECONVERT
+      || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   rtx op1 = gen_reg_rtx (V4SFmode);
   rtx op0 = gen_reg_rtx (V4BFmode);
@@ -3016,7 +3020,7 @@  (define_expand "extendv2bfv2sf2"
   [(set (match_operand:V2SF 0 "register_operand")
 	(float_extend:V2SF
 	  (match_operand:V2BF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSE2 && TARGET_MMX_WITH_SSE && !HONOR_NANS (BFmode)"
 {
   rtx op0 = gen_reg_rtx (V4SFmode);
   rtx op1 = gen_reg_rtx (V4BFmode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eeb3ab221a..efe32e5149f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30995,7 +30995,10 @@  (define_expand "truncv4sfv4bf2"
   [(set (match_operand:V4BF 0 "register_operand")
 	  (float_truncate:V4BF
 	    (match_operand:V4SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3"
+  "TARGET_SSSE3 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31088,7 +31091,10 @@  (define_expand "truncv8sfv8bf2"
   [(set (match_operand:V8BF 0 "register_operand")
 	(float_truncate:V8BF
 	  (match_operand:V8SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX2"
+  "TARGET_AVX2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+       || TARGET_AVXNECONVERT
+       || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
       && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31114,7 +31120,9 @@  (define_expand "truncv16sfv16bf2"
   [(set (match_operand:V16BF 0 "register_operand")
 	(float_truncate:V16BF
 	  (match_operand:V16SF 1 "nonimmediate_operand")))]
-  "TARGET_AVX512BW && TARGET_EVEX512"
+  "TARGET_AVX512BW && TARGET_EVEX512
+   && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations || TARGET_AVX512BF16)"
 {
   if (!TARGET_AVX512BF16)
     {
@@ -31127,7 +31135,7 @@  (define_expand "extend<sf_cvt_bf16_lower><mode>2"
   [(set (match_operand:VF1_AVX512BW 0 "register_operand")
 	(float_extend:VF1_AVX512BW
 	  (match_operand:<sf_cvt_bf16> 1 "nonimmediate_operand")))]
-  "TARGET_SSE2"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode)"
 {
   ix86_expand_vector_bf2sf_with_vec_perm (operands[0], operands[1]);
   DONE;
diff --git a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
index da31bdba21b..1b4b62f1060 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512vl -mavx512bf16 -O2" } */
+/* { dg-options "-mavx512vl -mavx512bf16 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)vcvtneps2bf16} 6 } } */
 
 #include "avx512bw-truncsfbf.c"
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
index 5b59958151f..e7c65b7ee01 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -O2" } */
+/* { dg-options "-mavx512bw -mavx512vl -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|vpunpcklwd)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
index 071db21cfb3..40802d865df 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-options "-mavx512bw -mavx512vl -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermw|vpshufb)} 6 } } */
 
 typedef float v4sf __attribute__((vector_size(16)));
diff --git a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
index 0f007df68f6..d7f77acd603 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-msse2 -O2" } */
+/* { dg-options "-msse2 -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)(?:vpermi2w|punpcklwd)} 2 { target { ! ia32 } } } } */
 
 typedef float v2sf __attribute__((vector_size(8)));
diff --git a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
index 70840c537f1..af92f4d0bef 100644
--- a/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
+++ b/gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c
@@ -1,5 +1,5 @@ 
 /* { dg-do compile } */
-/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2" } */
+/* { dg-options "-mssse3 -mno-avx512bf16 -mno-avxneconvert -O2 -ffast-math" } */
 /* { dg-final { scan-assembler-times {(?n)pshufb} 2 { target { ! ia32 } } } } */
 
 typedef float v2sf __attribute__((vector_size(8)));