From f78d9f2595c315b6343adc4c3b79b6596c45c65b Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 21 May 2021 09:48:18 +0800
Subject: [PATCH 1/2] [i386] Fold blendv builtins into gimple.
Fold __builtin_ia32_pblendvb128 (a, b, c) as VEC_COND_EXPR (c < 0, b,
a), similar for float version but with mask operand VIEW_CONVERT_EXPR
to same sized integer vectype.
After folding, blendv related patterns can be redefined as
vec_merge since all elements of mask operand is either const0_rtx or
constm1_rtx now. It could potentially enable more rtl optimizations.
Besides, although there's no pblendv{d,q} instructions, backend can
still define their patterns and generate blendv{ps,pd} instead.
gcc/ChangeLog:
* config/i386/i386-builtin.def (IX86_BUILTIN_BLENDVPD256,
IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_PBLENDVB256,
IX86_BUILTIN_BLENDVPD, IX86_BUILTIN_BLENDVPS,
IX86_BUILTIN_PBLENDVB128): Replace icode with
CODE_FOR_nothing.
* config/i386/i386-expand.c (ix86_expand_sse_movcc): Use
gen_avx_blendvd256/gen_avx_blendvq256/gen_sse4_1_blendvd/gen_sse4_1_blendvq
for V8SI/V4DI/V4SI/V2DImode.
* config/i386/i386.c (ix86_gimple_fold_builtin): Fold blendv
builtins.
* config/i386/mmx.md (mmx_blendvps): Change to define_expand.
(*mmx_blendvps): New pattern implemented as vec_merge.
* config/i386/sse.md
(<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): Change to
define_expand.
(<sse4_1_avx2>_pblendvb): Ditto.
(*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>): New pattern
implemented as vec_merge.
(*<sse4_1_avx2>_pblendvb): Ditto.
(*<sse4_1_avx2>_pblendvb_lt): Redefined as define_insn with
pattern implemented as vec_merge instead of UNSPEC_BLENDV.
(*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt): Ditto,
and extend mode to V48_AVX.
(*<sse4_1_avx2>_pblendvb_not_lt): New.
(*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint): Deleted.
(*<sse4_1_avx2>_pblendvb_lt): Ditto.
(*<sse4_1_avx2>_pblendvb_not_lt): Ditto.
gcc/testsuite/ChangeLog:
* gcc.target/i386/funcspec-8.c: Replace
__builtin_ia32_blendvpd with __builtin_ia32_roundps_az.
* gcc.target/i386/blendv-1.c: New test.
* gcc.target/i386/blendv-2.c: New test.
---
gcc/config/i386/i386-builtin.def | 12 +-
gcc/config/i386/i386-expand.c | 22 +-
gcc/config/i386/i386.c | 37 ++++
gcc/config/i386/mmx.md | 38 +++-
gcc/config/i386/sse.md | 227 +++++++++++----------
gcc/testsuite/gcc.target/i386/blendv-1.c | 51 +++++
gcc/testsuite/gcc.target/i386/blendv-2.c | 41 ++++
gcc/testsuite/gcc.target/i386/funcspec-8.c | 16 +-
8 files changed, 303 insertions(+), 141 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/blendv-1.c
create mode 100644 gcc/testsuite/gcc.target/i386/blendv-2.c
@@ -902,13 +902,13 @@ BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_ssse3_palignrdi,
/* SSE4.1 */
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendpd, "__builtin_ia32_blendpd", IX86_BUILTIN_BLENDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendps, "__builtin_ia32_blendps", IX86_BUILTIN_BLENDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvpd, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_blendvps, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd", IX86_BUILTIN_BLENDVPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps", IX86_BUILTIN_BLENDVPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SF)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, "__builtin_ia32_dppd", IX86_BUILTIN_DPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps, "__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, "__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI_V16QI)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, "__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI_INT)
BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, "__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) V8HI_FTYPE_V16QI)
@@ -1028,8 +1028,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpe
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_nothing, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT)
BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT)
@@ -1154,7 +1154,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, "__builtin_ia32_andsi256", IX
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, "__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI)
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, "__builtin_ia32_pavgb256", IX86_BUILTIN_PAVGB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, "__builtin_ia32_pavgw256", IX86_BUILTIN_PAVGW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendvb, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI_V32QI)
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, "__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI_INT)
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
@@ -3700,6 +3700,16 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
break;
+ /* Although x86 does not have pblendv{d,q} instructions,
+ backend can define their patterns and then generate pblendv{ps,pd}. */
+ case E_V4SImode:
+ if (TARGET_SSE4_1)
+ gen = gen_sse4_1_blendvd;
+ break;
+ case E_V2DImode:
+ if (TARGET_SSE4_1)
+ gen = gen_sse4_1_blendvq;
+ break;
case E_SFmode:
if (TARGET_SSE4_1)
{
@@ -3731,8 +3741,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
break;
case E_V16QImode:
case E_V8HImode:
- case E_V4SImode:
- case E_V2DImode:
if (TARGET_SSE4_1)
{
gen = gen_sse4_1_pblendvb;
@@ -3743,6 +3751,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
cmp = gen_lowpart (V16QImode, cmp);
}
break;
+ case E_V8SImode:
+ if (TARGET_AVX)
+ gen = gen_avx_blendvd256;
+ break;
+ case E_V4DImode:
+ if (TARGET_AVX)
+ gen = gen_avx_blendvq256;
+ break;
case E_V8SFmode:
if (TARGET_AVX)
gen = gen_avx_blendvps256;
@@ -3753,8 +3769,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
break;
case E_V32QImode:
case E_V16HImode:
- case E_V8SImode:
- case E_V4DImode:
if (TARGET_AVX2)
{
gen = gen_avx2_pblendvb;
@@ -17966,6 +17966,43 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
}
break;
+ case IX86_BUILTIN_PBLENDVB128:
+ case IX86_BUILTIN_PBLENDVB256:
+ case IX86_BUILTIN_BLENDVPS:
+ case IX86_BUILTIN_BLENDVPD:
+ case IX86_BUILTIN_BLENDVPS256:
+ case IX86_BUILTIN_BLENDVPD256:
+ gcc_assert (n_args == 3);
+ arg0 = gimple_call_arg (stmt, 0);
+ arg1 = gimple_call_arg (stmt, 1);
+ arg2 = gimple_call_arg (stmt, 2);
+ if (gimple_call_lhs (stmt))
+ {
+ location_t loc = gimple_location (stmt);
+ tree type = TREE_TYPE (arg2);
+ gimple_seq stmts = NULL;
+ if (VECTOR_FLOAT_TYPE_P (type))
+ {
+ tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
+ ? intSI_type_node : intDI_type_node;
+ type = get_same_sized_vectype (itype, type);
+ arg2 = gimple_build (&stmts, VIEW_CONVERT_EXPR, type, arg2);
+ }
+ tree zero_vec = build_zero_cst (type);
+ tree cmp_type = truth_type_for (type);
+ tree cmp = gimple_build (&stmts, LT_EXPR, cmp_type, arg2, zero_vec);
+ gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+ gimple *g = gimple_build_assign (gimple_call_lhs (stmt),
+ VEC_COND_EXPR, cmp,
+ arg1, arg0);
+ gimple_set_location (g, loc);
+ gsi_replace (gsi, g, false);
+ }
+ else
+ gsi_replace (gsi, gimple_build_nop (), false);
+ return true;
+
+
case IX86_BUILTIN_PCMPEQB128:
case IX86_BUILTIN_PCMPEQW128:
case IX86_BUILTIN_PCMPEQD128:
@@ -862,13 +862,30 @@ (define_expand "vcond<mode>v2sf"
DONE;
})
-(define_insn "mmx_blendvps"
- [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "mmx_blendvps"
+ [(set (match_operand:V2SF 0 "register_operand")
(unspec:V2SF
- [(match_operand:V2SF 1 "register_operand" "0,0,x")
- (match_operand:V2SF 2 "register_operand" "Yr,*x,x")
- (match_operand:V2SF 3 "register_operand" "Yz,Yz,x")]
- UNSPEC_BLENDV))]
+ [(match_operand:V2SF 1 "register_operand")
+ (match_operand:V2SF 2 "register_operand")
+ (match_operand:V2SF 3 "register_operand")]
+ UNSPEC_BLENDV))]
+ "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
+{
+ operands[3] = gen_lowpart (V2SImode, operands[3]);
+ rtx tmp = gen_rtx_VEC_MERGE (V2SFmode, operands[2],
+ operands[1], operands[3]);
+ emit_move_insn (operands[0], tmp);
+ DONE;
+})
+
+(define_insn "*mmx_blendvps"
+ [(set (match_operand:V2SF 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:V2SF
+ (match_operand:V2SF 2 "register_operand" "Yr,*x,x")
+ (match_operand:V2SF 1 "register_operand" "0,0,x")
+ (match_operand:V2SI 3 "register_operand" "Yz,Yz,x")))]
"TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
"@
blendvps\t{%3, %2, %0|%0, %2, %3}
@@ -1935,11 +1952,10 @@ (define_expand "vcond_mask_<mode><mmxintvecmodelower>"
(define_insn "mmx_pblendvb"
[(set (match_operand:V8QI 0 "register_operand" "=Yr,*x,x")
- (unspec:V8QI
- [(match_operand:V8QI 1 "register_operand" "0,0,x")
- (match_operand:V8QI 2 "register_operand" "Yr,*x,x")
- (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")]
- UNSPEC_BLENDV))]
+ (vec_merge:V8QI
+ (match_operand:V8QI 2 "register_operand" "Yr,*x,x")
+ (match_operand:V8QI 1 "register_operand" "0,0,x")
+ (match_operand:V8QI 3 "register_operand" "Yz,Yz,x")))]
"TARGET_SSE4_1 && TARGET_MMX_WITH_SSE"
"@
pblendvb\t{%3, %2, %0|%0, %2, %3}
@@ -547,6 +547,11 @@ (define_mode_iterator V48_AVX2
(V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
+(define_mode_iterator V48_AVX
+ [V4SF V2DF V4SI V2DI
+ (V8SF "TARGET_AVX") (V4DF "TARGET_AVX")
+ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX")])
+
(define_mode_iterator VI1_AVX512VLBW
[(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX512VL")
(V16QI "TARGET_AVX512VL")])
@@ -796,6 +801,14 @@ (define_mode_attr sseintvecmode
(V32HI "V32HI") (V64QI "V64QI")
(V32QI "V32QI") (V16QI "V16QI")])
+(define_mode_attr ssefloatvecmode
+ [(V16SF "V16SF") (V8DF "V8DF")
+ (V8SF "V8SF") (V4DF "V4DF")
+ (V4SF "V4SF") (V2DF "V2DF")
+ (V16SI "V16SF") (V8DI "V8DF")
+ (V8SI "V8SF") (V4DI "V4DF")
+ (V4SI "V4SF") (V2DI "V2DF")])
+
(define_mode_attr sseintvecmode2
[(V8DF "XI") (V4DF "OI") (V2DF "TI")
(V8SF "OI") (V4SF "TI")])
@@ -17637,26 +17650,50 @@ (define_insn "<sse4_1>_blend<ssemodesuffix><avxsizesuffix>"
(set_attr "prefix" "orig,orig,vex")
(set_attr "mode" "<MODE>")])
-(define_insn "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
- [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
- (unspec:VF_128_256
- [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
- (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
- (match_operand:VF_128_256 3 "register_operand" "Yz,Yz,x")]
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+ [(set (match_operand:V48_AVX 0 "register_operand")
+ (unspec:V48_AVX
+ [(match_operand:V48_AVX 1 "register_operand")
+ (match_operand:V48_AVX 2 "vector_operand")
+ (match_operand:V48_AVX 3 "register_operand")]
UNSPEC_BLENDV))]
"TARGET_SSE4_1"
+{
+ if (FLOAT_MODE_P (<MODE>mode))
+ operands[3] = gen_lowpart (<sseintvecmode>mode, operands[3]);
+ rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2],
+ operands[1], operands[3]);
+ emit_move_insn (operands[0], tmp);
+ DONE;
+})
+
+(define_mode_attr fblendvsuffix
+ [(V4SF "ps") (V2DF "pd")
+ (V8SF "ps") (V4DF "pd")
+ (V4SI "ps") (V2DI "pd")
+ (V8SI "ps") (V4DI "pd")])
+
+(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>"
+ [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:V48_AVX
+ (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm")
+ (match_operand:V48_AVX 1 "register_operand" "0,0,x")
+ (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")))]
+ "TARGET_SSE4_1"
"@
- blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
- blendv<ssemodesuffix>\t{%3, %2, %0|%0, %2, %3}
- vblendv<ssemodesuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+ blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+ blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+ vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
[(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "ssemov")
(set_attr "length_immediate" "1")
(set_attr "prefix_data16" "1,1,*")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "orig,orig,vex")
- (set_attr "btver2_decode" "vector,vector,vector")
- (set_attr "mode" "<MODE>")])
+ (set_attr "btver2_decode" "vector,vector,vector")
+ (set_attr "mode" "<ssefloatvecmode>")])
;; Also define scalar versions. These are used for conditional move.
;; Using subregs into vector modes causes register allocation lossage.
@@ -17698,67 +17735,27 @@ (define_insn "sse4_1_blendv<ssemodesuffix>"
]
(const_string "<ssevecmode>")))])
-(define_insn_and_split "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
- [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
- (unspec:VF_128_256
- [(match_operand:VF_128_256 1 "register_operand" "0,0,x")
- (match_operand:VF_128_256 2 "vector_operand" "YrBm,*xBm,xm")
- (lt:VF_128_256
- (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
- (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))]
- UNSPEC_BLENDV))]
+(define_insn "*<sse4_1>_blendv<ssemodesuffix><avxsizesuffix>_lt"
+ [(set (match_operand:V48_AVX 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:V48_AVX
+ (match_operand:V48_AVX 2 "vector_operand" "YrBm,*xBm,xm")
+ (match_operand:V48_AVX 1 "register_operand" "0,0,x")
+ (lt:<sseintvecmode>
+ (match_operand:<sseintvecmode> 3 "register_operand" "Yz,Yz,x")
+ (match_operand:<sseintvecmode> 4 "const0_operand" "C,C,C"))))]
"TARGET_SSE4_1"
- "#"
- "&& reload_completed"
- [(set (match_dup 0)
- (unspec:VF_128_256
- [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
- "operands[3] = gen_lowpart (<MODE>mode, operands[3]);"
- [(set_attr "isa" "noavx,noavx,avx")
- (set_attr "type" "ssemov")
- (set_attr "length_immediate" "1")
- (set_attr "prefix_data16" "1,1,*")
- (set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,orig,vex")
- (set_attr "btver2_decode" "vector,vector,vector")
- (set_attr "mode" "<MODE>")])
-
-(define_mode_attr ssefltmodesuffix
- [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")])
-
-(define_mode_attr ssefltvecmode
- [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
-
-(define_insn_and_split "*<sse4_1>_blendv<ssefltmodesuffix><avxsizesuffix>_ltint"
- [(set (match_operand:<ssebytemode> 0 "register_operand" "=Yr,*x,x")
- (unspec:<ssebytemode>
- [(match_operand:<ssebytemode> 1 "register_operand" "0,0,x")
- (match_operand:<ssebytemode> 2 "vector_operand" "YrBm,*xBm,xm")
- (subreg:<ssebytemode>
- (lt:VI48_AVX
- (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
- (match_operand:VI48_AVX 4 "const0_operand" "C,C,C")) 0)]
- UNSPEC_BLENDV))]
- "TARGET_SSE4_1"
- "#"
- "&& reload_completed"
- [(set (match_dup 0)
- (unspec:<ssefltvecmode>
- [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
-{
- operands[0] = gen_lowpart (<ssefltvecmode>mode, operands[0]);
- operands[1] = gen_lowpart (<ssefltvecmode>mode, operands[1]);
- operands[2] = gen_lowpart (<ssefltvecmode>mode, operands[2]);
- operands[3] = gen_lowpart (<ssefltvecmode>mode, operands[3]);
-}
+ "@
+ blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+ blendv<fblendvsuffix>\t{%3, %2, %0|%0, %2, %3}
+ vblendv<fblendvsuffix>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
[(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "ssemov")
(set_attr "length_immediate" "1")
(set_attr "prefix_data16" "1,1,*")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "orig,orig,vex")
- (set_attr "btver2_decode" "vector,vector,vector")
- (set_attr "mode" "<ssefltvecmode>")])
+ (set_attr "btver2_decode" "vector,vector,vector")
+ (set_attr "mode" "<ssefloatvecmode>")])
(define_insn "<sse4_1>_dp<ssemodesuffix><avxsizesuffix>"
[(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
@@ -17837,14 +17834,30 @@ (define_insn "<sse4_1_avx2>_packusdw<mask_name>"
(set_attr "prefix" "orig,orig,<mask_prefix>")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "<sse4_1_avx2>_pblendvb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+;; NB: This expander should only be used if only all elements
+;; of operands[3] are either const0_rtx or constm1_rtx.
+(define_expand "<sse4_1_avx2>_pblendvb"
+ [(set (match_operand:VI1_AVX2 0 "register_operand")
(unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x")
- (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
- (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")]
+ [(match_operand:VI1_AVX2 1 "register_operand")
+ (match_operand:VI1_AVX2 2 "vector_operand")
+ (match_operand:VI1_AVX2 3 "register_operand")]
UNSPEC_BLENDV))]
"TARGET_SSE4_1"
+{
+ rtx tmp = gen_rtx_VEC_MERGE (<MODE>mode, operands[2],
+ operands[1], operands[3]);
+ emit_move_insn (operands[0], tmp);
+ DONE;
+})
+
+(define_insn "*<sse4_1_avx2>_pblendvb"
+ [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+ (vec_merge:VI1_AVX2
+ (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
+ (match_operand:VI1_AVX2 1 "register_operand" "0,0,x")
+ (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")))]
+ "TARGET_SSE4_1"
"@
pblendvb\t{%3, %2, %0|%0, %2, %3}
pblendvb\t{%3, %2, %0|%0, %2, %3}
@@ -17857,50 +17870,19 @@ (define_insn "<sse4_1_avx2>_pblendvb"
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<sseinsnmode>")])
-(define_split
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "vector_operand")
- (match_operand:VI1_AVX2 2 "register_operand")
- (not:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand"))]
- UNSPEC_BLENDV))]
- "TARGET_SSE4_1"
- [(set (match_dup 0)
- (unspec:VI1_AVX2
- [(match_dup 2) (match_dup 1) (match_dup 3)]
- UNSPEC_BLENDV))])
-
-(define_split
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "vector_operand")
- (match_operand:VI1_AVX2 2 "register_operand")
- (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)]
- UNSPEC_BLENDV))]
- "TARGET_SSE4_1
- && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT
- && GET_MODE_SIZE (GET_MODE (operands[3])) == <MODE_SIZE>"
- [(set (match_dup 0)
- (unspec:VI1_AVX2
- [(match_dup 2) (match_dup 1) (match_dup 4)]
- UNSPEC_BLENDV))]
- "operands[4] = gen_lowpart (<MODE>mode, operands[3]);")
-
-(define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt"
+(define_insn "*<sse4_1_avx2>_pblendvb_lt"
[(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,0,x")
- (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
- (lt:VI1_AVX2 (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
- (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))]
- UNSPEC_BLENDV))]
+ (vec_merge:VI1_AVX2
+ (match_operand:VI1_AVX2 2 "vector_operand" "YrBm,*xBm,xm")
+ (match_operand:VI1_AVX2 1 "register_operand" "0,0,x")
+ (lt:VI1_AVX2
+ (match_operand:VI1_AVX2 3 "register_operand" "Yz,Yz,x")
+ (match_operand:VI1_AVX2 4 "const0_operand" "C,C,C"))))]
"TARGET_SSE4_1"
- "#"
- ""
- [(set (match_dup 0)
- (unspec:VI1_AVX2
- [(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
- ""
+ "@
+ pblendvb\t{%3, %2, %0|%0, %2, %3}
+ pblendvb\t{%3, %2, %0|%0, %2, %3}
+ vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}"
[(set_attr "isa" "noavx,noavx,avx")
(set_attr "type" "ssemov")
(set_attr "prefix_extra" "1")
@@ -17909,6 +17891,27 @@ (define_insn_and_split "*<sse4_1_avx2>_pblendvb_lt"
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*<sse4_1_avx2>_pblendvb_not_lt"
+ [(set (match_operand:VI1_AVX2 0 "register_operand")
+ (vec_merge:VI1_AVX2
+ (match_operand:VI1_AVX2 2 "register_operand")
+ (match_operand:VI1_AVX2 1 "vector_operand")
+ (lt:VI1_AVX2
+ (subreg:VI1_AVX2 (not (match_operand 3 "register_operand")) 0)
+ (match_operand:VI1_AVX2 4 "const0_operand"))))]
+ "TARGET_SSE4_1 && ix86_pre_reload_split ()
+ && GET_MODE_CLASS (GET_MODE (operands[3])) == MODE_VECTOR_INT
+ && GET_MODE_SIZE (GET_MODE (operands[3])) == GET_MODE_SIZE (<MODE>mode)"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (vec_merge:VI1_AVX2
+ (match_dup 1)
+ (match_dup 2)
+ (lt:VI1_AVX2
+ (subreg:VI1_AVX2 (match_dup 3) 0)
+ (match_dup 4))))])
+
(define_insn "sse4_1_pblendw"
[(set (match_operand:V8HI 0 "register_operand" "=Yr,*x,x")
(vec_merge:V8HI
new file mode 100644
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-times {pblendvb[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {pblendvb[\t ]*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvps[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvps[\t ]*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvpd[\t ]*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {blendvpd[\t ]*%ymm} 1 } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v32qi __attribute__ ((vector_size (32)));
+
+v4sf
+foo (v4sf a, v4sf b, v4sf c)
+{
+ return __builtin_ia32_blendvps (a, b, c);
+}
+
+v8sf
+foo2 (v8sf a, v8sf b, v8sf c)
+{
+ return __builtin_ia32_blendvps256 (a, b, c);
+}
+
+v2df
+foo3 (v2df a, v2df b, v2df c)
+{
+ return __builtin_ia32_blendvpd (a, b, c);
+}
+
+v4df
+foo4 (v4df a, v4df b, v4df c)
+{
+ return __builtin_ia32_blendvpd256 (a, b, c);
+}
+
+v16qi
+foo5 (v16qi a, v16qi b, v16qi c)
+{
+ return __builtin_ia32_pblendvb128 (a, b, c);
+}
+
+v32qi
+foo6 (v32qi a, v32qi b, v32qi c)
+{
+ return __builtin_ia32_pblendvb256 (a, b, c);
+}
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not {pblendv} } } */
+/* { dg-final { scan-assembler-not {blendvp} } } */
+
+#include <x86intrin.h>
+__m128
+foo (__m128 a, __m128 b)
+{
+ return _mm_blendv_ps (a, b, _mm_setzero_ps ());
+}
+
+__m256
+foo2 (__m256 a, __m256 b)
+{
+ return _mm256_blendv_ps (a, b, _mm256_set1_ps (-1.0));
+}
+
+__m128d
+foo3 (__m128d a, __m128d b, __m128d c)
+{
+ return _mm_blendv_pd (a, b, _mm_set1_pd (1.0));
+}
+
+__m256d
+foo4 (__m256d a, __m256d b, __m256d c)
+{
+ return _mm256_blendv_pd (a, b, _mm256_set1_pd (-134.3));
+}
+
+__m128i
+foo5 (__m128i a, __m128i b, __m128i c)
+{
+ return _mm_blendv_epi8 (a, b, _mm_set1_epi8 (3));
+}
+
+__m256i
+foo6 (__m256i a, __m256i b, __m256i c)
+{
+ return _mm256_blendv_epi8 (a, b, _mm256_set1_epi8 (-22));
+}
@@ -52,19 +52,19 @@ generic_psignd128 (__m128w a, __m128w b)
#error "-msse4.1 should not be set for this test"
#endif
-__m128d sse4_1_blendvpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("sse4.1")));
-__m128d generic_blendvpd (__m128d a, __m128d b, __m128d c);
+__m128 sse4_1_roundv4sf2 (__m128 a) __attribute__((__target__("sse4.1")));
+__m128 generic_roundv4sf2 (__m128 a);
-__m128d
-sse4_1_blendvpd (__m128d a, __m128d b, __m128d c)
+__m128
+sse4_1_roundv4sf2 (__m128 a)
{
- return __builtin_ia32_blendvpd (a, b, c);
+ return __builtin_ia32_roundps_az (a);
}
-__m128d
-generic_blendvpd (__m128d a, __m128d b, __m128d c)
+__m128
+generic_blendvpd (__m128 a)
{
- return __builtin_ia32_blendvpd (a, b, c); /* { dg-error "needs isa option" } */
+ return __builtin_ia32_roundps_az (a); /* { dg-error "needs isa option" } */
}
#ifdef __SSE4_2__
--
2.18.1