@@ -1,5 +1,24 @@
2012-06-25 Richard Henderson <rth@redhat.com>
+ * config/i386/i386-builtin-types.def (V4UDI, V8USI): New.
+ (V2UDI_FUNC_V4USI_V4USI): New.
+ (V4UDI_FUNC_V8USI_V8USI): New.
+ * config/i386/i386.c (ix86_expand_args_builtin): Handle them.
+ (IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI): New.
+ (IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI): New.
+ (IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI): New.
+ (IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI): New.
+ (IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI): New.
+ (IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI): New.
+ (bdesc_args): Add them.
+ (ix86_builtin_mul_widen_even, ix86_builtin_mul_widen_odd): New.
+ (TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN): New.
+ (TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD): New.
+ (ix86_expand_mul_widen_evenodd): Use xop_pmacsdqh.
+ * config/i386/sse.md (vec_widen_<s>mult_odd_<V124_AVX2>): New.
+
+2012-06-25 Richard Henderson <rth@redhat.com>
+
* config/i386.sse.md (mul<VI4_AVX2>3): Use xop_pmacsdd.
2012-06-25 Richard Henderson <rth@redhat.com>
@@ -97,7 +97,8 @@ DEF_VECTOR_TYPE (V4DI, DI)
DEF_VECTOR_TYPE (V8SI, SI)
DEF_VECTOR_TYPE (V16HI, HI)
DEF_VECTOR_TYPE (V32QI, QI)
-
+DEF_VECTOR_TYPE (V4UDI, UDI, V4DI)
+DEF_VECTOR_TYPE (V8USI, USI, V8SI)
DEF_POINTER_TYPE (PCCHAR, CHAR, CONST)
DEF_POINTER_TYPE (PCDOUBLE, DOUBLE, CONST)
@@ -283,6 +284,7 @@ DEF_FUNCTION_TYPE (V2DI, V2DI, SI)
DEF_FUNCTION_TYPE (V2DI, V2DI, V16QI)
DEF_FUNCTION_TYPE (V2DI, V2DI, V2DI)
DEF_FUNCTION_TYPE (V2DI, V4SI, V4SI)
+DEF_FUNCTION_TYPE (V2UDI, V4USI, V4USI)
DEF_FUNCTION_TYPE (V2DI, PCV2DI, V2DI)
DEF_FUNCTION_TYPE (V2SF, V2SF, V2SF)
DEF_FUNCTION_TYPE (V2SI, INT, INT)
@@ -349,6 +351,7 @@ DEF_FUNCTION_TYPE (V8SI, V8SI, SI)
DEF_FUNCTION_TYPE (V8SI, PCV8SI, V8SI)
DEF_FUNCTION_TYPE (V4DI, V4DI, V4DI)
DEF_FUNCTION_TYPE (V4DI, V8SI, V8SI)
+DEF_FUNCTION_TYPE (V4UDI, V8USI, V8USI)
DEF_FUNCTION_TYPE (V4DI, V4DI, V2DI)
DEF_FUNCTION_TYPE (V4DI, PCV4DI, V4DI)
DEF_FUNCTION_TYPE (V4DI, V4DI, INT)
@@ -25754,6 +25754,13 @@ enum ix86_builtins
IX86_BUILTIN_CPYSGNPS256,
IX86_BUILTIN_CPYSGNPD256,
+ IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI,
+ IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI,
+ IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI,
+ IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI,
+ IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI,
+ IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI,
+
/* FMA4 instructions. */
IX86_BUILTIN_VFMADDSS,
IX86_BUILTIN_VFMADDSD,
@@ -26612,6 +26619,8 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv1siv1di3, "__builtin_ia32_pmuludq", IX86_BUILTIN_PMULUDQ, UNKNOWN, (int) V1DI_FTYPE_V2SI_V2SI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_ia32_pmuludq128", IX86_BUILTIN_PMULUDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, "__builtin_vw_umul_even_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
+ { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_widen_umult_odd_v4si, "__builtin_ia32_vw_umul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI, UNKNOWN, (int) V2UDI_FTYPE_V4USI_V4USI },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, "__builtin_ia32_pmaddwd128", IX86_BUILTIN_PMADDWD128, UNKNOWN, (int) V4SI_FTYPE_V8HI_V8HI },
@@ -26738,6 +26747,7 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv4si3, "__builtin_ia32_pminud128", IX86_BUILTIN_PMINUD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_uminv8hi3, "__builtin_ia32_pminuw128", IX86_BUILTIN_PMINUW128, UNKNOWN, (int) V8HI_FTYPE_V8HI_V8HI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_sse4_1_mulv2siv2di3, "__builtin_ia32_pmuldq128", IX86_BUILTIN_PMULDQ128, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
+ { OPTION_MASK_ISA_SSE4_1, CODE_FOR_vec_widen_smult_odd_v4si, "__builtin_ia32_vw_smul_odd_v4si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI, UNKNOWN, (int) V2DI_FTYPE_V4SI_V4SI },
{ OPTION_MASK_ISA_SSE4_1, CODE_FOR_mulv4si3, "__builtin_ia32_pmulld128", IX86_BUILTIN_PMULLD128, UNKNOWN, (int) V4SI_FTYPE_V4SI_V4SI },
/* SSE4.1 */
@@ -27004,12 +27014,15 @@ static const struct builtin_description bdesc_args[] =
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4hiv4di2 , "__builtin_ia32_pmovzxwq256", IX86_BUILTIN_PMOVZXWQ256, UNKNOWN, (int) V4DI_FTYPE_V8HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_zero_extendv4siv4di2 , "__builtin_ia32_pmovzxdq256", IX86_BUILTIN_PMOVZXDQ256, UNKNOWN, (int) V4DI_FTYPE_V4SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_mulv4siv4di3 , "__builtin_ia32_pmuldq256" , IX86_BUILTIN_PMULDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_smult_odd_v8si, "__builtin_ia32_vw_smul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI, UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulhrswv16hi3 , "__builtin_ia32_pmulhrsw256", IX86_BUILTIN_PMULHRSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_umulv16hi3_highpart, "__builtin_ia32_pmulhuw256" , IX86_BUILTIN_PMULHUW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_smulv16hi3_highpart, "__builtin_ia32_pmulhw256" , IX86_BUILTIN_PMULHW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_mulv16hi3, "__builtin_ia32_pmullw256" , IX86_BUILTIN_PMULLW256 , UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_mulv8si3, "__builtin_ia32_pmulld256" , IX86_BUILTIN_PMULLD256 , UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_ia32_pmuludq256" , IX86_BUILTIN_PMULUDQ256 , UNKNOWN, (int) V4DI_FTYPE_V8SI_V8SI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_umulv4siv4di3 , "__builtin_i386_vw_umul_even_v8si" , IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_vec_widen_umult_odd_v8si, "__builtin_ia32_vw_umul_odd_v8si", IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI, UNKNOWN, (int) V4UDI_FTYPE_V8USI_V8USI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_iorv4di3, "__builtin_ia32_por256", IX86_BUILTIN_POR256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_psadbw, "__builtin_ia32_psadbw256", IX86_BUILTIN_PSADBW256, UNKNOWN, (int) V16HI_FTYPE_V32QI_V32QI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pshufbv32qi3, "__builtin_ia32_pshufb256", IX86_BUILTIN_PSHUFB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI },
@@ -29142,6 +29155,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V2DI_FTYPE_V2DI_V2DI:
case V2DI_FTYPE_V16QI_V16QI:
case V2DI_FTYPE_V4SI_V4SI:
+ case V2UDI_FTYPE_V4USI_V4USI:
case V2DI_FTYPE_V2DI_V16QI:
case V2DI_FTYPE_V2DF_V2DF:
case V2SI_FTYPE_V2SI_V2SI:
@@ -29166,6 +29180,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8SI_FTYPE_V16HI_V16HI:
case V4DI_FTYPE_V4DI_V4DI:
case V4DI_FTYPE_V8SI_V8SI:
+ case V4UDI_FTYPE_V8USI_V8USI:
if (comparison == UNKNOWN)
return ix86_expand_binop_builtin (icode, exp, target);
nargs = 2;
@@ -31042,6 +31057,78 @@ ix86_builtin_reciprocal (unsigned int fn, bool md_fn,
return NULL_TREE;
}
}
+
+static tree
+ix86_builtin_mul_widen_even (tree type)
+{
+ bool uns_p = TYPE_UNSIGNED (type);
+ enum ix86_builtins code;
+
+ switch (TYPE_MODE (type))
+ {
+ case V4SImode:
+ if (uns_p)
+ {
+ if (!TARGET_SSE2)
+ return NULL;
+ code = IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V4SI;
+ }
+ else
+ {
+ if (!TARGET_SSE4_1)
+ return NULL;
+ code = IX86_BUILTIN_PMULDQ128;
+ }
+ break;
+
+ case V8SImode:
+ if (!TARGET_AVX2)
+ return NULL;
+ code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_EVEN_V8SI
+ : IX86_BUILTIN_PMULDQ256);
+ break;
+
+ default:
+ return NULL;
+ }
+ return ix86_builtins[code];
+}
+
+static tree
+ix86_builtin_mul_widen_odd (tree type)
+{
+ bool uns_p = TYPE_UNSIGNED (type);
+ enum ix86_builtins code;
+
+ switch (TYPE_MODE (type))
+ {
+ case V4SImode:
+ if (uns_p)
+ {
+ if (!TARGET_SSE2)
+ return NULL;
+ code = IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V4SI;
+ }
+ else
+ {
+ if (!TARGET_SSE4_1)
+ return NULL;
+ code = IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V4SI;
+ }
+ break;
+
+ case V8SImode:
+ if (!TARGET_AVX2)
+ return NULL;
+ code = (uns_p ? IX86_BUILTIN_VEC_WIDEN_UMUL_ODD_V8SI
+ : IX86_BUILTIN_VEC_WIDEN_SMUL_ODD_V8SI);
+ break;
+
+ default:
+ return NULL;
+ }
+ return ix86_builtins[code];
+}
/* Helper for avx_vpermilps256_operand et al. This is also used by
the expansion functions to turn the parallel back into a mask.
@@ -38663,6 +38750,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
bool uns_p, bool odd_p)
{
enum machine_mode mode = GET_MODE (op1);
+ enum machine_mode wmode = GET_MODE (dest);
rtx x;
/* We only play even/odd games with vectors of SImode. */
@@ -38672,8 +38760,12 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
the even slots. For some cpus this is faster than a PSHUFD. */
if (odd_p)
{
- enum machine_mode wmode = GET_MODE (dest);
-
+ if (TARGET_XOP && mode == V4SImode)
+ {
+ x = force_reg (wmode, CONST0_RTX (wmode));
+ emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
+ return;
+ }
op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
GEN_INT (GET_MODE_UNIT_BITSIZE (mode)), NULL,
1, OPTAB_DIRECT);
@@ -38697,7 +38789,7 @@ ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
else if (TARGET_XOP)
{
- x = force_reg (V2DImode, CONST0_RTX (V2DImode));
+ x = force_reg (wmode, CONST0_RTX (wmode));
x = gen_xop_pmacsdql (dest, op1, op2, x);
}
else
@@ -39980,6 +40072,11 @@ ix86_memmodel_check (unsigned HOST_WIDE_INT val)
#undef TARGET_VECTORIZE_BUILTIN_GATHER
#define TARGET_VECTORIZE_BUILTIN_GATHER ix86_vectorize_builtin_gather
+#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN
+#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN ix86_builtin_mul_widen_even
+#undef TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD
+#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD ix86_builtin_mul_widen_odd
+
#undef TARGET_BUILTIN_RECIPROCAL
#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
@@ -5708,6 +5708,20 @@
DONE;
})
+(define_expand "vec_widen_<s>mult_odd_<mode>"
+ [(match_operand:<sseunpackmode> 0 "register_operand")
+ (any_extend:<sseunpackmode>
+ (match_operand:VI124_AVX2 1 "register_operand"))
+ (match_operand:VI124_AVX2 2 "register_operand")]
+ ; Note that SSE2 does not have signed SI multiply
+ "TARGET_AVX || TARGET_XOP || TARGET_SSE4_1
+ || (TARGET_SSE2 && (<u_bool> || <MODE>mode != V4SImode))"
+{
+ ix86_expand_mul_widen_evenodd (operands[0], operands[1], operands[2],
+ <u_bool>, true);
+ DONE;
+})
+
(define_expand "sdot_prod<mode>"
[(match_operand:<sseunpackmode> 0 "register_operand")
(match_operand:VI2_AVX2 1 "register_operand")