===================================================================
@@ -65,6 +65,7 @@ extern void rs6000_expand_vector_set (rt
extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
+extern void rs6000_split_v4si_init (rtx []);
extern bool altivec_expand_vec_perm_const (rtx op[4]);
extern void altivec_expand_vec_perm_le (rtx op[4]);
extern bool rs6000_expand_vec_perm_const (rtx op[4]);
===================================================================
@@ -6692,7 +6692,7 @@ rs6000_expand_vector_init (rtx target, r
if ((int_vector_p || TARGET_VSX) && all_const_zero)
{
/* Zero register. */
- emit_insn (gen_rtx_SET (target, gen_rtx_XOR (mode, target, target)));
+ emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
return;
}
else if (int_vector_p && easy_vector_constant (const_vec, mode))
@@ -6735,32 +6735,69 @@ rs6000_expand_vector_init (rtx target, r
return;
}
- /* Word values on ISA 3.0 can use mtvsrws, lxvwsx, or vspltisw. V4SF is
- complicated since scalars are stored as doubles in the registers. */
- if (TARGET_P9_VECTOR && mode == V4SImode && all_same
- && VECTOR_MEM_VSX_P (mode))
+ /* Special case initializing vector int if we are on 64-bit systems with
+ direct move or we have the ISA 3.0 instructions. */
+ if (mode == V4SImode && VECTOR_MEM_VSX_P (V4SImode)
+ && TARGET_DIRECT_MOVE_64BIT)
{
- emit_insn (gen_vsx_splat_v4si (target, XVECEXP (vals, 0, 0)));
- return;
+ if (all_same)
+ {
+ rtx element0 = XVECEXP (vals, 0, 0);
+ if (MEM_P (element0))
+ element0 = rs6000_address_for_fpconvert (element0);
+ else if (!REG_P (element0))
+ element0 = force_reg (SImode, element0);
+
+ if (TARGET_P9_VECTOR)
+ emit_insn (gen_vsx_splat_v4si (target, element0));
+ else
+ {
+ rtx tmp = gen_reg_rtx (DImode);
+ emit_insn (gen_zero_extendsidi2 (tmp, element0));
+ emit_insn (gen_vsx_splat_v4si_di (target, tmp));
+ }
+ return;
+ }
+ else
+ {
+ rtx elements[4];
+ size_t i;
+
+ for (i = 0; i < 4; i++)
+ {
+ elements[i] = XVECEXP (vals, 0, i);
+ if (!CONST_INT_P (elements[i]) && !REG_P (elements[i]))
+ elements[i] = copy_to_mode_reg (SImode, elements[i]);
+ }
+
+ emit_insn (gen_vsx_init_v4si (target, elements[0], elements[1],
+ elements[2], elements[3]));
+ return;
+ }
}
/* With single precision floating point on VSX, know that internally single
precision is actually represented as a double, and either make 2 V2DF
vectors, and convert these vectors to single precision, or do one
conversion, and splat the result to the other elements. */
- if (mode == V4SFmode && VECTOR_MEM_VSX_P (mode))
+ if (mode == V4SFmode && VECTOR_MEM_VSX_P (V4SFmode))
{
if (all_same)
{
- rtx op0 = XVECEXP (vals, 0, 0);
+ rtx element0 = XVECEXP (vals, 0, 0);
if (TARGET_P9_VECTOR)
- emit_insn (gen_vsx_splat_v4sf (target, op0));
+ {
+ if (MEM_P (element0))
+ element0 = rs6000_address_for_fpconvert (element0);
+
+ emit_insn (gen_vsx_splat_v4sf (target, element0));
+ }
else
{
rtx freg = gen_reg_rtx (V4SFmode);
- rtx sreg = force_reg (SFmode, op0);
+ rtx sreg = force_reg (SFmode, element0);
rtx cvt = (TARGET_XSCVDPSPN
? gen_vsx_xscvdpspn_scalar (freg, sreg)
: gen_vsx_xscvdpsp_scalar (freg, sreg));
@@ -7029,6 +7066,18 @@ rs6000_expand_vector_extract (rtx target
emit_move_insn (target, adjust_address_nv (mem, inner_mode, 0));
}
+/* Helper function to return the register number of a RTX. */
+static inline int
+regno_or_subregno (rtx op)
+{
+ if (REG_P (op))
+ return REGNO (op);
+ else if (SUBREG_P (op))
+ return subreg_regno (op);
+ else
+ gcc_unreachable ();
+}
+
/* Adjust a memory address (MEM) of a vector type to point to a scalar field
within the vector (ELEMENT) with a mode (SCALAR_MODE). Use a base register
temporary (BASE_TMP) to fixup the address. Return the new memory address
@@ -7108,14 +7157,22 @@ rs6000_adjust_vec_address (rtx scalar_re
}
else
{
- if (REG_P (op1) || SUBREG_P (op1))
+ bool op1_reg_p = (REG_P (op1) || SUBREG_P (op1));
+ bool ele_reg_p = (REG_P (element_offset) || SUBREG_P (element_offset));
+
+ /* Note, ADDI requires the register being added to be a base
+ register. If the register was R0, load it up into the temporary
+ and do the add. */
+ if (op1_reg_p
+ && (ele_reg_p || reg_or_subregno (op1) != FIRST_GPR_REGNO))
{
insn = gen_add3_insn (base_tmp, op1, element_offset);
gcc_assert (insn != NULL_RTX);
emit_insn (insn);
}
- else if (REG_P (element_offset) || SUBREG_P (element_offset))
+ else if (ele_reg_p
+ && reg_or_subregno (element_offset) != FIRST_GPR_REGNO)
{
insn = gen_add3_insn (base_tmp, element_offset, op1);
gcc_assert (insn != NULL_RTX);
@@ -7144,14 +7201,7 @@ rs6000_adjust_vec_address (rtx scalar_re
{
rtx op1 = XEXP (new_addr, 1);
addr_mask_type addr_mask;
- int scalar_regno;
-
- if (REG_P (scalar_reg))
- scalar_regno = REGNO (scalar_reg);
- else if (SUBREG_P (scalar_reg))
- scalar_regno = subreg_regno (scalar_reg);
- else
- gcc_unreachable ();
+ int scalar_regno = regno_or_subregno (scalar_reg);
gcc_assert (scalar_regno < FIRST_PSEUDO_REGISTER);
if (INT_REGNO_P (scalar_regno))
@@ -7318,6 +7368,93 @@ rs6000_split_vec_extract_var (rtx dest,
gcc_unreachable ();
}
+/* Helper function for rs6000_split_v4si_init to build up a DImode value from
+ two SImode values. */
+
+static void
+rs6000_split_v4si_init_di_reg (rtx dest, rtx si1, rtx si2, rtx tmp)
+{
+ const unsigned HOST_WIDE_INT mask_32bit = HOST_WIDE_INT_C (0xffffffff);
+
+ if (CONST_INT_P (si1) && CONST_INT_P (si2))
+ {
+ unsigned HOST_WIDE_INT const1 = (UINTVAL (si1) & mask_32bit) << 32;
+ unsigned HOST_WIDE_INT const2 = UINTVAL (si2) & mask_32bit;
+
+ emit_move_insn (dest, GEN_INT (const1 | const2));
+ return;
+ }
+
+ /* Put si1 into upper 32-bits of dest. */
+ if (CONST_INT_P (si1))
+ emit_move_insn (dest, GEN_INT ((UINTVAL (si1) & mask_32bit) << 32));
+ else
+ {
+ /* Generate RLDIC. */
+ rtx si1_di = gen_rtx_REG (DImode, regno_or_subregno (si1));
+ rtx shift_rtx = gen_rtx_ASHIFT (DImode, si1_di, GEN_INT (32));
+ rtx mask_rtx = GEN_INT (mask_32bit << 32);
+ rtx and_rtx = gen_rtx_AND (DImode, shift_rtx, mask_rtx);
+ gcc_assert (!reg_overlap_mentioned_p (dest, si1));
+ emit_insn (gen_rtx_SET (dest, and_rtx));
+ }
+
+ /* Put si2 into the temporary. */
+ gcc_assert (!reg_overlap_mentioned_p (dest, tmp));
+ if (CONST_INT_P (si2))
+ emit_move_insn (tmp, GEN_INT (UINTVAL (si2) & mask_32bit));
+ else
+ emit_insn (gen_zero_extendsidi2 (tmp, si2));
+
+ /* Combine the two parts. */
+ emit_insn (gen_iordi3 (dest, dest, tmp));
+ return;
+}
+
+/* Split a V4SI initialization. */
+
+void
+rs6000_split_v4si_init (rtx operands[])
+{
+ rtx dest = operands[0];
+
+ /* Destination is a GPR, build up the two DImode parts in place. */
+ if (REG_P (dest) || SUBREG_P (dest))
+ {
+ int d_regno = regno_or_subregno (dest);
+ rtx scalar1 = operands[1];
+ rtx scalar2 = operands[2];
+ rtx scalar3 = operands[3];
+ rtx scalar4 = operands[4];
+ rtx tmp1 = operands[5];
+ rtx tmp2 = operands[6];
+
+ /* Even though we only need one temporary (plus the destination, which
+ has an early clobber constraint, try to use two temporaries, one for
+ each double word created. That way the 2nd insn scheduling pass can
+ rearrange things so the two parts are done in parallel. */
+ if (BYTES_BIG_ENDIAN)
+ {
+ rtx di_lo = gen_rtx_REG (DImode, d_regno);
+ rtx di_hi = gen_rtx_REG (DImode, d_regno + 1);
+ rs6000_split_v4si_init_di_reg (di_lo, scalar1, scalar2, tmp1);
+ rs6000_split_v4si_init_di_reg (di_hi, scalar3, scalar4, tmp2);
+ }
+ else
+ {
+ rtx di_lo = gen_rtx_REG (DImode, d_regno + 1);
+ rtx di_hi = gen_rtx_REG (DImode, d_regno);
+ gcc_assert (!VECTOR_ELT_ORDER_BIG);
+ rs6000_split_v4si_init_di_reg (di_lo, scalar4, scalar3, tmp1);
+ rs6000_split_v4si_init_di_reg (di_hi, scalar2, scalar1, tmp2);
+ }
+ return;
+ }
+
+ else
+ gcc_unreachable ();
+}
+
/* Return TRUE if OP is an invalid SUBREG operation on the e500. */
bool
@@ -39006,6 +39143,7 @@ rtx_is_swappable_p (rtx op, unsigned int
case UNSPEC_VSX_CVSPDPN:
case UNSPEC_VSX_EXTRACT:
case UNSPEC_VSX_VSLO:
+ case UNSPEC_VSX_VEC_INIT:
return 0;
case UNSPEC_VSPLT_DIRECT:
*special = SH_SPLAT;
===================================================================
@@ -323,6 +323,7 @@ (define_c_enum "unspec"
UNSPEC_VSX_VXSIG
UNSPEC_VSX_VIEXP
UNSPEC_VSX_VTSTDC
+ UNSPEC_VSX_VEC_INIT
])
;; VSX moves
@@ -1950,10 +1951,10 @@ (define_insn "vsx_concat_<mode>"
;; together, relying on the fact that internally scalar floats are represented
;; as doubles. This is used to initialize a V4SF vector with 4 floats
(define_insn "vsx_concat_v2sf"
- [(set (match_operand:V2DF 0 "vsx_register_operand" "=wd,?wa")
+ [(set (match_operand:V2DF 0 "vsx_register_operand" "=wa")
(unspec:V2DF
- [(match_operand:SF 1 "vsx_register_operand" "f,f")
- (match_operand:SF 2 "vsx_register_operand" "f,f")]
+ [(match_operand:SF 1 "vsx_register_operand" "ww")
+ (match_operand:SF 2 "vsx_register_operand" "ww")]
UNSPEC_VSX_CONCAT))]
"VECTOR_MEM_VSX_P (V2DFmode)"
{
@@ -1964,6 +1965,26 @@ (define_insn "vsx_concat_v2sf"
}
[(set_attr "type" "vecperm")])
+;; V4SImode initialization splitter
+(define_insn_and_split "vsx_init_v4si"
+ [(set (match_operand:V4SI 0 "gpc_reg_operand" "=&r")
+ (unspec:V4SI
+ [(match_operand:SI 1 "reg_or_cint_operand" "rn")
+ (match_operand:SI 2 "reg_or_cint_operand" "rn")
+ (match_operand:SI 3 "reg_or_cint_operand" "rn")
+ (match_operand:SI 4 "reg_or_cint_operand" "rn")]
+ UNSPEC_VSX_VEC_INIT))
+ (clobber (match_scratch:DI 5 "=&r"))
+ (clobber (match_scratch:DI 6 "=&r"))]
+ "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
+ "#"
+ "&& reload_completed"
+ [(const_int 0)]
+{
+ rs6000_split_v4si_init (operands);
+ DONE;
+})
+
;; xxpermdi for little endian loads and stores. We need several of
;; these since the form of the PARALLEL differs by mode.
(define_insn "*vsx_xxpermdi2_le_<mode>"
@@ -2674,32 +2695,33 @@ (define_insn "vsx_splat_<mode>"
mtvsrdd %x0,%1,%1"
[(set_attr "type" "vecperm,vecload,vecperm")])
-;; V4SI splat (ISA 3.0)
-;; When SI's are allowed in VSX registers, add XXSPLTW support
-(define_expand "vsx_splat_<mode>"
- [(set (match_operand:VSX_W 0 "vsx_register_operand" "")
- (vec_duplicate:VSX_W
- (match_operand:<VS_scalar> 1 "splat_input_operand" "")))]
- "TARGET_P9_VECTOR"
-{
- if (MEM_P (operands[1]))
- operands[1] = rs6000_address_for_fpconvert (operands[1]);
- else if (!REG_P (operands[1]))
- operands[1] = force_reg (<VS_scalar>mode, operands[1]);
-})
-
-(define_insn "*vsx_splat_v4si_internal"
- [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa,wa")
+;; V4SI splat support
+(define_insn "vsx_splat_v4si"
+ [(set (match_operand:V4SI 0 "vsx_register_operand" "=we,we")
(vec_duplicate:V4SI
(match_operand:SI 1 "splat_input_operand" "r,Z")))]
"TARGET_P9_VECTOR"
"@
mtvsrws %x0,%1
lxvwsx %x0,%y1"
- [(set_attr "type" "mftgpr,vecload")])
+ [(set_attr "type" "vecperm,vecload")])
+
+;; SImode is not currently allowed in vector registers. This pattern
+;; allows us to use direct move to get the value in a vector register
+;; so that we can use XXSPLTW
+(define_insn "vsx_splat_v4si_di"
+ [(set (match_operand:V4SI 0 "vsx_register_operand" "=wa,we")
+ (vec_duplicate:V4SI
+ (truncate:SI
+ (match_operand:DI 1 "gpc_reg_operand" "wj,r"))))]
+ "VECTOR_MEM_VSX_P (V4SImode) && TARGET_DIRECT_MOVE_64BIT"
+ "@
+ xxspltw %x0,%x1,1
+ mtvsrws %x0,%1"
+ [(set_attr "type" "vecperm")])
;; V4SF splat (ISA 3.0)
-(define_insn_and_split "*vsx_splat_v4sf_internal"
+(define_insn_and_split "vsx_splat_v4sf"
[(set (match_operand:V4SF 0 "vsx_register_operand" "=wa,wa,wa")
(vec_duplicate:V4SF
(match_operand:SF 1 "splat_input_operand" "Z,wy,r")))]
@@ -2720,12 +2742,12 @@ (define_insn_and_split "*vsx_splat_v4sf_
;; V4SF/V4SI splat from a vector element
(define_insn "vsx_xxspltw_<mode>"
- [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wf,?<VSa>")
+ [(set (match_operand:VSX_W 0 "vsx_register_operand" "=<VSa>")
(vec_duplicate:VSX_W
(vec_select:<VS_scalar>
- (match_operand:VSX_W 1 "vsx_register_operand" "wf,<VSa>")
+ (match_operand:VSX_W 1 "vsx_register_operand" "<VSa>")
(parallel
- [(match_operand:QI 2 "u5bit_cint_operand" "i,i")]))))]
+ [(match_operand:QI 2 "u5bit_cint_operand" "n")]))))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
{
if (!BYTES_BIG_ENDIAN)
@@ -2736,9 +2758,9 @@ (define_insn "vsx_xxspltw_<mode>"
[(set_attr "type" "vecperm")])
(define_insn "vsx_xxspltw_<mode>_direct"
- [(set (match_operand:VSX_W 0 "vsx_register_operand" "=wf,?<VSa>")
- (unspec:VSX_W [(match_operand:VSX_W 1 "vsx_register_operand" "wf,<VSa>")
- (match_operand:QI 2 "u5bit_cint_operand" "i,i")]
+ [(set (match_operand:VSX_W 0 "vsx_register_operand" "=<VSa>")
+ (unspec:VSX_W [(match_operand:VSX_W 1 "vsx_register_operand" "<VSa>")
+ (match_operand:QI 2 "u5bit_cint_operand" "i")]
UNSPEC_VSX_XXSPLTW))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
"xxspltw %x0,%x1,%2"
===================================================================
@@ -760,13 +760,15 @@ extern int rs6000_vector_align[];
&& TARGET_SINGLE_FLOAT \
&& TARGET_DOUBLE_FLOAT)
-/* Macro to say whether we can do optimization where we need to do parts of the
- calculation in 64-bit GPRs and then is transfered to the vector
- registers. */
+/* Macro to say whether we can do optimizations where we need to do parts of
+ the calculation in 64-bit GPRs and then is transfered to the vector
+ registers. Do not allow -maltivec=be for these optimizations, because it
+ adds to the complexity of the code. */
#define TARGET_DIRECT_MOVE_64BIT (TARGET_DIRECT_MOVE \
&& TARGET_P8_VECTOR \
&& TARGET_POWERPC64 \
- && TARGET_UPPER_REGS_DI)
+ && TARGET_UPPER_REGS_DI \
+ && (rs6000_altivec_element_order != 2))
/* Whether the various reciprocal divide/square root estimate instructions
exist, and whether we should automatically generate code for the instruction
===================================================================
@@ -24,6 +24,9 @@ extern void check_splat (vector int a)
extern vector int pack_reg (int a, int b, int c, int d)
__attribute__((__noinline__));
+extern vector int pack_from_ptr (int *p_a, int *p_b, int *p_c, int *p_d)
+ __attribute__((__noinline__));
+
extern vector int pack_const (void)
__attribute__((__noinline__));
@@ -39,6 +42,9 @@ extern void pack_global (int a, int b, i
extern vector int splat_reg (int a)
__attribute__((__noinline__));
+extern vector int splat_from_ptr (int *p)
+ __attribute__((__noinline__));
+
extern vector int splat_const (void)
__attribute__((__noinline__));
@@ -78,6 +84,12 @@ pack_reg (int a, int b, int c, int d)
}
vector int
+pack_from_ptr (int *p_a, int *p_b, int *p_c, int *p_d)
+{
+ return (vector int) { *p_a, *p_b, *p_c, *p_d };
+}
+
+vector int
pack_const (void)
{
return (vector int) { ELEMENTS };
@@ -108,6 +120,12 @@ splat_reg (int a)
}
vector int
+splat_from_ptr (int *p)
+{
+ return (vector int) { *p, *p, *p, *p };
+}
+
+vector int
splat_const (void)
{
return (vector int) { SPLAT, SPLAT, SPLAT, SPLAT };
@@ -134,11 +152,15 @@ splat_global (int a)
int main (void)
{
vector int sv2, sv3;
+ int mem = SPLAT;
+ int mem2[4] = { ELEMENTS };
check (sv);
check (pack_reg (ELEMENTS));
+ check (pack_from_ptr (&mem2[0], &mem2[1], &mem2[2], &mem2[3]));
+
check (pack_const ());
pack_ptr (&sv2, ELEMENTS);
@@ -154,6 +176,8 @@ int main (void)
check_splat (splat_reg (SPLAT));
+ check_splat (splat_from_ptr (&mem));
+
check_splat (splat_const ());
splat_ptr (&sv2, SPLAT);
===================================================================
@@ -24,6 +24,9 @@ extern void check_splat (vector long a)
extern vector long pack_reg (long a, long b)
__attribute__((__noinline__));
+extern vector long pack_from_ptr (long *p_a, long *p_b)
+ __attribute__((__noinline__));
+
extern vector long pack_const (void)
__attribute__((__noinline__));
@@ -39,6 +42,9 @@ extern void pack_global (long a, long b)
extern vector long splat_reg (long a)
__attribute__((__noinline__));
+extern vector long splat_from_ptr (long *p)
+ __attribute__((__noinline__));
+
extern vector long splat_const (void)
__attribute__((__noinline__));
@@ -78,6 +84,12 @@ pack_reg (long a, long b)
}
vector long
+pack_from_ptr (long *p_a, long *p_b)
+{
+ return (vector long) { *p_a, *p_b };
+}
+
+vector long
pack_const (void)
{
return (vector long) { ELEMENTS };
@@ -108,6 +120,12 @@ splat_reg (long a)
}
vector long
+splat_from_ptr (long *p)
+{
+ return (vector long) { *p, *p };
+}
+
+vector long
splat_const (void)
{
return (vector long) { SPLAT, SPLAT };
@@ -134,11 +152,15 @@ splat_global (long a)
int main (void)
{
vector long sv2, sv3;
+ long mem = SPLAT;
+ long mem2[2] = { ELEMENTS };
check (sv);
check (pack_reg (ELEMENTS));
+ check (pack_from_ptr (&mem2[0], &mem2[1]));
+
check (pack_const ());
pack_ptr (&sv2, ELEMENTS);
@@ -154,6 +176,8 @@ int main (void)
check_splat (splat_reg (SPLAT));
+ check_splat (splat_from_ptr (&mem));
+
check_splat (splat_const ());
splat_ptr (&sv2, SPLAT);