===================================================================
@@ -106,6 +106,11 @@ (define_register_constraint "wy" "rs6000
(define_register_constraint "wz" "rs6000_constraints[RS6000_CONSTRAINT_wz]"
"Floating point register if the LFIWZX instruction is enabled or NO_REGS.")
+(define_constraint "wD"
+ "Int constant that is the element number of the 64-bit scalar in a vector."
+ (and (match_code "const_int")
+ (match_test "TARGET_VSX && (ival == VECTOR_ELEMENT_SCALAR_64BIT)")))
+
;; Lq/stq validates the address for load/store quad
(define_memory_constraint "wQ"
"Memory operand suitable for the load/store quad instructions"
===================================================================
@@ -981,6 +981,14 @@ (define_predicate "zero_reg_mem_operand"
(ior (match_operand 0 "zero_fp_constant")
(match_operand 0 "reg_or_mem_operand")))
+;; Return 1 if the operand is a CONST_INT and it is the element for 64-bit
+;; data types inside of a vector that scalar instructions operate on
+(define_predicate "vsx_scalar_64bit"
+ (match_code "const_int")
+{
+ return (INTVAL (op) == VECTOR_ELEMENT_SCALAR_64BIT);
+})
+
;; Return 1 if the operand is a general register or memory operand without
;; pre_inc or pre_dec or pre_modify, which produces invalid form of PowerPC
;; lwa instruction.
===================================================================
@@ -1374,6 +1374,7 @@ BU_P8V_AV_2 (VMINUD, "vminud", CONST, u
BU_P8V_AV_2 (VMAXUD, "vmaxud", CONST, umaxv2di3)
BU_P8V_AV_2 (VMRGEW, "vmrgew", CONST, p8_vmrgew)
BU_P8V_AV_2 (VMRGOW, "vmrgow", CONST, p8_vmrgow)
+BU_P8V_AV_2 (VBPERMQ, "vbpermq", CONST, altivec_vbpermq)
BU_P8V_AV_2 (VPKUDUM, "vpkudum", CONST, altivec_vpkudum)
BU_P8V_AV_2 (VPKSDSS, "vpksdss", CONST, altivec_vpksdss)
BU_P8V_AV_2 (VPKUDUS, "vpkudus", CONST, altivec_vpkudus)
@@ -1448,6 +1449,7 @@ BU_P8V_OVERLOAD_2 (ORC, "orc")
BU_P8V_OVERLOAD_2 (VADDCUQ, "vaddcuq")
BU_P8V_OVERLOAD_2 (VADDUDM, "vaddudm")
BU_P8V_OVERLOAD_2 (VADDUQM, "vadduqm")
+BU_P8V_OVERLOAD_2 (VBPERMQ, "vbpermq")
BU_P8V_OVERLOAD_2 (VMAXSD, "vmaxsd")
BU_P8V_OVERLOAD_2 (VMAXUD, "vmaxud")
BU_P8V_OVERLOAD_2 (VMINSD, "vminsd")
===================================================================
@@ -3778,6 +3778,12 @@ const struct altivec_builtin_types altiv
RS6000_BTI_unsigned_V1TI, RS6000_BTI_unsigned_V1TI,
RS6000_BTI_unsigned_V1TI, 0 },
+ { P8V_BUILTIN_VEC_VBPERMQ, P8V_BUILTIN_VBPERMQ,
+ RS6000_BTI_V2DI, RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0 },
+ { P8V_BUILTIN_VEC_VBPERMQ, P8V_BUILTIN_VBPERMQ,
+ RS6000_BTI_unsigned_V2DI, RS6000_BTI_unsigned_V16QI,
+ RS6000_BTI_unsigned_V16QI, 0 },
+
{ P8V_BUILTIN_VEC_VCLZ, P8V_BUILTIN_VCLZB,
RS6000_BTI_V16QI, RS6000_BTI_V16QI, 0, 0 },
{ P8V_BUILTIN_VEC_VCLZ, P8V_BUILTIN_VCLZB,
===================================================================
@@ -2310,6 +2310,10 @@ rs6000_debug_reg_global (void)
(int)END_BUILTINS);
fprintf (stderr, DEBUG_FMT_D, "Number of rs6000 builtins",
(int)RS6000_BUILTIN_COUNT);
+
+ if (TARGET_VSX)
+ fprintf (stderr, DEBUG_FMT_D, "VSX easy 64-bit scalar element",
+ (int)VECTOR_ELEMENT_SCALAR_64BIT);
}
===================================================================
@@ -1531,52 +1531,129 @@ (define_insn "vsx_set_<mode>"
[(set_attr "type" "vecperm")])
;; Extract a DF/DI element from V2DF/V2DI
-(define_insn "vsx_extract_<mode>"
- [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=ws,d,?wa")
- (vec_select:<VS_scalar> (match_operand:VSX_D 1 "vsx_register_operand" "wd,wd,wa")
+(define_expand "vsx_extract_<mode>"
+ [(set (match_operand:<VS_scalar> 0 "register_operand" "")
+ (vec_select:<VS_scalar> (match_operand:VSX_D 1 "register_operand" "")
(parallel
- [(match_operand:QI 2 "u5bit_cint_operand" "i,i,i")])))]
+ [(match_operand:QI 2 "u5bit_cint_operand" "")])))]
"VECTOR_MEM_VSX_P (<MODE>mode)"
+ "")
+
+;; Optimize cases were we can do a simple or direct move.
+;; Or see if we can avoid doing the move at all
+(define_insn "*vsx_extract_<mode>_internal1"
+ [(set (match_operand:<VS_scalar> 0 "register_operand" "=d,ws,?wa,r")
+ (vec_select:<VS_scalar>
+ (match_operand:VSX_D 1 "register_operand" "d,wd,wa,wm")
+ (parallel
+ [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD,wD")])))]
+ "VECTOR_MEM_VSX_P (<MODE>mode) && TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+{
+ int op0_regno = REGNO (operands[0]);
+ int op1_regno = REGNO (operands[1]);
+
+ if (op0_regno == op1_regno)
+ return "nop\t\t# vec_extract %x0,%x1,%2";
+
+ if (INT_REGNO_P (op0_regno))
+ return "mfvsrd %0,%x1";
+
+ if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno))
+ return "fmr %0,%1";
+
+ return "xxlor %x0,%x1,%x1";
+}
+ [(set_attr "type" "fp,vecsimple,vecsimple,mftgpr")
+ (set_attr "length" "4")])
+
+(define_insn "*vsx_extract_<mode>_internal2"
+ [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=d,ws,ws,?wa")
+ (vec_select:<VS_scalar>
+ (match_operand:VSX_D 1 "vsx_register_operand" "d,wd,wd,wa")
+ (parallel [(match_operand:QI 2 "u5bit_cint_operand" "wD,wD,i,i")])))]
+ "VECTOR_MEM_VSX_P (<MODE>mode)
+ && (!TARGET_POWERPC64 || !TARGET_DIRECT_MOVE
+ || INTVAL (operands[2]) != VECTOR_ELEMENT_SCALAR_64BIT)"
{
int fldDM;
gcc_assert (UINTVAL (operands[2]) <= 1);
+
+ if (INTVAL (operands[2]) == VECTOR_ELEMENT_SCALAR_64BIT)
+ {
+ int op0_regno = REGNO (operands[0]);
+ int op1_regno = REGNO (operands[1]);
+
+ if (op0_regno == op1_regno)
+ return "nop\t\t# vec_extract %x0,%x1,%2";
+
+ if (FP_REGNO_P (op0_regno) && FP_REGNO_P (op1_regno))
+ return "fmr %0,%1";
+
+ return "xxlor %x0,%x1,%x1";
+ }
+
fldDM = INTVAL (operands[2]) << 1;
if (!BYTES_BIG_ENDIAN)
fldDM = 3 - fldDM;
operands[3] = GEN_INT (fldDM);
- return \"xxpermdi %x0,%x1,%x1,%3\";
+ return "xxpermdi %x0,%x1,%x1,%3";
}
- [(set_attr "type" "vecperm")])
+ [(set_attr "type" "fp,vecsimple,vecperm,vecperm")
+ (set_attr "length" "4")])
-;; Optimize extracting element 0 from memory
-(define_insn "*vsx_extract_<mode>_zero"
- [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=ws,d,?wa")
+;; Optimize extracting a single scalar element from memory if the scalar is in
+;; the correct location to use a single load.
+(define_insn "*vsx_extract_<mode>_load"
+ [(set (match_operand:<VS_scalar> 0 "register_operand" "=d,wv,wr")
(vec_select:<VS_scalar>
- (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z")
- (parallel [(const_int 0)])))]
- "VECTOR_MEM_VSX_P (<MODE>mode) && WORDS_BIG_ENDIAN"
- "lxsd%U1x %x0,%y1"
- [(set (attr "type")
- (if_then_else
- (match_test "update_indexed_address_mem (operands[1], VOIDmode)")
- (const_string "fpload_ux")
- (const_string "fpload")))
- (set_attr "length" "4")])
-
-;; Optimize extracting element 1 from memory for little endian
-(define_insn "*vsx_extract_<mode>_one_le"
- [(set (match_operand:<VS_scalar> 0 "vsx_register_operand" "=ws,d,?wa")
+ (match_operand:VSX_D 1 "memory_operand" "m,Z,m")
+ (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))]
+ "VECTOR_MEM_VSX_P (<MODE>mode)"
+ "@
+ lfd%U1%X1 %0,%1
+ lxsd%U1x %x0,%y1
+ ld%U1%X1 %0,%1"
+ [(set_attr_alternative "type"
+ [(if_then_else
+ (match_test "update_indexed_address_mem (operands[1], VOIDmode)")
+ (const_string "fpload_ux")
+ (if_then_else
+ (match_test "update_address_mem (operands[1], VOIDmode)")
+ (const_string "fpload_u")
+ (const_string "fpload")))
+ (const_string "fpload")
+ (if_then_else
+ (match_test "update_indexed_address_mem (operands[1], VOIDmode)")
+ (const_string "load_ux")
+ (if_then_else
+ (match_test "update_address_mem (operands[1], VOIDmode)")
+ (const_string "load_u")
+ (const_string "load")))])
+ (set_attr "length" "4")])
+
+;; Optimize storing a single scalar element that is the right location to
+;; memory
+(define_insn "*vsx_extract_<mode>_store"
+ [(set (match_operand:<VS_scalar> 0 "memory_operand" "=m,Z,?Z")
(vec_select:<VS_scalar>
- (match_operand:VSX_D 1 "indexed_or_indirect_operand" "Z,Z,Z")
- (parallel [(const_int 1)])))]
- "VECTOR_MEM_VSX_P (<MODE>mode) && !WORDS_BIG_ENDIAN"
- "lxsd%U1x %x0,%y1"
- [(set (attr "type")
- (if_then_else
- (match_test "update_indexed_address_mem (operands[1], VOIDmode)")
- (const_string "fpload_ux")
- (const_string "fpload")))
- (set_attr "length" "4")])
+ (match_operand:VSX_D 1 "register_operand" "d,wd,wa")
+ (parallel [(match_operand:QI 2 "vsx_scalar_64bit" "wD,wD,wD")])))]
+ "VECTOR_MEM_VSX_P (<MODE>mode)"
+ "@
+ stfd%U0%X0 %1,%0
+ stxsd%U0x %x1,%y0
+ stxsd%U0x %x1,%y0"
+ [(set_attr_alternative "type"
+ [(if_then_else
+ (match_test "update_indexed_address_mem (operands[0], VOIDmode)")
+ (const_string "fpstore_ux")
+ (if_then_else
+ (match_test "update_address_mem (operands[0], VOIDmode)")
+ (const_string "fpstore_u")
+ (const_string "fpstore")))
+ (const_string "fpstore")
+ (const_string "fpstore")])
+ (set_attr "length" "4")])
;; Extract a SF element from V4SF
(define_insn_and_split "vsx_extract_v4sf"
===================================================================
@@ -477,6 +477,10 @@ extern int rs6000_vector_align[];
#define VECTOR_ELT_ORDER_BIG \
(BYTES_BIG_ENDIAN || (rs6000_altivec_element_order == 2))
+/* Element number of the 64-bit value in a 128-bit vector that can be accessed
+ with scalar instructions. */
+#define VECTOR_ELEMENT_SCALAR_64BIT ((BYTES_BIG_ENDIAN) ? 0 : 1)
+
/* Alignment options for fields in structures for sub-targets following
AIX-like ABI.
ALIGN_POWER word-aligns FP doubles (default AIX ABI).
===================================================================
@@ -142,6 +142,7 @@ (define_c_enum "unspec"
UNSPEC_VSUBCUQ
UNSPEC_VSUBEUQM
UNSPEC_VSUBECUQ
+ UNSPEC_VBPERMQ
])
(define_c_enum "unspecv"
@@ -3322,3 +3323,14 @@ (define_insn "altivec_vsubecuq"
[(set_attr "length" "4")
(set_attr "type" "vecsimple")])
+;; We use V2DI as the output type to simplify converting the permute
+;; bits into an integer
+(define_insn "altivec_vbpermq"
+ [(set (match_operand:V2DI 0 "register_operand" "=v")
+ (unspec:V2DI [(match_operand:V16QI 1 "register_operand" "v")
+ (match_operand:V16QI 2 "register_operand" "v")]
+ UNSPEC_VBPERMQ))]
+ "TARGET_P8_VECTOR"
+ "vbpermq %0,%1,%2"
+ [(set_attr "length" "4")
+ (set_attr "type" "vecsimple")])
===================================================================
@@ -329,6 +329,7 @@
#define vec_vaddcuq __builtin_vec_vaddcuq
#define vec_vaddudm __builtin_vec_vaddudm
#define vec_vadduqm __builtin_vec_vadduqm
+#define vec_vbpermq __builtin_vec_vbpermq
#define vec_vclz __builtin_vec_vclz
#define vec_vclzb __builtin_vec_vclzb
#define vec_vclzd __builtin_vec_vclzd
===================================================================
@@ -2162,6 +2162,9 @@ VSX vector register to hold scalar float
@item wz
Floating point register if the LFIWZX instruction is enabled or NO_REGS.
+@item wD
+Int constant that is the element number of the 64-bit scalar in a vector.
+
@item wQ
A memory address that will work with the @code{lq} and @code{stq}
instructions.
===================================================================
@@ -0,0 +1,27 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O3 -mcpu=power8" } */
+/* { dg-final { scan-assembler "vbpermq" } } */
+/* { dg-final { scan-assembler "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "stfd" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+
+#include <altivec.h>
+
+#if __LITTLE_ENDIAN__
+#define OFFSET 1
+#else
+#define OFFSET 0
+#endif
+
+long foos (vector signed char a, vector signed char b)
+{
+ return vec_extract (vec_vbpermq (a, b), OFFSET);
+}
+
+long foou (vector unsigned char a, vector unsigned char b)
+{
+ return vec_extract (vec_vbpermq (a, b), OFFSET);
+}
+
===================================================================
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -mcpu=power7" } */
+/* { dg-final { scan-assembler "lfd" } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+
+#include <altivec.h>
+
+#if __LITTLE_ENDIAN__
+#define OFFSET 1
+#else
+#define OFFSET 0
+#endif
+
+double get_value (vector double *p) { return vec_extract (*p, OFFSET); }
===================================================================
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -mcpu=power7" } */
+/* { dg-final { scan-assembler "xxlor" } } */
+/* { dg-final { scan-assembler-not "lfd" } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+
+#include <altivec.h>
+
+#if __LITTLE_ENDIAN__
+#define OFFSET 1
+#else
+#define OFFSET 0
+#endif
+
+double get_value (vector double v) { return vec_extract (v, OFFSET); }
===================================================================
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-options "-O3 -mcpu=power8" } */
+/* { dg-final { scan-assembler "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "stfd" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+
+#include <altivec.h>
+
+#if __LITTLE_ENDIAN__
+#define OFFSET 1
+#else
+#define OFFSET 0
+#endif
+
+long get_value (vector long v) { return vec_extract (v, OFFSET); }