@@ -537,7 +537,7 @@ powerpc*-*-*)
extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h"
extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h"
extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
- extra_headers="${extra_headers} amo.h"
+ extra_headers="${extra_headers} amo.h vector-pair.h"
case x$with_cpu in
xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
cpu_is_64bit=yes
@@ -590,9 +590,13 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags,
if (rs6000_cpu == PROCESSOR_CELL)
rs6000_define_or_undefine_macro (define_p, "__PPU__");
- /* Tell the user if we support the MMA instructions. */
+ /* Tell the user if we support the MMA instructions. Also tell vector-pair.h
+ that we have the vector pair built-in function support. */
if ((flags & OPTION_MASK_MMA) != 0)
- rs6000_define_or_undefine_macro (define_p, "__MMA__");
+ {
+ rs6000_define_or_undefine_macro (define_p, "__MMA__");
+ rs6000_define_or_undefine_macro (define_p, "__VPAIR__");
+ }
/* Whether pc-relative code is being generated. */
if ((flags & OPTION_MASK_PCREL) != 0)
rs6000_define_or_undefine_macro (define_p, "__PCREL__");
new file mode 100644
@@ -0,0 +1,573 @@
+/* PowerPC vector pair include file.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ Contributed by Aldy Hernandez (aldyh@redhat.com).
+ Rewritten by Paolo Bonzini (bonzini@gnu.org).
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Provide support for vector pairs, even on systems that do not have native
+ support for loading and storing pairs of vectors. */
+
+#ifndef _VECTOR_PAIR_H
+#define _VECTOR_PAIR_H 1
+
+/* Union of the various vector pair types. */
+union __vpair_union {
+
+#ifdef __MMA__
+ __vector_pair __vpair;
+#endif
+
+ vector double __vp_f64[2];
+ vector float __vp_f32[2];
+ vector unsigned char __vp_uc[2];
+};
+
+typedef union __vpair_union vector_pair_f64_t;
+typedef union __vpair_union vector_pair_f32_t;
+
+#if !__VPAIR_BUILTIN__ && !__VPAIR_ASM__ && !__VPAIR_NOP10__
+#if __MMA__
+#define __VPAIR_ASM__ 1
+
+#else
+#define __VPAIR_NOP10__ 1
+#endif
+#endif
+
+/* Macros to simplify creation of the various operations.
+ *
+ * The __VPAIR_FP_{UNARY,BINARY,FMA} macros are the base macros, and takes:
+ * R: The argument for the output vector pair
+ * A, B, C: 1-3 arguments for the inputs
+ * OPCODE: The assembler opcode for __asm__ on power10
+ * VEC: Either __vp_f64 or __vp_f32 for the union field
+ * VEC_FUNC: 128-bit vector function for use on power8/power9
+ *
+ * The __VPAIR_FP_splat macro takes:
+ * R: The argument for the output vector pair
+ * X: The scalar that is to be splat-ed to the vector pair
+ * VEC: Either __vp_f64 or __vp_f32 for the union field
+ *
+ * The __VPAIR_F32_<...> and __VPAIR_F64_<...> macros call the above macros
+ * with the appropriate structure field to use.
+ */
+
+#undef __VPAIR_FP_SPLAT
+#undef __VPAIR_FP_UNARY
+#undef __VPAIR_FP_BINARY
+#undef __VPAIR_FP_FMA
+
+#undef __VPAIR_F64_UNARY
+#undef __VPAIR_F64_BINARY
+#undef __VPAIR_F64_FMA
+
+#undef __VPAIR_F32_UNARY
+#undef __VPAIR_F32_BINARY
+#undef __VPAIR_F32_FMA
+
+/* Operations using a vector pair and __asm__operations. */
+#if __MMA__ && !__VPAIR_NOP10__
+
+/* When using __asm__, we need to access the second register. Due to the way
+ VSX registers were formed by combining the traditional floating point
+ registers and Altivec registers, we can't use the output modifier %L<n> to
+ refer to the second register if the VSX register was a traditional Altivec
+ register. If the value is in VSX registers 34 & 35, %x0 would give 34, but
+ %L0 would give 1, since 'Altivec' registers start at 0.
+
+ If we are using GAS under Linux, we can use %x0+1 to access the second
+ register and use the full VSX register set.
+
+ If this include file is used on non-Linux systems, or with a non-GCC
+ compiler, limit the registers used to the traditional FPR registers so that
+ we can use %L0. */
+
+#if __VPAIR__USE_FPR__ || !__GNUC__ || (!__linux__ && !__ELF__)
+
+/* Use %0 and %L0 on traditional FPR registers. */
+#define __VPAIR_FP_SPLAT(R, X, VEC) \
+ __asm__ ("xxlor %L0,%0,%0" \
+ : "=d" ((R)->__vpair) \
+ : "0" (__builtin_vec_splats ((X))))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %0,%1\n\t" OPCODE " %L0,%L1" \
+ : "=d" ((R)->__vpair) \
+ : "d" ((A)->__vpair))
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %0,%1,$1\n\t" OPCODE " %L0,%L1,%L2" \
+ : "=d" ((R)->__vpair) \
+ : "d" ((A)->__vpair), "d" ((B)->__vpair))
+
+/* Note the 'a' form of the fma instructions must be used. */
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %0,%1,%2\n\t" OPCODE " %L0,%L1,%L2" \
+ : "=d" ((R)->__vpair) \
+ : "d" ((A)->__vpair), "d" ((B)->__vpair), "0" ((C)->__vpair))
+
+#else
+
+/* Use %x0 and %x0+1 on VSX reigsters. */
+#define __VPAIR_FP_SPLAT(R, X, VEC) \
+ __asm__ ("xxlor %x0+1,%x0,%x0" \
+ : "=wa" ((R)->__vpair) \
+ : "0" (__builtin_vec_splats ((X))))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1" \
+ : "=wa" ((R)->__vpair) \
+ : "wa" ((A)->__vpair))
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \
+ : "=wa" ((R)->__vpair) \
+ : "wa" ((A)->__vpair), "wa" ((B)->__vpair))
+
+/* Note the 'a' form of the fma instructions must be used. */
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \
+ __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1" \
+ : "=wa" ((R)->__vpair) \
+ : "wa" ((A)->__vpair), "wa" ((B)->__vpair), "0" ((C)->__vpair))
+#endif /* Select whether to use %0/%L0 or %x0/%x0+1. */
+
+#else /* vpair support on power8/power9. */
+
+/* Pair of vector operations using a built-in function. */
+
+#define __VPAIR_FP_SPLAT(R, X, VEC) \
+ (R)->VEC[0] = (R)->VEC[1] = __builtin_vec_splats ((X))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC) \
+ do \
+ { \
+ (R)->VEC[0] = VEC_FUNC ((A)->VEC[0]); \
+ (R)->VEC[1] = VEC_FUNC ((A)->VEC[1]); \
+ } \
+ while (0)
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC) \
+ do \
+ { \
+ (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0]); \
+ (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1]); \
+ } \
+ while (0)
+
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC) \
+ do \
+ { \
+ (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0], (C)->VEC[0]); \
+ (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1], (C)->VEC[1]); \
+ } \
+ while (0)
+
+#endif
+
+/* 64-bit version of the macros. */
+#define __VPAIR_F64_UNARY(R, A, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f64, VEC_FUNC)
+
+#define __VPAIR_F64_BINARY(R, A, B, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f64, VEC_FUNC)
+
+#define __VPAIR_F64_FMA(R, A, B, C, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f64, VEC_FUNC)
+
+
+/* 32-bit version of the macros. */
+#define __VPAIR_F32_UNARY(R, A, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f32, VEC_FUNC)
+
+#define __VPAIR_F32_BINARY(R, A, B, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f32, VEC_FUNC)
+
+#define __VPAIR_F32_FMA(R, A, B, C, OPCODE, VEC_FUNC) \
+ __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f32, VEC_FUNC)
+
+
+/* Splat functions. */
+
+/* 64-bit splat to vector pair. */
+
+static inline void
+vpair_f64_splat (vector_pair_f64_t *__r, double __x)
+{
+ __VPAIR_FP_SPLAT (__r, __x, __vp_f64);
+}
+
+/* 32-bit splat to vector pair. */
+
+static inline void
+vpair_f32_splat (vector_pair_f32_t *__r, float __x)
+{
+ __VPAIR_FP_SPLAT (__r, __x, __vp_f32);
+}
+
+
+/* 64-bit unary functions. */
+
+static inline void
+vpair_f64_abs (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a)
+{
+ __VPAIR_F64_UNARY (__r, __a,
+ "xvabsdp",
+ __builtin_vec_abs);
+}
+
+static inline void
+vpair_f64_nabs (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a)
+{
+ __VPAIR_F64_UNARY (__r, __a,
+ "xvnabsdp",
+ __builtin_vec_nabs);
+}
+
+static inline void
+vpair_f64_neg (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a)
+{
+ __VPAIR_F64_UNARY (__r, __a,
+ "xvnegdp",
+ __builtin_vec_neg);
+}
+
+static inline void
+vpair_f64_sqrt (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a)
+{
+ __VPAIR_F64_UNARY (__r, __a,
+ "xvsqrtdp",
+ __builtin_vec_sqrt);
+}
+
+/* 32-bit unary functions. */
+
+static inline void
+vpair_f32_abs (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a)
+{
+ __VPAIR_F32_UNARY (__r, __a,
+ "xvabssp",
+ __builtin_vec_abs);
+}
+
+static inline void
+vpair_f32_nabs (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a)
+{
+ __VPAIR_F32_UNARY (__r, __a,
+ "xvnabssp",
+ __builtin_vec_nabs);
+}
+
+static inline void
+vpair_f32_neg (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a)
+{
+ __VPAIR_F32_UNARY (__r, __a,
+ "xvnegsp",
+ __builtin_vec_neg);
+}
+
+static inline void
+vpair_f32_sqrt (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a)
+{
+ __VPAIR_F32_UNARY (__r, __a,
+ "xvsqrtsp",
+ __builtin_vec_sqrt);
+}
+
+
+/* 64-bit binary functions. */
+
+static inline void
+vpair_f64_add (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvadddp",
+ __builtin_vec_add);
+}
+
+static inline void
+vpair_f64_div (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvdivdp",
+ __builtin_vec_div);
+}
+
+static inline void
+vpair_f64_max (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvmaxdp",
+ __builtin_vec_max);
+}
+
+static inline void
+vpair_f64_min (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvmindp",
+ __builtin_vec_min);
+}
+
+static inline void
+vpair_f64_mul (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvmuldp",
+ __builtin_vec_mul);
+}
+
+static inline void
+vpair_f64_sub (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b)
+{
+ __VPAIR_F64_BINARY (__r, __a, __b,
+ "xvsubdp",
+ __builtin_vec_sub);
+}
+
+/* 32-bit binary functions. */
+
+static inline void
+vpair_f32_add (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvaddsp",
+ __builtin_vec_add);
+}
+
+static inline void
+vpair_f32_div (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvdivsp",
+ __builtin_vec_div);
+}
+
+static inline void
+vpair_f32_max (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvmaxsp",
+ __builtin_vec_max);
+}
+
+static inline void
+vpair_f32_min (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvminsp",
+ __builtin_vec_min);
+}
+
+static inline void
+vpair_f32_mul (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvmulsp",
+ __builtin_vec_mul);
+}
+
+static inline void
+vpair_f32_sub (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b)
+{
+ __VPAIR_F32_BINARY (__r, __a, __b,
+ "xvsubsp",
+ __builtin_vec_sub);
+}
+
+/* 64-bit fma operations. */
+
+static inline void
+vpair_f64_fma (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b,
+ const vector_pair_f64_t *__c)
+{
+ __VPAIR_F64_FMA (__r, __a, __b, __c,
+ "xvmaddadp",
+ __builtin_vsx_xvmadddp);
+}
+
+static inline void
+vpair_f64_fms (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b,
+ const vector_pair_f64_t *__c)
+{
+ __VPAIR_F64_FMA (__r, __a, __b, __c,
+ "xvmsubadp",
+ __builtin_vsx_xvmsubdp);
+}
+
+static inline void
+vpair_f64_nfma (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b,
+ const vector_pair_f64_t *__c)
+{
+ __VPAIR_F64_FMA (__r, __a, __b, __c,
+ "xvnmaddadp",
+ __builtin_vsx_xvnmadddp);
+}
+
+static inline void
+vpair_f64_nfms (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a,
+ const vector_pair_f64_t *__b,
+ const vector_pair_f64_t *__c)
+{
+ __VPAIR_F64_FMA (__r, __a, __b, __c,
+ "xvnmsubadp",
+ __builtin_vsx_xvnmsubdp);
+}
+/* 32-bit fma operations. */
+
+static inline void
+vpair_f32_fma (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b,
+ const vector_pair_f32_t *__c)
+{
+ __VPAIR_F32_FMA (__r, __a, __b, __c,
+ "xvmaddasp",
+ __builtin_vsx_xvmaddsp);
+}
+
+static inline void
+vpair_f32_fms (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b,
+ const vector_pair_f32_t *__c)
+{
+ __VPAIR_F32_FMA (__r, __a, __b, __c,
+ "xvmsubasp",
+ __builtin_vsx_xvmsubsp);
+}
+
+static inline void
+vpair_f32_nfma (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b,
+ const vector_pair_f32_t *__c)
+{
+ __VPAIR_F32_FMA (__r, __a, __b, __c,
+ "xvnmaddasp",
+ __builtin_vsx_xvnmaddsp);
+}
+
+static inline void
+vpair_f32_nfms (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a,
+ const vector_pair_f32_t *__b,
+ const vector_pair_f32_t *__c)
+{
+ __VPAIR_F32_FMA (__r, __a, __b, __c,
+ "xvnmsubasp",
+ __builtin_vsx_xvnmsubsp);
+}
+
+
+/* Swap even/odd operations. */
+
+static inline void
+vpair_f32_swap_odd_even (vector_pair_f32_t *__r,
+ const vector_pair_f32_t *__a)
+{
+ vector unsigned long long __rotate = { 32, 32 };
+
+#if __MMA__ && !__VPAIR_NOP10__
+ /* Power10 vector pair support. */
+ __asm__ ("vrld %0,%1,%2\n\tvrld %L0,%L1,%2"
+ : "=v" (__r->__vpair)
+ : "v" (__a->__vpair), "v" (__rotate));
+
+#else
+ /* vector pair not available. */
+ vector unsigned long long *__r_ll = (vector unsigned long long *)__r;
+ vector unsigned long long *__a_ll = (vector unsigned long long *)__a;
+ __r_ll[0] = __builtin_vec_vrld (__a_ll[0], __rotate);
+ __r_ll[1] = __builtin_vec_vrld (__a_ll[1], __rotate);
+#endif /* power10/not power10. */
+}
+
+
+static inline void
+vpair_f64_swap_odd_even (vector_pair_f64_t *__r,
+ const vector_pair_f64_t *__a)
+{
+#if __MMA__ && !__VPAIR_NOP10__
+#if __VPAIR__USE_FPR__ || !__GNUC__ || (!__linux__ && !__ELF__)
+
+ /* Use vector pair and use %0 and %L0 on traditional FPR registers. */
+ __asm__ ("xxpermdi %0,%1,%1,2\n\txxpermdi %L0,%L1,%L1,2"
+ : "=d" (__r->__vpair)
+ : "d" (__a->__vpair));
+
+#else
+ /* Use vector pair and use %x0 and %x0+ on all VSX registers. */
+ __asm__ ("xxpermdi %x0,%x1,%x1,2\n\txxpermdi %x0+1,%x1+1,%x1+1,2"
+ : "=wa" (__r->__vpair)
+ : "wa" (__a->__vpair));
+#endif
+
+#else
+ /* vector pair not available. */
+ __r->__vp_f64[0]
+ = __builtin_vsx_xxpermdi_2df (__a->__vp_f64[0], __a->__vp_f64[0], 2);
+ __r->__vp_f64[1]
+ = __builtin_vsx_xxpermdi_2df (__a->__vp_f64[1], __a->__vp_f64[1], 2);
+#endif
+}
+
+#endif /* _VECTOR_PAIR_H. */
@@ -16213,6 +16213,7 @@ instructions, but allow the compiler to schedule those calls.
* PowerPC Hardware Transactional Memory Built-in Functions::
* PowerPC Atomic Memory Operation Functions::
* PowerPC Matrix-Multiply Assist Built-in Functions::
+* PowerPC Vector Pair Support::
* PRU Built-in Functions::
* RISC-V Built-in Functions::
* RISC-V Vector Intrinsics::
@@ -24729,6 +24730,105 @@ __vector_pair __builtin_vsx_lxvp (size_t, __vector_pair *);
void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *);
@end smallexample
+@node PowerPC Vector Pair Support
+@subsection PowerPC Vector Pair Support
+ISA 3.1 (power10) added instructions to load and store pairs of
+vectors with a single instruction.
+
+GCC now provides an include file (@file{vector-pair.h}) on PowerPC
+systems that allows users to write code that can write 32-bit and
+64-bit floating point code that processes data in 256-bit chunks
+rather than 128-bit chunks.
+
+If the code is compiled on an ISA 3.1 system with MMA enabled, the
+vector pair functions will use the @code{__vector_pair} type to have
+values in adjacent vectors and do the operation as a pair of
+operations.
+
+If the code is compiled on a VSX system, but not one with MMA enabled, the vector
+pair functions will use 2 separate vectors to do the operation.
+
+Two types are provided: @code{vector_pair_f64_t} is for vector pairs
+that will operate on units of 4 64-bit floating point values, and
+@code{vector_pair_f32_t} for operating on units of 8 32-bit floating
+point values.
+
+The following functions are provided for operating on vector pairs
+that consist of 4 64-bit floating point values:
+
+@smallexample
+void vpair_f64_splat (vector_pair_f64_t *, double);
+
+void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *);
+
+void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *);
+
+void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *,
+ vector_pair_f64_t *, vector_pair_f64_t *);
+@end smallexample
+
+The following functions are provided for operating on vector pairs
+that consist of 8 32-bit floating point values:
+
+@smallexample
+void vpair_f32_splat (vector_pair_f32_t *, float);
+
+void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *);
+
+void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *);
+
+void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *,
+ vector_pair_f32_t *, vector_pair_f32_t *);
+@end smallexample
+
+The following functions are provided for operating on vector pairs
+that swap the even and odd elements.
+
+@smallexample
+void vpair_f32_swap_odd_even (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f64_swap_odd_even (vector_pair_f64_t *, vector_pair_f64_t *);
+@end smallexample
+
@node PRU Built-in Functions
@subsection PRU Built-in Functions
new file mode 100644
@@ -0,0 +1,150 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector builtin code generates the expected instructions for
+ vector pairs with 4 double elements. */
+
+#include <vector-pair.h>
+
+void
+test_add (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y)
+{
+ /* 2 lxvp, 2 xvadddp, 1 stxvp. */
+ vpair_f64_add (dest, x, y);
+}
+
+void
+test_sub (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y)
+{
+ /* 2 lxvp, 2 xvsubdp, 1 stxvp. */
+ vpair_f64_sub (dest, x, y);
+}
+
+void
+test_multiply (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y)
+{
+ /* 2 lxvp, 2 xvmuldp, 1 stxvp. */
+ vpair_f64_mul (dest, x, y);
+}
+
+void
+test_min (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y)
+{
+ /* 2 lxvp, 2 xvmindp, 1 stxvp. */
+ vpair_f64_min (dest, x, y);
+}
+
+void
+test_max (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y)
+{
+ /* 2 lxvp, 2 xvmaxdp, 1 stxvp. */
+ vpair_f64_max (dest, x, y);
+}
+
+void
+test_negate (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x)
+{
+ /* 1 lxvp, 2 xvnegdp, 1 stxvp. */
+ vpair_f64_neg (dest, x);
+}
+
+void
+test_abs (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x)
+{
+ /* 1 lxvp, 2 xvabsdp, 1 stxvp. */
+ vpair_f64_abs (dest, x);
+}
+
+void
+test_negative_abs (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x)
+{
+ /* 2 lxvp, 2 xvnabsdp, 1 stxvp. */
+ vpair_f64_nabs (dest, x);
+}
+
+void
+test_sqrt (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x)
+{
+ /* 1 lxvp, 2 xvabsdp, 1 stxvp. */
+ vpair_f64_sqrt (dest, x);
+}
+
+void
+test_fma (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y,
+ vector_pair_f64_t *z)
+{
+ /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp. */
+ vpair_f64_fma (dest, x, y, z);
+}
+
+void
+test_fms (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y,
+ vector_pair_f64_t *z)
+{
+ /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp. */
+ vpair_f64_fms (dest, x, y, z);
+}
+
+void
+test_nfma (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y,
+ vector_pair_f64_t *z)
+{
+ /* 2 lxvp, 2 xvnmadd{a,m}dp, 1 stxvp. */
+ vpair_f64_nfma (dest, x, y, z);
+}
+
+void
+test_nfms (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x,
+ vector_pair_f64_t *y,
+ vector_pair_f64_t *z)
+{
+ /* 2 lxvp, 2 xvnmsub{a,m}dp, 1 stxvp. */
+ vpair_f64_nfms (dest, x, y, z);
+}
+
+void
+test_swap (vector_pair_f64_t *dest,
+ vector_pair_f64_t *x)
+{
+ /* 1 lxvp, 2 xxpermdi, 1 stxvp. */
+ vpair_f64_swap_odd_even (dest, x);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 27 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsqrtdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 2 } } */
new file mode 100644
@@ -0,0 +1,150 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector builtin code generates the expected instructions for
+ vector pairs with 4 double elements. */
+
+#include <vector-pair.h>
+
+void
+test_add (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y)
+{
+ /* 2 lxvp, 2 xvaddsp, 1 stxvp. */
+ vpair_f32_add (dest, x, y);
+}
+
+void
+test_sub (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y)
+{
+ /* 2 lxvp, 2 xvsubsp, 1 stxvp. */
+ vpair_f32_sub (dest, x, y);
+}
+
+void
+test_multiply (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y)
+{
+ /* 2 lxvp, 2 xvmulsp, 1 stxvp. */
+ vpair_f32_mul (dest, x, y);
+}
+
+void
+test_min (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y)
+{
+ /* 2 lxvp, 2 xvminsp, 1 stxvp. */
+ vpair_f32_min (dest, x, y);
+}
+
+void
+test_max (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y)
+{
+ /* 2 lxvp, 2 xvmaxsp, 1 stxvp. */
+ vpair_f32_max (dest, x, y);
+}
+
+void
+test_negate (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x)
+{
+ /* 1 lxvp, 2 xvnegsp, 1 stxvp. */
+ vpair_f32_neg (dest, x);
+}
+
+void
+test_abs (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x)
+{
+ /* 1 lxvp, 2 xvabssp, 1 stxvp. */
+ vpair_f32_abs (dest, x);
+}
+
+void
+test_negative_abs (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x)
+{
+ /* 2 lxvp, 2 xvnabssp, 1 stxvp. */
+ vpair_f32_nabs (dest, x);
+}
+
+void
+test_sqrt (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x)
+{
+ /* 1 lxvp, 2 xvabssp, 1 stxvp. */
+ vpair_f32_sqrt (dest, x);
+}
+
+void
+test_fma (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y,
+ vector_pair_f32_t *z)
+{
+ /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp. */
+ vpair_f32_fma (dest, x, y, z);
+}
+
+void
+test_fms (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y,
+ vector_pair_f32_t *z)
+{
+ /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp. */
+ vpair_f32_fms (dest, x, y, z);
+}
+
+void
+test_nfma (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y,
+ vector_pair_f32_t *z)
+{
+ /* 2 lxvp, 2 xvnmadd{a,m}sp, 1 stxvp. */
+ vpair_f32_nfma (dest, x, y, z);
+}
+
+void
+test_nfms (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x,
+ vector_pair_f32_t *y,
+ vector_pair_f32_t *z)
+{
+ /* 2 lxvp, 2 xvnmsub{a,m}sp, 1 stxvp. */
+ vpair_f32_nfms (dest, x, y, z);
+}
+
+void
+test_swap (vector_pair_f32_t *dest,
+ vector_pair_f32_t *x)
+{
+ /* 1 lxvp, 2 xxpermdi, 1 stxvp. */
+ vpair_f32_swap_odd_even (dest, x);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M} 27 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M} 14 } } */
+/* { dg-final { scan-assembler-times {\mvrld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsqrtsp\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M} 2 } } */
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do run { target { vsx_hw } } } */
+/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */
+
+/*
+ * This test of the double (f64) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are not
+ * available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mno-mma option disables GCC from enabling the __vector_pair type.
+ */
+
+#include "vpair-3.h"
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do run { target { power10_hw } } } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */
+
+/*
+ * This test of the double (f64) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mmma option makes sure GC enables the __vector_pair type.
+ */
+
+#include "vpair-3.h"
new file mode 100644
@@ -0,0 +1,461 @@
+/* Common include file to test the vector pair double functions. This is run
+ two times, once compiled for a non-power10 system that does not have the
+ vector pair load and store instructions, and once with power10 defaults that
+ has load/store vector pair. */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <vector-pair.h>
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#ifndef NUM
+#define NUM 16
+#endif
+
+static double result1[NUM];
+static double result2[NUM];
+static double in_a[NUM];
+static double in_b[NUM];
+static double in_c[NUM];
+
+/* vector pair tests. */
+
+void
+vpair_abs (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_abs (vr + i, va + i);
+}
+
+void
+vpair_nabs (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_nabs (vr + i, va + i);
+}
+
+void
+vpair_neg (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_neg (vr + i, va + i);
+}
+
+void
+vpair_sqrt (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_sqrt (vr + i, va + i);
+}
+
+void
+vpair_add (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_add (vr + i, va + i, vb + i);
+}
+
+void
+vpair_sub (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_sub (vr + i, va + i, vb + i);
+}
+
+void
+vpair_mul (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_mul (vr + i, va + i, vb + i);
+}
+
+void
+vpair_div (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_div (vr + i, va + i, vb + i);
+}
+
+void
+vpair_min (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_min (vr + i, va + i, vb + i);
+}
+
+void
+vpair_max (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_max (vr + i, va + i, vb + i);
+}
+
+void
+vpair_fma (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+ vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_fma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_fms (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+ vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_fms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfma (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+ vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_nfma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfms (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+ vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+ vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_nfms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_swap (double *r, double *a, double *b, double *c, size_t num)
+{
+ vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+ vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+ for (i = 0; i < num2; i++)
+ vpair_f64_swap_odd_even (vr + i, va + i);
+}
+
+
+/* scalar tests. */
+
+void
+scalar_abs (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < 0.0) ? -a[i] : a[i];
+}
+
+void
+scalar_nabs (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < 0.0) ? a[i] : -a[i];
+}
+
+void
+scalar_neg (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = -a[i];
+}
+
+void
+scalar_sqrt (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_sqrt (a[i]);
+}
+
+void
+scalar_add (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] + b[i];
+}
+
+void
+scalar_sub (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] - b[i];
+}
+
+void
+scalar_mul (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] * b[i];
+}
+
+void
+scalar_div (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] / b[i];
+}
+
+void
+scalar_min (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_max (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] > b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_fma (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_fms (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_nfma (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = - __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_nfms (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = - __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_swap (double *r, double *a, double *b, double *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i += 2)
+ {
+ r[i] = a[i+1];
+ r[i+1] = a[i];
+ }
+}
+
+
+/* Check results. */
+void
+check (const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < NUM; i++)
+ if (result1[i] != result2[i])
+ {
+#ifdef DEBUG
+ printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n",
+ (long)i,
+ result1[i],
+ result2[i],
+ name,
+ in_a[i],
+ in_b[i],
+ in_c[i]);
+#endif
+ abort ();
+ }
+
+ return;
+}
+
+typedef void func_t (double *, double *, double *, double *, size_t);
+
+/* tests to run. */
+struct
+{
+ func_t *vpair_test;
+ func_t *scalar_test;
+ const char *name;
+} tests[] = {
+ { vpair_abs, scalar_abs, "abs" },
+ { vpair_nabs, scalar_nabs, "nabs" },
+ { vpair_neg, scalar_neg, "neg" },
+ { vpair_sqrt, scalar_sqrt, "sqrt" },
+ { vpair_add, scalar_add, "add" },
+ { vpair_sub, scalar_sub, "sub" },
+ { vpair_mul, scalar_mul, "mul" },
+ { vpair_div, scalar_div, "div" },
+ { vpair_min, scalar_min, "min" },
+ { vpair_max, scalar_max, "max" },
+ { vpair_fma, scalar_fma, "fma" },
+ { vpair_fms, scalar_fms, "fms" },
+ { vpair_nfma, scalar_nfma, "nfma" },
+ { vpair_nfms, scalar_nfms, "nfms" },
+ { vpair_swap, scalar_swap, "swap" },
+};
+
+/* Run tests. */
+
+int
+main (void)
+{
+ size_t i;
+
+ /* Initialize the inputs. */
+ for (i = 0; i < NUM; i++)
+ {
+ double d = (double)(i + 1);
+ in_a[i] = d * d;
+ in_b[i] = d;
+ in_c[i] = d + 2.0;
+ }
+
+#ifdef DEBUG
+ printf ("Start tests\n");
+#endif
+
+ /* Run the tests. */
+ for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++)
+ {
+ tests[i].vpair_test (result1, in_a, in_b, in_c, NUM);
+ tests[i].scalar_test (result2, in_a, in_b, in_c, NUM);
+ check (tests[i].name);
+ }
+
+#ifdef DEBUG
+ printf ("End tests\n");
+#endif
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do run { target { vsx_hw } } } */
+/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */
+
+/*
+ * This test of the float (f32) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are not
+ * available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mno-mma option disables GCC from enabling the __vector_pair type.
+ */
+
+#include "vpair-4.h"
new file mode 100644
@@ -0,0 +1,14 @@
+/* { dg-do run { target { power10_hw } } } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */
+
+/*
+ * This test of the float (f32) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mmma option makes sure GC enables the __vector_pair type.
+ */
+
+#include "vpair-4.h"
new file mode 100644
@@ -0,0 +1,461 @@
+/* Common include file to test the vector pair float functions. This is run
+ two times, once compiled for a non-power10 system that does not have the
+ vector pair load and store instructions, and once with power10 defaults that
+ has load/store vector pair. */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <vector-pair.h>
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#ifndef NUM
+#define NUM 16
+#endif
+
+static float result1[NUM];
+static float result2[NUM];
+static float in_a[NUM];
+static float in_b[NUM];
+static float in_c[NUM];
+
+/* vector pair tests. */
+
+void
+vpair_abs (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_abs (vr + i, va + i);
+}
+
+void
+vpair_nabs (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_nabs (vr + i, va + i);
+}
+
+void
+vpair_neg (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_neg (vr + i, va + i);
+}
+
+void
+vpair_sqrt (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_sqrt (vr + i, va + i);
+}
+
+void
+vpair_add (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_add (vr + i, va + i, vb + i);
+}
+
+void
+vpair_sub (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_sub (vr + i, va + i, vb + i);
+}
+
+void
+vpair_mul (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_mul (vr + i, va + i, vb + i);
+}
+
+void
+vpair_div (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_div (vr + i, va + i, vb + i);
+}
+
+void
+vpair_min (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_min (vr + i, va + i, vb + i);
+}
+
+void
+vpair_max (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_max (vr + i, va + i, vb + i);
+}
+
+void
+vpair_fma (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+ vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_fma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_fms (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+ vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_fms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfma (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+ vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_nfma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfms (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+ vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+ vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_nfms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_swap (float *r, float *a, float *b, float *c, size_t num)
+{
+ vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+ vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+ size_t i;
+ size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+ for (i = 0; i < num2; i++)
+ vpair_f32_swap_odd_even (vr + i, va + i);
+}
+
+
+/* scalar tests. */
+
+void
+scalar_abs (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < 0.0) ? -a[i] : a[i];
+}
+
+void
+scalar_nabs (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < 0.0) ? a[i] : -a[i];
+}
+
+void
+scalar_neg (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = -a[i];
+}
+
+void
+scalar_sqrt (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_sqrt (a[i]);
+}
+
+void
+scalar_add (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] + b[i];
+}
+
+void
+scalar_sub (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] - b[i];
+}
+
+void
+scalar_mul (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] * b[i];
+}
+
+void
+scalar_div (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = a[i] / b[i];
+}
+
+void
+scalar_min (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] < b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_max (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = (a[i] > b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_fma (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_fms (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_nfma (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = - __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_nfms (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i++)
+ r[i] = - __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_swap (float *r, float *a, float *b, float *c, size_t num)
+{
+ size_t i;
+
+ for (i = 0; i < num; i += 2)
+ {
+ r[i] = a[i+1];
+ r[i+1] = a[i];
+ }
+}
+
+
+/* Check results. */
+void
+check (const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < NUM; i++)
+ if (result1[i] != result2[i])
+ {
+#ifdef DEBUG
+ printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n",
+ (long)i,
+ result1[i],
+ result2[i],
+ name,
+ in_a[i],
+ in_b[i],
+ in_c[i]);
+#endif
+ abort ();
+ }
+
+ return;
+}
+
+typedef void func_t (float *, float *, float *, float *, size_t);
+
+/* tests to run. */
+struct
+{
+ func_t *vpair_test;
+ func_t *scalar_test;
+ const char *name;
+} tests[] = {
+ { vpair_abs, scalar_abs, "abs" },
+ { vpair_nabs, scalar_nabs, "nabs" },
+ { vpair_neg, scalar_neg, "neg" },
+ { vpair_sqrt, scalar_sqrt, "sqrt" },
+ { vpair_add, scalar_add, "add" },
+ { vpair_sub, scalar_sub, "sub" },
+ { vpair_mul, scalar_mul, "mul" },
+ { vpair_div, scalar_div, "div" },
+ { vpair_min, scalar_min, "min" },
+ { vpair_max, scalar_max, "max" },
+ { vpair_fma, scalar_fma, "fma" },
+ { vpair_fms, scalar_fms, "fms" },
+ { vpair_nfma, scalar_nfma, "nfma" },
+ { vpair_nfms, scalar_nfms, "nfms" },
+ { vpair_swap, scalar_swap, "swap" },
+};
+
+/* Run tests. */
+
+int
+main (void)
+{
+ size_t i;
+
+ /* Initialize the inputs. */
+ for (i = 0; i < NUM; i++)
+ {
+ float f = (float)(i + 1);
+ in_a[i] = f * f;
+ in_b[i] = f;
+ in_c[i] = f + 2.0f;
+ }
+
+#ifdef DEBUG
+ printf ("Start tests\n");
+#endif
+
+ /* Run the tests. */
+ for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++)
+ {
+ tests[i].vpair_test (result1, in_a, in_b, in_c, NUM);
+ tests[i].scalar_test (result2, in_a, in_b, in_c, NUM);
+ check (tests[i].name);
+ }
+
+#ifdef DEBUG
+ printf ("End tests\n");
+#endif
+
+ return 0;
+}