diff mbox series

[1/1] PowerPC vector pair support

Message ID ZwRv9uMQlWfgAv_G@cowardly-lion.the-meissners.org
State New
Headers show
Series PowerPC vector pair support | expand

Commit Message

Michael Meissner Oct. 7, 2024, 11:34 p.m. UTC
See the previous post for a longer explanation of the motavations for this
patch:

    https://gcc.gnu.org/pipermail/gcc-patches/2024-October/664694.html

This patch adds a new include file (vector-pair.h) that implements a series of
functions that allows people implementing high performance libraries to
optimize their code to use the vector pair load/store instructions on power10
computers to enhance the memory bandwidth.

I have tested this on both big endian and little endian servers.  Can I check
this into the GCC trunk?

2024-10-07  Michael Meissner  <meissner@linux.ibm.com>

gcc/

	* config.gcc (powerpc*-*-*): Add vector-pair.h to extra headers.
	* config/rs6000/vector-pair.h: New file.
	* doc/extend.texi (PowerPC Vector Pair Support): Document the vector
	pair support functions.

gcc/testsuite/

	* gcc.target/powerpc/vpair-1.c: New test or include file.
	* gcc.target/powerpc/vpair-2.c: Likewise.
	* gcc.target/powerpc/vpair-3-not-p10.c: Likewise.
	* gcc.target/powerpc/vpair-3-p10.c: Likewise.
	* gcc.target/powerpc/vpair-3.h: Likewise.
	* gcc.target/powerpc/vpair-4-not-p10.c: Likewise.
	* gcc.target/powerpc/vpair-4-p10.c: Likewise.
	* gcc.target/powerpc/vpair-4.h: Likewise.
---
 gcc/config.gcc                                |   2 +-
 gcc/config/rs6000/rs6000-c.cc                 |   8 +-
 gcc/config/rs6000/vector-pair.h               | 519 ++++++++++++++++++
 gcc/doc/extend.texi                           |  98 ++++
 gcc/testsuite/gcc.target/powerpc/vpair-1.c    | 141 +++++
 gcc/testsuite/gcc.target/powerpc/vpair-2.c    | 141 +++++
 .../gcc.target/powerpc/vpair-3-not-p10.c      |  15 +
 .../gcc.target/powerpc/vpair-3-p10.c          |  14 +
 gcc/testsuite/gcc.target/powerpc/vpair-3.h    | 435 +++++++++++++++
 .../gcc.target/powerpc/vpair-4-not-p10.c      |  15 +
 .../gcc.target/powerpc/vpair-4-p10.c          |  14 +
 gcc/testsuite/gcc.target/powerpc/vpair-4.h    | 435 +++++++++++++++
 12 files changed, 1834 insertions(+), 3 deletions(-)
 create mode 100644 gcc/config/rs6000/vector-pair.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-3.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vpair-4.h
diff mbox series

Patch

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 0b794e977f6..3627bed8b86 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -537,7 +537,7 @@  powerpc*-*-*)
 	extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h"
 	extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
-	extra_headers="${extra_headers} amo.h"
+	extra_headers="${extra_headers} amo.h vector-pair.h"
 	case x$with_cpu in
 	    xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
 		cpu_is_64bit=yes
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 82826f96a8e..77bee8fc878 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -590,9 +590,13 @@  rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT flags,
   if (rs6000_cpu == PROCESSOR_CELL)
     rs6000_define_or_undefine_macro (define_p, "__PPU__");
 
-  /* Tell the user if we support the MMA instructions.  */
+  /* Tell the user if we support the MMA instructions.  Also tell vector-pair.h
+     that we have the vector pair built-in function support.  */
   if ((flags & OPTION_MASK_MMA) != 0)
-    rs6000_define_or_undefine_macro (define_p, "__MMA__");
+    {
+      rs6000_define_or_undefine_macro (define_p, "__MMA__");
+      rs6000_define_or_undefine_macro (define_p, "__VPAIR__");
+    }
   /* Whether pc-relative code is being generated.  */
   if ((flags & OPTION_MASK_PCREL) != 0)
     rs6000_define_or_undefine_macro (define_p, "__PCREL__");
diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h
new file mode 100644
index 00000000000..ceb28c4e974
--- /dev/null
+++ b/gcc/config/rs6000/vector-pair.h
@@ -0,0 +1,519 @@ 
+/* PowerPC vector pair include file.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   Contributed by Aldy Hernandez (aldyh@redhat.com).
+   Rewritten by Paolo Bonzini (bonzini@gnu.org).
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published
+   by the Free Software Foundation; either version 3, or (at your
+   option) any later version.
+
+   GCC is distributed in the hope that it will be useful, but WITHOUT
+   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+   License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Provide support for vector pairs, even on systems that do not have native
+   support for loading and storing pairs of vectors.  */
+
+#ifndef _VECTOR_PAIR_H
+#define _VECTOR_PAIR_H	1
+
+/* Union of the various vector pair types.  */
+union __vpair_union {
+
+#ifdef __MMA__
+  __vector_pair		__vpair;
+#endif
+
+  vector double		__vp_f64[2];
+  vector float		__vp_f32[2];
+  vector unsigned char	__vp_uc[2];
+};
+
+typedef union __vpair_union	vector_pair_f64_t;
+typedef union __vpair_union	vector_pair_f32_t;
+
+#if !__VPAIR_BUILTIN__ && !__VPAIR_ASM__ && !__VPAIR_NOP10__
+#if __MMA__
+#define __VPAIR_ASM__		1
+
+#else
+#define __VPAIR_NOP10__		1
+#endif
+#endif
+
+/* Macros to simplify creation of the various operations.
+ *
+ * The __VPAIR_FP_{UNARY,BINARY,FMA} macros are the base macros, and takes:
+ *	R:         The  argument for the output vector pair
+ *	A, B, C:   1-3 arguments for the inputs
+ *	OPCODE:    The assembler opcode for __asm__ on power10
+ *	VEC:       Either __vp_f64 or __vp_f32 for the union field
+ *	VEC_FUNC:  128-bit vector function for use on power8/power9
+ *
+ * The __VPAIR_FP_splat macro takes:
+ *	R:         The  argument for the output vector pair
+ *	X:         The scalar that is to be splat-ed to the vector pair
+ *	VEC:       Either __vp_f64 or __vp_f32 for the union field
+ *
+ * The __VPAIR_F32_<...> and __VPAIR_F64_<...> macros call the above macros
+ * with the appropriate structure field to use.
+ */
+
+#undef  __VPAIR_FP_SPLAT
+#undef  __VPAIR_FP_UNARY
+#undef  __VPAIR_FP_BINARY
+#undef  __VPAIR_FP_FMA
+
+#undef  __VPAIR_F64_UNARY
+#undef  __VPAIR_F64_BINARY
+#undef  __VPAIR_F64_FMA
+
+#undef  __VPAIR_F32_UNARY
+#undef  __VPAIR_F32_BINARY
+#undef  __VPAIR_F32_FMA
+
+/* Operations using a vector pair and __asm__operations.  */
+#if __MMA__ && !__VPAIR_NOP10__
+
+/* When using __asm__, we need to access the second register.  Due to the way
+   VSX registers were formed by combining the traditional floating point
+   registers and Altivec registers, we can't use the output modifier %L<n> to
+   refer to the second register if the VSX register was a traditional Altivec
+   register.  If the value is in VSX registers 34 & 35, %x0 would give 34, but
+   %L0 would give 1, since 'Altivec' registers start at 0.
+
+   If we are using GAS under Linux, we can use %x0+1 to access the second
+   register and use the full VSX register set.
+
+   If this include file is used on non-Linux systems, or with a non-GCC
+   compiler, limit the registers used to the traditional FPR registers so that
+   we can use %L0.  */
+
+#if __VPAIR__USE_FPR__ || !__GNUC__ || (!__linux__ && !__ELF__)
+
+/* Use %0 and %L0 on traditional FPR registers.  */
+#define __VPAIR_FP_SPLAT(R, X, VEC)					\
+  __asm__ ("xxlor %L0,%0,%0"						\
+           : "=d" ((R)->__vpair)					\
+           : "0" (__builtin_vec_splats ((X))))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC)			\
+  __asm__ (OPCODE " %0,%1\n\t" OPCODE " %L0,%L1"			\
+           : "=d" ((R)->__vpair)					\
+           : "d" ((A)->__vpair))
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC)		\
+  __asm__ (OPCODE " %0,%1,$1\n\t" OPCODE " %L0,%L1,%L2"			\
+           : "=d" ((R)->__vpair)					\
+           : "d" ((A)->__vpair), "d" ((B)->__vpair))
+
+/* Note the 'a' form of the fma instructions must be used.  */
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC)		\
+  __asm__ (OPCODE " %0,%1,%2\n\t" OPCODE " %L0,%L1,%L2"			\
+           : "=d" ((R)->__vpair)					\
+           : "d" ((A)->__vpair), "d" ((B)->__vpair), "0" ((C)->__vpair))
+
+#else
+
+/* Use %x0 and %x0+1 on VSX reigsters.  */
+#define __VPAIR_FP_SPLAT(R, X, VEC)					\
+  __asm__ ("xxlor %x0+1,%x0,%x0"					\
+           : "=wa" ((R)->__vpair)					\
+           : "0" (__builtin_vec_splats ((X))))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC)			\
+  __asm__ (OPCODE " %x0,%x1\n\t" OPCODE " %x0+1,%x1+1"			\
+           : "=wa" ((R)->__vpair)					\
+           : "wa" ((A)->__vpair))
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC)		\
+  __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1"	\
+           : "=wa" ((R)->__vpair)					\
+           : "wa" ((A)->__vpair), "wa" ((B)->__vpair))
+
+/* Note the 'a' form of the fma instructions must be used.  */
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC)		\
+  __asm__ (OPCODE " %x0,%x1,%x2\n\t" OPCODE " %x0+1,%x1+1,%x2+1"	\
+           : "=wa" ((R)->__vpair)					\
+           : "wa" ((A)->__vpair), "wa" ((B)->__vpair), "0" ((C)->__vpair))
+#endif	/* Select whether to use %0/%L0 or %x0/%x0+1.  */
+
+#else	/* vpair support on power8/power9.  */
+
+/* Pair of vector operations using a built-in function.  */
+
+#define __VPAIR_FP_SPLAT(R, X, VEC)					\
+  (R)->VEC[0] = (R)->VEC[1] = __builtin_vec_splats ((X))
+
+#define __VPAIR_FP_UNARY(R, A, OPCODE, VEC, VEC_FUNC)			\
+  do									\
+    {									\
+      (R)->VEC[0] = VEC_FUNC ((A)->VEC[0]);				\
+      (R)->VEC[1] = VEC_FUNC ((A)->VEC[1]);				\
+    }									\
+  while (0)
+
+#define __VPAIR_FP_BINARY(R, A, B, OPCODE, VEC, VEC_FUNC)		\
+  do									\
+    {									\
+      (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0]);		\
+      (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1]);		\
+    }									\
+  while (0)
+
+#define __VPAIR_FP_FMA(R, A, B, C, OPCODE, VEC, VEC_FUNC)		\
+  do									\
+    {									\
+      (R)->VEC[0] = VEC_FUNC ((A)->VEC[0], (B)->VEC[0], (C)->VEC[0]);	\
+      (R)->VEC[1] = VEC_FUNC ((A)->VEC[1], (B)->VEC[1], (C)->VEC[1]);	\
+    }									\
+  while (0)
+
+#endif
+
+/* 64-bit version of the macros.  */
+#define __VPAIR_F64_UNARY(R, A, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f64, VEC_FUNC)
+
+#define __VPAIR_F64_BINARY(R, A, B, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f64, VEC_FUNC)
+
+#define __VPAIR_F64_FMA(R, A, B, C, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f64, VEC_FUNC)
+
+
+/* 32-bit version of the macros.  */
+#define __VPAIR_F32_UNARY(R, A, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_UNARY(R, A, OPCODE, __vp_f32, VEC_FUNC)
+
+#define __VPAIR_F32_BINARY(R, A, B, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_BINARY(R, A, B, OPCODE, __vp_f32, VEC_FUNC)
+
+#define __VPAIR_F32_FMA(R, A, B, C, OPCODE, VEC_FUNC)			\
+  __VPAIR_FP_FMA(R, A, B, C, OPCODE, __vp_f32, VEC_FUNC)
+
+
+/* Splat functions.  */
+
+/* 64-bit splat to vector pair.  */
+
+static inline void
+vpair_f64_splat (vector_pair_f64_t *__r, double __x)
+{
+  __VPAIR_FP_SPLAT (__r, __x, __vp_f64);
+}
+
+/* 32-bit splat to vector pair.  */
+
+static inline void
+vpair_f32_splat (vector_pair_f32_t *__r, float __x)
+{
+  __VPAIR_FP_SPLAT (__r, __x, __vp_f32);
+}
+
+
+/* 64-bit unary functions.  */
+
+static inline void
+vpair_f64_abs (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a)
+{
+  __VPAIR_F64_UNARY (__r, __a,
+		     "xvabsdp",
+		     __builtin_vec_abs);
+}
+
+static inline void
+vpair_f64_nabs (vector_pair_f64_t       *__r,
+		const vector_pair_f64_t *__a)
+{
+  __VPAIR_F64_UNARY (__r, __a,
+		     "xvnabsdp",
+		     __builtin_vec_nabs);
+}
+
+static inline void
+vpair_f64_neg (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a)
+{
+  __VPAIR_F64_UNARY (__r, __a,
+		     "xvnegdp",
+		     __builtin_vec_neg);
+}
+
+static inline void
+vpair_f64_sqrt (vector_pair_f64_t       *__r,
+		const vector_pair_f64_t *__a)
+{
+  __VPAIR_F64_UNARY (__r, __a,
+		     "xvsqrtdp",
+		     __builtin_vec_sqrt);
+}
+
+/* 32-bit unary functions.  */
+
+static inline void
+vpair_f32_abs (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a)
+{
+  __VPAIR_F32_UNARY (__r, __a,
+		     "xvabssp",
+		     __builtin_vec_abs);
+}
+
+static inline void
+vpair_f32_nabs (vector_pair_f32_t       *__r,
+		const vector_pair_f32_t *__a)
+{
+  __VPAIR_F32_UNARY (__r, __a,
+		     "xvnabssp",
+		     __builtin_vec_nabs);
+}
+
+static inline void
+vpair_f32_neg (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a)
+{
+  __VPAIR_F32_UNARY (__r, __a,
+		     "xvnegsp",
+		     __builtin_vec_neg);
+}
+
+static inline void
+vpair_f32_sqrt (vector_pair_f32_t       *__r,
+		const vector_pair_f32_t *__a)
+{
+  __VPAIR_F32_UNARY (__r, __a,
+		     "xvsqrtsp",
+		     __builtin_vec_sqrt);
+}
+
+
+/* 64-bit binary functions.  */
+
+static inline void
+vpair_f64_add (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvadddp",
+		      __builtin_vec_add);
+}
+
+static inline void
+vpair_f64_div (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvdivdp",
+		      __builtin_vec_div);
+}
+
+static inline void
+vpair_f64_max (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvmaxdp",
+		      __builtin_vec_max);
+}
+
+static inline void
+vpair_f64_min (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvmindp",
+		      __builtin_vec_min);
+}
+
+static inline void
+vpair_f64_mul (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvmuldp",
+		      __builtin_vec_mul);
+}
+
+static inline void
+vpair_f64_sub (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b)
+{
+  __VPAIR_F64_BINARY (__r, __a, __b,
+		      "xvsubdp",
+		      __builtin_vec_sub);
+}
+
+/* 32-bit binary functions.  */
+
+static inline void
+vpair_f32_add (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvaddsp",
+		      __builtin_vec_add);
+}
+
+static inline void
+vpair_f32_div (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvdivsp",
+		      __builtin_vec_div);
+}
+
+static inline void
+vpair_f32_max (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvmaxsp",
+		      __builtin_vec_max);
+}
+
+static inline void
+vpair_f32_min (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvminsp",
+		      __builtin_vec_min);
+}
+
+static inline void
+vpair_f32_mul (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvmulsp",
+		      __builtin_vec_mul);
+}
+
+static inline void
+vpair_f32_sub (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b)
+{
+  __VPAIR_F32_BINARY (__r, __a, __b,
+		      "xvsubsp",
+		      __builtin_vec_sub);
+}
+
+/* 64-bit fma operations.  */
+
+static inline void
+vpair_f64_fma (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b,
+	       const vector_pair_f64_t *__c)
+{
+  __VPAIR_F64_FMA (__r, __a, __b, __c,
+		   "xvmaddadp",
+		   __builtin_vsx_xvmadddp);
+}
+
+static inline void
+vpair_f64_fms (vector_pair_f64_t       *__r,
+	       const vector_pair_f64_t *__a,
+	       const vector_pair_f64_t *__b,
+	       const vector_pair_f64_t *__c)
+{
+  __VPAIR_F64_FMA (__r, __a, __b, __c,
+		   "xvmsubadp",
+		   __builtin_vsx_xvmsubdp);
+}
+
+static inline void
+vpair_f64_nfma (vector_pair_f64_t       *__r,
+		const vector_pair_f64_t *__a,
+		const vector_pair_f64_t *__b,
+		const vector_pair_f64_t *__c)
+{
+  __VPAIR_F64_FMA (__r, __a, __b, __c,
+		   "xvnmaddadp",
+		   __builtin_vsx_xvnmadddp);
+}
+
+static inline void
+vpair_f64_nfms (vector_pair_f64_t       *__r,
+		const vector_pair_f64_t *__a,
+		const vector_pair_f64_t *__b,
+		const vector_pair_f64_t *__c)
+{
+  __VPAIR_F64_FMA (__r, __a, __b, __c,
+		   "xvnmsubadp",
+		   __builtin_vsx_xvnmsubdp);
+}
+/* 32-bit fma operations.  */
+
+static inline void
+vpair_f32_fma (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b,
+	       const vector_pair_f32_t *__c)
+{
+  __VPAIR_F32_FMA (__r, __a, __b, __c,
+		   "xvmaddasp",
+		   __builtin_vsx_xvmaddsp);
+}
+
+static inline void
+vpair_f32_fms (vector_pair_f32_t       *__r,
+	       const vector_pair_f32_t *__a,
+	       const vector_pair_f32_t *__b,
+	       const vector_pair_f32_t *__c)
+{
+  __VPAIR_F32_FMA (__r, __a, __b, __c,
+		   "xvmsubasp",
+		   __builtin_vsx_xvmsubsp);
+}
+
+static inline void
+vpair_f32_nfma (vector_pair_f32_t       *__r,
+		const vector_pair_f32_t *__a,
+		const vector_pair_f32_t *__b,
+		const vector_pair_f32_t *__c)
+{
+  __VPAIR_F32_FMA (__r, __a, __b, __c,
+		   "xvnmaddasp",
+		   __builtin_vsx_xvnmaddsp);
+}
+
+static inline void
+vpair_f32_nfms (vector_pair_f32_t       *__r,
+		const vector_pair_f32_t *__a,
+		const vector_pair_f32_t *__b,
+		const vector_pair_f32_t *__c)
+{
+  __VPAIR_F32_FMA (__r, __a, __b, __c,
+		   "xvnmsubasp",
+		   __builtin_vsx_xvnmsubsp);
+}
+#endif	/* _VECTOR_PAIR_H.  */
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index f46c3df3303..4c9e8c2e313 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -16157,6 +16157,7 @@  instructions, but allow the compiler to schedule those calls.
 * PowerPC Hardware Transactional Memory Built-in Functions::
 * PowerPC Atomic Memory Operation Functions::
 * PowerPC Matrix-Multiply Assist Built-in Functions::
+* PowerPC Vector Pair Support::
 * PRU Built-in Functions::
 * RISC-V Built-in Functions::
 * RISC-V Vector Intrinsics::
@@ -24673,6 +24674,103 @@  __vector_pair __builtin_vsx_lxvp (size_t, __vector_pair *);
 void __builtin_vsx_stxvp (__vector_pair, size_t, __vector_pair *);
 @end smallexample
 
+@node PowerPC Vector Pair Support
+@subsection PowerPC Vector Pair Support
+ISA 3.1 (power10) added instructions to load and store pairs of
+vectors with a single instruction.
+
+GCC now provides an include file (@file{vector-pair.h}) on PowerPC
+systems that allows users to write code that can write 32-bit and
+64-bit floating point code that processes data in 256-bit chunks
+rather than 128-bit chunks.
+
+If the code is compiled on an ISA 3.1 system with MMA enabled, the
+vector pair functions will use the @code{__vector_pair} type to have
+values in adjacent vectors and do the operation as a pair of
+operations.
+
+If the code is compiled on a VSX system, but not one with MMA enabled, the vector
+pair functions will use 2 separate vectors to do the operation.
+
+Two types are provided: @code{vector_pair_f64_t} is for vector pairs
+that will operate on units of 4 64-bit floating point values, and
+@code{vector_pair_f32_t} for operating on units of 8 32-bit floating
+point values.
+
+@node PowerPC Vector Pair Support for 64-bit floating point
+@subsection PowerPC Vector Pair Support for 64-bit floating point.
+
+The following functions are provided for operating on vector pairs
+that consist of 4 64-bit floating point values:
+
+@smallexample
+void vpair_f64_splat (vector_pair_f64_t *, double);
+
+void vpair_f64_abs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nabs (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_neg (vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_sqrt (vector_pair_f64_t *, vector_pair_f64_t *);
+
+void vpair_f64_add (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_div (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_max (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_min (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_mul (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+void vpair_f64_sub (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *);
+
+void vpair_f64_fma (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_fms (vector_pair_f64_t *, vector_pair_f64_t *,
+                    vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfma (vector_pair_f64_t *, vector_pair_f64_t *,
+                     vector_pair_f64_t *, vector_pair_f64_t *);
+void vpair_f64_nfms (vector_pair_f64_t *, vector_pair_f64_t *,
+                     vector_pair_f64_t *, vector_pair_f64_t *);
+@end smallexample
+
+@node PowerPC Vector Pair Support for 32-bit floating point
+@subsection PowerPC Vector Pair Support for 32-bit floating point.
+
+The following functions are provided for operating on vector pairs
+that consist of 8 32-bit floating point values:
+
+@smallexample
+void vpair_f32_splat (vector_pair_f32_t *, float);
+
+void vpair_f32_abs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nabs (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_neg (vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_sqrt (vector_pair_f32_t *, vector_pair_f32_t *);
+
+void vpair_f32_add (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_div (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_max (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_min (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_mul (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+void vpair_f32_sub (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *);
+
+void vpair_f32_fma (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_fms (vector_pair_f32_t *, vector_pair_f32_t *,
+                    vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfma (vector_pair_f32_t *, vector_pair_f32_t *,
+                     vector_pair_f32_t *, vector_pair_f32_t *);
+void vpair_f32_nfms (vector_pair_f32_t *, vector_pair_f32_t *,
+                     vector_pair_f32_t *, vector_pair_f32_t *);
+@end smallexample
+
 @node PRU Built-in Functions
 @subsection PRU Built-in Functions
 
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-1.c b/gcc/testsuite/gcc.target/powerpc/vpair-1.c
new file mode 100644
index 00000000000..55772cc44e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-1.c
@@ -0,0 +1,141 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector builtin code generates the expected instructions for
+   vector pairs with 4 double elements.  */
+
+#include <vector-pair.h>
+
+void
+test_add (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y)
+{
+  /* 2 lxvp, 2 xvadddp, 1 stxvp.  */
+  vpair_f64_add (dest, x, y);
+}
+
+void
+test_sub (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y)
+{
+  /* 2 lxvp, 2 xvsubdp, 1 stxvp.  */
+  vpair_f64_sub (dest, x, y);
+}
+
+void
+test_multiply (vector_pair_f64_t *dest,
+	       vector_pair_f64_t *x,
+	       vector_pair_f64_t *y)
+{
+  /* 2 lxvp, 2 xvmuldp, 1 stxvp.  */
+  vpair_f64_mul (dest, x, y);
+}
+
+void
+test_min (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y)
+{
+  /* 2 lxvp, 2 xvmindp, 1 stxvp.  */
+  vpair_f64_min (dest, x, y);
+}
+
+void
+test_max (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y)
+{
+  /* 2 lxvp, 2 xvmaxdp, 1 stxvp.  */
+  vpair_f64_max (dest, x, y);
+}
+
+void
+test_negate (vector_pair_f64_t *dest,
+	     vector_pair_f64_t *x)
+{
+  /* 1 lxvp, 2 xvnegdp, 1 stxvp.  */
+  vpair_f64_neg (dest, x);
+}
+
+void
+test_abs (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x)
+{
+  /* 1 lxvp, 2 xvabsdp, 1 stxvp.  */
+  vpair_f64_abs (dest, x);
+}
+
+void
+test_negative_abs (vector_pair_f64_t *dest,
+		   vector_pair_f64_t *x)
+{
+  /* 2 lxvp, 2 xvnabsdp, 1 stxvp.  */
+  vpair_f64_nabs (dest, x);
+}
+
+void
+test_sqrt (vector_pair_f64_t *dest,
+	   vector_pair_f64_t *x)
+{
+  /* 1 lxvp, 2 xvabsdp, 1 stxvp.  */
+  vpair_f64_sqrt (dest, x);
+}
+
+void
+test_fma (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y,
+	  vector_pair_f64_t *z)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}dp, 1 stxvp.  */
+  vpair_f64_fma (dest, x, y, z);
+}
+
+void
+test_fms (vector_pair_f64_t *dest,
+	  vector_pair_f64_t *x,
+	  vector_pair_f64_t *y,
+	  vector_pair_f64_t *z)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}dp, 1 stxvp.  */
+  vpair_f64_fms (dest, x, y, z);
+}
+
+void
+test_nfma (vector_pair_f64_t *dest,
+	   vector_pair_f64_t *x,
+	   vector_pair_f64_t *y,
+	   vector_pair_f64_t *z)
+{
+  /* 2 lxvp, 2 xvnmadd{a,m}dp, 1 stxvp.  */
+  vpair_f64_nfma (dest, x, y, z);
+}
+
+void
+test_nfms (vector_pair_f64_t *dest,
+	   vector_pair_f64_t *x,
+	   vector_pair_f64_t *y,
+	   vector_pair_f64_t *z)
+{
+  /* 2 lxvp, 2 xvnmsub{a,m}dp, 1 stxvp.  */
+  vpair_f64_nfms (dest, x, y, z);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       26 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}      13 } } */
+/* { dg-final { scan-assembler-times {\mxvabsdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvadddp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmindp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.dp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmuldp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabsdp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegdp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.dp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsqrtdp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubdp\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-2.c b/gcc/testsuite/gcc.target/powerpc/vpair-2.c
new file mode 100644
index 00000000000..3030b0b3338
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-2.c
@@ -0,0 +1,141 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target power10_ok } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2" } */
+
+/* Test whether the vector builtin code generates the expected instructions for
+   vector pairs with 4 double elements.  */
+
+#include <vector-pair.h>
+
+void
+test_add (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y)
+{
+  /* 2 lxvp, 2 xvaddsp, 1 stxvp.  */
+  vpair_f32_add (dest, x, y);
+}
+
+void
+test_sub (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y)
+{
+  /* 2 lxvp, 2 xvsubsp, 1 stxvp.  */
+  vpair_f32_sub (dest, x, y);
+}
+
+void
+test_multiply (vector_pair_f32_t *dest,
+	       vector_pair_f32_t *x,
+	       vector_pair_f32_t *y)
+{
+  /* 2 lxvp, 2 xvmulsp, 1 stxvp.  */
+  vpair_f32_mul (dest, x, y);
+}
+
+void
+test_min (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y)
+{
+  /* 2 lxvp, 2 xvminsp, 1 stxvp.  */
+  vpair_f32_min (dest, x, y);
+}
+
+void
+test_max (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y)
+{
+  /* 2 lxvp, 2 xvmaxsp, 1 stxvp.  */
+  vpair_f32_max (dest, x, y);
+}
+
+void
+test_negate (vector_pair_f32_t *dest,
+	     vector_pair_f32_t *x)
+{
+  /* 1 lxvp, 2 xvnegsp, 1 stxvp.  */
+  vpair_f32_neg (dest, x);
+}
+
+void
+test_abs (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x)
+{
+  /* 1 lxvp, 2 xvabssp, 1 stxvp.  */
+  vpair_f32_abs (dest, x);
+}
+
+void
+test_negative_abs (vector_pair_f32_t *dest,
+		   vector_pair_f32_t *x)
+{
+  /* 2 lxvp, 2 xvnabssp, 1 stxvp.  */
+  vpair_f32_nabs (dest, x);
+}
+
+void
+test_sqrt (vector_pair_f32_t *dest,
+	   vector_pair_f32_t *x)
+{
+  /* 1 lxvp, 2 xvabssp, 1 stxvp.  */
+  vpair_f32_sqrt (dest, x);
+}
+
+void
+test_fma (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y,
+	  vector_pair_f32_t *z)
+{
+  /* 2 lxvp, 2 xvmadd{a,m}sp, 1 stxvp.  */
+  vpair_f32_fma (dest, x, y, z);
+}
+
+void
+test_fms (vector_pair_f32_t *dest,
+	  vector_pair_f32_t *x,
+	  vector_pair_f32_t *y,
+	  vector_pair_f32_t *z)
+{
+  /* 2 lxvp, 2 xvmsub{a,m}sp, 1 stxvp.  */
+  vpair_f32_fms (dest, x, y, z);
+}
+
+void
+test_nfma (vector_pair_f32_t *dest,
+	   vector_pair_f32_t *x,
+	   vector_pair_f32_t *y,
+	   vector_pair_f32_t *z)
+{
+  /* 2 lxvp, 2 xvnmadd{a,m}sp, 1 stxvp.  */
+  vpair_f32_nfma (dest, x, y, z);
+}
+
+void
+test_nfms (vector_pair_f32_t *dest,
+	   vector_pair_f32_t *x,
+	   vector_pair_f32_t *y,
+	   vector_pair_f32_t *z)
+{
+  /* 2 lxvp, 2 xvnmsub{a,m}sp, 1 stxvp.  */
+  vpair_f32_nfms (dest, x, y, z);
+}
+
+/* { dg-final { scan-assembler-times {\mlxvp\M}       26 } } */
+/* { dg-final { scan-assembler-times {\mstxvp\M}      13 } } */
+/* { dg-final { scan-assembler-times {\mxvabssp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvaddsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmadd.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmaxsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvminsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvmsub.sp\M}   2 } } */
+/* { dg-final { scan-assembler-times {\mxvmulsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnabssp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvnegsp\M}     2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmadd.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvnmsub.sp\M}  2 } } */
+/* { dg-final { scan-assembler-times {\mxvsqrtsp\M}    2 } } */
+/* { dg-final { scan-assembler-times {\mxvsubsp\M}     2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c
new file mode 100644
index 00000000000..d1a1029417f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c
@@ -0,0 +1,15 @@ 
+/* { dg-do run { target { vsx_hw } } } */
+/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */
+
+/*
+ * This test of the double (f64) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are not
+ * available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mno-mma option disables GCC from enabling the __vector_pair type.
+ */
+
+#include "vpair-3.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c
new file mode 100644
index 00000000000..d78faf3fed4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c
@@ -0,0 +1,14 @@ 
+/* { dg-do run { target { power10_hw } } } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */
+
+/*
+ * This test of the double (f64) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mmma option makes sure GC enables the __vector_pair type.
+ */
+
+#include "vpair-3.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-3.h b/gcc/testsuite/gcc.target/powerpc/vpair-3.h
new file mode 100644
index 00000000000..e61ad23dd57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-3.h
@@ -0,0 +1,435 @@ 
+/* Common include file to test the vector pair double functions.  This is run
+   two times, once compiled for a non-power10 system that does not have the
+   vector pair load and store instructions, and once with power10 defaults that
+   has load/store vector pair.  */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <vector-pair.h>
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#ifndef NUM
+#define NUM	16
+#endif
+
+static double	result1[NUM];
+static double	result2[NUM];
+static double	in_a[NUM];
+static double	in_b[NUM];
+static double	in_c[NUM];
+
+/* vector pair tests.  */
+
+void
+vpair_abs (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_abs (vr + i, va + i);
+}
+
+void
+vpair_nabs (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_nabs (vr + i, va + i);
+}
+
+void
+vpair_neg (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_neg (vr + i, va + i);
+}
+
+void
+vpair_sqrt (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_sqrt (vr + i, va + i);
+}
+
+void
+vpair_add (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_add (vr + i, va + i, vb + i);
+}
+
+void
+vpair_sub (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_sub (vr + i, va + i, vb + i);
+}
+
+void
+vpair_mul (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_mul (vr + i, va + i, vb + i);
+}
+
+void
+vpair_div (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_div (vr + i, va + i, vb + i);
+}
+
+void
+vpair_min (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_min (vr + i, va + i, vb + i);
+}
+
+void
+vpair_max (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_max (vr + i, va + i, vb + i);
+}
+
+void
+vpair_fma (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+  vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_fma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_fms (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+  vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_fms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfma (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+  vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_nfma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfms (double *r, double *a, double *b, double *c, size_t num)
+{
+  vector_pair_f64_t *vr = (vector_pair_f64_t *)r;
+  vector_pair_f64_t *va = (vector_pair_f64_t *)a;
+  vector_pair_f64_t *vb = (vector_pair_f64_t *)b;
+  vector_pair_f64_t *vc = (vector_pair_f64_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f64_t) / sizeof (double));
+
+  for (i = 0; i < num2; i++)
+    vpair_f64_nfms (vr + i, va + i, vb + i, vc + i);
+}
+
+
+/* scalar tests.  */
+
+void
+scalar_abs (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < 0.0) ? -a[i] : a[i];
+}
+
+void
+scalar_nabs (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < 0.0) ? a[i] : -a[i];
+}
+
+void
+scalar_neg (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = -a[i];
+}
+
+void
+scalar_sqrt (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_sqrt (a[i]);
+}
+
+void
+scalar_add (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] + b[i];
+}
+
+void
+scalar_sub (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] - b[i];
+}
+
+void
+scalar_mul (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] * b[i];
+}
+
+void
+scalar_div (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] / b[i];
+}
+
+void
+scalar_min (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_max (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] > b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_fma (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_fms (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_nfma (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = - __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_nfms (double *r, double *a, double *b, double *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = - __builtin_fma (a[i], b[i], -c[i]);
+}
+
+
+/* Check results.  */
+void
+check (const char *name)
+{
+  size_t i;
+
+  for (i = 0; i < NUM; i++)
+    if (result1[i] != result2[i])
+      {
+#ifdef DEBUG
+	printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n",
+		(long)i,
+		result1[i],
+		result2[i],
+		name,
+		in_a[i],
+		in_b[i],
+		in_c[i]);
+#endif
+	abort ();
+      }
+
+  return;
+}
+
+typedef void func_t (double *, double *, double *, double *, size_t);
+
+/* tests to run.  */
+struct
+{
+  func_t *vpair_test;
+  func_t *scalar_test;
+  const char *name;
+} tests[] = {
+  { vpair_abs,  scalar_abs,     "abs"  }, 
+  { vpair_nabs, scalar_nabs,    "nabs" }, 
+  { vpair_neg,  scalar_neg,     "neg"  }, 
+  { vpair_sqrt, scalar_sqrt,    "sqrt" }, 
+  { vpair_add,  scalar_add,     "add"  }, 
+  { vpair_sub,  scalar_sub,     "sub"  }, 
+  { vpair_mul,  scalar_mul,     "mul"  }, 
+  { vpair_div,  scalar_div,     "div"  }, 
+  { vpair_min,  scalar_min,     "min"  }, 
+  { vpair_max,  scalar_max,     "max"  }, 
+  { vpair_fma,  scalar_fma,     "fma"  }, 
+  { vpair_fms,  scalar_fms,     "fms"  }, 
+  { vpair_nfma, scalar_nfma,    "nfma" }, 
+  { vpair_nfms, scalar_nfms,    "nfms" }, 
+};
+
+/* Run tests.  */
+
+int
+main (void)
+{
+  size_t i;
+
+  /* Initialize the inputs.  */
+  for (i = 0; i < NUM; i++)
+    {
+      double d = (double)(i + 1);
+      in_a[i] = d * d;
+      in_b[i] = d;
+      in_c[i] = d + 2.0;
+    }
+
+#ifdef DEBUG
+  printf ("Start tests\n");
+#endif
+
+  /* Run the tests.  */
+  for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++)
+    {
+      tests[i].vpair_test  (result1, in_a, in_b, in_c, NUM);
+      tests[i].scalar_test (result2, in_a, in_b, in_c, NUM);
+      check (tests[i].name);
+    }
+
+#ifdef DEBUG
+  printf ("End tests\n");
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c
new file mode 100644
index 00000000000..f57fbbf8b05
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c
@@ -0,0 +1,15 @@ 
+/* { dg-do run { target { vsx_hw } } } */
+/* { dg-options "-mvsx -O2 -ffast-math -mno-mma" } */
+
+/*
+ * This test of the float (f32) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are not
+ * available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mno-mma option disables GCC from enabling the __vector_pair type.
+ */
+
+#include "vpair-4.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c
new file mode 100644
index 00000000000..12291202c16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c
@@ -0,0 +1,14 @@ 
+/* { dg-do run { target { power10_hw } } } */
+/* { dg-options "-mdejagnu-cpu=power10 -O2 -ffast-math -mmma" } */
+
+/*
+ * This test of the float (f32) vector pair functions in vector-pair.h is run
+ * on VSX systems when the load/store vector pair instructions are available.
+ *
+ * The -ffast-math option is used to just use the hardware sqrt, min, and max
+ * instructions without calling into the library.
+ *
+ * The -mmma option makes sure GC enables the __vector_pair type.
+ */
+
+#include "vpair-4.h"
diff --git a/gcc/testsuite/gcc.target/powerpc/vpair-4.h b/gcc/testsuite/gcc.target/powerpc/vpair-4.h
new file mode 100644
index 00000000000..1a80cf5e639
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vpair-4.h
@@ -0,0 +1,435 @@ 
+/* Common include file to test the vector pair float functions.  This is run
+   two times, once compiled for a non-power10 system that does not have the
+   vector pair load and store instructions, and once with power10 defaults that
+   has load/store vector pair.  */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <vector-pair.h>
+
+#ifdef DEBUG
+#include <stdio.h>
+#endif
+
+#ifndef NUM
+#define NUM	16
+#endif
+
+static float	result1[NUM];
+static float	result2[NUM];
+static float	in_a[NUM];
+static float	in_b[NUM];
+static float	in_c[NUM];
+
+/* vector pair tests.  */
+
+void
+vpair_abs (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_abs (vr + i, va + i);
+}
+
+void
+vpair_nabs (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_nabs (vr + i, va + i);
+}
+
+void
+vpair_neg (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_neg (vr + i, va + i);
+}
+
+void
+vpair_sqrt (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_sqrt (vr + i, va + i);
+}
+
+void
+vpair_add (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_add (vr + i, va + i, vb + i);
+}
+
+void
+vpair_sub (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_sub (vr + i, va + i, vb + i);
+}
+
+void
+vpair_mul (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_mul (vr + i, va + i, vb + i);
+}
+
+void
+vpair_div (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_div (vr + i, va + i, vb + i);
+}
+
+void
+vpair_min (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_min (vr + i, va + i, vb + i);
+}
+
+void
+vpair_max (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_max (vr + i, va + i, vb + i);
+}
+
+void
+vpair_fma (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+  vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_fma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_fms (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+  vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_fms (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfma (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+  vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_nfma (vr + i, va + i, vb + i, vc + i);
+}
+
+void
+vpair_nfms (float *r, float *a, float *b, float *c, size_t num)
+{
+  vector_pair_f32_t *vr = (vector_pair_f32_t *)r;
+  vector_pair_f32_t *va = (vector_pair_f32_t *)a;
+  vector_pair_f32_t *vb = (vector_pair_f32_t *)b;
+  vector_pair_f32_t *vc = (vector_pair_f32_t *)c;
+
+  size_t i;
+  size_t num2 = num / (sizeof (vector_pair_f32_t) / sizeof (float));
+
+  for (i = 0; i < num2; i++)
+    vpair_f32_nfms (vr + i, va + i, vb + i, vc + i);
+}
+
+
+/* scalar tests.  */
+
+void
+scalar_abs (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < 0.0) ? -a[i] : a[i];
+}
+
+void
+scalar_nabs (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < 0.0) ? a[i] : -a[i];
+}
+
+void
+scalar_neg (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = -a[i];
+}
+
+void
+scalar_sqrt (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_sqrt (a[i]);
+}
+
+void
+scalar_add (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] + b[i];
+}
+
+void
+scalar_sub (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] - b[i];
+}
+
+void
+scalar_mul (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] * b[i];
+}
+
+void
+scalar_div (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = a[i] / b[i];
+}
+
+void
+scalar_min (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] < b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_max (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = (a[i] > b[i]) ? a[i] : b[i];
+}
+
+void
+scalar_fma (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_fms (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = __builtin_fma (a[i], b[i], -c[i]);
+}
+
+void
+scalar_nfma (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = - __builtin_fma (a[i], b[i], c[i]);
+}
+
+void
+scalar_nfms (float *r, float *a, float *b, float *c, size_t num)
+{
+  size_t i;
+
+  for (i = 0; i < num; i++)
+    r[i] = - __builtin_fma (a[i], b[i], -c[i]);
+}
+
+
+/* Check results.  */
+void
+check (const char *name)
+{
+  size_t i;
+
+  for (i = 0; i < NUM; i++)
+    if (result1[i] != result2[i])
+      {
+#ifdef DEBUG
+	printf ("test #%ld failed, %g != %g, %s (%g, %g, %g).\n",
+		(long)i,
+		result1[i],
+		result2[i],
+		name,
+		in_a[i],
+		in_b[i],
+		in_c[i]);
+#endif
+	abort ();
+      }
+
+  return;
+}
+
+typedef void func_t (float *, float *, float *, float *, size_t);
+
+/* tests to run.  */
+struct
+{
+  func_t *vpair_test;
+  func_t *scalar_test;
+  const char *name;
+} tests[] = {
+  { vpair_abs,  scalar_abs,     "abs"  }, 
+  { vpair_nabs, scalar_nabs,    "nabs" }, 
+  { vpair_neg,  scalar_neg,     "neg"  }, 
+  { vpair_sqrt, scalar_sqrt,    "sqrt" }, 
+  { vpair_add,  scalar_add,     "add"  }, 
+  { vpair_sub,  scalar_sub,     "sub"  }, 
+  { vpair_mul,  scalar_mul,     "mul"  }, 
+  { vpair_div,  scalar_div,     "div"  }, 
+  { vpair_min,  scalar_min,     "min"  }, 
+  { vpair_max,  scalar_max,     "max"  }, 
+  { vpair_fma,  scalar_fma,     "fma"  }, 
+  { vpair_fms,  scalar_fms,     "fms"  }, 
+  { vpair_nfma, scalar_nfma,    "nfma" }, 
+  { vpair_nfms, scalar_nfms,    "nfms" }, 
+};
+
+/* Run tests.  */
+
+int
+main (void)
+{
+  size_t i;
+
+  /* Initialize the inputs.  */
+  for (i = 0; i < NUM; i++)
+    {
+      float f = (float)(i + 1);
+      in_a[i] = f * f;
+      in_b[i] = f;
+      in_c[i] = f + 2.0f;
+    }
+
+#ifdef DEBUG
+  printf ("Start tests\n");
+#endif
+
+  /* Run the tests.  */
+  for (i = 0; i < sizeof (tests) / sizeof (tests[0]); i++)
+    {
+      tests[i].vpair_test  (result1, in_a, in_b, in_c, NUM);
+      tests[i].scalar_test (result2, in_a, in_b, in_c, NUM);
+      check (tests[i].name);
+    }
+
+#ifdef DEBUG
+  printf ("End tests\n");
+#endif
+
+  return 0;
+}