diff mbox series

[v3,6/6] rs6000: Guard some x86 intrinsics implementations

Message ID 20210823190310.1679905-7-pc@us.ibm.com
State New
Headers show
Series rs6000: Support more SSE4 intrinsics | expand

Commit Message

Paul A. Clarke Aug. 23, 2021, 7:03 p.m. UTC
Some compatibility implementations of x86 intrinsics include
Power intrinsics which require POWER8.  Guard them.

emmintrin.h:
- _mm_cmpord_pd: Remove code which was ostensibly for pre-POWER8,
  but which indeed depended on POWER8 (vec_cmpgt(v2du)/vcmpgtud).
  The "POWER8" version works fine on pre-POWER8.
- _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.
pmmintrin.h:
- _mm_movehdup_ps: vec_mergeo(v4su) uses vmrgow.
- _mm_moveldup_ps: vec_mergee(v4su) uses vmrgew.
smmintrin.h:
- _mm_cmpeq_epi64: vec_cmpeq(v2di) uses vcmpequd.
- _mm_mul_epi32: vec_mule(v4si) uses vmuluwm.
- _mm_cmpgt_epi64: vec_cmpgt(v2di) uses vcmpgtsd.
tmmintrin.h:
- _mm_sign_epi8: vec_neg(v4si) uses vsububm.
- _mm_sign_epi16: vec_neg(v4si) uses vsubuhm.
- _mm_sign_epi32: vec_neg(v4si) uses vsubuwm.
  Note that the above three could actually be supported pre-POWER8,
  but current GCC does not support them before POWER8.
- _mm_sign_pi8: depends on _mm_sign_epi8.
- _mm_sign_pi16: depends on _mm_sign_epi16.
- _mm_sign_pi32: depends on _mm_sign_epi32.

2021-08-20  Paul A. Clarke  <pc@us.ibm.com>

gcc
	PR target/101893
	* config/rs6000/emmintrin.h: Guard POWER8 intrinsics.
	* config/rs6000/pmmintrin.h: Same.
	* config/rs6000/smmintrin.h: Same.
	* config/rs6000/tmmintrin.h: Same.
---
v3: No change.
v2:
- Ensured that new "#ifdef _ARCH_PWR8" bracket each function so
  impacted, rather than groups of functions, per v1 review.
- Noted testing in patch series cover letter.
- Added PR number to commit message.

 gcc/config/rs6000/emmintrin.h | 12 ++----------
 gcc/config/rs6000/pmmintrin.h |  4 ++++
 gcc/config/rs6000/smmintrin.h |  4 ++++
 gcc/config/rs6000/tmmintrin.h | 12 ++++++++++++
 4 files changed, 22 insertions(+), 10 deletions(-)

Comments

Li, Pan2 via Gcc-patches Aug. 27, 2021, 3:25 p.m. UTC | #1
Hi Paul,

Thanks for the changes!  This looks fine to me, recommend approval.

Thanks,
Bill

On 8/23/21 2:03 PM, Paul A. Clarke wrote:
> Some compatibility implementations of x86 intrinsics include
> Power intrinsics which require POWER8.  Guard them.
>
> emmintrin.h:
> - _mm_cmpord_pd: Remove code which was ostensibly for pre-POWER8,
>    but which indeed depended on POWER8 (vec_cmpgt(v2du)/vcmpgtud).
>    The "POWER8" version works fine on pre-POWER8.
> - _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.
> pmmintrin.h:
> - _mm_movehdup_ps: vec_mergeo(v4su) uses vmrgow.
> - _mm_moveldup_ps: vec_mergee(v4su) uses vmrgew.
> smmintrin.h:
> - _mm_cmpeq_epi64: vec_cmpeq(v2di) uses vcmpequd.
> - _mm_mul_epi32: vec_mule(v4si) uses vmuluwm.
> - _mm_cmpgt_epi64: vec_cmpgt(v2di) uses vcmpgtsd.
> tmmintrin.h:
> - _mm_sign_epi8: vec_neg(v4si) uses vsububm.
> - _mm_sign_epi16: vec_neg(v4si) uses vsubuhm.
> - _mm_sign_epi32: vec_neg(v4si) uses vsubuwm.
>    Note that the above three could actually be supported pre-POWER8,
>    but current GCC does not support them before POWER8.
> - _mm_sign_pi8: depends on _mm_sign_epi8.
> - _mm_sign_pi16: depends on _mm_sign_epi16.
> - _mm_sign_pi32: depends on _mm_sign_epi32.
>
> 2021-08-20  Paul A. Clarke  <pc@us.ibm.com>
>
> gcc
> 	PR target/101893
> 	* config/rs6000/emmintrin.h: Guard POWER8 intrinsics.
> 	* config/rs6000/pmmintrin.h: Same.
> 	* config/rs6000/smmintrin.h: Same.
> 	* config/rs6000/tmmintrin.h: Same.
> ---
> v3: No change.
> v2:
> - Ensured that new "#ifdef _ARCH_PWR8" bracket each function so
>    impacted, rather than groups of functions, per v1 review.
> - Noted testing in patch series cover letter.
> - Added PR number to commit message.
>
>   gcc/config/rs6000/emmintrin.h | 12 ++----------
>   gcc/config/rs6000/pmmintrin.h |  4 ++++
>   gcc/config/rs6000/smmintrin.h |  4 ++++
>   gcc/config/rs6000/tmmintrin.h | 12 ++++++++++++
>   4 files changed, 22 insertions(+), 10 deletions(-)
>
> diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h
> index ce1287edf782..32ad72b4cc35 100644
> --- a/gcc/config/rs6000/emmintrin.h
> +++ b/gcc/config/rs6000/emmintrin.h
> @@ -430,20 +430,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
>   extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_cmpord_pd (__m128d __A, __m128d __B)
>   {
> -#if _ARCH_PWR8
>     __v2du c, d;
>     /* Compare against self will return false (0's) if NAN.  */
>     c = (__v2du)vec_cmpeq (__A, __A);
>     d = (__v2du)vec_cmpeq (__B, __B);
> -#else
> -  __v2du a, b;
> -  __v2du c, d;
> -  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
> -  a = (__v2du)vec_abs ((__v2df)__A);
> -  b = (__v2du)vec_abs ((__v2df)__B);
> -  c = (__v2du)vec_cmpgt (double_exp_mask, a);
> -  d = (__v2du)vec_cmpgt (double_exp_mask, b);
> -#endif
>     /* A != NAN and B != NAN.  */
>     return ((__m128d)vec_and(c, d));
>   }
> @@ -1472,6 +1462,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
>     return ((__m64)a * (__m64)b);
>   }
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_mul_epu32 (__m128i __A, __m128i __B)
>   {
> @@ -1498,6 +1489,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
>     return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
>   #endif
>   }
> +#endif
>   
>   extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_slli_epi16 (__m128i __A, int __B)
> diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h
> index eab712fdfa66..83dff1d85666 100644
> --- a/gcc/config/rs6000/pmmintrin.h
> +++ b/gcc/config/rs6000/pmmintrin.h
> @@ -123,17 +123,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y)
>   			    vec_mergel ((__v2df) __X, (__v2df)__Y));
>   }
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_movehdup_ps (__m128 __X)
>   {
>     return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_moveldup_ps (__m128 __X)
>   {
>     return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
>   }
> +#endif
>   
>   extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_loaddup_pd (double const *__P)
> diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
> index c04d2bb5b6d3..29719367e205 100644
> --- a/gcc/config/rs6000/smmintrin.h
> +++ b/gcc/config/rs6000/smmintrin.h
> @@ -272,6 +272,7 @@ _mm_extract_ps (__m128 __X, const int __N)
>     return ((__v4si)__X)[__N & 3];
>   }
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
>   {
> @@ -283,6 +284,7 @@ _mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
>     #endif
>     return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
>   }
> +#endif
>   
>   extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
> @@ -343,6 +345,7 @@ _mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
>     return (__m128d) __r;
>   }
>   
> +#ifdef _ARCH_PWR8
>   __inline __m128d
>   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
> @@ -351,6 +354,7 @@ _mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
>     const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
>     return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
>   }
> +#endif
>   
>   __inline int
>   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> diff --git a/gcc/config/rs6000/tmmintrin.h b/gcc/config/rs6000/tmmintrin.h
> index 971511260b78..a67d88c8079a 100644
> --- a/gcc/config/rs6000/tmmintrin.h
> +++ b/gcc/config/rs6000/tmmintrin.h
> @@ -350,6 +350,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
>     return (__m64) ((__v2du) (__C))[0];
>   }
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128i
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_epi8 (__m128i __A, __m128i __B)
> @@ -361,7 +362,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B)
>     __v16qi __conv = vec_add (__selectneg, __selectpos);
>     return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128i
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_epi16 (__m128i __A, __m128i __B)
> @@ -373,7 +376,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B)
>     __v8hi __conv = vec_add (__selectneg, __selectpos);
>     return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m128i
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_epi32 (__m128i __A, __m128i __B)
> @@ -385,7 +390,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B)
>     __v4si __conv = vec_add (__selectneg, __selectpos);
>     return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m64
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_pi8 (__m64 __A, __m64 __B)
> @@ -396,7 +403,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B)
>     __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
>     return (__m64) ((__v2du) (__C))[0];
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m64
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_pi16 (__m64 __A, __m64 __B)
> @@ -407,7 +416,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B)
>     __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
>     return (__m64) ((__v2du) (__C))[0];
>   }
> +#endif
>   
> +#ifdef _ARCH_PWR8
>   extern __inline __m64
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_sign_pi32 (__m64 __A, __m64 __B)
> @@ -418,6 +429,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B)
>     __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
>     return (__m64) ((__v2du) (__C))[0];
>   }
> +#endif
>   
>   extern __inline __m128i
>   __attribute__((__gnu_inline__, __always_inline__, __artificial__))
Segher Boessenkool Oct. 12, 2021, 12:11 a.m. UTC | #2
Hi!

On Mon, Aug 23, 2021 at 02:03:10PM -0500, Paul A. Clarke wrote:
> Some compatibility implementations of x86 intrinsics include
> Power intrinsics which require POWER8.  Guard them.

> emmintrin.h:
> - _mm_cmpord_pd: Remove code which was ostensibly for pre-POWER8,
>   but which indeed depended on POWER8 (vec_cmpgt(v2du)/vcmpgtud).
>   The "POWER8" version works fine on pre-POWER8.

Huh.  It just generates xvcmpeqdp I suppose?

> - _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.

Did this fail on p7?  If not, add a test that *does*?

> pmmintrin.h:
> - _mm_movehdup_ps: vec_mergeo(v4su) uses vmrgow.
> - _mm_moveldup_ps: vec_mergee(v4su) uses vmrgew.

Similar.

> smmintrin.h:
> - _mm_cmpeq_epi64: vec_cmpeq(v2di) uses vcmpequd.
> - _mm_mul_epi32: vec_mule(v4si) uses vmuluwm.
> - _mm_cmpgt_epi64: vec_cmpgt(v2di) uses vcmpgtsd.
> tmmintrin.h:
> - _mm_sign_epi8: vec_neg(v4si) uses vsububm.
> - _mm_sign_epi16: vec_neg(v4si) uses vsubuhm.
> - _mm_sign_epi32: vec_neg(v4si) uses vsubuwm.
>   Note that the above three could actually be supported pre-POWER8,
>   but current GCC does not support them before POWER8.
> - _mm_sign_pi8: depends on _mm_sign_epi8.
> - _mm_sign_pi16: depends on _mm_sign_epi16.
> - _mm_sign_pi32: depends on _mm_sign_epi32.

And more.

> gcc
> 	PR target/101893

This is a different bug (the vgbdd one)?

All looks good, but we need such failing tests :-)


Segher
Paul A. Clarke Oct. 13, 2021, 5:04 p.m. UTC | #3
On Mon, Oct 11, 2021 at 07:11:13PM -0500, Segher Boessenkool wrote:
> On Mon, Aug 23, 2021 at 02:03:10PM -0500, Paul A. Clarke wrote:
> > Some compatibility implementations of x86 intrinsics include
> > Power intrinsics which require POWER8.  Guard them.
> 
> > emmintrin.h:
> > - _mm_cmpord_pd: Remove code which was ostensibly for pre-POWER8,
> >   but which indeed depended on POWER8 (vec_cmpgt(v2du)/vcmpgtud).
> >   The "POWER8" version works fine on pre-POWER8.
> 
> Huh.  It just generates xvcmpeqdp I suppose?

Yes.

> > - _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.
> 
> Did this fail on p7?  If not, add a test that *does*?

Do you mean fail if not for "dg-require-effective-target p8vector_hw"?
We have that, in gcc/testsuite/gcc.target/powerpc/sse2-pmuludq-1.c.

> > pmmintrin.h:
> > - _mm_movehdup_ps: vec_mergeo(v4su) uses vmrgow.
> > - _mm_moveldup_ps: vec_mergee(v4su) uses vmrgew.
> 
> Similar.

gcc/testsuite/gcc.target/powerpc/sse3-movshdup.c
gcc/testsuite/gcc.target/powerpc/sse3-movsldup.c

> > smmintrin.h:
> > - _mm_cmpeq_epi64: vec_cmpeq(v2di) uses vcmpequd.
> > - _mm_mul_epi32: vec_mule(v4si) uses vmuluwm.
> > - _mm_cmpgt_epi64: vec_cmpgt(v2di) uses vcmpgtsd.
> > tmmintrin.h:
> > - _mm_sign_epi8: vec_neg(v4si) uses vsububm.
> > - _mm_sign_epi16: vec_neg(v4si) uses vsubuhm.
> > - _mm_sign_epi32: vec_neg(v4si) uses vsubuwm.
> >   Note that the above three could actually be supported pre-POWER8,
> >   but current GCC does not support them before POWER8.
> > - _mm_sign_pi8: depends on _mm_sign_epi8.
> > - _mm_sign_pi16: depends on _mm_sign_epi16.
> > - _mm_sign_pi32: depends on _mm_sign_epi32.
> 
> And more.

gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
- although this one will _actually_ fail on P7, as it only requires
"vsx_hw". I'll fix this.
gcc/testsuite/gcc.target/powerpc/ssse3-psignb.c
gcc/testsuite/gcc.target/powerpc/ssse3-psignw.c
gcc/testsuite/gcc.target/powerpc/ssse3-psignd.c

> > gcc
> > 	PR target/101893
> 
> This is a different bug (the vgbdd one)?

PR 101893 is the same issue: things not being properly masked by
#ifdefs.

> All looks good, but we need such failing tests :-)

Thanks for the review! Let me know what you mean by "failing tests".
("Would fail if not for ..."?)

PC
Segher Boessenkool Oct. 13, 2021, 11:47 p.m. UTC | #4
On Wed, Oct 13, 2021 at 12:04:39PM -0500, Paul A. Clarke wrote:
> On Mon, Oct 11, 2021 at 07:11:13PM -0500, Segher Boessenkool wrote:
> > > - _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.
> > 
> > Did this fail on p7?  If not, add a test that *does*?
> 
> Do you mean fail if not for "dg-require-effective-target p8vector_hw"?
> We have that, in gcc/testsuite/gcc.target/powerpc/sse2-pmuludq-1.c.

"Some compatibility implementations of x86 intrinsics include
Power intrinsics which require POWER8."

Plus, everything this patch does.  None of that would be needed if it
worked on p7!

So things in this patch are either not needed (so add noise only, and
reduce functionality on older systems for no reason), or they do fix a
bug.  It would be nice if we could have detected such bugs earlier.

> > > gcc
> > > 	PR target/101893
> > 
> > This is a different bug (the vgbdd one)?
> 
> PR 101893 is the same issue: things not being properly masked by
> #ifdefs.

But PR101893 does not mention anything you touch here, and this patch
does not fix PR101893.  The main purpose of bug tracking systems is the
tracking part!


Segher
Paul A. Clarke Oct. 19, 2021, 12:26 a.m. UTC | #5
On Wed, Oct 13, 2021 at 06:47:21PM -0500, Segher Boessenkool wrote:
> On Wed, Oct 13, 2021 at 12:04:39PM -0500, Paul A. Clarke wrote:
> > On Mon, Oct 11, 2021 at 07:11:13PM -0500, Segher Boessenkool wrote:
> > > > - _mm_mul_epu32: vec_mule(v4su) uses vmuleuw.
> > > 
> > > Did this fail on p7?  If not, add a test that *does*?
> > 
> > Do you mean fail if not for "dg-require-effective-target p8vector_hw"?
> > We have that, in gcc/testsuite/gcc.target/powerpc/sse2-pmuludq-1.c.
> 
> "Some compatibility implementations of x86 intrinsics include
> Power intrinsics which require POWER8."
> 
> Plus, everything this patch does.  None of that would be needed if it
> worked on p7!

The tests that are permitted to compile/link on P7, gated by dg directives,
work on P7.

> So things in this patch are either not needed (so add noise only, and
> reduce functionality on older systems for no reason), or they do fix a
> bug.  It would be nice if we could have detected such bugs earlier.

Most, if not all of the intrinsics tests were originally limited to
P8 and up, 64bit, and little-endian. At your request, I have lowered
many of those restrictions in areas that are capable of support.
Such is the case here, to enable compiling and running as much as
possible on P7.

If you want a different approach, do let me know.

> > > > gcc
> > > > 	PR target/101893
> > > 
> > > This is a different bug (the vgbdd one)?
> > 
> > PR 101893 is the same issue: things not being properly masked by
> > #ifdefs.
> 
> But PR101893 does not mention anything you touch here, and this patch
> does not fix PR101893.  The main purpose of bug tracking systems is the
> tracking part!

The error message in PR101893 is in smmintrin.h:
| gcc/include/smmintrin.h:103:3: error: AltiVec argument passed to unprototyped function
| 
| That line is
| 
|   __charmask = vec_gb (__charmask);

smmintrin.h is changed by this patch, including `#ifdef _ARCH_PWR8` around
the code which has vec_gb.

PC
diff mbox series

Patch

diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h
index ce1287edf782..32ad72b4cc35 100644
--- a/gcc/config/rs6000/emmintrin.h
+++ b/gcc/config/rs6000/emmintrin.h
@@ -430,20 +430,10 @@  _mm_cmpnge_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpord_pd (__m128d __A, __m128d __B)
 {
-#if _ARCH_PWR8
   __v2du c, d;
   /* Compare against self will return false (0's) if NAN.  */
   c = (__v2du)vec_cmpeq (__A, __A);
   d = (__v2du)vec_cmpeq (__B, __B);
-#else
-  __v2du a, b;
-  __v2du c, d;
-  const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
-  a = (__v2du)vec_abs ((__v2df)__A);
-  b = (__v2du)vec_abs ((__v2df)__B);
-  c = (__v2du)vec_cmpgt (double_exp_mask, a);
-  d = (__v2du)vec_cmpgt (double_exp_mask, b);
-#endif
   /* A != NAN and B != NAN.  */
   return ((__m128d)vec_and(c, d));
 }
@@ -1472,6 +1462,7 @@  _mm_mul_su32 (__m64 __A, __m64 __B)
   return ((__m64)a * (__m64)b);
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
@@ -1498,6 +1489,7 @@  _mm_mul_epu32 (__m128i __A, __m128i __B)
   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
 #endif
 }
+#endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h
index eab712fdfa66..83dff1d85666 100644
--- a/gcc/config/rs6000/pmmintrin.h
+++ b/gcc/config/rs6000/pmmintrin.h
@@ -123,17 +123,21 @@  _mm_hsub_pd (__m128d __X, __m128d __Y)
 			    vec_mergel ((__v2df) __X, (__v2df)__Y));
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movehdup_ps (__m128 __X)
 {
   return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_moveldup_ps (__m128 __X)
 {
   return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
 }
+#endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loaddup_pd (double const *__P)
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index c04d2bb5b6d3..29719367e205 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -272,6 +272,7 @@  _mm_extract_ps (__m128 __X, const int __N)
   return ((__v4si)__X)[__N & 3];
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
 {
@@ -283,6 +284,7 @@  _mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
   #endif
   return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
 }
+#endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
@@ -343,6 +345,7 @@  _mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
   return (__m128d) __r;
 }
 
+#ifdef _ARCH_PWR8
 __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
@@ -351,6 +354,7 @@  _mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
   const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
   return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
 }
+#endif
 
 __inline int
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/config/rs6000/tmmintrin.h b/gcc/config/rs6000/tmmintrin.h
index 971511260b78..a67d88c8079a 100644
--- a/gcc/config/rs6000/tmmintrin.h
+++ b/gcc/config/rs6000/tmmintrin.h
@@ -350,6 +350,7 @@  _mm_shuffle_pi8 (__m64 __A, __m64 __B)
   return (__m64) ((__v2du) (__C))[0];
 }
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi8 (__m128i __A, __m128i __B)
@@ -361,7 +362,9 @@  _mm_sign_epi8 (__m128i __A, __m128i __B)
   __v16qi __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi16 (__m128i __A, __m128i __B)
@@ -373,7 +376,9 @@  _mm_sign_epi16 (__m128i __A, __m128i __B)
   __v8hi __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_epi32 (__m128i __A, __m128i __B)
@@ -385,7 +390,9 @@  _mm_sign_epi32 (__m128i __A, __m128i __B)
   __v4si __conv = vec_add (__selectneg, __selectpos);
   return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi8 (__m64 __A, __m64 __B)
@@ -396,7 +403,9 @@  _mm_sign_pi8 (__m64 __A, __m64 __B)
   __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi16 (__m64 __A, __m64 __B)
@@ -407,7 +416,9 @@  _mm_sign_pi16 (__m64 __A, __m64 __B)
   __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
+#ifdef _ARCH_PWR8
 extern __inline __m64
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sign_pi32 (__m64 __A, __m64 __B)
@@ -418,6 +429,7 @@  _mm_sign_pi32 (__m64 __A, __m64 __B)
   __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
   return (__m64) ((__v2du) (__C))[0];
 }
+#endif
 
 extern __inline __m128i
 __attribute__((__gnu_inline__, __always_inline__, __artificial__))