From patchwork Fri Apr 11 20:09:44 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Marc Glisse X-Patchwork-Id: 338633 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 143C41400D9 for ; Sat, 12 Apr 2014 06:10:12 +1000 (EST) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type; q=dns; s= default; b=gBmsV+3zNVNpuqJI53JwU6KtXiuWactXLiQ5XeEz7ganLzDYUMyyf n8Acg+D8If7TjwuYA5kIaEhUmXwNIj3LhRMQW4cvBe71CAgnM1qOvHVaPPlZQW26 GDMYkl7l+r975emYCVztAWbv61ROg7+7d04OvnFuPmyGMvEs96qakc= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:date :from:to:subject:message-id:mime-version:content-type; s= default; bh=JMMC1nph0PNxYzWWg2h/3d8c3Oo=; b=dh0eW51cwO+Nusd5BT/E mdlWbuXk/8glShFx3swJAeN4zQZfLZYHwWcbW5lsmrwS2X3m7YXVLh8QtT9aDqrq UKi+ZE3f6kGO60SmX37wwuB9hPK1+hnPTYaToUNIzFXjDwpeW2ouMUrXma7aBkkN n6sBTkBNkkzOicmUlmf62SE= Received: (qmail 25820 invoked by alias); 11 Apr 2014 20:10:04 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 25806 invoked by uid 89); 11 Apr 2014 20:10:03 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-3.5 required=5.0 tests=AWL, BAYES_00, RP_MATCHES_RCVD autolearn=ham version=3.3.2 X-HELO: mail2-relais-roc.national.inria.fr Received: from mail2-relais-roc.national.inria.fr (HELO mail2-relais-roc.national.inria.fr) (192.134.164.83) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (CAMELLIA256-SHA encrypted) ESMTPS; Fri, 11 Apr 2014 20:10:01 +0000 Received: from stedding.saclay.inria.fr ([193.55.250.194]) by mail2-relais-roc.national.inria.fr with ESMTP/TLS/DHE-RSA-AES128-SHA; 11 Apr 2014 22:09:44 +0200 Received: from glisse (helo=localhost) by stedding.saclay.inria.fr with local-esmtp (Exim 4.82) (envelope-from ) id 1WYhlg-0005A4-Cu for gcc-patches@gcc.gnu.org; Fri, 11 Apr 2014 22:09:44 +0200 Date: Fri, 11 Apr 2014 22:09:44 +0200 (CEST) From: Marc Glisse To: gcc-patches@gcc.gnu.org Subject: [i386] Replace builtins with vector extensions Message-ID: User-Agent: Alpine 2.02 (DEB 1266 2009-07-14) MIME-Version: 1.0 Hello, the previous discussion on the topic was before we added all those #pragma target in *mmintrin.h: http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html I believe that removes a large part of the arguments against it. Note that I only did a few of the more obvious intrinsics, I am waiting to see if this patch is accepted before doing more. Bootstrap+testsuite on x86_64-linux-gnu. 2014-04-11 Marc Glisse * config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions instead of builtins. * config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd, _mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64, _mm_loadh_pd, _mm_loadl_pd): Likewise. (_mm_sqrt_sd): Fix comment. Index: gcc/config/i386/emmintrin.h =================================================================== --- gcc/config/i386/emmintrin.h (revision 209323) +++ gcc/config/i386/emmintrin.h (working copy) @@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_pd (double *__P, __m128d __A) { __builtin_ia32_storeupd (__P, __A); } /* Stores the lower DPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd (double *__P, __m128d __A) { - *__P = __builtin_ia32_vec_ext_v2df (__A, 0); + *__P = __A[0]; } extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsd_f64 (__m128d __A) { - return __builtin_ia32_vec_ext_v2df (__A, 0); + return __A[0]; } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_pd (double *__P, __m128d __A) { _mm_store_sd (__P, __A); } /* Stores the upper DPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeh_pd (double *__P, __m128d __A) { - *__P = __builtin_ia32_vec_ext_v2df (__A, 1); + *__P = __A[1]; } /* Store the lower DPFP value across two words. The address must be 16-byte aligned. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store1_pd (double *__P, __m128d __A) { _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); } @@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu _mm_cvtsi128_si32 (__m128i __A) { return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); } #ifdef __x86_64__ /* Intel intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64 (__m128i __A) { - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); + return __A[0]; } /* Microsoft intrinsic. */ extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtsi128_si64x (__m128i __A) { - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0); + return __A[0]; } #endif extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); + return __A + __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); + return __A - __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); + return __A * __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_pd (__m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B); + return __A / __B; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_pd (__m128d __A) { return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); } -/* Return pair {sqrt (A[0), B[1]}. */ +/* Return pair {sqrt (B[0]), A[1]}. */ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_sd (__m128d __A, __m128d __B) { __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_min_pd (__m128d __A, __m128d __B) { @@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storeu_si128 (__m128i *__P, __m128i __B) { __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B); } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_storel_epi64 (__m128i *__P, __m128i __B) { - *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); + *(long long *)__P = __B[0]; } extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movepi64_pi64 (__m128i __B) { - return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0); + return (__m64) __B[0]; } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movpi64_epi64 (__m64 __A) { return _mm_set_epi64 ((__m64)0LL, __A); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_move_epi64 (__m128i __A) @@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __ extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_unpacklo_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd (__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); + return __extension__ (__m128d){ __A[0], __B[0] }; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd (__m128d __A, double const *__B) { - return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); + return __extension__ (__m128d){ __B[0], __A[1] }; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd (__m128d __A) { return __builtin_ia32_movmskpd ((__v2df)__A); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_packs_epi16 (__m128i __A, __m128i __B) Index: gcc/config/i386/xmmintrin.h =================================================================== --- gcc/config/i386/xmmintrin.h (revision 209323) +++ gcc/config/i386/xmmintrin.h (working copy) @@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__ _mm_max_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); } /* Perform the respective operation on the four SPFP values in A and B. */ extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_add_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); + return __A + __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sub_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); + return __A - __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_mul_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); + return __A * __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_div_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); + return __A / __B; } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_sqrt_ps (__m128 __A) { return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); } extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_rcp_ps (__m128 __A) @@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_setr_ps (float __Z, float __Y, float __X, float __W) { return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; } /* Stores the lower SPFP value. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ss (float *__P, __m128 __A) { - *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); + *__P = __A[0]; } extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_cvtss_f32 (__m128 __A) { - return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); + return __A[0]; } /* Store four SPFP values. The address must be 16-byte aligned. */ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ps (float *__P, __m128 __A) { *(__v4sf *)__P = (__v4sf)__A; } /* Store four SPFP values. The address need not be 16-byte aligned. */