From patchwork Tue Jul 28 11:23:41 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alan Lawrence X-Patchwork-Id: 501164 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 643D7140285 for ; Tue, 28 Jul 2015 21:23:59 +1000 (AEST) Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; unprotected) header.d=gcc.gnu.org header.i=@gcc.gnu.org header.b=oTVnPOZ1; dkim-atps=neutral DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:in-reply-to :content-type; q=dns; s=default; b=uw1Nz3LXc1WyKfoUfR9aODhgzT+fm sxvJoZkIrPrAj5LYRKOUBcRCu4gVTJZuzXYOck044t1KjCvryBxcT4O9yc6tdaJi WDoUognnHgvzpCSxq5uhueH/Aaf9WqVW7oG909am0nHAOVQDdLhkmOMwUyxbov2O 1L1FdH43g8HkzM= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:in-reply-to :content-type; s=default; bh=uUT/SDgE4Neg0M1itxoDenfF7JU=; b=oTV nPOZ13pZbFcylRdNJObtfskBHAk2hIKacCsQ6H8Ytlm6E6Ux7B6Jaw3Tgv0RoCmD ZJX49lObczDU0K28pd1x/7sMF/iB7feO25kf1EczxceA+FibeAv+V5nIIBaUaYqZ 3VQVVb2+nCCr2u4yQxrx8ujdgsxGE1mnsFQAa+KQ= Received: (qmail 87833 invoked by alias); 28 Jul 2015 11:23:50 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 87784 invoked by uid 89); 28 Jul 2015 11:23:49 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-0.4 required=5.0 tests=AWL, BAYES_50, SPF_PASS autolearn=ham version=3.3.2 X-HELO: eu-smtp-delivery-143.mimecast.com Received: from eu-smtp-delivery-143.mimecast.com (HELO eu-smtp-delivery-143.mimecast.com) (207.82.80.143) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Tue, 28 Jul 2015 11:23:47 +0000 Received: from cam-owa1.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.140]) by eu-smtp-1.mimecast.com with ESMTP id uk-mta-12-Vs-BCccvSe-v9ow5epDqUw-1; Tue, 28 Jul 2015 12:23:42 +0100 Received: from [10.2.207.65] ([10.1.2.79]) by cam-owa1.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.3959); Tue, 28 Jul 2015 12:23:42 +0100 Message-ID: <55B7663D.2030700@arm.com> Date: Tue, 28 Jul 2015 12:23:41 +0100 From: Alan Lawrence User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" Subject: [PATCH 2/15][ARM] float16x4_t intrinsics in arm_neon.h In-Reply-To: <55B765DF.4040706@arm.com> X-MC-Unique: Vs-BCccvSe-v9ow5epDqUw-1 X-IsSubscribed: yes This is a respin of https://gcc.gnu.org/ml/gcc-patches/2015-07/msg00476.html. The change is to provide all the new float16 intrinsics only if we actually have an scalar __fp16 type. (This covers the intrinsics whose implementation is entirely within arm_neon.h; those requiring .md changes follow in patch 7). gcc/ChangeLog (unchanged): * config/arm/arm_neon.h (float16_t, vget_lane_f16, vset_lane_f16, vcreate_f16, vld1_lane_f16, vld1_dup_f16, vreinterpret_p8_f16, vreinterpret_p16_f16, vreinterpret_f16_p8, vreinterpret_f16_p16, vreinterpret_f16_f32, vreinterpret_f16_p64, vreinterpret_f16_s64, vreinterpret_f16_u64, vreinterpret_f16_s8, vreinterpret_f16_s16, vreinterpret_f16_s32, vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32, vreinterpret_f32_f16, vreinterpret_p64_f16, vreinterpret_s64_f16, vreinterpret_u64_f16, vreinterpret_s8_f16, vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16, vreinterpret_u16_f16, vreinterpret_u32_f16): New. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 2b30be61a46a0c906478c599a005c27cd467dfa6..3c40f9f94fae30cab5e8833d72d0ac9ff3ac7b0f 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -42,6 +42,7 @@ typedef __simd64_int16_t int16x4_t; typedef __simd64_int32_t int32x2_t; typedef __builtin_neon_di int64x1_t; #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef __fp16 float16_t; typedef __simd64_float16_t float16x4_t; #endif typedef __simd64_float32_t float32x2_t; @@ -5203,6 +5204,21 @@ vget_lane_s32 (int32x2_t __a, const int __b) return (int32_t)__builtin_neon_vget_lanev2si (__a, __b); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +/* Functions cannot accept or return __FP16 types. Even if the function + were marked always-inline so there were no call sites, the declaration + would nonetheless raise an error. Hence, we must use a macro instead. */ + +#define vget_lane_f16(__v, __idx) \ + __extension__ \ + ({ \ + float16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + float16_t __res = __vec[__idx]; \ + __res; \ + }) +#endif + __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vget_lane_f32 (float32x2_t __a, const int __b) { @@ -5335,6 +5351,18 @@ vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c) return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +#define vset_lane_f16(__e, __v, __idx) \ + __extension__ \ + ({ \ + float16_t __elem = (__e); \ + float16x4_t __vec = (__v); \ + __builtin_arm_lane_check (4, __idx); \ + __vec[__idx] = __elem; \ + __vec; \ + }) +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c) { @@ -5481,6 +5509,14 @@ vcreate_s64 (uint64_t __a) return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vcreate_f16 (uint64_t __a) +{ + return (float16x4_t) __a; +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vcreate_f32 (uint64_t __a) { @@ -8802,6 +8838,14 @@ vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c) return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vld1_lane_f16 (const float16_t * __a, float16x4_t __b, const int __c) +{ + return vset_lane_f16 (*__a, __b, __c); +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c) { @@ -8950,6 +8994,15 @@ vld1_dup_s32 (const int32_t * __a) return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vld1_dup_f16 (const float16_t * __a) +{ + float16_t __f = *__a; + return (float16x4_t) { __f, __f, __f, __f }; +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_dup_f32 (const float32_t * __a) { @@ -11833,6 +11886,14 @@ vreinterpret_p8_p16 (poly16x4_t __a) return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vreinterpret_p8_f16 (float16x4_t __a) +{ + return (poly8x8_t) __a; +} +#endif + __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) vreinterpret_p8_f32 (float32x2_t __a) { @@ -11901,6 +11962,14 @@ vreinterpret_p16_p8 (poly8x8_t __a) return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vreinterpret_p16_f16 (float16x4_t __a) +{ + return (poly16x4_t) __a; +} +#endif + __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) vreinterpret_p16_f32 (float32x2_t __a) { @@ -11963,6 +12032,104 @@ vreinterpret_p16_u32 (uint32x2_t __a) return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_p8 (poly8x8_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_p16 (poly16x4_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_f32 (float32x2_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#ifdef __ARM_FEATURE_CRYPTO +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_p64 (poly64x1_t __a) +{ + return (float16x4_t) __a; +} +#endif +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_s64 (int64x1_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_u64 (uint64x1_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_s8 (int8x8_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_s16 (int16x4_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_s32 (int32x2_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_u8 (uint8x8_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_u16 (uint16x4_t __a) +{ + return (float16x4_t) __a; +} +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vreinterpret_f16_u32 (uint32x2_t __a) +{ + return (float16x4_t) __a; +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vreinterpret_f32_p8 (poly8x8_t __a) { @@ -11975,6 +12142,14 @@ vreinterpret_f32_p16 (poly16x4_t __a) return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vreinterpret_f32_f16 (float16x4_t __a) +{ + return (float32x2_t) __a; +} +#endif + #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vreinterpret_f32_p64 (poly64x1_t __a) @@ -12047,6 +12222,17 @@ vreinterpret_p64_p16 (poly16x4_t __a) } #endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline poly64x1_t __attribute__ ((__always_inline__)) +vreinterpret_p64_f16 (float16x4_t __a) +{ + return (poly64x1_t) __a; +} +#endif +#endif + #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline poly64x1_t __attribute__ ((__always_inline__)) vreinterpret_p64_f32 (float32x2_t __a) @@ -12131,6 +12317,14 @@ vreinterpret_s64_p16 (poly16x4_t __a) return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vreinterpret_s64_f16 (float16x4_t __a) +{ + return (int64x1_t) __a; +} +#endif + __extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) vreinterpret_s64_f32 (float32x2_t __a) { @@ -12199,6 +12393,14 @@ vreinterpret_u64_p16 (poly16x4_t __a) return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vreinterpret_u64_f16 (float16x4_t __a) +{ + return (uint64x1_t) __a; +} +#endif + __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) vreinterpret_u64_f32 (float32x2_t __a) { @@ -12267,6 +12469,14 @@ vreinterpret_s8_p16 (poly16x4_t __a) return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vreinterpret_s8_f16 (float16x4_t __a) +{ + return (int8x8_t) __a; +} +#endif + __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) vreinterpret_s8_f32 (float32x2_t __a) { @@ -12335,6 +12545,14 @@ vreinterpret_s16_p16 (poly16x4_t __a) return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vreinterpret_s16_f16 (float16x4_t __a) +{ + return (int16x4_t) __a; +} +#endif + __extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) vreinterpret_s16_f32 (float32x2_t __a) { @@ -12403,6 +12621,14 @@ vreinterpret_s32_p16 (poly16x4_t __a) return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vreinterpret_s32_f16 (float16x4_t __a) +{ + return (int32x2_t) __a; +} +#endif + __extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) vreinterpret_s32_f32 (float32x2_t __a) { @@ -12471,6 +12697,14 @@ vreinterpret_u8_p16 (poly16x4_t __a) return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vreinterpret_u8_f16 (float16x4_t __a) +{ + return (uint8x8_t) __a; +} +#endif + __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vreinterpret_u8_f32 (float32x2_t __a) { @@ -12539,6 +12773,14 @@ vreinterpret_u16_p16 (poly16x4_t __a) return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vreinterpret_u16_f16 (float16x4_t __a) +{ + return (uint16x4_t) __a; +} +#endif + __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vreinterpret_u16_f32 (float32x2_t __a) { @@ -12607,6 +12849,14 @@ vreinterpret_u32_p16 (poly16x4_t __a) return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vreinterpret_u32_f16 (float16x4_t __a) +{ + return (uint32x2_t) __a; +} +#endif + __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) vreinterpret_u32_f32 (float32x2_t __a) {