From patchwork Fri Jan 16 17:33:50 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alan Lawrence X-Patchwork-Id: 429950 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 92951140161 for ; Sat, 17 Jan 2015 04:34:03 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:references :in-reply-to:content-type; q=dns; s=default; b=iqfkgd5QMe9wEciRM TTw1P8XREq5dz0WmlezqdnLiGFEDIpH5V265B1gxw9Mo88h8UaxFfHGLBaXi2fAf r5zw68vIj1LA3yKQ0W6kJluqus9FxuvObiUFngvYwhjN0Y9g+jMbpei/Ffffqt2w xupnL3389p8uNaBqbJdFsSwUE0= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:references :in-reply-to:content-type; s=default; bh=e+J/8WCixp8Rb+nRiyPbVNE XnIA=; b=xVynIB21upvAeSRucpwbgQ+LACwF4fUCENcaiqV72liLgDaxY6q3orZ H/kRawfnJhQ85/0I724c99CUkz3yU93P2jcin0dt9BOo/+tiW9XaPrbhWETY8f+M XTEbkI+QEA4eC1YeZIlSMiWpu/fD1vGDO8ZojxPiWr8LHKZ3cfLE= Received: (qmail 11349 invoked by alias); 16 Jan 2015 17:33:56 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 11339 invoked by uid 89); 16 Jan 2015 17:33:55 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=AWL, BAYES_00, SPF_PASS autolearn=ham version=3.3.2 X-HELO: service87.mimecast.com Received: from service87.mimecast.com (HELO service87.mimecast.com) (91.220.42.44) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 16 Jan 2015 17:33:53 +0000 Received: from cam-owa1.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.140]) by service87.mimecast.com; Fri, 16 Jan 2015 17:33:51 +0000 Received: from [10.1.209.51] ([10.1.255.212]) by cam-owa1.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.3959); Fri, 16 Jan 2015 17:33:50 +0000 Message-ID: <54B94B7E.8080806@arm.com> Date: Fri, 16 Jan 2015 17:33:50 +0000 From: Alan Lawrence User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" Subject: [PATCH 3/4][ARM Intrinsics]float16x8_t intrinsics: vgetq_lane, vsetq_lane, vdupq_n, vdupq_lane, vld1q_lane, vld1q_dup, vreinterpretq References: <54B948CB.3010906@arm.com> In-Reply-To: <54B948CB.3010906@arm.com> X-MC-Unique: 115011617335102901 X-IsSubscribed: yes Much like the first patch, this adds the equivalent ...q... intrinsics for float16x8_t, using GCC vector extensions. gcc/ChangeLog: * config/arm/arm_neon.h (vdupq_lane_f16, vld1q_lane_f16, vld1q_dup_f16, vreinterpretq_p8_f16, vreinterpretq_f16_p8, vreinterpretq_f16_p16, vreinterpretq_f16_f32, vreinterpretq_f16_p64, vreinterpretq_f16_p128, vreinterpretq_f16_s64, vreinterpretq_f16_u64, vreinterpretq_f16_s8, vreinterpretq_f16_s16, vreinterpretq_f16_s32, vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32, vreinterpretq_f32_f16, vreinterpretq_p64_f16, vreinterpretq_p128_f16, vreinterpretq_s64_f16, vreinterpretq_u64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16, vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16, vreinterpretq_u32_f16): New. diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 7259852a6a450c5f693b03cf6342f33190f266c6..d214fd673565c1cd020203c40c514762dfead520 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -5264,6 +5264,16 @@ vgetq_lane_s32 (int32x4_t __a, const int __b) return (int32_t)__builtin_neon_vget_lanev4si (__a, __b); } +#define vgetq_lane_f16(__v, __i) \ + __extension__ \ + ({ \ + float16x8_t __vec = (__v); \ + int __idx = (__i); \ + __builtin_arm_lane_check (8, __idx); \ + float16_t __res = __vec[__idx]; \ + __res; \ + }) + __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vgetq_lane_f32 (float32x4_t __a, const int __b) { @@ -5407,6 +5417,17 @@ vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c) return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c); } +#define vsetq_lane_f16(__e, __v, __i) \ + __extension__ \ + ({ \ + float16_t __elem = (__e); \ + float16x8_t __vec = (__v); \ + int __idx = (__i); \ + __builtin_arm_lane_check (4, __idx); \ + __vec[__idx] = __elem; \ + __vec; \ + }) + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c) { @@ -5642,6 +5663,13 @@ vdupq_n_s32 (int32_t __a) return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a); } +#define vdupq_n_f16(__e1) \ + __extension__ \ + ({ \ + float16_t __e = (__e1); \ + (float16x8_t) {__e, __e, __e, __e, __e, __e, __e, __e}; \ + }) + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vdupq_n_f32 (float32_t __a) { @@ -5920,6 +5948,12 @@ vdupq_lane_s32 (int32x2_t __a, const int __b) return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b); } +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vdupq_lane_f16 (float16x8_t __a, const int __b) +{ + return vdupq_n_f16 (vgetq_lane_f16 (__a, __b)); +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vdupq_lane_f32 (float32x2_t __a, const int __b) { @@ -8903,6 +8937,12 @@ vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c) return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c); } +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vld1q_lane_f16 (const float16_t * __a, float16x8_t __b, const int __c) +{ + return vsetq_lane_f16 (*__a, __b, __c); +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c) { @@ -9057,6 +9097,12 @@ vld1q_dup_s32 (const int32_t * __a) return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a); } +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_f16 (const float16_t * __a) +{ + return vdupq_n_f16 (*__a); +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_dup_f32 (const float32_t * __a) { @@ -12851,6 +12897,12 @@ vreinterpretq_p8_p16 (poly16x8_t __a) } __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_p8_f16 (float16x8_t __a) +{ + return (poly8x16_t) __a; +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) vreinterpretq_p8_f32 (float32x4_t __a) { return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -12996,6 +13048,88 @@ vreinterpretq_p16_u32 (uint32x4_t __a) return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a); } +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_p8 (poly8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_p16 (poly16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_f32 (float32x4_t __a) +{ + return (float16x8_t) __a; +} + +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_p64 (poly64x2_t __a) +{ + return (float16x8_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_p128 (poly128_t __a) +{ + return (float16x8_t) __a; +} + +#endif +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_s64 (int64x2_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_u64 (uint64x2_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_s8 (int8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_s16 (int16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_s32 (int32x4_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_u8 (uint8x16_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_u16 (uint16x8_t __a) +{ + return (float16x8_t) __a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_f16_u32 (uint32x4_t __a) +{ + return (float16x8_t) __a; +} + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vreinterpretq_f32_p8 (poly8x16_t __a) { @@ -13008,6 +13142,12 @@ vreinterpretq_f32_p16 (poly16x8_t __a) return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a); } +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_f32_f16 (float16x8_t __a) +{ + return (float32x4_t) __a; +} + #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vreinterpretq_f32_p64 (poly64x2_t __a) @@ -13090,6 +13230,14 @@ vreinterpretq_p64_p16 (poly16x8_t __a) #endif #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline poly64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_p64_f16 (float16x8_t __a) +{ + return (poly64x2_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline poly64x2_t __attribute__ ((__always_inline__)) vreinterpretq_p64_f32 (float32x4_t __a) { return (poly64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13186,6 +13334,14 @@ vreinterpretq_p128_p16 (poly16x8_t __a) #endif #ifdef __ARM_FEATURE_CRYPTO __extension__ static __inline poly128_t __attribute__ ((__always_inline__)) +vreinterpretq_p128_f16 (float16x8_t __a) +{ + return (poly128_t) __a; +} + +#endif +#ifdef __ARM_FEATURE_CRYPTO +__extension__ static __inline poly128_t __attribute__ ((__always_inline__)) vreinterpretq_p128_f32 (float32x4_t __a) { return (poly128_t)__builtin_neon_vreinterprettiv4sf (__a); @@ -13277,6 +13433,12 @@ vreinterpretq_s64_p16 (poly16x8_t __a) } __extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_s64_f16 (float16x8_t __a) +{ + return (int64x2_t) __a; +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) vreinterpretq_s64_f32 (float32x4_t __a) { return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13353,6 +13515,12 @@ vreinterpretq_u64_p16 (poly16x8_t __a) } __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vreinterpretq_u64_f16 (float16x8_t __a) +{ + return (uint64x2_t) __a; +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) vreinterpretq_u64_f32 (float32x4_t __a) { return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a); @@ -13429,6 +13597,12 @@ vreinterpretq_s8_p16 (poly16x8_t __a) } __extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_s8_f16 (float16x8_t __a) +{ + return (int8x16_t) __a; +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) vreinterpretq_s8_f32 (float32x4_t __a) { return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -13505,6 +13679,12 @@ vreinterpretq_s16_p16 (poly16x8_t __a) } __extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_s16_f16 (float16x8_t __a) +{ + return (int16x8_t) __a; +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) vreinterpretq_s16_f32 (float32x4_t __a) { return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); @@ -13581,6 +13761,12 @@ vreinterpretq_s32_p16 (poly16x8_t __a) } __extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_s32_f16 (float16x8_t __a) +{ + return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) vreinterpretq_s32_f32 (float32x4_t __a) { return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a); @@ -13657,6 +13843,12 @@ vreinterpretq_u8_p16 (poly16x8_t __a) } __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vreinterpretq_u8_f16 (float16x8_t __a) +{ + return (uint8x16_t) __a; +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vreinterpretq_u8_f32 (float32x4_t __a) { return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a); @@ -13733,6 +13925,12 @@ vreinterpretq_u16_p16 (poly16x8_t __a) } __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vreinterpretq_u16_f16 (float16x8_t __a) +{ + return (uint16x8_t) __a; +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) vreinterpretq_u16_f32 (float32x4_t __a) { return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a); @@ -13809,6 +14007,12 @@ vreinterpretq_u32_p16 (poly16x8_t __a) } __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vreinterpretq_u32_f16 (float16x8_t __a) +{ + return (uint32x4_t) __a; +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) vreinterpretq_u32_f32 (float32x4_t __a) { return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);