From patchwork Fri Nov 14 10:46:33 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alan Lawrence X-Patchwork-Id: 410764 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 8DD641400D2 for ; Fri, 14 Nov 2014 21:47:10 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:in-reply-to :content-type; q=dns; s=default; b=PRQ5ggo9keoZ9w+lBwHlufikR0KLL BiEJHmEtj5AqVzv+29WvNxmsOUarkUefAfHs+YB7UrZHrAs/WH7ZwXnV1mz4dYy0 DaniB8OWNxUJhdZxd8Z6N04gXl1joyXh/ZPcpIbQUYMynVskoZVNONiMkRf9VFIw 9cas9TtqzRFRuo= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender :message-id:date:from:mime-version:to:subject:in-reply-to :content-type; s=default; bh=NLGZFjdIdYO0NWkErUm5DM9mAu0=; b=sq3 zoSEgXtoDNhXREdZyoePZ+y2nucSRoSgQDmJnG7NPWOwh8B30tEbeyuHyBhEQ35P ifrOPuIBYbQqSwY/rf74z8QlWhas4BVV65Yvq7WOmIUBAD/3hAO0ZdqtTZKRrcK3 Gmogpq6laTOwS6BjVOfXjBPimpLoCiMepEAnExnA= Received: (qmail 17659 invoked by alias); 14 Nov 2014 10:46:39 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Delivered-To: mailing list gcc-patches@gcc.gnu.org Received: (qmail 17599 invoked by uid 89); 14 Nov 2014 10:46:38 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, SPF_PASS autolearn=ham version=3.3.2 X-HELO: service87.mimecast.com Received: from service87.mimecast.com (HELO service87.mimecast.com) (91.220.42.44) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Fri, 14 Nov 2014 10:46:36 +0000 Received: from cam-owa1.Emea.Arm.com (fw-tnat.cambridge.arm.com [217.140.96.21]) by service87.mimecast.com; Fri, 14 Nov 2014 10:46:34 +0000 Received: from [10.1.209.51] ([10.1.255.212]) by cam-owa1.Emea.Arm.com with Microsoft SMTPSVC(6.0.3790.3959); Fri, 14 Nov 2014 10:46:34 +0000 Message-ID: <5465DD89.5080603@arm.com> Date: Fri, 14 Nov 2014 10:46:33 +0000 From: Alan Lawrence User-Agent: Thunderbird 2.0.0.24 (X11/20101213) MIME-Version: 1.0 To: "gcc-patches@gcc.gnu.org" Subject: [PATCH 3/3][AArch64]Replace temporary assembler for vld1_dup In-Reply-To: <5465DD0E.1090207@arm.com> X-MC-Unique: 114111410463405901 X-IsSubscribed: yes This patch replaces the inline asm for vld1_dup intrinsics with a vdup_n_ and a load from the pointer. The existing *aarch64_simd_ld1r insn, combiner, etc., are quite capable of generating the expected single ld1r instruction from this. (I've verified by inspecting assembler output.) gcc/ChangeLog: * config/aarch64/arm_neon.h (vld1_dup_f32, vld1_dup_f64, vld1_dup_p8, vld1_dup_p16, vld1_dup_s8, vld1_dup_s16, vld1_dup_s32, vld1_dup_s64, vld1_dup_u8, vld1_dup_u16, vld1_dup_u32, vld1_dup_u64, vld1q_dup_f32, vld1q_dup_f64, vld1q_dup_p8, vld1q_dup_p16, vld1q_dup_s8, vld1q_dup_s16, vld1q_dup_s32, vld1q_dup_s64, vld1q_dup_u8, vld1q_dup_u16, vld1q_dup_u32, vld1q_dup_u64): Replace inline asm with vdup_n_ and pointer dereference. diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index f036f7c0ba2733a822661027b815e7c3654db1bc..61a3bd3ab59c427522087f10ddd5679d6d46019d 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -6144,270 +6144,6 @@ vhsubq_u32 (uint32x4_t a, uint32x4_t b) } __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vld1_dup_f32 (const float32_t * a) -{ - float32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) -vld1_dup_f64 (const float64_t * a) -{ - float64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) -vld1_dup_p8 (const poly8_t * a) -{ - poly8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) -vld1_dup_p16 (const poly16_t * a) -{ - poly16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vld1_dup_s8 (const int8_t * a) -{ - int8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vld1_dup_s16 (const int16_t * a) -{ - int16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vld1_dup_s32 (const int32_t * a) -{ - int32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) -vld1_dup_s64 (const int64_t * a) -{ - int64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vld1_dup_u8 (const uint8_t * a) -{ - uint8x8_t result; - __asm__ ("ld1r {%0.8b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vld1_dup_u16 (const uint16_t * a) -{ - uint16x4_t result; - __asm__ ("ld1r {%0.4h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vld1_dup_u32 (const uint32_t * a) -{ - uint32x2_t result; - __asm__ ("ld1r {%0.2s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) -vld1_dup_u64 (const uint64_t * a) -{ - uint64x1_t result; - __asm__ ("ld1r {%0.1d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_f32 (const float32_t * a) -{ - float32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_f64 (const float64_t * a) -{ - float64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_p8 (const poly8_t * a) -{ - poly8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_p16 (const poly16_t * a) -{ - poly16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_s8 (const int8_t * a) -{ - int8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_s16 (const int16_t * a) -{ - int16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_s32 (const int32_t * a) -{ - int32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_s64 (const int64_t * a) -{ - int64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vld1q_dup_u8 (const uint8_t * a) -{ - uint8x16_t result; - __asm__ ("ld1r {%0.16b}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vld1q_dup_u16 (const uint16_t * a) -{ - uint16x8_t result; - __asm__ ("ld1r {%0.8h}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vld1q_dup_u32 (const uint32_t * a) -{ - uint32x4_t result; - __asm__ ("ld1r {%0.4s}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) -vld1q_dup_u64 (const uint64_t * a) -{ - uint64x2_t result; - __asm__ ("ld1r {%0.2d}, %1" - : "=w"(result) - : "Utv"(*a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { float32x2_t result; @@ -16142,6 +15878,154 @@ vld1q_u64 (const uint64_t *a) __builtin_aarch64_ld1v2di ((const __builtin_aarch64_simd_di *) a); } +/* vld1_dup */ + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vld1_dup_f32 (const float32_t* __a) +{ + return vdup_n_f32 (*__a); +} + +__extension__ static __inline float64x1_t __attribute__ ((__always_inline__)) +vld1_dup_f64 (const float64_t* __a) +{ + return vdup_n_f64 (*__a); +} + +__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__)) +vld1_dup_p8 (const poly8_t* __a) +{ + return vdup_n_p8 (*__a); +} + +__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__)) +vld1_dup_p16 (const poly16_t* __a) +{ + return vdup_n_p16 (*__a); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vld1_dup_s8 (const int8_t* __a) +{ + return vdup_n_s8 (*__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vld1_dup_s16 (const int16_t* __a) +{ + return vdup_n_s16 (*__a); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vld1_dup_s32 (const int32_t* __a) +{ + return vdup_n_s32 (*__a); +} + +__extension__ static __inline int64x1_t __attribute__ ((__always_inline__)) +vld1_dup_s64 (const int64_t* __a) +{ + return vdup_n_s64 (*__a); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vld1_dup_u8 (const uint8_t* __a) +{ + return vdup_n_u8 (*__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vld1_dup_u16 (const uint16_t* __a) +{ + return vdup_n_u16 (*__a); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vld1_dup_u32 (const uint32_t* __a) +{ + return vdup_n_u32 (*__a); +} + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vld1_dup_u64 (const uint64_t* __a) +{ + return vdup_n_u64 (*__a); +} + +/* vld1q_dup */ + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_f32 (const float32_t* __a) +{ + return vdupq_n_f32 (*__a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_f64 (const float64_t* __a) +{ + return vdupq_n_f64 (*__a); +} + +__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_p8 (const poly8_t* __a) +{ + return vdupq_n_p8 (*__a); +} + +__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_p16 (const poly16_t* __a) +{ + return vdupq_n_p16 (*__a); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_s8 (const int8_t* __a) +{ + return vdupq_n_s8 (*__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_s16 (const int16_t* __a) +{ + return vdupq_n_s16 (*__a); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_s32 (const int32_t* __a) +{ + return vdupq_n_s32 (*__a); +} + +__extension__ static __inline int64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_s64 (const int64_t* __a) +{ + return vdupq_n_s64 (*__a); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vld1q_dup_u8 (const uint8_t* __a) +{ + return vdupq_n_u8 (*__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vld1q_dup_u16 (const uint16_t* __a) +{ + return vdupq_n_u16 (*__a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vld1q_dup_u32 (const uint32_t* __a) +{ + return vdupq_n_u32 (*__a); +} + +__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__)) +vld1q_dup_u64 (const uint64_t* __a) +{ + return vdupq_n_u64 (*__a); +} + /* vld1_lane */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))