diff mbox series

[v2] AArch64: Fix copysign patterns

Message ID PAWPR08MB8982A492455DBF862BFE973A83622@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers show
Series [v2] AArch64: Fix copysign patterns | expand

Commit Message

Wilco Dijkstra Sept. 18, 2024, 7:17 p.m. UTC
v2: Add more testcase fixes.

The current copysign pattern has a mismatch in the predicates and constraints -
operand[2] is a register_operand but also has an alternative X which allows any
operand.  Since it is a floating point operation, having an integer alternative
makes no sense.  Change the expander to always use the vector variant of copysign
which results in better code.  Add a SVE bitmask move immediate alternative to
the aarch64_simd_mov patterns so we emit a single move when SVE is available.

Passes bootstrap and regress, OK for commit?

gcc:
        * config/aarch64/aarch64.md (copysign<GPF:mode>3): Defer to AdvSIMD copysign.
        (copysign<GPF:mode>3_insn): Remove pattern.
        * config/aarch64/aarch64-simd.md (aarch64_simd_mov<VDMOV:mode>): Add SVE movimm
        alternative.
        (aarch64_simd_mov<VQMOV:mode>): Likewise.  Remove redundant V2DI check.
        (copysign<mode>3): Make global.
        (ior<mode>3<vczle><vczbe>): Move Neon immediate alternative before the SVE one.	

testsuite:
        * gcc.target/aarch64/copysign_3.c: New test.
        * gcc.target/aarch64/copysign_4.c: New test.
        * gcc.target/aarch64/fneg-abs_2.c: Allow .2s and .4s.
        * gcc.target/aarch64/sve/fneg-abs_1.c: Fixup test.
        * gcc.target/aarch64/sve/fneg-abs_2.c: Likewise.

---

Comments

Saurabh Jha Sept. 18, 2024, 7:53 p.m. UTC | #1
Hi Wilco,

Thanks for the patch. This mostly looks good. Just added a couple 
clarifications.

On 9/18/2024 8:17 PM, Wilco Dijkstra wrote:
> v2: Add more testcase fixes.
> 
> The current copysign pattern has a mismatch in the predicates and constraints -
> operand[2] is a register_operand but also has an alternative X which allows any
> operand.  Since it is a floating point operation, having an integer alternative
> makes no sense.  Change the expander to always use the vector variant of copysign
> which results in better code.  Add a SVE bitmask move immediate alternative to
> the aarch64_simd_mov patterns so we emit a single move when SVE is available.
> 
> Passes bootstrap and regress, OK for commit?
> 
> gcc:
>          * config/aarch64/aarch64.md (copysign<GPF:mode>3): Defer to AdvSIMD copysign.

Should the things after "(copysign..") be on a newline? I have mostly 
seen gcc ChangeLogs have file name and individual elements separated by 
newlines.

>          (copysign<GPF:mode>3_insn): Remove pattern.
>          * config/aarch64/aarch64-simd.md (aarch64_simd_mov<VDMOV:mode>): Add SVE movimm
>          alternative.

Similar comment about file name and the instruction pattern being on 
separate lines.

>          (aarch64_simd_mov<VQMOV:mode>): Likewise.  Remove redundant V2DI check.
>          (copysign<mode>3): Make global.
>          (ior<mode>3<vczle><vczbe>): Move Neon immediate alternative before the SVE one.	
> 
> testsuite:
>          * gcc.target/aarch64/copysign_3.c: New test.
>          * gcc.target/aarch64/copysign_4.c: New test.
>          * gcc.target/aarch64/fneg-abs_2.c: Allow .2s and .4s.
>          * gcc.target/aarch64/sve/fneg-abs_1.c: Fixup test.
>          * gcc.target/aarch64/sve/fneg-abs_2.c: Likewise.
> 
> ---
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index e70d59380ed295577721f15277c28829d42a0189..3077e920ce623c92d21193124747ff7ad010d006 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -161,6 +161,7 @@ (define_insn_and_split "*aarch64_simd_mov<VDMOV:mode>"
>        [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
>        [?r, r ; mov_reg            , *        , *] mov\t%0, %1
>        [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_immediate (operands[1], 64);
> +     [w , vsl; *                 , sve      , *] mov\t%Z0.<Vetype>, %1
>        [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
>        [w , Dx; neon_move          , simd     , 8] #
>     }
> @@ -190,6 +191,7 @@ (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>"
>        [?w , r ; multiple           , *   , 8] #
>        [?r , r ; multiple           , *   , 8] #
>        [w  , Dn; neon_move<q>       , simd, 4] << aarch64_output_simd_mov_immediate (operands[1], 128);
> +     [w  , vsl; *                 , sve,  4] mov\t%Z0.<Vetype>, %1
>        [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
>        [w  , Dx; neon_move          , simd, 8] #
>     }
> @@ -208,7 +210,6 @@ (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>"
>       else
>         {
>   	if (FP_REGNUM_P (REGNO (operands[0]))
> -	    && <MODE>mode == V2DImode
>   	    && aarch64_maybe_generate_simd_constant (operands[0], operands[1],
>   						     <MODE>mode))
>   	  ;
> @@ -648,7 +649,7 @@ (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
>     [(set_attr "type" "neon_dot<VS:q>")]
>   )
>   
> -(define_expand "copysign<mode>3"
> +(define_expand "@copysign<mode>3"
>     [(match_operand:VHSDF 0 "register_operand")
>      (match_operand:VHSDF 1 "register_operand")
>      (match_operand:VHSDF 2 "nonmemory_operand")]
> @@ -1138,10 +1139,8 @@ (define_insn "ior<mode>3<vczle><vczbe>"
>     "TARGET_SIMD"
>     {@ [ cons: =0 , 1 , 2; attrs: arch ]
>        [ w        , w , w  ; simd      ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
> -     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
> -     [ w        , 0 , Do ; simd      ] \
> -       << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \
> -					     AARCH64_CHECK_ORR);
> +     [ w        , 0 , Do ; simd      ] << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, AARCH64_CHECK_ORR);
> +     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, %2
>     }
>     [(set_attr "type" "neon_logic<q>")]
>   )
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index c54b29cd64b9e0dc6c6d12735049386ccedc5408..e9b148e59abf81cee53cb0dd846af9a62bbad294 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -7218,20 +7218,11 @@ (define_expand "lrint<GPF:mode><GPI:mode>2"
>   }
>   )
>   
> -;; For copysign (x, y), we want to generate:
> +;; For copysignf (x, y), we want to generate:
>   ;;
> -;;   LDR d2, #(1 << 63)
> -;;   BSL v2.8b, [y], [x]
> +;;	movi    v31.4s, 0x80, lsl 24
> +;;	bit     v0.16b, v1.16b, v31.16b
>   ;;
> -;; or another, equivalent, sequence using one of BSL/BIT/BIF.  Because
> -;; we expect these operations to nearly always operate on
> -;; floating-point values, we do not want the operation to be
> -;; simplified into a bit-field insert operation that operates on the
> -;; integer side, since typically that would involve three inter-bank
> -;; register copies.  As we do not expect copysign to be followed by
> -;; other logical operations on the result, it seems preferable to keep
> -;; this as an unspec operation, rather than exposing the underlying
> -;; logic to the compiler.
>   
>   (define_expand "copysign<GPF:mode>3"
>     [(match_operand:GPF 0 "register_operand")
> @@ -7239,57 +7230,22 @@ (define_expand "copysign<GPF:mode>3"
>      (match_operand:GPF 2 "nonmemory_operand")]
>     "TARGET_SIMD"
>   {
> -  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
> -			       << (GET_MODE_BITSIZE (<MODE>mode) - 1));
> -  /* copysign (x, -1) should instead be expanded as orr with the sign
> -     bit.  */
> -  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
> -  if (GET_CODE (op2_elt) == CONST_DOUBLE
> -      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
> -    {
> -      rtx v_bitmask
> -	= force_reg (V2<V_INT_EQUIV>mode,
> -		     gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
> -					      signbit_const));
> -
> -      emit_insn (gen_iorv2<v_int_equiv>3 (
> -	lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
> -	lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
> -	v_bitmask));
> -      DONE;
> -    }
> -
> -  machine_mode int_mode = <V_INT_EQUIV>mode;
> -  rtx bitmask = gen_reg_rtx (int_mode);
> -  emit_move_insn (bitmask, signbit_const);
> -  operands[2] = force_reg (<MODE>mode, operands[2]);
> -  emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
> -				       bitmask));
> +  rtx tmp = gen_reg_rtx (<VCONQ>mode);
> +  rtx op1 = lowpart_subreg (<VCONQ>mode, operands[1], <MODE>mode);
> +  rtx op2 = REG_P (operands[2])
> +	      ? lowpart_subreg (<VCONQ>mode, operands[2], <MODE>mode)
> +	      : gen_const_vec_duplicate (<VCONQ>mode, operands[2]);
> +  emit_insn (gen_copysign3 (<VCONQ>mode, tmp, op1, op2));
> +  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, tmp, <VCONQ>mode));
>     DONE;
>   }
>   )
>   
> -(define_insn "copysign<GPF:mode>3_insn"
> -  [(set (match_operand:GPF 0 "register_operand")
> -	(unspec:GPF [(match_operand:GPF 1 "register_operand")
> -		     (match_operand:GPF 2 "register_operand")
> -		     (match_operand:<V_INT_EQUIV> 3 "register_operand")]
> -	 UNSPEC_COPYSIGN))]
> -  "TARGET_SIMD"
> -  {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type  ]
> -     [ w        , w , w , 0 ; neon_bsl<q>  ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>
> -     [ w        , 0 , w , w ; neon_bsl<q>  ] bit\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype>
> -     [ w        , w , 0 , w ; neon_bsl<q>  ] bif\t%0.<Vbtype>, %1.<Vbtype>, %3.<Vbtype>
> -     [ r        , r , 0 , X ; bfm          ] bfxil\t%<w1>0, %<w1>1, #0, <sizem1>
> -  }
> -)
> -
> -
> -;; For xorsign (x, y), we want to generate:
> +;; For xorsignf (x, y), we want to generate:
>   ;;
> -;; LDR   d2, #1<<63
> -;; AND   v3.8B, v1.8B, v2.8B
> -;; EOR   v0.8B, v0.8B, v3.8B
> +;;	movi    v31.4s, 0x80, lsl 24
> +;;	and     v31.16b, v31.16b, v1.16b
> +;;	eor     v0.16b, v31.16b, v0.16b
>   ;;
>   
>   (define_expand "@xorsign<mode>3"
> diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_3.c b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..be48682420f1ff84e80af9efd9d11f64bd6e8052
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +float f1 (float x, float y)
> +{
> +  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
> +}
> +
> +double f2 (double x, double y)
> +{
> +  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
> +}
> +
> +/* { dg-final { scan-assembler-times "movi\t" 2 } } */
> +/* { dg-final { scan-assembler-not "copysign\tw" } } */
> +/* { dg-final { scan-assembler-not "dup\tw" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_4.c b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f3cec2fc9c21a4eaa3b6556479aeb15c04358a1c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=armv8-a+sve" } */
> +
> +float f1 (float x, float y)
> +{
> +  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
> +}
> +
> +double f2 (double x, double y)
> +{
> +  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
> +}
> +
> +/* { dg-final { scan-assembler-times "movi\t" 1 } } */
> +/* { dg-final { scan-assembler-times "mov\tz" 1 } } */
> +/* { dg-final { scan-assembler-not "copysign\tw" } } */
> +/* { dg-final { scan-assembler-not "dup\tw" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> index 18d10ee834d5d9b4361d890447060e78f09d3a73..1544bc5f1a736e95dd8bd2c608405aebb54ded1f 100644
> --- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
> @@ -9,7 +9,7 @@
>   
>   /*
>   ** f1:
> -**	orr	v[0-9]+.2s, #?128, lsl #?24
> +**	orr	v[0-9]+.[24]s, #?128, lsl #?24
>   **	ret
>   */
>   float32_t f1 (float32_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> index a8b27199ff83d0eebadfc7dcf03f94e1229d76b8..1ebdc6aaeb102da25ad561b24641e72a652175fa 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
> @@ -6,7 +6,7 @@
>   
>   /*
>   ** t1:
> -**	orr	z[0-9]+.s, z[0-9]+.s, #-2147483648
> +**	orr	v0.2s, #?128, lsl #?24
>   **	ret
>   */
>   float32x2_t t1 (float32x2_t a)
> @@ -16,7 +16,7 @@ float32x2_t t1 (float32x2_t a)
>   
>   /*
>   ** t2:
> -**	orr	z[0-9]+.s, z[0-9]+.s, #-2147483648
> +**	orr	v0.4s, #?128, lsl #?24
>   **	ret
>   */
>   float32x4_t t2 (float32x4_t a)
> @@ -26,7 +26,7 @@ float32x4_t t2 (float32x4_t a)
>   
>   /*
>   ** t3:
> -**	orr	z[0-9]+.d, z[0-9]+.d, #-9223372036854775808
> +**	orr	z[0-9]+.d, z[0-9]+.d, -9223372036854775808
>   **	ret
>   */
>   float64x2_t t3 (float64x2_t a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> index 19a7695e605bc8aced486a9c450d1cdc6be4691a..122152c0ebe4ea6840e418e75a2cadbfc9b0aba4 100644
> --- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
> @@ -7,7 +7,7 @@
>   
>   /*
>   ** f1:
> -**	orr	z0.s, z0.s, #-2147483648
> +**	orr	v0.4s, #?128, lsl #?24
>   **	ret
>   */
>   float32_t f1 (float32_t a)
> @@ -17,7 +17,7 @@ float32_t f1 (float32_t a)
>   
>   /*
>   ** f2:
> -**	orr	z0.d, z0.d, #-9223372036854775808
> +**	orr	z0.d, z0.d, -9223372036854775808
>   **	ret
>   */
>   float64_t f2 (float64_t a)
>
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e70d59380ed295577721f15277c28829d42a0189..3077e920ce623c92d21193124747ff7ad010d006 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -161,6 +161,7 @@  (define_insn_and_split "*aarch64_simd_mov<VDMOV:mode>"
      [?w, r ; f_mcr              , *        , *] fmov\t%d0, %1
      [?r, r ; mov_reg            , *        , *] mov\t%0, %1
      [w , Dn; neon_move<q>       , simd     , *] << aarch64_output_simd_mov_immediate (operands[1], 64);
+     [w , vsl; *                 , sve      , *] mov\t%Z0.<Vetype>, %1
      [w , Dz; f_mcr              , *        , *] fmov\t%d0, xzr
      [w , Dx; neon_move          , simd     , 8] #
   }
@@ -190,6 +191,7 @@  (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>"
      [?w , r ; multiple           , *   , 8] #
      [?r , r ; multiple           , *   , 8] #
      [w  , Dn; neon_move<q>       , simd, 4] << aarch64_output_simd_mov_immediate (operands[1], 128);
+     [w  , vsl; *                 , sve,  4] mov\t%Z0.<Vetype>, %1
      [w  , Dz; fmov               , *   , 4] fmov\t%d0, xzr
      [w  , Dx; neon_move          , simd, 8] #
   }
@@ -208,7 +210,6 @@  (define_insn_and_split "*aarch64_simd_mov<VQMOV:mode>"
     else
       {
 	if (FP_REGNUM_P (REGNO (operands[0]))
-	    && <MODE>mode == V2DImode
 	    && aarch64_maybe_generate_simd_constant (operands[0], operands[1],
 						     <MODE>mode))
 	  ;
@@ -648,7 +649,7 @@  (define_insn "aarch64_<DOTPROD_I8MM:sur>dot_lane<VB:isquadop><VS:vsi2qi><vczle><
   [(set_attr "type" "neon_dot<VS:q>")]
 )
 
-(define_expand "copysign<mode>3"
+(define_expand "@copysign<mode>3"
   [(match_operand:VHSDF 0 "register_operand")
    (match_operand:VHSDF 1 "register_operand")
    (match_operand:VHSDF 2 "nonmemory_operand")]
@@ -1138,10 +1139,8 @@  (define_insn "ior<mode>3<vczle><vczbe>"
   "TARGET_SIMD"
   {@ [ cons: =0 , 1 , 2; attrs: arch ]
      [ w        , w , w  ; simd      ] orr\t%0.<Vbtype>, %1.<Vbtype>, %2.<Vbtype>
-     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, #%2
-     [ w        , 0 , Do ; simd      ] \
-       << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, \
-					     AARCH64_CHECK_ORR);
+     [ w        , 0 , Do ; simd      ] << aarch64_output_simd_mov_immediate (operands[2], <bitsize>, AARCH64_CHECK_ORR);
+     [ w        , 0 , vsl; sve       ] orr\t%Z0.<Vetype>, %Z0.<Vetype>, %2
   }
   [(set_attr "type" "neon_logic<q>")]
 )
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c54b29cd64b9e0dc6c6d12735049386ccedc5408..e9b148e59abf81cee53cb0dd846af9a62bbad294 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -7218,20 +7218,11 @@  (define_expand "lrint<GPF:mode><GPI:mode>2"
 }
 )
 
-;; For copysign (x, y), we want to generate:
+;; For copysignf (x, y), we want to generate:
 ;;
-;;   LDR d2, #(1 << 63)
-;;   BSL v2.8b, [y], [x]
+;;	movi    v31.4s, 0x80, lsl 24
+;;	bit     v0.16b, v1.16b, v31.16b
 ;;
-;; or another, equivalent, sequence using one of BSL/BIT/BIF.  Because
-;; we expect these operations to nearly always operate on
-;; floating-point values, we do not want the operation to be
-;; simplified into a bit-field insert operation that operates on the
-;; integer side, since typically that would involve three inter-bank
-;; register copies.  As we do not expect copysign to be followed by
-;; other logical operations on the result, it seems preferable to keep
-;; this as an unspec operation, rather than exposing the underlying
-;; logic to the compiler.
 
 (define_expand "copysign<GPF:mode>3"
   [(match_operand:GPF 0 "register_operand")
@@ -7239,57 +7230,22 @@  (define_expand "copysign<GPF:mode>3"
    (match_operand:GPF 2 "nonmemory_operand")]
   "TARGET_SIMD"
 {
-  rtx signbit_const = GEN_INT (HOST_WIDE_INT_M1U
-			       << (GET_MODE_BITSIZE (<MODE>mode) - 1));
-  /* copysign (x, -1) should instead be expanded as orr with the sign
-     bit.  */
-  rtx op2_elt = unwrap_const_vec_duplicate (operands[2]);
-  if (GET_CODE (op2_elt) == CONST_DOUBLE
-      && real_isneg (CONST_DOUBLE_REAL_VALUE (op2_elt)))
-    {
-      rtx v_bitmask
-	= force_reg (V2<V_INT_EQUIV>mode,
-		     gen_const_vec_duplicate (V2<V_INT_EQUIV>mode,
-					      signbit_const));
-
-      emit_insn (gen_iorv2<v_int_equiv>3 (
-	lowpart_subreg (V2<V_INT_EQUIV>mode, operands[0], <MODE>mode),
-	lowpart_subreg (V2<V_INT_EQUIV>mode, operands[1], <MODE>mode),
-	v_bitmask));
-      DONE;
-    }
-
-  machine_mode int_mode = <V_INT_EQUIV>mode;
-  rtx bitmask = gen_reg_rtx (int_mode);
-  emit_move_insn (bitmask, signbit_const);
-  operands[2] = force_reg (<MODE>mode, operands[2]);
-  emit_insn (gen_copysign<mode>3_insn (operands[0], operands[1], operands[2],
-				       bitmask));
+  rtx tmp = gen_reg_rtx (<VCONQ>mode);
+  rtx op1 = lowpart_subreg (<VCONQ>mode, operands[1], <MODE>mode);
+  rtx op2 = REG_P (operands[2])
+	      ? lowpart_subreg (<VCONQ>mode, operands[2], <MODE>mode)
+	      : gen_const_vec_duplicate (<VCONQ>mode, operands[2]);
+  emit_insn (gen_copysign3 (<VCONQ>mode, tmp, op1, op2));
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, tmp, <VCONQ>mode));
   DONE;
 }
 )
 
-(define_insn "copysign<GPF:mode>3_insn"
-  [(set (match_operand:GPF 0 "register_operand")
-	(unspec:GPF [(match_operand:GPF 1 "register_operand")
-		     (match_operand:GPF 2 "register_operand")
-		     (match_operand:<V_INT_EQUIV> 3 "register_operand")]
-	 UNSPEC_COPYSIGN))]
-  "TARGET_SIMD"
-  {@ [ cons: =0 , 1 , 2 , 3 ; attrs: type  ]
-     [ w        , w , w , 0 ; neon_bsl<q>  ] bsl\t%0.<Vbtype>, %2.<Vbtype>, %1.<Vbtype>
-     [ w        , 0 , w , w ; neon_bsl<q>  ] bit\t%0.<Vbtype>, %2.<Vbtype>, %3.<Vbtype>
-     [ w        , w , 0 , w ; neon_bsl<q>  ] bif\t%0.<Vbtype>, %1.<Vbtype>, %3.<Vbtype>
-     [ r        , r , 0 , X ; bfm          ] bfxil\t%<w1>0, %<w1>1, #0, <sizem1>
-  }
-)
-
-
-;; For xorsign (x, y), we want to generate:
+;; For xorsignf (x, y), we want to generate:
 ;;
-;; LDR   d2, #1<<63
-;; AND   v3.8B, v1.8B, v2.8B
-;; EOR   v0.8B, v0.8B, v3.8B
+;;	movi    v31.4s, 0x80, lsl 24
+;;	and     v31.16b, v31.16b, v1.16b
+;;	eor     v0.16b, v31.16b, v0.16b
 ;;
 
 (define_expand "@xorsign<mode>3"
diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_3.c b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..be48682420f1ff84e80af9efd9d11f64bd6e8052
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/copysign_3.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+float f1 (float x, float y)
+{
+  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
+}
+
+double f2 (double x, double y)
+{
+  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
+}
+
+/* { dg-final { scan-assembler-times "movi\t" 2 } } */
+/* { dg-final { scan-assembler-not "copysign\tw" } } */
+/* { dg-final { scan-assembler-not "dup\tw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/copysign_4.c b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..f3cec2fc9c21a4eaa3b6556479aeb15c04358a1c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/copysign_4.c
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv8-a+sve" } */
+
+float f1 (float x, float y)
+{
+  return __builtin_copysignf (1.0, x) * __builtin_copysignf (1.0, y);
+}
+
+double f2 (double x, double y)
+{
+  return __builtin_copysign (1.0, x) * __builtin_copysign (1.0, y);
+}
+
+/* { dg-final { scan-assembler-times "movi\t" 1 } } */
+/* { dg-final { scan-assembler-times "mov\tz" 1 } } */
+/* { dg-final { scan-assembler-not "copysign\tw" } } */
+/* { dg-final { scan-assembler-not "dup\tw" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
index 18d10ee834d5d9b4361d890447060e78f09d3a73..1544bc5f1a736e95dd8bd2c608405aebb54ded1f 100644
--- a/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/fneg-abs_2.c
@@ -9,7 +9,7 @@ 
 
 /*
 ** f1:
-**	orr	v[0-9]+.2s, #?128, lsl #?24
+**	orr	v[0-9]+.[24]s, #?128, lsl #?24
 **	ret
 */
 float32_t f1 (float32_t a)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
index a8b27199ff83d0eebadfc7dcf03f94e1229d76b8..1ebdc6aaeb102da25ad561b24641e72a652175fa 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_1.c
@@ -6,7 +6,7 @@ 
 
 /*
 ** t1:
-**	orr	z[0-9]+.s, z[0-9]+.s, #-2147483648
+**	orr	v0.2s, #?128, lsl #?24
 **	ret
 */
 float32x2_t t1 (float32x2_t a)
@@ -16,7 +16,7 @@  float32x2_t t1 (float32x2_t a)
 
 /*
 ** t2:
-**	orr	z[0-9]+.s, z[0-9]+.s, #-2147483648
+**	orr	v0.4s, #?128, lsl #?24
 **	ret
 */
 float32x4_t t2 (float32x4_t a)
@@ -26,7 +26,7 @@  float32x4_t t2 (float32x4_t a)
 
 /*
 ** t3:
-**	orr	z[0-9]+.d, z[0-9]+.d, #-9223372036854775808
+**	orr	z[0-9]+.d, z[0-9]+.d, -9223372036854775808
 **	ret
 */
 float64x2_t t3 (float64x2_t a)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
index 19a7695e605bc8aced486a9c450d1cdc6be4691a..122152c0ebe4ea6840e418e75a2cadbfc9b0aba4 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fneg-abs_2.c
@@ -7,7 +7,7 @@ 
 
 /*
 ** f1:
-**	orr	z0.s, z0.s, #-2147483648
+**	orr	v0.4s, #?128, lsl #?24
 **	ret
 */
 float32_t f1 (float32_t a)
@@ -17,7 +17,7 @@  float32_t f1 (float32_t a)
 
 /*
 ** f2:
-**	orr	z0.d, z0.d, #-9223372036854775808
+**	orr	z0.d, z0.d, -9223372036854775808
 **	ret
 */
 float64_t f2 (float64_t a)