diff mbox

[AArch64] Use MOVN to generate 64-bit negative immediates where sensible

Message ID 53E363C2.4000405@arm.com
State New
Headers show

Commit Message

Kyrylo Tkachov Aug. 7, 2014, 11:32 a.m. UTC
On 16/05/14 13:35, Richard Earnshaw wrote:
> On 08/05/14 18:36, Ian Bolton wrote:
>> Hi,
>>
>> It currently takes 4 instructions to generate certain immediates on
>> AArch64 (unless we put them in the constant pool).
>>
>> For example ...
>>
>>    long long
>>    ffffbeefcafebabe ()
>>    {
>>      return 0xFFFFBEEFCAFEBABEll;
>>    }
>>
>> leads to ...
>>
>>    mov x0, 0x47806
>>    mov x0, 0xcafe, lsl 16
>>    mov x0, 0xbeef, lsl 32
>>    orr x0, x0, -281474976710656
>>
>> The above case is tackled in this patch by employing MOVN
>> to generate the top 32-bits in a single instruction ...
>>
>>    mov x0, -71536975282177
>>    movk x0, 0xcafe, lsl 16
>>    movk x0, 0xbabe, lsl 0
>>
>> Note that where at least two half-words are 0xffff, existing
>> code that does the immediate in two instructions is still used.)
>>
>> Tested on standard gcc regressions and the attached test case.
>>
>> OK for commit?
> What about:
>
> long long a()
> {
>    return 0x1234ffff56789abcll;
> }
>
> long long b()
> {
>    return 0x12345678ffff9abcll;
> }
>
> long long c()
> {
>    return 0x123456789abcffffll;
> }
>
> ?
>
> Surely these can also benefit from this sort of optimization, but it
> looks as though you only handle the top 16 bits being set.

Hi Richard,

How about this rework of the patch?

For code:

long long foo ()
{
   return 0xFFFFBEEFCAFEBABEll;
}

long long a()
{
   return 0x1234ffff56789abcll;
}

long long b()
{
   return 0x12345678ffff9abcll;
}

long long c()
{
   return 0x123456789abcffffll;
}

we now generate:
foo:
         mov     x0, -17730
         movk    x0, 0xcafe, lsl 16
         movk    x0, 0xbeef, lsl 32
         ret
         .size   foo, .-foo
         .align  2
         .global a
         .type   a, %function
a:
         mov     x0, -25924
         movk    x0, 0x5678, lsl 16
         movk    x0, 0x1234, lsl 48
         ret
         .size   a, .-a
         .align  2
         .global b
         .type   b, %function
b:
         mov     x0, -25924
         movk    x0, 0x5678, lsl 32
         movk    x0, 0x1234, lsl 48
         ret
         .size   b, .-b
         .align  2
         .global c
         .type   c, %function
c:
         mov     x0, -1698889729
         movk    x0, 0x5678, lsl 32
         movk    x0, 0x1234, lsl 48
         ret


3 instructions are used in each case.

Thanks,
Kyrill

2014-08-07  Ian Bolton  <ian.bolton@arm.com>
                     Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

         * config/aarch64/aarch64.c (aarch64_expand_mov_immediate):
         Use MOVN when one of the half-words is 0xffff.

Comments

Richard Earnshaw Aug. 7, 2014, 12:46 p.m. UTC | #1
On 07/08/14 12:32, Kyrill Tkachov wrote:
> 
> On 16/05/14 13:35, Richard Earnshaw wrote:
>> On 08/05/14 18:36, Ian Bolton wrote:
>>> Hi,
>>>
>>> It currently takes 4 instructions to generate certain immediates on
>>> AArch64 (unless we put them in the constant pool).
>>>
>>> For example ...
>>>
>>>    long long
>>>    ffffbeefcafebabe ()
>>>    {
>>>      return 0xFFFFBEEFCAFEBABEll;
>>>    }
>>>
>>> leads to ...
>>>
>>>    mov x0, 0x47806
>>>    mov x0, 0xcafe, lsl 16
>>>    mov x0, 0xbeef, lsl 32
>>>    orr x0, x0, -281474976710656
>>>
>>> The above case is tackled in this patch by employing MOVN
>>> to generate the top 32-bits in a single instruction ...
>>>
>>>    mov x0, -71536975282177
>>>    movk x0, 0xcafe, lsl 16
>>>    movk x0, 0xbabe, lsl 0
>>>
>>> Note that where at least two half-words are 0xffff, existing
>>> code that does the immediate in two instructions is still used.)
>>>
>>> Tested on standard gcc regressions and the attached test case.
>>>
>>> OK for commit?
>> What about:
>>
>> long long a()
>> {
>>    return 0x1234ffff56789abcll;
>> }
>>
>> long long b()
>> {
>>    return 0x12345678ffff9abcll;
>> }
>>
>> long long c()
>> {
>>    return 0x123456789abcffffll;
>> }
>>
>> ?
>>
>> Surely these can also benefit from this sort of optimization, but it
>> looks as though you only handle the top 16 bits being set.
> 
> Hi Richard,
> 
> How about this rework of the patch?
> 
> For code:
> 
> long long foo ()
> {
>    return 0xFFFFBEEFCAFEBABEll;
> }
> 
> long long a()
> {
>    return 0x1234ffff56789abcll;
> }
> 
> long long b()
> {
>    return 0x12345678ffff9abcll;
> }
> 
> long long c()
> {
>    return 0x123456789abcffffll;
> }
> 
> we now generate:
> foo:
>          mov     x0, -17730
>          movk    x0, 0xcafe, lsl 16
>          movk    x0, 0xbeef, lsl 32
>          ret
>          .size   foo, .-foo
>          .align  2
>          .global a
>          .type   a, %function
> a:
>          mov     x0, -25924
>          movk    x0, 0x5678, lsl 16
>          movk    x0, 0x1234, lsl 48
>          ret
>          .size   a, .-a
>          .align  2
>          .global b
>          .type   b, %function
> b:
>          mov     x0, -25924
>          movk    x0, 0x5678, lsl 32
>          movk    x0, 0x1234, lsl 48
>          ret
>          .size   b, .-b
>          .align  2
>          .global c
>          .type   c, %function
> c:
>          mov     x0, -1698889729
>          movk    x0, 0x5678, lsl 32
>          movk    x0, 0x1234, lsl 48
>          ret
> 
> 
> 3 instructions are used in each case.
> 
> Thanks,
> Kyrill
> 
> 2014-08-07  Ian Bolton  <ian.bolton@arm.com>
>                      Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>          * config/aarch64/aarch64.c (aarch64_expand_mov_immediate):
>          Use MOVN when one of the half-words is 0xffff.
> 
> 
> aarch64-movn-pattern-patch-v3.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0a7f441..2db91c7 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -1005,7 +1005,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>    unsigned HOST_WIDE_INT val;
>    bool subtargets;
>    rtx subtarget;
> -  int one_match, zero_match;
> +  int one_match, zero_match, first_not_ffff_match;
>  
>    gcc_assert (mode == SImode || mode == DImode);
>  
> @@ -1106,29 +1106,48 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>    one_match = 0;
>    zero_match = 0;
>    mask = 0xffff;
> +  first_not_ffff_match = -1;
>  
>    for (i = 0; i < 64; i += 16, mask <<= 16)
>      {
> -      if ((val & mask) == 0)
> -	zero_match++;
> -      else if ((val & mask) == mask)
> +      if ((val & mask) == mask)
>  	one_match++;
> +      else
> +	{
> +	  if (first_not_ffff_match < 0)
> +	    first_not_ffff_match = i;
> +	  if ((val & mask) == 0)
> +	    zero_match++;
> +	}
>      }
>  
>    if (one_match == 2)
>      {
> -      mask = 0xffff;
> -      for (i = 0; i < 64; i += 16, mask <<= 16)
> +      /* Set one of the quarters and then insert back into result.  */
> +      mask = 0xffffll << first_not_ffff_match;
> +      emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
> +      emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
> +				 GEN_INT ((val >> first_not_ffff_match)
> +					  & 0xffff)));
> +      return;
> +    }
> +
> +  if (one_match == 1)

I think this should be (one_match > zero_match).

Otherwise constants such as


  0x00001234ffff0000ll

might end up taking three rather than two insns.

R.

> +    {
> +      /* Set either first three quarters or all but the third.	 */
> +      mask = 0xffffll << (16 - first_not_ffff_match);
> +      emit_insn (gen_rtx_SET (VOIDmode, dest,
> +			      GEN_INT (val | mask | 0xffffffff00000000ull)));
> +
> +      /* Now insert other two quarters.	 */
> +      for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
> +	   i < 64; i += 16, mask <<= 16)
>  	{
>  	  if ((val & mask) != mask)
> -	    {
> -	      emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
> -	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -					 GEN_INT ((val >> i) & 0xffff)));
> -	      return;
> -	    }
> +	    emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +				       GEN_INT ((val >> i) & 0xffff)));
>  	}
> -      gcc_unreachable ();
> +      return;
>      }
>  
>    if (zero_match == 2)
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0a7f441..2db91c7 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1005,7 +1005,7 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
   unsigned HOST_WIDE_INT val;
   bool subtargets;
   rtx subtarget;
-  int one_match, zero_match;
+  int one_match, zero_match, first_not_ffff_match;
 
   gcc_assert (mode == SImode || mode == DImode);
 
@@ -1106,29 +1106,48 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
   one_match = 0;
   zero_match = 0;
   mask = 0xffff;
+  first_not_ffff_match = -1;
 
   for (i = 0; i < 64; i += 16, mask <<= 16)
     {
-      if ((val & mask) == 0)
-	zero_match++;
-      else if ((val & mask) == mask)
+      if ((val & mask) == mask)
 	one_match++;
+      else
+	{
+	  if (first_not_ffff_match < 0)
+	    first_not_ffff_match = i;
+	  if ((val & mask) == 0)
+	    zero_match++;
+	}
     }
 
   if (one_match == 2)
     {
-      mask = 0xffff;
-      for (i = 0; i < 64; i += 16, mask <<= 16)
+      /* Set one of the quarters and then insert back into result.  */
+      mask = 0xffffll << first_not_ffff_match;
+      emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
+      emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
+				 GEN_INT ((val >> first_not_ffff_match)
+					  & 0xffff)));
+      return;
+    }
+
+  if (one_match == 1)
+    {
+      /* Set either first three quarters or all but the third.	 */
+      mask = 0xffffll << (16 - first_not_ffff_match);
+      emit_insn (gen_rtx_SET (VOIDmode, dest,
+			      GEN_INT (val | mask | 0xffffffff00000000ull)));
+
+      /* Now insert other two quarters.	 */
+      for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
+	   i < 64; i += 16, mask <<= 16)
 	{
 	  if ((val & mask) != mask)
-	    {
-	      emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
-	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
-					 GEN_INT ((val >> i) & 0xffff)));
-	      return;
-	    }
+	    emit_insn (gen_insv_immdi (dest, GEN_INT (i),
+				       GEN_INT ((val >> i) & 0xffff)));
 	}
-      gcc_unreachable ();
+      return;
     }
 
   if (zero_match == 2)