diff mbox series

[v1,6/9] aarch64: Use symbols without offset to prevent relocation issues

Message ID DBBPR83MB0613D599B208585785781C22F8922@DBBPR83MB0613.EURPRD83.prod.outlook.com
State New
Headers show
Series SMALL code model fixes, optimization fixes, LTO and minimal C++ enablement | expand

Commit Message

Evgeny Karpov Sept. 2, 2024, 1:09 p.m. UTC
aarch64.cc has been updated to prevent emitting "symbol + offset"
for SYMBOL_SMALL_ABSOLUTE for the PECOFF target. "symbol + offset"
cannot be used in relocations for aarch64-w64-mingw32 due to
relocation requirements.
Instead, it will adjust the address by an offset with the
"add" instruction.

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (aarch64_load_symref_and_add_offset):
	New.
	(aarch64_expand_mov_immediate): Use
	aarch64_load_symref_and_add_offset.
---
 gcc/config/aarch64/aarch64.cc | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

Comments

Richard Sandiford Sept. 2, 2024, 3:32 p.m. UTC | #1
Evgeny Karpov <Evgeny.Karpov@microsoft.com> writes:
> aarch64.cc has been updated to prevent emitting "symbol + offset"
> for SYMBOL_SMALL_ABSOLUTE for the PECOFF target. "symbol + offset"
> cannot be used in relocations for aarch64-w64-mingw32 due to
> relocation requirements.
> Instead, it will adjust the address by an offset with the
> "add" instruction.
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (aarch64_load_symref_and_add_offset):
> 	New.
> 	(aarch64_expand_mov_immediate): Use
> 	aarch64_load_symref_and_add_offset.
> ---
>  gcc/config/aarch64/aarch64.cc | 24 ++++++++++++++++++++----
>  1 file changed, 20 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 7865484860b..1d88814f28d 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -4887,6 +4887,17 @@ aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
>  		      temp1, temp2, 0, false);
>  }
>  
> +static void
> +aarch64_load_symref_and_add_offset (scalar_int_mode mode, rtx dest, rtx src,
> +		    poly_int64 offset)

Nit: there should be a function comment, and the second line should be
indented under "scalar_int_mode".  Maybe:

/* Emit:

      TMP = SRC
      DEST = TMP + OFFSET

   given that SRC is a symbolic constant and DEST is a register.  Both SRC
   and DEST have mode MODE.  */

static void
aarch64_load_symref_and_add_offset (scalar_int_mode mode, rtx dest, rtx src,
				    poly_int64 offset)

OK with that change, thanks.

Richard

> +{
> +  gcc_assert (can_create_pseudo_p ());
> +  src = aarch64_force_temporary (mode, dest, src);
> +  aarch64_add_offset (mode, dest, src, offset,
> +		      NULL_RTX, NULL_RTX, 0, false);
> +}
> +
> +
>  /* Add DELTA to the stack pointer, marking the instructions frame-related.
>     TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
>     for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
> @@ -6054,10 +6065,8 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>  	case SYMBOL_TINY_TLSIE:
>  	  if (const_offset != 0)
>  	    {
> -	      gcc_assert(can_create_pseudo_p ());
> -	      base = aarch64_force_temporary (int_mode, dest, base);
> -	      aarch64_add_offset (int_mode, dest, base, const_offset,
> -				  NULL_RTX, NULL_RTX, 0, false);
> +	      aarch64_load_symref_and_add_offset (int_mode, dest, base,
> +						  const_offset);
>  	      return;
>  	    }
>  	  /* FALLTHRU */
> @@ -6068,6 +6077,13 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>  	case SYMBOL_TLSLE24:
>  	case SYMBOL_TLSLE32:
>  	case SYMBOL_TLSLE48:
> +	  if (TARGET_PECOFF && const_offset != 0)
> +	    {
> +	      aarch64_load_symref_and_add_offset (int_mode, dest, base,
> +						  const_offset);
> +	      return;
> +	    }
> +
>  	  aarch64_load_symref_appropriately (dest, imm, sty);
>  	  return;
Martin Storsjö Sept. 2, 2024, 7:54 p.m. UTC | #2
On Mon, 2 Sep 2024, Evgeny Karpov wrote:

> aarch64.cc has been updated to prevent emitting "symbol + offset"
> for SYMBOL_SMALL_ABSOLUTE for the PECOFF target. "symbol + offset"
> cannot be used in relocations for aarch64-w64-mingw32 due to
> relocation requirements.

What relocation requirements are these? COFF for aarch64 does permit 
symbol+offset references in most relocations.

The IMAGE_REL_ARM64_PAGEBASE_REL21, IMAGE_REL_ARM64_REL21, 
IMAGE_REL_ARM64_PAGEOFFSET_12A and IMAGE_REL_ARM64_PAGEOFFSET_12L 
relocations all do support storing a symbol offset in the immediate of the 
instruction. LLD and MS link.exe support this just fine.

The only case where symbol offsets aren't supported, is the 
IMAGE_REL_ARM64_BRANCH* relocations. MS link.exe ignores the instruction 
immediate in those, so within LLVM, we've followed suit, and avoid 
emitting those as symbol+offset. It would be easy to implement support for 
it in LLD, but then we'd end up with object files incompatible with MS 
link.exe.

The only non-obvious thing, is that for IMAGE_REL_ARM64_PAGEBASE_REL21, 
i.e. "adrp" instructions, the immediate that gets stored in the 
instruction, is the byte offset to the symbol.

After linking, when the instruction is interpreted at execution time, the 
immediate in an adrp instruction denotes the offset in units of 2^12 bytes 
- but in relocatable object files, the unit of the immediate is in single 
bytes.

If GNU ld doesn't support offsets in these relocations yet, I'd recommend 
implementing support for it there (cross check with LLD or MS link.exe how 
it behaves) rather than avoiding it in the compiler.

// Martin
Evgeny Karpov Sept. 4, 2024, 12:12 p.m. UTC | #3
Monday, September 2, 2024
Martin Storsjö <martin@martin.st> wrote:

> The only non-obvious thing, is that for IMAGE_REL_ARM64_PAGEBASE_REL21,
> i.e. "adrp" instructions, the immediate that gets stored in the
> instruction, is the byte offset to the symbol.
>
> After linking, when the instruction is interpreted at execution time, the
> immediate in an adrp instruction denotes the offset in units of 2^12 bytes
> - but in relocatable object files, the unit of the immediate is in single
> bytes.

This is exactly the reason why the fix was introduced, and it resolves
the issues detected during testing.
Here is a more detailed explanation.

1. the code without the fix
adrp        x0, symbol + 256
add         x0, x0, symbol + 256

2. the code with the fix
adrp        x0, symbol
add         x0, x0, symbol
add         x0, x0, 256


Let's consider the following example, when symbol is located at 3072.

1. Example without the fix
compilation time
adrp        x0, (3072 + 256) & ~0xFFF // x0 = 0
add         x0, x0, (3072 + 256) & 0xFFF // x0 = 3328

linking time when symbol is relocated with offset 896
adrp        x0, (0 + 896) & ~0xFFF // x0 = 0
add         x0, x0, (3328 + 896) & 0xFFF; // x0 = 128
which is wrong. it should be x0 = 3072 + 896 + 256 = 4224

2. Example with the fix
compilation time
adrp        x0, 3072 & ~0xFFF // x0 = 0
add         x0, x0, 3072 & 0xFFF // x0 = 3072
add         x0, x0, 256 // x0 = 3328

linking time when symbol is relocated with offset 896
adrp        x0, (0 + 896) & ~0xFFF // x0 = 0
add         x0, x0, (3072 + 896) & 0xFFF // x0 = 3968
add         x0, x0, 256 // x0 = 4224
x0 contains expected result.

Theoretically, the issue can be solved by changing the alignment of segments
to 4096. It might require further investigation and a follow-up patch if it works.
Even if it works, it has a downside as it increases the segment sizes.

Regards,
Evgeny
Martin Storsjö Sept. 4, 2024, 1:29 p.m. UTC | #4
On Wed, 4 Sep 2024, Evgeny Karpov wrote:

> Monday, September 2, 2024
> Martin Storsjö <martin@martin.st> wrote:
>
>> The only non-obvious thing, is that for IMAGE_REL_ARM64_PAGEBASE_REL21,
>> i.e. "adrp" instructions, the immediate that gets stored in the
>> instruction, is the byte offset to the symbol.
>>
>> After linking, when the instruction is interpreted at execution time, the
>> immediate in an adrp instruction denotes the offset in units of 2^12 bytes
>> - but in relocatable object files, the unit of the immediate is in single
>> bytes.
>
> This is exactly the reason why the fix was introduced, and it resolves
> the issues detected during testing.
> Here is a more detailed explanation.
>
> 1. the code without the fix
> adrp        x0, symbol + 256
> add         x0, x0, symbol + 256

I can attest that there are _no_ problems with using such representation 
in COFF-ARM64. Whatever issues you are having must be caused by bugs in 
your assembler, or linker, or both.

> Let's consider the following example, when symbol is located at 3072.
>
> 1. Example without the fix
> compilation time
> adrp        x0, (3072 + 256) & ~0xFFF // x0 = 0
> add         x0, x0, (3072 + 256) & 0xFFF // x0 = 3328
>
> linking time when symbol is relocated with offset 896
> adrp        x0, (0 + 896) & ~0xFFF // x0 = 0

Why did the 3072 suddenly become 0 here?

> add         x0, x0, (3328 + 896) & 0xFFF; // x0 = 128

Where did 3328 come from in your example? Wasn't "symbol" supposed to be 
at address 3072, and we're adding an offset of 896 to it?


In any case - you are misrepresenting how the relocations and immediates 
work.

If you have this on the assembly level:

     adrp x0, symbol + 896

then the assembler should output an adrp instruction, with the instruction 
immediate encoding the offset 896, with an IMAGE_REL_ARM64_PAGEBASE_REL21 
pointing at "symbol".

When the linker links this object file, it will resolve the virtual 
address of "symbol", add the offset 896, and use this as the destination 
address to calculate the final page offset, for the instruction.

I can produce a set of test data to showcase the various corner cases 
that can be relevant in handling of these relocations, which work with 
both LLD and MS link.exe, to help you pinpoint your potential bug in your 
assembler and linker, and potential misunderstandings about how these 
concepts work. I can hopefully have such a set of examples ready for you 
tonight.

// Martin
Evgeny Karpov Sept. 4, 2024, 2:10 p.m. UTC | #5
Monday, September 4, 2024
Martin Storsjö <martin@martin.st> wrote:

>> Let's consider the following example, when symbol is located at 3072.
>>
>> 1. Example without the fix
>> compilation time
>> adrp        x0, (3072 + 256) & ~0xFFF // x0 = 0
>> add         x0, x0, (3072 + 256) & 0xFFF // x0 = 3328
>>
>> linking time when symbol is relocated with offset 896
>> adrp        x0, (0 + 896) & ~0xFFF // x0 = 0
>
> Why did the 3072 suddenly become 0 here?

The test case which will be compiled.

adrp x0, symbol + 256
add  x0, x0, symbol + 256

The numbers which are presented in the example help to clarify relocation steps.
symbol is located at 3072.

compilation time
adrp x0, symbol + 256
90000000 adrp x0, 0
add  x0, x0, symbol + 256
91340000 add x0, x0, 3328

linking time when symbol is relocated with offset 896
compiled  90000000 adrp x0, 0
relocated 90000000 adrp x0, 0 // without change
((0 << 12) + 896) >> 12 = 0 // relocation calculation

>> add         x0, x0, (3328 + 896) & 0xFFF; // x0 = 128
>
> Where did 3328 come from in your example? Wasn't "symbol" supposed to be
> at address 3072, and we're adding an offset of 896 to it?

compiled  91340000 add x0, x0, 3328
relocated 91020000 add x0, x0, 128
(3328 + 896) & 0xFFF = 128 // relocation calculation

Regards,
Evgeny
Martin Storsjö Sept. 4, 2024, 3:06 p.m. UTC | #6
On Wed, 4 Sep 2024, Evgeny Karpov wrote:

> Monday, September 4, 2024
> Martin Storsjö <martin@martin.st> wrote:
>
>>> Let's consider the following example, when symbol is located at 3072.
>>>
>>> 1. Example without the fix
>>> compilation time
>>> adrp        x0, (3072 + 256) & ~0xFFF // x0 = 0
>>> add         x0, x0, (3072 + 256) & 0xFFF // x0 = 3328
>>>
>>> linking time when symbol is relocated with offset 896
>>> adrp        x0, (0 + 896) & ~0xFFF // x0 = 0
>>
>> Why did the 3072 suddenly become 0 here?
>
> The test case which will be compiled.
>
> adrp x0, symbol + 256
> add  x0, x0, symbol + 256
>
> The numbers which are presented in the example help to clarify relocation steps.
> symbol is located at 3072.
>
> compilation time
> adrp x0, symbol + 256
> 90000000 adrp x0, 0

This is your first error.

As the symbol offset is 256, you will need to encode the offset "256" in 
the instruction immediate field. Not "256 >> 12". This is the somewhat 
non-obvious part here, but this is the only way symbol offsets can work. 
This is how MS tools handle immediates in IMAGE_REL_ARM64_PAGEBASE_REL21, 
and LLVM has replicated this bit.

See 
https://github.com/llvm/llvm-project/commit/0b7bf7a2e3cb34086d6a05419319fd35ae8dd9a8#diff-502793e1256bca6339a09f5756111a947a2aeb5c600cdd22b2e1679db5ec48b0R162 
for the case where I implemented this bit in LLVM.

> add  x0, x0, symbol + 256
> 91340000 add x0, x0, 3328
>
> linking time when symbol is relocated with offset 896
> compiled  90000000 adrp x0, 0
> relocated 90000000 adrp x0, 0 // without change
> ((0 << 12) + 896) >> 12 = 0 // relocation calculation

This is the wrong calculation for how to apply a 
IMAGE_REL_ARM64_PAGEBASE_REL21 relocation.

If the instruction in the object file has the immediate obj_imm, and the 
instruction is at address instr_addr, the linker should update the 
instruction, setting the immediate to ((symbol_addr + obj_imm) >> 12 - 
instr_addr >> 12.

See 
https://github.com/llvm/llvm-project/commit/38608c0975772513007ec08116a1a3fb6160722b 
how this was implemented in LLD.

// Martin
Martin Storsjö Sept. 4, 2024, 3:24 p.m. UTC | #7
On Wed, 4 Sep 2024, Martin Storsjö wrote:

> On Wed, 4 Sep 2024, Evgeny Karpov wrote:
>
>> Monday, September 4, 2024
>> Martin Storsjö <martin@martin.st> wrote:
>> 
>>>> Let's consider the following example, when symbol is located at 3072.
>>>> 
>>>> 1. Example without the fix
>>>> compilation time
>>>> adrp        x0, (3072 + 256) & ~0xFFF // x0 = 0
>>>> add         x0, x0, (3072 + 256) & 0xFFF // x0 = 3328
>>>> 
>>>> linking time when symbol is relocated with offset 896
>>>> adrp        x0, (0 + 896) & ~0xFFF // x0 = 0
>>> 
>>> Why did the 3072 suddenly become 0 here?
>> 
>> The test case which will be compiled.
>> 
>> adrp x0, symbol + 256
>> add  x0, x0, symbol + 256
>> 
>> The numbers which are presented in the example help to clarify 
>> relocation steps.
>> symbol is located at 3072.
>> 
>> compilation time
>> adrp x0, symbol + 256
>> 90000000 adrp x0, 0
>
> This is your first error.
>
> As the symbol offset is 256, you will need to encode the offset "256" in 
> the instruction immediate field. Not "256 >> 12". This is the somewhat 
> non-obvious part here, but this is the only way symbol offsets can work. 
> This is how MS tools handle immediates in IMAGE_REL_ARM64_PAGEBASE_REL21, 
> and LLVM has replicated this bit.
>
> See 
> https://github.com/llvm/llvm-project/commit/0b7bf7a2e3cb34086d6a05419319fd35ae8dd9a8#diff-502793e1256bca6339a09f5756111a947a2aeb5c600cdd22b2e1679db5ec48b0R162 
> for the case where I implemented this bit in LLVM.

To show this in action:

$ cat adrp.s
adrp x0, symbol  + 256
add x0, x0, :lo12:symbol  + 256
$ clang -target aarch64-windows -c adrp.s
$ llvm-objdump -d -r adrp.o

adrp.o: file format coff-arm64

Disassembly of section .text:

0000000000000000 <.text>:
        0: 90000800      adrp    x0, 0x100000 <.text+0x100000>
                 0000000000000000:  IMAGE_REL_ARM64_PAGEBASE_REL21       symbol
        4: 91040000      add     x0, x0, #0x100
                 0000000000000004:  IMAGE_REL_ARM64_PAGEOFFSET_12A       symbol

The disassembly tool doesn't interpret the immediate correctly (it's not 
0x100000, it's 0x100), but the opcode and relocation info is correct.

// Martin
Evgeny Karpov Sept. 4, 2024, 3:53 p.m. UTC | #8
Monday, September 4, 2024
Martin Storsjö <martin@martin.st> wrote:

>> compilation time
>> adrp x0, symbol + 256
>> 90000000 adrp x0, 0
>
> As the symbol offset is 256, you will need to encode the offset "256" in
> the instruction immediate field. Not "256 >> 12". This is the somewhat
> non-obvious part here, but this is the only way symbol offsets can work.
> This is how MS tools handle immediates in IMAGE_REL_ARM64_PAGEBASE_REL21,
> and LLVM has replicated this bit.

This approach does not allow to address 4GB, instead it can address only 1MB.
This issue has been fixed in this patch series.

https://sourceware.org/pipermail/binutils/2024-August/136481.html

armasm produces the same opcode
for: adrp x0, symbol + 256
it will be: 90000000 adrp x0, 0

>  0000000000000000 <.text>:
>         0: 90000800      adrp    x0, 0x100000 <.text+0x100000>
>                  0000000000000000:  IMAGE_REL_ARM64_PAGEBASE_REL21       symbol
>         4: 91040000      add     x0, x0, #0x100
>                  0000000000000004:  IMAGE_REL_ARM64_PAGEOFFSET_12A       symbol
> 
> The disassembly tool doesn't interpret the immediate correctly (it's not
> 0x100000, it's 0x100), but the opcode and relocation info is correct.

Disassembly shows the correct result, 0x100 scaled by 2^12.

Regards,
Evgeny
Martin Storsjö Sept. 4, 2024, 4:29 p.m. UTC | #9
On Wed, 4 Sep 2024, Evgeny Karpov wrote:

> Monday, September 4, 2024
> Martin Storsjö <martin@martin.st> wrote:
>
>>> compilation time
>>> adrp x0, symbol + 256
>>> 90000000 adrp x0, 0
>>
>> As the symbol offset is 256, you will need to encode the offset "256" in
>> the instruction immediate field. Not "256 >> 12". This is the somewhat
>> non-obvious part here, but this is the only way symbol offsets can work.
>> This is how MS tools handle immediates in IMAGE_REL_ARM64_PAGEBASE_REL21,
>> and LLVM has replicated this bit.
>
> This approach does not allow to address 4GB, instead it can address only 1MB.
> This issue has been fixed in this patch series.
> 
> https://sourceware.org/pipermail/binutils/2024-August/136481.html

But this is not something you can redefine yourself! The relocation format 
and their behaviour is defined by Microsof (your employer?), you can't 
just change it within the scope of GNU tools because you disagree with it!


Yes, the immediate offset can be maximum 1 MB. But this doesn't mean that 
you can't address anywhere in the 4 GB address space of a PE image. It 
just means that an IMAGE_REL_ARM64_PAGEBASE_REL21 can point anywhere up to 
1 MB before/after a symbol. You can't have one single symbol at the start 
of an image and try to add a fixed 4 GB offset on top of that - that's not 
what any regular object file would do anyway.


Yes, it is possible to hit cases where you want an offset slightly larger 
than 1 MB - if you happen to have a very large object file. It's very rare 
though, but it can happen. In LLVM we fixed this by injecting extra label 
symbols with 1 MB intervals if it turns out that an individual section 
ends up larger than this, like this: 
https://github.com/llvm/llvm-project/commit/06d0d449d8555ae5f1ac33e8d4bb4ae40eb080d3

> armasm produces the same opcode
> for: adrp x0, symbol + 256
> it will be: 90000000 adrp x0, 0

It seems like armasm64 doesn't handle this case correctly, and/or is 
inconsistent.

But let's see what MSVC cl.exe does, if you don't trust my other 
references.

$ cat adrp.c
extern char array[];
char *getPtr(void) {
     return &array[256];
}
$ cl -c -O2 adrp.c -Fa
Microsoft (R) C/C++ Optimizing Compiler Version 19.41.34120 for ARM64
Copyright (C) Microsoft Corporation.  All rights reserved.

adrp.c
$ dumpbin -nologo -disasm adrp.obj

Dump of file adrp.obj

File Type: COFF OBJECT

getPtr:
   0000000000000000: 90000808  adrp        x8,array+#0x100
   0000000000000004: 91040100  add         x0,x8,array+#0x100
   0000000000000008: D65F03C0  ret
$ cat adrp.asm
[...]
 	AREA	|.text$mn|, CODE, ARM64
|getPtr| PROC
 	adrp        x8,array+#0x100
 	add         x0,x8,array+#0x100
 	ret
 	ENDP  ; |getPtr|
 	END

Unfortunately, it seems like armasm64 doesn't actually manage to assemble 
the output of MSVC in this case. If the # chars are removed, it can 
assemble it, but the offsets simply aren't encoded at all - neither for 
the adrp nor for the add. So it simply seems that armasm64 doesn't support 
immediates for symbol offsets at all.

Nevertheless, the object file format supports it just fine, MSVC cl.exe 
uses it, and link.exe handles it exactly like I've described.

// Martin
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 7865484860b..1d88814f28d 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -4887,6 +4887,17 @@  aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
 		      temp1, temp2, 0, false);
 }
 
+static void
+aarch64_load_symref_and_add_offset (scalar_int_mode mode, rtx dest, rtx src,
+		    poly_int64 offset)
+{
+  gcc_assert (can_create_pseudo_p ());
+  src = aarch64_force_temporary (mode, dest, src);
+  aarch64_add_offset (mode, dest, src, offset,
+		      NULL_RTX, NULL_RTX, 0, false);
+}
+
+
 /* Add DELTA to the stack pointer, marking the instructions frame-related.
    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
@@ -6054,10 +6065,8 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	case SYMBOL_TINY_TLSIE:
 	  if (const_offset != 0)
 	    {
-	      gcc_assert(can_create_pseudo_p ());
-	      base = aarch64_force_temporary (int_mode, dest, base);
-	      aarch64_add_offset (int_mode, dest, base, const_offset,
-				  NULL_RTX, NULL_RTX, 0, false);
+	      aarch64_load_symref_and_add_offset (int_mode, dest, base,
+						  const_offset);
 	      return;
 	    }
 	  /* FALLTHRU */
@@ -6068,6 +6077,13 @@  aarch64_expand_mov_immediate (rtx dest, rtx imm)
 	case SYMBOL_TLSLE24:
 	case SYMBOL_TLSLE32:
 	case SYMBOL_TLSLE48:
+	  if (TARGET_PECOFF && const_offset != 0)
+	    {
+	      aarch64_load_symref_and_add_offset (int_mode, dest, base,
+						  const_offset);
+	      return;
+	    }
+
 	  aarch64_load_symref_appropriately (dest, imm, sty);
 	  return;