diff mbox

[07/13] libffi: Support go closures on x86_64

Message ID 1412973773-3942-8-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Oct. 10, 2014, 8:42 p.m. UTC
Still missing changes for darwin, win64, and all 32-bit abis.
Dumps all of the hand-coded unwind info for gas generated, as
I can't be bothered to do the updates by hand again.
---
 libffi/src/x86/ffi64.c     | 103 ++++++++++-----
 libffi/src/x86/ffitarget.h |   2 +
 libffi/src/x86/unix64.S    | 319 ++++++++++++++++++++++-----------------------
 3 files changed, 230 insertions(+), 194 deletions(-)
diff mbox

Patch

diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index 1daa1c0..428168c 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -31,6 +31,7 @@ 
 
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdint.h>
 
 #ifdef __x86_64__
 
@@ -48,10 +49,12 @@  struct register_args
   /* Registers for argument passing.  */
   UINT64 gpr[MAX_GPR_REGS];
   UINT128 sse[MAX_SSE_REGS];
+  UINT64 rax;	/* ssecount */
+  UINT64 r10;	/* static chain */
 };
 
 extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-			     void *raddr, void (*fnaddr)(void), unsigned ssecount);
+			     void *raddr, void (*fnaddr)(void)) FFI_HIDDEN;
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -341,6 +344,9 @@  ffi_prep_cif_machdep (ffi_cif *cif)
   enum x86_64_reg_class classes[MAX_CLASSES];
   size_t bytes;
 
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
   gprcount = ssecount = 0;
 
   flags = cif->rtype->type;
@@ -402,8 +408,9 @@  ffi_prep_cif_machdep (ffi_cif *cif)
   return FFI_OK;
 }
 
-void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static void
+ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	      void **avalue, void *closure)
 {
   enum x86_64_reg_class classes[MAX_CLASSES];
   char *stack, *argp;
@@ -428,6 +435,8 @@  ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
   reg_args = (struct register_args *) stack;
   argp = stack + sizeof (struct register_args);
 
+  reg_args->r10 = (unsigned long) closure;
+
   gprcount = ssecount = 0;
 
   /* If the return value is passed in memory, add the pointer as the
@@ -488,13 +497,27 @@  ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 	    }
 	}
     }
+  reg_args->rax = ssecount;
 
   ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn, ssecount);
+		   cif->flags, rvalue, fn);
 }
 
+void
+ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, NULL);
+}
+
+void
+ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue,
+	     void **avalue, void *closure)
+{
+  ffi_call_int (cif, fn, rvalue, avalue, closure);
+}
 
-extern void ffi_closure_unix64(void);
+extern void ffi_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_closure_unix64_sse(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -503,29 +526,26 @@  ffi_prep_closure_loc (ffi_closure* closure,
 		      void *user_data,
 		      void *codeloc)
 {
-  volatile unsigned short *tramp;
-
-  /* Sanity check on the cif ABI.  */
-  {
-    int abi = cif->abi;
-    if (UNLIKELY (! (abi > FFI_FIRST_ABI && abi < FFI_LAST_ABI)))
-      return FFI_BAD_ABI;
-  }
-
-  tramp = (volatile unsigned short *) &closure->tramp[0];
+  static const unsigned char trampoline[16] = {
+    /* leaq  -0x7(%rip),%r10   # 0x0  */
+    0x4c, 0x8d, 0x15, 0xf9, 0xff, 0xff, 0xff,
+    /* jmpq  *0x3(%rip)        # 0x10 */
+    0xff, 0x25, 0x03, 0x00, 0x00, 0x00,
+    /* nopl  (%rax) */
+    0x0f, 0x1f, 0x00
+  };
+  void (*dest)(void);
 
-  tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
-  *((unsigned long long * volatile) &tramp[1])
-    = (unsigned long) ffi_closure_unix64;
-  tramp[5] = 0xba49;		/* mov <data>, %r10	*/
-  *((unsigned long long * volatile) &tramp[6])
-    = (unsigned long) codeloc;
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
 
-  /* Set the carry bit iff the function uses any sse registers.
-     This is clc or stc, together with the first byte of the jmp.  */
-  tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
+  if (cif->flags & (1 << 11))
+    dest = ffi_closure_unix64_sse;
+  else
+    dest = ffi_closure_unix64;
 
-  tramp[11] = 0xe3ff;			/* jmp *%r11    */
+  memcpy (closure->tramp, trampoline, sizeof(trampoline));
+  *(UINT64 *)(closure->tramp + 16) = (uintptr_t)dest;
 
   closure->cif = cif;
   closure->fun = fun;
@@ -534,18 +554,20 @@  ffi_prep_closure_loc (ffi_closure* closure,
   return FFI_OK;
 }
 
-int
-ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
-			 struct register_args *reg_args, char *argp)
+int FFI_HIDDEN
+ffi_closure_unix64_inner(ffi_cif *cif,
+			 void (*fun)(ffi_cif*, void*, void**, void*),
+			 void *user_data,
+			 void *rvalue,
+			 struct register_args *reg_args,
+			 char *argp)
 {
-  ffi_cif *cif;
   void **avalue;
   ffi_type **arg_types;
   long i, avn;
   int gprcount, ssecount, ngpr, nsse;
   int ret;
 
-  cif = closure->cif;
   avalue = alloca(cif->nargs * sizeof(void *));
   gprcount = ssecount = 0;
 
@@ -634,10 +656,29 @@  ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
     }
 
   /* Invoke the closure.  */
-  closure->fun (cif, rvalue, avalue, closure->user_data);
+  fun (cif, rvalue, avalue, user_data);
 
   /* Tell assembly how to perform return type promotions.  */
   return ret;
 }
 
+extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
+extern void ffi_go_closure_unix64_sse(void) FFI_HIDDEN;
+
+ffi_status
+ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
+		     void (*fun)(ffi_cif*, void*, void**, void*))
+{
+  if (cif->abi != FFI_UNIX64)
+    return FFI_BAD_ABI;
+
+  closure->tramp = (cif->flags & (1 << 11)
+		    ? ffi_go_closure_unix64_sse
+		    : ffi_go_closure_unix64);
+  closure->cif = cif;
+  closure->fun = fun;
+
+  return FFI_OK;
+}
+
 #endif /* __x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index 46f294c..592d6f8 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -111,6 +111,8 @@  typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
+#define FFI_GO_CLOSURES 1
+
 #define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
 #define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
 #define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S
index 7a6619a..3881f51 100644
--- a/libffi/src/x86/unix64.S
+++ b/libffi/src/x86/unix64.S
@@ -41,10 +41,11 @@ 
 
 	.align	2
 	.globl	ffi_call_unix64
+	.hidden	ffi_call_unix64
 	.type	ffi_call_unix64,@function
 
 ffi_call_unix64:
-.LUW0:
+	.cfi_startproc
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -52,24 +53,36 @@  ffi_call_unix64:
 	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
 	movq	%r10, 24(%rax)		/* Relocate return address.  */
 	movq	%rax, %rbp		/* Finalize local stack frame.  */
-.LUW1:
+
+	/* New stack frame based off rbp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-8, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	.cfi_def_cfa %rbp, 32
+	.cfi_rel_offset %rbp, 16
+
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 	movl	%r9d, %eax		/* Set number of SSE registers.  */
 
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
+	movq	0x08(%r10), %rsi
+	movq	0x10(%r10), %rdx
+	movq	0x18(%r10), %rcx
+	movq	0x20(%r10), %r8
+	movq	0x28(%r10), %r9
+	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
 	jnz	.Lload_sse
 .Lret_from_load_sse:
 
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
+	/* Deallocate the reg arg area, except for r10, then load via pop.  */
+	leaq	0xb8(%r10), %rsp
+	popq	%r10
 
 	/* Call the user function.  */
 	call	*%r11
@@ -80,7 +93,9 @@  ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-.LUW2:
+	.cfi_remember_state
+	.cfi_def_cfa %rsp, 8
+	.cfi_restore %rbp
 
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	movzbl	%cl, %r10d
@@ -89,6 +104,8 @@  ffi_call_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lstore_table:
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
 	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
@@ -105,6 +122,7 @@  ffi_call_unix64:
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
 	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lst_void:
@@ -187,49 +205,83 @@  ffi_call_unix64:
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
 	.align 2
-.LUW3:
+	.cfi_restore_state
 .Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
+	movdqa	0x30(%r10), %xmm0
+	movdqa	0x40(%r10), %xmm1
+	movdqa	0x50(%r10), %xmm2
+	movdqa	0x60(%r10), %xmm3
+	movdqa	0x70(%r10), %xmm4
+	movdqa	0x80(%r10), %xmm5
+	movdqa	0x90(%r10), %xmm6
+	movdqa	0xa0(%r10), %xmm7
 	jmp	.Lret_from_load_sse
 
-.LUW4:
+	.cfi_endproc
 	.size    ffi_call_unix64,.-ffi_call_unix64
 
+/* 6 general registers, 8 vector registers,
+   16 bytes of rvalue, 8 bytes of alignment.  */
+#define ffi_closure_OFS_G	0
+#define ffi_closure_OFS_V	(6*8)
+#define ffi_closure_OFS_RVALUE	(ffi_closure_OFS_V + 8*16)
+#define ffi_closure_FS		(ffi_closure_OFS_RVALUE + 16 + 8)
+
+/* The location of rvalue within the red zone after deallocating the frame.  */
+#define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
+
+	.align	2
+	.globl	ffi_closure_unix64_sse
+	.hidden	ffi_closure_unix64_sse
+	.type	ffi_closure_unix64_sse,@function
+
+ffi_closure_unix64_sse:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	.cfi_endproc
+	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+
 	.align	2
-	.globl ffi_closure_unix64
+	.globl	ffi_closure_unix64
+	.hidden	ffi_closure_unix64
 	.type	ffi_closure_unix64,@function
 
 ffi_closure_unix64:
-.LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-.LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      .Lsave_sse
-.Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	ffi_closure_unix64_inner@PLT
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	24(%r10), %rdi				/* Load cif */
+	movq	32(%r10), %rsi				/* Load fun */
+	movq	40(%r10), %rdx				/* Load user_data */
+.Ldo_closure:
+	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
+	movq	%rsp, %r8				/* Load reg_args */
+	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
+	call	ffi_closure_unix64_inner
 
 	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-.LUW7:
+	addq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	movzbl	%al, %r10d
@@ -238,6 +290,8 @@  ffi_closure_unix64:
 	addq	%r11, %r10
 	jmp	*%r10
 
+	.section .rodata
+	.align	2
 .Lload_table:
 	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
@@ -254,6 +308,7 @@  ffi_closure_unix64:
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
 	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
+	.previous
 
 	.align 2
 .Lld_void:
@@ -261,32 +316,32 @@  ffi_closure_unix64:
 
 	.align 2
 .Lld_int8:
-	movzbl	-24(%rsp), %eax
+	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int16:
-	movzwl	-24(%rsp), %eax
+	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int32:
-	movl	-24(%rsp), %eax
+	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 	.align 2
 .Lld_int64:
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
 
 	.align 2
 .Lld_float:
-	movss	-24(%rsp), %xmm0
+	movss	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_double:
-	movsd	-24(%rsp), %xmm0
+	movsd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
 	.align 2
 .Lld_ldouble:
-	fldt	-24(%rsp)
+	fldt	ffi_closure_RED_RVALUE(%rsp)
 	ret
 
 	.align 2
@@ -296,131 +351,69 @@  ffi_closure_unix64:
 	   both rdx and xmm1 with the second word.  For the remaining,
 	   bit 8 set means xmm0 gets the second word, and bit 9 means
 	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
+	movq	ffi_closure_RED_RVALUE(%rsp), %rcx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
+	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
 	testl	$0x100, %eax
 	cmovnz	%rdx, %rcx
 	movd	%rcx, %xmm0
 	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
+	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
 
-	/* See the comment above .Lload_sse; the same logic applies here.  */
-	.align 2
-.LUW8:
-.Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	.Lret_from_save_sse
-
-.LUW9:
+	.cfi_endproc
 	.size	ffi_closure_unix64,.-ffi_closure_unix64
 
-#ifdef HAVE_AS_X86_64_UNWIND_SECTION_TYPE
-	.section	.eh_frame,"a",@unwind
-#else
-	.section	.eh_frame,"a",@progbits
-#endif
-.Lframe1:
-	.long	.LECIE1-.LSCIE1		/* CIE Length */
-.LSCIE1:
-	.long	0			/* CIE Identifier Tag */
-	.byte	1			/* CIE Version */
-	.ascii "zR\0"			/* CIE Augmentation */
-	.uleb128 1			/* CIE Code Alignment Factor */
-	.sleb128 -8			/* CIE Data Alignment Factor */
-	.byte	0x10			/* CIE RA Column */
-	.uleb128 1			/* Augmentation size */
-	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
-	.uleb128 1
-	.align 8
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW0-.			/* FDE initial location */
-#else
-	.long	.LUW0@rel
-#endif
-	.long	.LUW4-.LUW0		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW1-.LUW0
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.uleb128 6
-	.uleb128 32
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.uleb128 2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW2-.LUW1
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.uleb128 7
-	.uleb128 8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW3-.LUW2
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE1:
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-#if HAVE_AS_X86_PCREL
-	.long	.LUW5-.			/* FDE initial location */
-#else
-	.long	.LUW5@rel
-#endif
-	.long	.LUW9-.LUW5		/* FDE address range */
-	.uleb128 0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW6-.LUW5
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 208
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW7-.LUW6
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.uleb128 8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.long	.LUW8-.LUW7
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align 8
-.LEFDE3:
+	.align	2
+	.globl	ffi_go_closure_unix64_sse
+	.hidden	ffi_go_closure_unix64_sse
+	.type	ffi_go_closure_unix64_sse,@function
+
+ffi_go_closure_unix64_sse:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
+	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
+	movdqa	%xmm2, ffi_closure_OFS_V+0x20(%rsp)
+	movdqa	%xmm3, ffi_closure_OFS_V+0x30(%rsp)
+	movdqa	%xmm4, ffi_closure_OFS_V+0x40(%rsp)
+	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
+	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
+	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
+	jmp	0f
+
+	.cfi_endproc
+	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
 
-#endif /* __x86_64__ */
+	.align	2
+	.globl	ffi_go_closure_unix64
+	.hidden	ffi_go_closure_unix64
+	.type	ffi_go_closure_unix64,@function
+
+ffi_go_closure_unix64:
+	.cfi_startproc
+	subq	$ffi_closure_FS, %rsp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+0:
+	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
+	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
+	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
+	movq    %rcx, ffi_closure_OFS_G+0x18(%rsp)
+	movq    %r8,  ffi_closure_OFS_G+0x20(%rsp)
+	movq    %r9,  ffi_closure_OFS_G+0x28(%rsp)
+
+	movq	8(%r10), %rdi		/* Load cif */
+	movq	16(%r10), %rsi		/* Load fun */
+	movq	%r10, %rdx		/* Load closure (user_data) */
+	jmp	.Ldo_closure
+
+	.cfi_endproc
+	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
+#endif /* x86_64 */