diff mbox

[12/13] libffi: Rewrite i386 sysv

Message ID 1412973773-3942-13-git-send-email-rth@redhat.com
State New
Headers show

Commit Message

Richard Henderson Oct. 10, 2014, 8:42 p.m. UTC
(1) Invent a new "internal.h" rather than polluting the public ffitarget.h
    with stuff that ought not be exposed.

(2) Reduce the ifdefs to a minimum.  Support the windows and sysv abis at
    the same time.  After all, it's possible to write functions for any of
    these abis with gcc at any time with attributes.

(3) The Win64 port should be (but hasn't been) moved to ffi64.c so that we
    can call between the abis on unix too.  Again, one can always use
    attributes.

(4) Don't use the out-dated prep_args callback form for ffi_call.

(5) Assume gas .cfi directives.  The pic/non-pic paths are IMO impossible
    to maintain with hand-written unwind info.

---
 libffi/src/x86/ffi.c       | 1097 ++++++++++++++++----------------------------
 libffi/src/x86/ffitarget.h |  112 ++---
 libffi/src/x86/internal.h  |   48 ++
 libffi/src/x86/sysv.S      |  932 +++++++++++++++++++++----------------
 4 files changed, 1049 insertions(+), 1140 deletions(-)
 create mode 100644 libffi/src/x86/internal.h
diff mbox

Patch

diff --git a/libffi/src/x86/ffi.c b/libffi/src/x86/ffi.c
index 6338de2..e3f82ef 100644
--- a/libffi/src/x86/ffi.c
+++ b/libffi/src/x86/ffi.c
@@ -28,620 +28,261 @@ 
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#if !defined(__x86_64__) || defined(_WIN64) || defined(__CYGWIN__)
-
-#ifdef _WIN64
-#include <windows.h>
-#endif
+#ifndef __x86_64__
 
+#include <stdlib.h>
 #include <ffi.h>
 #include <ffi_common.h>
+#include "internal.h"
 
-#include <stdlib.h>
 
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments */
-
-void ffi_prep_args(char *stack, extended_cif *ecif)
+/* Perform machine dependent cif processing */
+ffi_status FFI_HIDDEN
+ffi_prep_cif_machdep(ffi_cif *cif)
 {
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-#ifdef X86_WIN32
-  size_t p_stack_args[2];
-  void *p_stack_data[2];
-  char *argp2 = stack;
-  int stack_args_count = 0;
-  int cabi = ecif->cif->abi;
-#endif
-
-  argp = stack;
-
-  if ((ecif->cif->flags == FFI_TYPE_STRUCT
-       || ecif->cif->flags == FFI_TYPE_MS_STRUCT)
-#ifdef X86_WIN64
-      && (ecif->cif->rtype->size != 1 && ecif->cif->rtype->size != 2
-          && ecif->cif->rtype->size != 4 && ecif->cif->rtype->size != 8)
-#endif
-      )
-    {
-      *(void **) argp = ecif->rvalue;
-#ifdef X86_WIN32
-      /* For fastcall/thiscall this is first register-passed
-         argument.  */
-      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
-	{
-	  p_stack_args[stack_args_count] = sizeof (void*);
-	  p_stack_data[stack_args_count] = argp;
-	  ++stack_args_count;
-	}
-#endif
-      argp += sizeof(void*);
-    }
+  size_t bytes = 0;
+  int i, n, flags, cabi = cif->abi;
 
-  p_argv = ecif->avalue;
-
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       i != 0;
-       i--, p_arg++)
+  switch (cabi)
     {
-      size_t z;
-
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp)
-        argp = (char *) ALIGN(argp, sizeof(void*));
-
-      z = (*p_arg)->size;
-#ifdef X86_WIN64
-      if (z > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && (z != 1 && z != 2 && z != 4 && z != 8))
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
-          || ((*p_arg)->type == FFI_TYPE_LONGDOUBLE)
-#endif
-          )
-        {
-          z = sizeof(ffi_arg);
-          *(void **)argp = *p_argv;
-        }
-      else if ((*p_arg)->type == FFI_TYPE_FLOAT)
-        {
-          memcpy(argp, *p_argv, z);
-        }
-      else
-#endif
-      if (z < sizeof(ffi_arg))
-        {
-          z = sizeof(ffi_arg);
-          switch ((*p_arg)->type)
-            {
-            case FFI_TYPE_SINT8:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT8:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT8 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT16:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT16:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT16 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_SINT32:
-              *(ffi_sarg *) argp = (ffi_sarg)*(SINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_UINT32:
-              *(ffi_arg *) argp = (ffi_arg)*(UINT32 *)(* p_argv);
-              break;
-
-            case FFI_TYPE_STRUCT:
-              *(ffi_arg *) argp = *(ffi_arg *)(* p_argv);
-              break;
-
-            default:
-              FFI_ASSERT(0);
-            }
-        }
-      else
-        {
-          memcpy(argp, *p_argv, z);
-        }
-
-#ifdef X86_WIN32
-    /* For thiscall/fastcall convention register-passed arguments
-       are the first two none-floating-point arguments with a size
-       smaller or equal to sizeof (void*).  */
-    if ((cabi == FFI_THISCALL && stack_args_count < 1)
-        || (cabi == FFI_FASTCALL && stack_args_count < 2))
-      {
-	if (z <= 4
-	    && ((*p_arg)->type != FFI_TYPE_FLOAT
-	        && (*p_arg)->type != FFI_TYPE_STRUCT))
-	  {
-	    p_stack_args[stack_args_count] = z;
-	    p_stack_data[stack_args_count] = argp;
-	    ++stack_args_count;
-	  }
-      }
-#endif
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
-      argp += z;
-#endif
-    }
-
-#ifdef X86_WIN32
-  /* We need to move the register-passed arguments for thiscall/fastcall
-     on top of stack, so that those can be moved to registers ecx/edx by
-     call-handler.  */
-  if (stack_args_count > 0)
-    {
-      size_t zz = (p_stack_args[0] + 3) & ~3;
-      char *h;
-
-      /* Move first argument to top-stack position.  */
-      if (p_stack_data[0] != argp2)
-	{
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[0], zz);
-	  memmove (argp2 + zz, argp2,
-	           (size_t) ((char *) p_stack_data[0] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
-
-      argp2 += zz;
-      --stack_args_count;
-      if (zz > 4)
-	stack_args_count = 0;
-
-      /* If we have a second argument, then move it on top
-         after the first one.  */
-      if (stack_args_count > 0 && p_stack_data[1] != argp2)
-	{
-	  zz = p_stack_args[1];
-	  zz = (zz + 3) & ~3;
-	  h = alloca (zz + 1);
-	  h = alloca (zz + 1);
-	  memcpy (h, p_stack_data[1], zz);
-	  memmove (argp2 + zz, argp2, (size_t) ((char*) p_stack_data[1] - (char*)argp2));
-	  memcpy (argp2, h, zz);
-	}
+    case FFI_SYSV:
+    case FFI_STDCALL:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+      break;
+    default:
+      return FFI_BAD_ABI;
     }
-#endif
-  return;
-}
-
-/* Perform machine dependent cif processing */
-ffi_status ffi_prep_cif_machdep(ffi_cif *cif)
-{
-  unsigned int i;
-  ffi_type **ptr;
 
-  /* Set the return type flag */
   switch (cif->rtype->type)
     {
     case FFI_TYPE_VOID:
-    case FFI_TYPE_UINT8:
-    case FFI_TYPE_UINT16:
-    case FFI_TYPE_SINT8:
-    case FFI_TYPE_SINT16:
-#ifdef X86_WIN64
-    case FFI_TYPE_UINT32:
+      flags = X86_RET_VOID;
+      break;
+    case FFI_TYPE_INT:
     case FFI_TYPE_SINT32:
-#endif
-    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+      flags = X86_RET_INT32;
+      break;
     case FFI_TYPE_FLOAT:
+      flags = X86_RET_FLOAT;
+      break;
     case FFI_TYPE_DOUBLE:
-#ifndef X86_WIN64
-#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
+      flags = X86_RET_DOUBLE;
+      break;
     case FFI_TYPE_LONGDOUBLE:
-#endif
-#endif
-      cif->flags = (unsigned) cif->rtype->type;
+      flags = X86_RET_LDOUBLE;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = X86_RET_UINT8;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = X86_RET_SINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = X86_RET_UINT16;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = X86_RET_SINT16;
       break;
-
     case FFI_TYPE_UINT64:
-#ifdef X86_WIN64
-    case FFI_TYPE_POINTER:
-#endif
-      cif->flags = FFI_TYPE_SINT64;
+    case FFI_TYPE_SINT64:
+      flags = X86_RET_INT64;
       break;
-
     case FFI_TYPE_STRUCT:
-#ifndef X86
-      if (cif->rtype->size == 1)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_1B; /* same as char size */
-        }
-      else if (cif->rtype->size == 2)
-        {
-          cif->flags = FFI_TYPE_SMALL_STRUCT_2B; /* same as short size */
-        }
-      else if (cif->rtype->size == 4)
-        {
-#ifdef X86_WIN64
-          cif->flags = FFI_TYPE_SMALL_STRUCT_4B;
-#else
-          cif->flags = FFI_TYPE_INT; /* same as int type */
-#endif
-        }
-      else if (cif->rtype->size == 8)
-        {
-          cif->flags = FFI_TYPE_SINT64; /* same as int64 type */
-        }
-      else
-#endif
-        {
-#ifdef X86_WIN32
-          if (cif->abi == FFI_MS_CDECL)
-            cif->flags = FFI_TYPE_MS_STRUCT;
-          else
-#endif
-            cif->flags = FFI_TYPE_STRUCT;
-          /* allocate space for return value pointer */
-          cif->bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
-        }
+      switch (cabi)
+	{
+	case FFI_THISCALL:
+	case FFI_FASTCALL:
+	case FFI_MS_CDECL:
+	  flags = X86_RET_STRUCTECX;
+	  break;
+	default:
+	  flags = X86_RET_STRUCTPOP;
+	  break;
+	}
+      /* Allocate space for return value pointer.  */
+      bytes += ALIGN(sizeof(void*), FFI_SIZEOF_ARG);
       break;
-
     default:
-#ifdef X86_WIN64
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-    case FFI_TYPE_INT:
-      cif->flags = FFI_TYPE_SINT32;
-#else
-      cif->flags = FFI_TYPE_INT;
-#endif
-      break;
+      abort();
     }
+  cif->flags = flags;
 
-  for (ptr = cif->arg_types, i = cif->nargs; i > 0; i--, ptr++)
+  for (i = 0, n = cif->nargs; i < n; ++i)
     {
-      if (((*ptr)->alignment - 1) & cif->bytes)
-        cif->bytes = ALIGN(cif->bytes, (*ptr)->alignment);
-      cif->bytes += ALIGN((*ptr)->size, FFI_SIZEOF_ARG);
-    }
-
-#ifdef X86_WIN64
-  /* ensure space for storing four registers */
-  cif->bytes += 4 * sizeof(ffi_arg);
-#endif
+      ffi_type *t = cif->arg_types[i];
 
-#ifdef X86_DARWIN
-  cif->bytes = (cif->bytes + 15) & ~0xF;
-#endif
+      bytes = ALIGN (bytes, t->alignment);
+      bytes += ALIGN(t->size, FFI_SIZEOF_ARG);
+    }
 
   return FFI_OK;
 }
 
-#ifdef X86_WIN64
-extern int
-ffi_call_win64(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned *, void (*fn)(void));
-#elif defined(X86_WIN32)
-extern void
-ffi_call_win32(void (*)(char *, extended_cif *), extended_cif *,
-               unsigned, unsigned, unsigned, unsigned *, void (*fn)(void));
-#else
-extern void ffi_call_SYSV(void (*)(char *, extended_cif *), extended_cif *,
-                          unsigned, unsigned, unsigned *, void (*fn)(void));
-#endif
-
-void ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+static ffi_arg
+extend_basic_type(void *arg, int type)
 {
-  extended_cif ecif;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
-
-#ifdef X86_WIN64
-  if (rvalue == NULL
-      && cif->flags == FFI_TYPE_STRUCT
-      && cif->rtype->size != 1 && cif->rtype->size != 2
-      && cif->rtype->size != 4 && cif->rtype->size != 8)
+  switch (type)
     {
-      ecif.rvalue = alloca((cif->rtype->size + 0xF) & ~0xF);
-    }
-#else
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
-    {
-      ecif.rvalue = alloca(cif->rtype->size);
-    }
-#endif
-  else
-    ecif.rvalue = rvalue;
-    
-  
-  switch (cif->abi) 
-    {
-#ifdef X86_WIN64
-    case FFI_WIN64:
-      ffi_call_win64(ffi_prep_args, &ecif, cif->bytes,
-                     cif->flags, ecif.rvalue, fn);
-      break;
-#elif defined(X86_WIN32)
-    case FFI_SYSV:
-    case FFI_STDCALL:
-    case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
-      break;
-    case FFI_THISCALL:
-    case FFI_FASTCALL:
-      {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
+    case FFI_TYPE_SINT8:
+      return *(SINT8 *)arg;
+    case FFI_TYPE_UINT8:
+      return *(UINT8 *)arg;
+    case FFI_TYPE_SINT16:
+      return *(SINT16 *)arg;
+    case FFI_TYPE_UINT16:
+      return *(UINT16 *)arg;
 
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
-	  {
-	    size_t sz;
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_POINTER:
+    case FFI_TYPE_FLOAT:
+      return *(UINT32 *)arg;
 
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
-	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
-      }
-      break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args, &ecif, cif->bytes, cif->flags, ecif.rvalue,
-                    fn);
-      break;
-#endif
     default:
-      FFI_ASSERT(0);
-      break;
+      abort();
     }
 }
 
-
-/** private members **/
-
-/* The following __attribute__((regparm(1))) decorations will have no effect
-   on MSVC - standard cdecl convention applies. */
-static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
-                                         void** args, ffi_cif* cif);
-void FFI_HIDDEN ffi_closure_SYSV (ffi_closure *)
-     __attribute__ ((regparm(1)));
-unsigned int FFI_HIDDEN ffi_closure_SYSV_inner (ffi_closure *, void **, void *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_raw_SYSV (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-#ifdef X86_WIN32
-void FFI_HIDDEN ffi_closure_raw_THISCALL (ffi_raw_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_STDCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-void FFI_HIDDEN ffi_closure_THISCALL (ffi_closure *)
-     __attribute__ ((regparm(1)));
-#endif
-#ifdef X86_WIN64
-void FFI_HIDDEN ffi_closure_win64 (ffi_closure *);
-#endif
-
-/* This function is jumped to by the trampoline */
-
-#ifdef X86_WIN64
-void * FFI_HIDDEN
-ffi_closure_win64_inner (ffi_closure *closure, void *args) {
-  ffi_cif       *cif;
-  void         **arg_area;
-  void          *result;
-  void          *resp = &result;
-
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, &resp, arg_area, cif);
-  
-  (closure->fun) (cif, resp, arg_area, closure->user_data);
-
-  /* The result is returned in rax.  This does the right thing for
-     result types except for floats; we have to 'mov xmm0, rax' in the
-     caller to correct this.
-     TODO: structure sizes of 3 5 6 7 are returned by reference, too!!!
-  */
-  return cif->rtype->size > sizeof(void *) ? resp : *(void **)resp;
-}
-
-#else
-unsigned int FFI_HIDDEN __attribute__ ((regparm(1)))
-ffi_closure_SYSV_inner (ffi_closure *closure, void **respp, void *args)
+struct ffi_call_frame
 {
-  /* our various things...  */
-  ffi_cif       *cif;
-  void         **arg_area;
+  void *ebp;		/* 0 */
+  void *retaddr;	/* 4 */
+  void (*fn)(void);	/* 8 */
+  int flags;		/* 12 */
+  void *rvalue;		/* 16 */
+  unsigned eax;		/* 20 */
+  unsigned edx;		/* 24 */
+  unsigned ecx;		/* 28 */
+};
+
+extern void ffi_call_i386(struct ffi_call_frame *, char *)
+	FFI_HIDDEN __attribute__((fastcall));
 
-  cif         = closure->cif;
-  arg_area    = (void**) alloca (cif->nargs * sizeof (void*));  
-
-  /* this call will initialize ARG_AREA, such that each
-   * element in that array points to the corresponding 
-   * value on the stack; and if the function returns
-   * a structure, it will change RESP to point to the
-   * structure return address.  */
-
-  ffi_prep_incoming_args_SYSV(args, respp, arg_area, cif);
+void
+ffi_call(ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+{
+  size_t rsize;
+  struct ffi_call_frame *frame;
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int flags, cabi, i, n, narg_reg;
+  size_t bytes;
+
+  flags = cif->flags;
+  cabi = cif->abi;
+
+  rsize = 0;
+  if (rvalue == NULL)
+    {
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTECX:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can just pretend the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
+    }
 
-  (closure->fun) (cif, *respp, arg_area, closure->user_data);
+  bytes = ALIGN (cif->bytes, 16);
+  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  frame = (struct ffi_call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
 
-  return cif->flags;
-}
-#endif /* !X86_WIN64 */
+  frame->fn = fn;
+  frame->flags = flags;
+  frame->rvalue = rvalue;
 
-static void
-ffi_prep_incoming_args_SYSV(char *stack, void **rvalue, void **avalue,
-                            ffi_cif *cif)
-{
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-
-  argp = stack;
-
-#ifdef X86_WIN64
-  if (cif->rtype->size > sizeof(ffi_arg)
-      || (cif->flags == FFI_TYPE_STRUCT
-          && (cif->rtype->size != 1 && cif->rtype->size != 2
-              && cif->rtype->size != 4 && cif->rtype->size != 8))) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#else
-  if ( cif->flags == FFI_TYPE_STRUCT
-       || cif->flags == FFI_TYPE_MS_STRUCT ) {
-    *rvalue = *(void **) argp;
-    argp += sizeof(void *);
-  }
-#endif
-
-  p_argv = avalue;
+  narg_reg = 0;
+  switch (flags)
+    {
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	{
+	  /* For fastcall/thiscall this is first register-passed argument.  */
+	  frame->ecx = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void*);
+      break;
+    }
 
-  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
+  arg_types = cif->arg_types;
+  for (i = 0, n = cif->nargs; i < n; ++i)
     {
-      size_t z;
+      size_t z = arg_types[i]->size;
+      int t = arg_types[i]->type;
+      void *valp = avalue[i];
 
-      /* Align if necessary */
-      if ((sizeof(void*) - 1) & (size_t) argp) {
-        argp = (char *) ALIGN(argp, sizeof(void*));
-      }
+      if (z <= sizeof(ffi_arg) && t != FFI_TYPE_STRUCT)
+	{
+	  ffi_arg val = extend_basic_type (valp, t);
+
+	  /* For thiscall/fastcall convention register-passed arguments
+	     are the first two non-floating-point, non-aggregate arguments
+	     with a size smaller or equal to sizeof(ffi_arg).  */
+	  if (t != FFI_TYPE_FLOAT
+	      && ((cabi == FFI_THISCALL && narg_reg < 1)
+		  || (cabi == FFI_FASTCALL && narg_reg < 2)))
+	    {
+	      if (narg_reg == 0)
+		frame->ecx = val;
+	      else
+		frame->edx = val;
+	      narg_reg++;
+	      continue;
+	    }
 
-#ifdef X86_WIN64
-      if ((*p_arg)->size > sizeof(ffi_arg)
-          || ((*p_arg)->type == FFI_TYPE_STRUCT
-              && ((*p_arg)->size != 1 && (*p_arg)->size != 2
-                  && (*p_arg)->size != 4 && (*p_arg)->size != 8)))
-        {
-          z = sizeof(void *);
-          *p_argv = *(void **)argp;
+	  *(ffi_arg *)argp = val;
+	  z = sizeof(ffi_arg);
         }
       else
-#endif
         {
-          z = (*p_arg)->size;
-          
-          /* because we're little endian, this is what it turns into.   */
-          
-          *p_argv = (void*) argp;
+          memcpy(argp, valp, z);
+	  z = ALIGN(z, sizeof(ffi_arg));
         }
-          
-      p_argv++;
-#ifdef X86_WIN64
-      argp += (z + sizeof(void*) - 1) & ~(sizeof(void*) - 1);
-#else
+
       argp += z;
-#endif
     }
-  
-  return;
+
+  ffi_call_i386(frame, stack);
 }
 
-#define FFI_INIT_TRAMPOLINE_WIN64(TRAMP,FUN,CTX,MASK) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   void*  __fun = (void*)(FUN); \
-   void*  __ctx = (void*)(CTX); \
-   *(unsigned char*) &__tramp[0] = 0x41; \
-   *(unsigned char*) &__tramp[1] = 0xbb; \
-   *(unsigned int*) &__tramp[2] = MASK; /* mov $mask, %r11 */ \
-   *(unsigned char*) &__tramp[6] = 0x48; \
-   *(unsigned char*) &__tramp[7] = 0xb8; \
-   *(void**) &__tramp[8] = __ctx; /* mov __ctx, %rax */ \
-   *(unsigned char *)  &__tramp[16] = 0x49; \
-   *(unsigned char *)  &__tramp[17] = 0xba; \
-   *(void**) &__tramp[18] = __fun; /* mov __fun, %r10 */ \
-   *(unsigned char *)  &__tramp[26] = 0x41; \
-   *(unsigned char *)  &__tramp[27] = 0xff; \
-   *(unsigned char *)  &__tramp[28] = 0xe2; /* jmp %r10 */ \
- }
+/* ------- Closure API support ----------------------------------- */
 
 /* How to make a trampoline.  Derived from gcc/config/i386/i386.c. */
 
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10);  \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe9; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* jmp __fun  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_THISCALL(TRAMP,FUN,CTX,SIZE) \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 49);  \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned int *) &__tramp[0] = 0x8324048b;	/* mov (%esp), %eax */ \
-   *(unsigned int *) &__tramp[4] = 0x4c890cec;	/* sub $12, %esp */ \
-   *(unsigned int *) &__tramp[8] = 0x04890424;	/* mov %ecx, 4(%esp) */ \
-   *(unsigned char*) &__tramp[12] = 0x24;	/* mov %eax, (%esp) */ \
-   *(unsigned char*) &__tramp[13] = 0xb8; \
-   *(unsigned int *) &__tramp[14] = __size;	/* mov __size, %eax */ \
-   *(unsigned int *) &__tramp[18] = 0x08244c8d;	/* lea 8(%esp), %ecx */ \
-   *(unsigned int *) &__tramp[22] = 0x4802e8c1; /* shr $2, %eax ; dec %eax */ \
-   *(unsigned short*) &__tramp[26] = 0x0b74;	/* jz 1f */ \
-   *(unsigned int *) &__tramp[28] = 0x8908518b;	/* 2b: mov 8(%ecx), %edx */ \
-   *(unsigned int *) &__tramp[32] = 0x04c18311; /* mov %edx, (%ecx) ; add $4, %ecx */ \
-   *(unsigned char*) &__tramp[36] = 0x48;	/* dec %eax */ \
-   *(unsigned short*) &__tramp[37] = 0xf575;	/* jnz 2b ; 1f: */ \
-   *(unsigned char*) &__tramp[39] = 0xb8; \
-   *(unsigned int*)  &__tramp[40] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[44] = 0xe8; \
-   *(unsigned int*)  &__tramp[45] = __dis; /* call __fun  */ \
-   *(unsigned char*)  &__tramp[49] = 0xc2; /* ret  */ \
-   *(unsigned short*)  &__tramp[50] = (__size + 8); /* ret (__size + 8)  */ \
- }
-
-#define FFI_INIT_TRAMPOLINE_STDCALL(TRAMP,FUN,CTX,SIZE)  \
-{ unsigned char *__tramp = (unsigned char*)(TRAMP); \
-   unsigned int  __fun = (unsigned int)(FUN); \
-   unsigned int  __ctx = (unsigned int)(CTX); \
-   unsigned int  __dis = __fun - (__ctx + 10); \
-   unsigned short __size = (unsigned short)(SIZE); \
-   *(unsigned char*) &__tramp[0] = 0xb8; \
-   *(unsigned int*)  &__tramp[1] = __ctx; /* movl __ctx, %eax */ \
-   *(unsigned char *)  &__tramp[5] = 0xe8; \
-   *(unsigned int*)  &__tramp[6] = __dis; /* call __fun  */ \
-   *(unsigned char *)  &__tramp[10] = 0xc2; \
-   *(unsigned short*)  &__tramp[11] = __size; /* ret __size  */ \
- }
-
-/* the cif must already be prep'ed */
+extern void ffi_closure_i386(void) FFI_HIDDEN;
+extern void ffi_closure_i386_stdcall(void) FFI_HIDDEN;
+
+static void
+ffi_init_trampoline(char *tramp, void (*dest)(void), void *codeloc)
+{
+  unsigned diff = (unsigned)dest;
+
+  tramp[0] = 0xb8;			/* movl codeloc, %eax */
+  *(void **)(tramp + 1) = codeloc;
+  tramp[5] = 0xe9;			/* jmp  ffi_closure_i386 */
+  diff -= (unsigned)tramp + 10;
+  *(unsigned*)(tramp + 6) = diff;
+}
 
 ffi_status
 ffi_prep_closure_loc (ffi_closure* closure,
@@ -650,61 +291,116 @@  ffi_prep_closure_loc (ffi_closure* closure,
                       void *user_data,
                       void *codeloc)
 {
-#ifdef X86_WIN64
-#define ISFLOAT(IDX) (cif->arg_types[IDX]->type == FFI_TYPE_FLOAT || cif->arg_types[IDX]->type == FFI_TYPE_DOUBLE)
-#define FLAG(IDX) (cif->nargs>(IDX)&&ISFLOAT(IDX)?(1<<(IDX)):0)
-  if (cif->abi == FFI_WIN64) 
-    {
-      int mask = FLAG(0)|FLAG(1)|FLAG(2)|FLAG(3);
-      FFI_INIT_TRAMPOLINE_WIN64 (&closure->tramp[0],
-                                 &ffi_closure_win64,
-                                 codeloc, mask);
-      /* make sure we can execute here */
-    }
-#else
-  if (cif->abi == FFI_SYSV)
-    {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
-    }
-#ifdef X86_WIN32
-  else if (cif->abi == FFI_THISCALL)
-    {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0],
-				    &ffi_closure_THISCALL,
-				    (void*)codeloc,
-				    cif->bytes);
-    }
-  else if (cif->abi == FFI_STDCALL)
+  void (*fn)(void);
+
+  switch (cif->abi)
     {
-      FFI_INIT_TRAMPOLINE_STDCALL (&closure->tramp[0],
-                                   &ffi_closure_STDCALL,
-                                   (void*)codeloc, cif->bytes);
+    case FFI_SYSV:
+    case FFI_THISCALL:
+    case FFI_FASTCALL:
+    case FFI_MS_CDECL:
+      fn = ffi_closure_i386;
+      break;
+
+    case FFI_STDCALL:
+      fn = ffi_closure_i386_stdcall;
+      break;
+
+    default:
+      return FFI_BAD_ABI;
     }
-  else if (cif->abi == FFI_MS_CDECL)
+
+  ffi_init_trampoline (closure->tramp, fn, codeloc);
+
+  closure->cif = cif;
+  closure->fun = fun;
+  closure->user_data = user_data;
+
+  return FFI_OK;
+}
+
+struct ffi_closure_frame
+{
+  unsigned rettemp[4];	/* 0 */
+  unsigned eax;		/* 16 */
+  unsigned edx;		/* 20 */
+  unsigned ecx;		/* 24 */
+  ffi_closure *closure;	/* 28 */
+};
+
+unsigned int FFI_HIDDEN __attribute__ ((fastcall))
+ffi_closure_inner (struct ffi_closure_frame *frame, char *argp)
+{
+  ffi_closure *closure = frame->closure;
+  ffi_cif *cif = closure->cif;
+  int cabi, i, n, flags, narg_reg;
+  ffi_type **arg_types;
+  void *rvalue;
+  void **avalue;
+
+  cabi = cif->abi;
+  flags = cif->flags;
+  narg_reg = 0;
+  rvalue = frame->rettemp;
+
+  switch (flags)
     {
-      FFI_INIT_TRAMPOLINE (&closure->tramp[0],
-                           &ffi_closure_SYSV,
-                           (void*)codeloc);
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL || cabi == FFI_FASTCALL)
+	{
+	  rvalue = (void *)frame->ecx;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      rvalue = *(void **)argp;
+      argp += sizeof(void *);
+      break;
     }
-#endif /* X86_WIN32 */
-#endif /* !X86_WIN64 */
-  else
+
+  n = cif->nargs;
+  avalue = alloca(sizeof(void *) * n);
+
+  arg_types = cif->arg_types;
+  for (i = 0; i < n; ++i)
     {
-      return FFI_BAD_ABI;
+      size_t z = arg_types[i]->size;
+      int t = arg_types[i]->type;
+      void *valp;
+
+      if (z <= sizeof(ffi_arg)
+	  && t != FFI_TYPE_STRUCT && t != FFI_TYPE_FLOAT
+	  && ((cabi == FFI_THISCALL && narg_reg < 1)
+	      || (cabi == FFI_FASTCALL && narg_reg < 2)))
+	{
+	  if (narg_reg == 0)
+	    valp = &frame->ecx;
+	  else
+	    valp = &frame->edx;
+	}
+      else
+	{
+	  valp = argp;
+	  z = ALIGN (z, 4);
+	  argp += z;
+	}
+
+      avalue[i] = valp;
     }
-    
-  closure->cif  = cif;
-  closure->user_data = user_data;
-  closure->fun  = fun;
 
-  return FFI_OK;
+  closure->fun (cif, rvalue, avalue, closure->user_data);
+
+  if (cabi == FFI_STDCALL)
+    return flags + (cif->bytes << X86_RET_POP_SHIFT);
+  else
+    return flags;
 }
 
 /* ------- Native raw API support -------------------------------- */
 
-#if !FFI_NO_RAW_API
+extern void ffi_closure_raw_SYSV(void) FFI_HIDDEN;
+extern void ffi_closure_raw_THISCALL(void) FFI_HIDDEN;
 
 ffi_status
 ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
@@ -713,131 +409,154 @@  ffi_prep_raw_closure_loc (ffi_raw_closure* closure,
                           void *user_data,
                           void *codeloc)
 {
+  void (*fn)(void);
   int i;
 
-  if (cif->abi != FFI_SYSV) {
-#ifdef X86_WIN32
-    if (cif->abi != FFI_THISCALL)
-#endif
-    return FFI_BAD_ABI;
-  }
-
-  /* we currently don't support certain kinds of arguments for raw
-     closures.  This should be implemented by a separate assembly
-     language routine, since it would require argument processing,
-     something we don't do now for performance.  */
-
+  /* We currently don't support certain kinds of arguments for raw closures.
+     This should be implemented by a separate assembly language routine,
+     since it would require argument processing, something we don't do now
+     for performance.  */
   for (i = cif->nargs-1; i >= 0; i--)
     {
       FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_STRUCT);
       FFI_ASSERT (cif->arg_types[i]->type != FFI_TYPE_LONGDOUBLE);
     }
   
-#ifdef X86_WIN32
-  if (cif->abi == FFI_SYSV)
-    {
-#endif
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], &ffi_closure_raw_SYSV,
-                       codeloc);
-#ifdef X86_WIN32
-    }
-  else if (cif->abi == FFI_THISCALL)
+  switch (cif->abi)
     {
-      FFI_INIT_TRAMPOLINE_THISCALL (&closure->tramp[0], &ffi_closure_raw_THISCALL,
-				    codeloc, cif->bytes);
-    }
-#endif
-  closure->cif  = cif;
+    case FFI_SYSV:
+      fn = ffi_closure_raw_SYSV;
+      break;
+    case FFI_THISCALL:
+      fn = ffi_closure_raw_THISCALL;
+      break;
+    default:
+      return FFI_BAD_ABI;
+  }
+
+  ffi_init_trampoline (closure->tramp, fn, codeloc);
+
+  closure->cif = cif;
+  closure->fun = fun;
   closure->user_data = user_data;
-  closure->fun  = fun;
 
   return FFI_OK;
 }
 
-static void 
-ffi_prep_args_raw(char *stack, extended_cif *ecif)
-{
-  memcpy (stack, ecif->avalue, ecif->cif->bytes);
-}
-
-/* we borrow this routine from libffi (it must be changed, though, to
- * actually call the function passed in the first argument.  as of
- * libffi-1.20, this is not the case.)
- */
-
 void
-ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *fake_avalue)
+ffi_raw_call(ffi_cif *cif, void (*fn)(void), void *rvalue, ffi_raw *avalue)
 {
-  extended_cif ecif;
-  void **avalue = (void **)fake_avalue;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
+  size_t rsize;
+  struct ffi_call_frame *frame;
+  char *stack, *argp;
+  int flags, cabi, narg_reg;
+  size_t bytes;
   
-  /* If the return value is a struct and we don't have a return */
-  /* value address then we need to make one                     */
+  flags = cif->flags;
+  cabi = cif->abi;
+  
+  rsize = 0;
+  if (rvalue == NULL)
+    {
+      switch (flags)
+	{
+	case X86_RET_FLOAT:
+	case X86_RET_DOUBLE:
+	case X86_RET_LDOUBLE:
+	case X86_RET_STRUCTPOP:
+	case X86_RET_STRUCTECX:
+	  /* The float cases need to pop the 387 stack.
+	     The struct cases need to pass a valid pointer to the callee.  */
+	  rsize = cif->rtype->size;
+	  break;
+	default:
+	  /* We can just pretend the callee returns nothing.  */
+	  flags = X86_RET_VOID;
+	  break;
+	}
+    }
+
+  bytes = cif->bytes;
+  argp = stack = alloca(bytes + sizeof(*frame) + rsize);
+  frame = (struct ffi_call_frame *)(stack + bytes);
+  if (rsize)
+    rvalue = frame + 1;
 
-  if (rvalue == NULL
-      && (cif->flags == FFI_TYPE_STRUCT
-          || cif->flags == FFI_TYPE_MS_STRUCT))
+  frame->fn = fn;
+  frame->flags = flags;
+  frame->rvalue = rvalue;
+
+  narg_reg = 0;
+  switch (flags)
     {
-      ecif.rvalue = alloca(cif->rtype->size);
+    case X86_RET_STRUCTECX:
+      if (cabi == FFI_THISCALL)
+	{
+	  /* For fastcall/thiscall this is first register-passed argument.  */
+	  frame->ecx = (unsigned)rvalue;
+	  narg_reg = 1;
+	  break;
+	}
+      /* fallthru */
+    case X86_RET_STRUCTPOP:
+      *(void **)argp = rvalue;
+      argp += sizeof(void *);
+      bytes -= sizeof(void *);
+      break;
     }
-  else
-    ecif.rvalue = rvalue;
-    
-  
+
   switch (cif->abi) 
     {
-#ifdef X86_WIN32
     case FFI_SYSV:
     case FFI_STDCALL:
     case FFI_MS_CDECL:
-      ffi_call_win32(ffi_prep_args_raw, &ecif, cif->abi, cif->bytes, cif->flags,
-		     ecif.rvalue, fn);
+      memcpy (argp, avalue, bytes);
       break;
+
     case FFI_THISCALL:
     case FFI_FASTCALL:
       {
-	unsigned int abi = cif->abi;
-	unsigned int i, passed_regs = 0;
-
-	if (cif->flags == FFI_TYPE_STRUCT)
-	  ++passed_regs;
-
-	for (i=0; i < cif->nargs && passed_regs < 2;i++)
+	int narg_max = (cif->abi == FFI_FASTCALL ? 2 : 1);
+	ffi_type **arg_types = cif->arg_types;
+	int i, n = cif->nargs;
+	
+	for (i = 0; i < n && narg_reg < narg_max; i++)
 	  {
-	    size_t sz;
-
-	    if (cif->arg_types[i]->type == FFI_TYPE_FLOAT
-	        || cif->arg_types[i]->type == FFI_TYPE_STRUCT)
-	      continue;
-	    sz = (cif->arg_types[i]->size + 3) & ~3;
-	    if (sz == 0 || sz > 4)
-	      continue;
-	    ++passed_regs;
+	    size_t z = arg_types[i]->size;
+	    int t = arg_types[i]->type;
+
+	    if (z <= sizeof(ffi_arg)
+		&& t != FFI_TYPE_STRUCT
+		&& t != FFI_TYPE_FLOAT)
+	      {
+		ffi_arg val = extend_basic_type (avalue, t);
+		if (narg_reg == 0)
+		  frame->ecx = val;
+		else
+		  frame->edx = val;
+		narg_reg++;
+		z = sizeof(ffi_arg);
+	      }
+	    else
+	      {
+		memcpy (argp, avalue, z);
+		z = ALIGN (z, sizeof(ffi_arg));
+		argp += z;
+	      }
+	    avalue += z;
+	    bytes -= z;
 	  }
-	if (passed_regs < 2 && abi == FFI_FASTCALL)
-	  cif->abi = abi = FFI_THISCALL;
-	if (passed_regs < 1 && abi == FFI_THISCALL)
-	  cif->abi = abi = FFI_STDCALL;
-        ffi_call_win32(ffi_prep_args_raw, &ecif, abi, cif->bytes, cif->flags,
-                       ecif.rvalue, fn);
+	if (i < n)
+	  memcpy (argp, avalue, bytes);
       }
       break;
-#else
-    case FFI_SYSV:
-      ffi_call_SYSV(ffi_prep_args_raw, &ecif, cif->bytes, cif->flags,
-                    ecif.rvalue, fn);
-      break;
-#endif
+
     default:
       FFI_ASSERT(0);
-      break;
+      return;
     }
-}
 
-#endif
-
-#endif /* !__x86_64__  || X86_WIN64 */
+  ffi_call_i386(frame, stack);
+}
 
+#endif /* !__x86_64__ */
diff --git a/libffi/src/x86/ffitarget.h b/libffi/src/x86/ffitarget.h
index 592d6f8..bf8d8c6 100644
--- a/libffi/src/x86/ffitarget.h
+++ b/libffi/src/x86/ffitarget.h
@@ -36,107 +36,85 @@ 
 
 /* ---- System specific configurations ----------------------------------- */
 
-/* For code common to all platforms on x86 and x86_64. */
-#define X86_ANY
-
 #if defined (X86_64) && defined (__i386__)
-#undef X86_64
-#define X86
+# undef X86_64
+# define X86
+#elif defined (X86_WIN64)
+# define X86_64
+#elif defined (X86_DARWIN) && defined (__x86_64__)
+# define X86_64
 #endif
 
 #ifdef X86_WIN64
-#define FFI_SIZEOF_ARG 8
 #define USE_BUILTIN_FFS 0 /* not yet implemented in mingw-64 */
 #endif
 
 /* ---- Generic type definitions ----------------------------------------- */
 
 #ifndef LIBFFI_ASM
-#ifdef X86_WIN64
-#ifdef _MSC_VER
-typedef unsigned __int64       ffi_arg;
-typedef __int64                ffi_sarg;
-#else
-typedef unsigned long long     ffi_arg;
-typedef long long              ffi_sarg;
-#endif
-#else
-#if defined __x86_64__ && defined __ILP32__
-#define FFI_SIZEOF_ARG 8
-#define FFI_SIZEOF_JAVA_RAW  4
-typedef unsigned long long     ffi_arg;
-typedef long long              ffi_sarg;
+
+#ifdef X86_64
+# ifdef _MSC_VER
+typedef unsigned __int64    ffi_arg;
+typedef __int64             ffi_sarg;
+# else
+typedef unsigned long long  ffi_arg;
+typedef long long           ffi_sarg;
+# endif
+# define FFI_SIZEOF_ARG 8
+# ifdef __ILP32__
+#  define FFI_SIZEOF_JAVA_RAW  4
+# endif
 #else
-typedef unsigned long          ffi_arg;
-typedef signed long            ffi_sarg;
-#endif
-#endif
+typedef unsigned long       ffi_arg;
+typedef signed long         ffi_sarg;
+# define FFI_SIZEOF_ARG 4
+#endif /* X86_64 */
 
 typedef enum ffi_abi {
   FFI_FIRST_ABI = 0,
 
-  /* ---- Intel x86 Win32 ---------- */
-#ifdef X86_WIN32
+#ifdef X86_64
+  FFI_WIN64,
+  FFI_UNIX64,
+  FFI_LAST_ABI,
+# ifdef X86_WIN64
+  FFI_DEFAULT_ABI = FFI_WIN64,
+# else
+  FFI_DEFAULT_ABI = FFI_UNIX64,
+# endif
+#else
   FFI_SYSV,
   FFI_STDCALL,
   FFI_THISCALL,
   FFI_FASTCALL,
   FFI_MS_CDECL,
   FFI_LAST_ABI,
-#ifdef _MSC_VER
+# ifdef _MSC_VER
   FFI_DEFAULT_ABI = FFI_MS_CDECL
-#else
-  FFI_DEFAULT_ABI = FFI_SYSV
-#endif
-
-#elif defined(X86_WIN64)
-  FFI_WIN64,
-  FFI_LAST_ABI,
-  FFI_DEFAULT_ABI = FFI_WIN64
-
-#else
-  /* ---- Intel x86 and AMD x86-64 - */
-  FFI_SYSV,
-  FFI_UNIX64,   /* Unix variants all use the same ABI for x86-64  */
-  FFI_LAST_ABI,
-#if defined(__i386__) || defined(__i386)
+# else
   FFI_DEFAULT_ABI = FFI_SYSV
-#else
-  FFI_DEFAULT_ABI = FFI_UNIX64
-#endif
-#endif
+# endif
+#endif /* X86_64 */
 } ffi_abi;
-#endif
+
+#endif /* !LIBFFI_ASM */
 
 /* ---- Definitions for closures ----------------------------------------- */
 
 #define FFI_CLOSURES 1
 #define FFI_GO_CLOSURES 1
 
-#define FFI_TYPE_SMALL_STRUCT_1B (FFI_TYPE_LAST + 1)
-#define FFI_TYPE_SMALL_STRUCT_2B (FFI_TYPE_LAST + 2)
-#define FFI_TYPE_SMALL_STRUCT_4B (FFI_TYPE_LAST + 3)
-#define FFI_TYPE_MS_STRUCT       (FFI_TYPE_LAST + 4)
-
-#if defined (X86_64) || (defined (__x86_64__) && defined (X86_DARWIN))
+#ifdef X86_64
 #define FFI_TRAMPOLINE_SIZE 24
-#define FFI_NATIVE_RAW_API 0
-#else
-#ifdef X86_WIN32
-#define FFI_TRAMPOLINE_SIZE 52
-#else
-#ifdef X86_WIN64
-#define FFI_TRAMPOLINE_SIZE 29
-#define FFI_NATIVE_RAW_API 0
-#define FFI_NO_RAW_API 1
 #else
 #define FFI_TRAMPOLINE_SIZE 10
 #endif
-#endif
-#ifndef X86_WIN64
-#define FFI_NATIVE_RAW_API 1	/* x86 has native raw api support */
-#endif
-#endif
 
+#ifdef X86_64
+# define FFI_NATIVE_RAW_API 0
+#else
+# define FFI_NATIVE_RAW_API 1
 #endif
 
+#endif /* LIBFFI_TARGET_H */
diff --git a/libffi/src/x86/internal.h b/libffi/src/x86/internal.h
new file mode 100644
index 0000000..e1df862
--- /dev/null
+++ b/libffi/src/x86/internal.h
@@ -0,0 +1,48 @@ 
+/* -----------------------------------------------------------------*-C-*-
+   ffitarget.h - Copyright (c) 2012  Anthony Green
+                 Copyright (c) 1996-2003, 2010  Red Hat, Inc.
+                 Copyright (C) 2008  Free Software Foundation, Inc.
+
+   Internal configuration macros for x86 and x86-64.
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   ``Software''), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+   DEALINGS IN THE SOFTWARE.
+
+   ----------------------------------------------------------------------- */
+
+#define X86_RET_FLOAT		0
+#define X86_RET_DOUBLE		1
+#define X86_RET_LDOUBLE		2
+#define X86_RET_SINT8		3
+#define X86_RET_SINT16		4
+#define X86_RET_UINT8		5
+#define X86_RET_UINT16		6
+#define X86_RET_INT64		7
+#define X86_RET_INT32		8
+#define X86_RET_VOID		9
+#define X86_RET_STRUCTPOP	10
+#define X86_RET_STRUCTECX	11
+#define X86_RET_UNUSED12	12
+#define X86_RET_UNUSED13	13
+#define X86_RET_UNUSED14	14
+#define X86_RET_UNUSED15	15
+
+#define X86_RET_TYPE_MASK	15
+#define X86_RET_POP_SHIFT	4
diff --git a/libffi/src/x86/sysv.S b/libffi/src/x86/sysv.S
index f108dd8..d8256d0 100644
--- a/libffi/src/x86/sysv.S
+++ b/libffi/src/x86/sysv.S
@@ -24,226 +24,363 @@ 
    DEALINGS IN THE SOFTWARE.
    ----------------------------------------------------------------------- */
 
-#ifndef __x86_64__
+#if !(defined(X86_WIN64) || defined(__x86_64__))
 
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
+#include "internal.h"
+
+	.text
+
+	.align	16
+	.globl	ffi_call_i386
+        .type	ffi_call_i386,@function
+	FFI_HIDDEN (ffi_call_i386)
+
+/* This macro allows the safe creation of jump tables without an
+   actual table.  The entry points into the table are all 8 bytes.
+   The use of ORG asserts that we're at the correct location.  */
+.macro	E	which
+	.align	8
+	.org	0b + \which * 8
+.endm
+
+/* This is declared as
+
+   void ffi_call_i386(struct ffi_call_frame *frame, char *argp)
+	__attribute__((fastcall));
+
+   This the arguments are present in
+
+	ecx: frame
+	edx: argp
+*/
+
+ffi_call_i386:
+	.cfi_startproc
+	movl	(%esp), %eax		/* move the return address */
+	movl	%ebp, (%ecx)		/* store %ebp into local frame */
+	movl	%eax, 4(%ecx)		/* store retaddr into local frame */
+
+	/* New stack frame based off ebp.  This is a itty bit of unwind
+	   trickery in that the CFA *has* changed.  There is no easy way
+	   to describe it correctly on entry to the function.  Fortunately,
+	   it doesn't matter too much since at all points we can correctly
+	   unwind back to ffi_call.  Note that the location to which we
+	   moved the return address is (the new) CFA-4, so from the
+	   perspective of the unwind info, it hasn't moved.  */
+	movl	%ecx, %ebp
+	.cfi_def_cfa %ebp, 8
+	.cfi_rel_offset %ebp, 0
+
+	movl	%edx, %esp		/* set outgoing argument stack */
+	movl	20(%ebp), %eax		/* set register arguments */
+	movl	24(%ebp), %edx
+	movl	28(%ebp), %ecx
+
+	call	*8(%ebp)
+
+	movl	12(%ebp), %ecx		/* load return type code */
+	movl	%ebx, 8(%ebp)		/* preserve %ebx */
+	.cfi_rel_offset %ebx, 8
+
+	andl	$X86_RET_TYPE_MASK, %ecx
+#ifdef __PIC__
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %ecx, 8), %ebx
+#else
+	leal	0f(,%ecx, 8), %ebx
+#endif
+	movl	16(%ebp), %ecx		/* load result address */
+	jmp	*%ebx
 
-.text
-
-.globl ffi_prep_args
+	.align	8
+0:
+E X86_RET_FLOAT
+	fstps	(%ecx)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fstpl	(%ecx)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fstpt	(%ecx)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	%al, %eax
+	mov	%eax, (%ecx)
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	%ax, %eax
+	mov	%eax, (%ecx)
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	%al, %eax
+	movl	%eax, (%ebx)
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	%ax, %eax
+	movl	%eax, (%ebx)
+	jmp	9f
+
+E X86_RET_INT64
+	movl	%edx, 4(%ecx)
+	/* fallthru */
+E X86_RET_INT32
+	movl	%eax, (%ecx)
+	/* fallthru */
+E X86_RET_VOID
+9:	movl	8(%ebp), %ebx
+	movl	%ebp, %esp
+	popl	%ebp
+	.cfi_remember_state
+	.cfi_def_cfa %esp, 4
+	.cfi_restore %ebx
+	.cfi_restore %ebp
+	ret
+	.cfi_restore_state
 
-	.align 4
-.globl ffi_call_SYSV
-        .type    ffi_call_SYSV,@function
+	/* No struct return path need do anything special.  */
+E X86_RET_STRUCTPOP
+	jmp	9b
+E X86_RET_STRUCTECX
+	jmp	9b
 
-ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
 
-        /* Align the stack pointer to 16-bytes */
-        andl  $0xfffffff0, %esp
+	.cfi_endproc
+	.size	ffi_call_i386, . - ffi_call_i386
 
-	movl  %esp,%eax
+/* The closure entry points are reached from the ffi_closure trampoline.
+   On entry, %eax contains the address of the ffi_closure.  */
 
-	/* Place all of the ffi_prep_args in position  */
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
+#define	ffi_closure_FS	(12 + 4*4 + 16)
 
-	/* Return stack to previous state and call the function  */
-	addl  $8,%esp	
+.macro FFI_CLOSURE_FIRST
+	subl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_FS
 
-	call  *28(%ebp)
+	movl	%edx, 20(%esp)		/* save incoming register args */
+	movl	%ecx, 24(%esp)
+	movl	%eax, 28(%esp)		/* trampoline loaded closure */
 
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
+	movl	%esp, %ecx		/* pass save area to C */
+	leal	ffi_closure_FS+4(%esp), %edx
 
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
+#ifdef __PIC__
+	movl	%ebx, 32(%esp)		/* save ebx */
+	.cfi_rel_offset %esp, 32
+	call	__x86.get_pc_thunk.bx
+	addl	$_GLOBAL_OFFSET_TABLE_, %ebx
+#endif
+#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
+	call	ffi_closure_inner
+#else
+	call	ffi_closure_inner@PLT
+#endif
+.endm
 
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
+.macro FFI_CLOSURE_SECOND
+	andl	$X86_RET_TYPE_MASK, %eax
+#ifdef __PIC__
+	leal	0f@GOTOFF(%ebx, %eax, 8), %eax
+	movl	32(%esp), %ebx		/* restore ebx */
+	.cfi_restore %ebx
+#else
+	leal	0f(, %eax, 8), %eax
+#endif
+	jmp	*%eax
+.endm
 
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
+	.align	16
+	.globl	ffi_closure_i386
+	.type	ffi_closure_i386, @function
+	FFI_HIDDEN (ffi_closure_i386)
 
-        jmp   epilogue
+ffi_closure_i386:
+	.cfi_startproc
+	FFI_CLOSURE_FIRST
+	FFI_CLOSURE_SECOND
 
+	.align	8
 0:
-	call  1f
-
-.Lstore_table:
-	.long	noretval-.Lstore_table	/* FFI_TYPE_VOID */
-	.long	retint-.Lstore_table	/* FFI_TYPE_INT */
-	.long	retfloat-.Lstore_table	/* FFI_TYPE_FLOAT */
-	.long	retdouble-.Lstore_table	/* FFI_TYPE_DOUBLE */
-	.long	retlongdouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	retuint8-.Lstore_table	/* FFI_TYPE_UINT8 */
-	.long	retsint8-.Lstore_table	/* FFI_TYPE_SINT8 */
-	.long	retuint16-.Lstore_table	/* FFI_TYPE_UINT16 */
-	.long	retsint16-.Lstore_table	/* FFI_TYPE_SINT16 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_UINT32 */
-	.long	retint-.Lstore_table	/* FFI_TYPE_SINT32 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_UINT64 */
-	.long	retint64-.Lstore_table	/* FFI_TYPE_SINT64 */
-	.long	retstruct-.Lstore_table	/* FFI_TYPE_STRUCT */
-	.long	retint-.Lstore_table	/* FFI_TYPE_POINTER */
-
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	fstpt (%ecx)
-	jmp   epilogue
-	
-retint64:	
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-	
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx	
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-        popl %esi
-        movl %ebp,%esp
-        popl %ebp
-        ret
-.LFE1:
-.ffi_call_SYSV_end:
-        .size    ffi_call_SYSV,.ffi_call_SYSV_end-ffi_call_SYSV
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl ffi_closure_SYSV
-	.type	ffi_closure_SYSV, @function
-
-ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-#if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE || !defined __PIC__
-	call	ffi_closure_SYSV_inner
-#else
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	1f
-1:	popl	%ebx
-	addl	$_GLOBAL_OFFSET_TABLE_+[.-1b], %ebx
-	call	ffi_closure_SYSV_inner@PLT
-	movl	8(%esp), %ebx
-#endif
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-	
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	movl	%ebp, %esp
-	popl	%ebp
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+E X86_RET_STRUCTPOP
+	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
 	ret	$4
-.LFE2:
-	.size	ffi_closure_SYSV, .-ffi_closure_SYSV
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+E X86_RET_STRUCTECX
+	movl	24(%esp), %ecx
+	addl	$ffi_closure_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_FS
+	ret
+	.cfi_adjust_cfa_offset ffi_closure_FS
+
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_i386, . - ffi_closure_i386
+
+	.align	16
+	.globl	ffi_closure_i386_stdcall
+	.type	ffi_closure_i386_stdcall, @function
+	FFI_HIDDEN(ffi_closure_i386_stdcall)
+
+ffi_closure_i386_stdcall:
+	.cfi_startproc
+	FFI_CLOSURE_FIRST
+
+	movl	%eax, %ecx
+	shrl	$4, %ecx			    /* isolate pop count */
+	leal	ffi_closure_FS(%esp, %ecx), %ecx    /* compute popped esp */
+	movl	ffi_closure_FS(%esp), %edx	    /* move return address */
+	movl	%edx, (%ecx)
+	.cfi_def_cfa %ecx, 4
+
+	FFI_CLOSURE_SECOND
+
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	movl	%ecx, %esp
+	ret
 
-#if !FFI_NO_RAW_API
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_INT64
+	popl	%eax
+	popl	%edx
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_INT32
+	movl	(%esp), %eax
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_VOID
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_STRUCTPOP
+	movl	%ecx, %esp
+	ret
+
+E X86_RET_STRUCTECX
+	/* This entry is one byte too big for the 8 byte slot.  */
+	jmp	9f
+
+	/* Fill out the table so that bad values are predictable.  */
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.align	8
+9:	movl	24(%esp), %eax
+	movl	%ecx, %esp
+	.cfi_def_cfa_register %esp
+	movl	%eax, %ecx
+	ret
+
+	.cfi_endproc
+	.size	ffi_closure_i386_stdcall, . - ffi_closure_i386_stdcall
 
 /* Precalculate for e.g. the Solaris 10/x86 assembler.  */
 #if FFI_TRAMPOLINE_SIZE == 10
@@ -261,208 +398,235 @@  ffi_closure_SYSV:
 #endif
 #define CIF_FLAGS_OFFSET 20
 
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl ffi_closure_raw_SYSV
+	.align	16
+	.globl	ffi_closure_raw_SYSV
 	.type	ffi_closure_raw_SYSV, @function
+	FFI_HIDDEN (ffi_closure_raw_SYSV)
+
+#define ffi_closure_raw_SYSV_FS  (12 + 16 + 4*4)
 
 ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
-#endif
+	.cfi_startproc
+	subl	$ffi_closure_raw_SYSV_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_raw_SYSV_FS
+	movl	%ebx, 32(%esp)
+	.cfi_rel_offset %ebx, 32
 
-#if defined __PIC__
-# if defined __sun__ && defined __svr4__
-/* 32-bit Solaris 2/x86 uses datarel encoding for PIC.  GNU ld before 2.22
-   doesn't correctly sort .eh_frame_hdr with mixed encodings, so match this.  */
-#  define FDE_ENCODING		0x30	/* datarel */
-#  define FDE_ENCODE(X)		X@GOTOFF
-# else
-#  define FDE_ENCODING		0x1b	/* pcrel sdata4 */
-#  if defined HAVE_AS_X86_PCREL
-#   define FDE_ENCODE(X)	X-.
-#  else
-#   define FDE_ENCODE(X)	X@rel
-#  endif
-# endif
-#else
-# define FDE_ENCODING		0	/* absolute */
-# define FDE_ENCODE(X)		X
-#endif
+	/* Install each of the arguments to the closure in turn.  */
+	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* user_data */
+	movl	%edx, 12(%esp)
+
+	leal	ffi_closure_raw_SYSV_FS+4(%esp), %edx	/* raw_args */
+	movl	%edx, 8(%esp)
+
+	leal	16(%esp), %edx				/* &res */
+	movl	%edx, 4(%esp)
+
+	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %ebx	/* cif */
+	movl	%ebx, (%esp)
+
+	call	*RAW_CLOSURE_FUN_OFFSET(%eax)
+
+	movl	CIF_FLAGS_OFFSET(%ebx), %eax		/* load rtype */
+	andl	$X86_RET_TYPE_MASK, %eax
 
-	.section	.eh_frame,EH_FRAME_FLAGS,@progbits
-.Lframe1:
-	.long	.LECIE1-.LSCIE1	/* Length of Common Information Entry */
-.LSCIE1:
-	.long	0x0	/* CIE Identifier Tag */
-	.byte	0x1	/* CIE Version */
-#ifdef HAVE_AS_ASCII_PSEUDO_OP
-#ifdef __PIC__
-	.ascii "zR\0"	/* CIE Augmentation */
-#else
-	.ascii "\0"	/* CIE Augmentation */
-#endif
-#elif defined HAVE_AS_STRING_PSEUDO_OP
 #ifdef __PIC__
-	.string "zR"	/* CIE Augmentation */
-#else
-	.string ""	/* CIE Augmentation */
-#endif
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %eax, 8), %eax
 #else
-#error missing .ascii/.string
-#endif
-	.byte	0x1	/* .uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x7c	/* .sleb128 -4; CIE Data Alignment Factor */
-	.byte	0x8	/* CIE RA Column */
-#ifdef __PIC__
-	.byte	0x1	/* .uleb128 0x1; Augmentation size */
-	.byte	FDE_ENCODING
-#endif
-	.byte	0xc	/* DW_CFA_def_cfa */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x4	/* .uleb128 0x4 */
-	.byte	0x88	/* DW_CFA_offset, column 0x8 */
-	.byte	0x1	/* .uleb128 0x1 */
-	.align 4
-.LECIE1:
-.LSFDE1:
-	.long	.LEFDE1-.LASFDE1	/* FDE Length */
-.LASFDE1:
-	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB1)	/* FDE initial location */
-	.long	.LFE1-.LFB1		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI0-.LFB1
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI1-.LCFI0
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.align 4
-.LEFDE1:
-.LSFDE2:
-	.long	.LEFDE2-.LASFDE2	/* FDE Length */
-.LASFDE2:
-	.long	.LASFDE2-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB2)	/* FDE initial location */
-	.long	.LFE2-.LFB2		/* FDE address range */
-#ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
-#endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI2-.LFB2
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI3-.LCFI2
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-#if !defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE && defined __PIC__
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI7-.LCFI3
-	.byte	0x83	/* DW_CFA_offset, column 0x3 */
-	.byte	0xa	/* .uleb128 0xa */
+	leal	0f(,%eax, 8), %eax
 #endif
-	.align 4
-.LEFDE2:
 
-#if !FFI_NO_RAW_API
+	movl	32(%esp), %ebx				/* restore ebx early */
+	.cfi_restore %ebx
+	jmp	*%eax
+
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_raw_SYSV_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_raw_SYSV_FS
+	ret
+	.cfi_adjust_cfa_offset ffi_closure_raw_SYSV_FS
+
+	/* We should never get here.  */
+E X86_RET_STRUCTPOP
+	ud2
+E X86_RET_STRUCTECX
+	ud2
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_raw_SYSV, .-ffi_closure_raw_SYSV
+
+	.align	16
+	.globl	ffi_closure_raw_THISCALL
+	.type	ffi_closure_raw_THISCALL, @function
+	FFI_HIDDEN (ffi_closure_raw_THISCALL)
+
+#define ffi_closure_raw_TC_FS  (8 + 16 + 4*4)
+	
+ffi_closure_raw_THISCALL:
+	.cfi_startproc
+	/* Rearrange the stack such that %ecx is the first argument.
+	   This means moving the return address.  */
+	popl	%edx
+	.cfi_adjust_cfa_offset -4
+	.cfi_register %eip, %edx
+	pushl	%ecx
+	.cfi_adjust_cfa_offset 4
+	pushl	%edx
+	.cfi_adjust_cfa_offset 4
+	.cfi_rel_offset %eip, 0
+	subl	$ffi_closure_raw_TC_FS, %esp
+	.cfi_adjust_cfa_offset ffi_closure_raw_TC_FS
+	movl	%ebx, 32(%esp)
+	.cfi_rel_offset %ebx, 32
+
+	/* Install each of the arguments to the closure in turn.  */
+	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* user_data */
+	movl	%edx, 12(%esp)
+
+	leal	ffi_closure_raw_TC_FS+4(%esp), %edx	/* raw_args */
+	movl	%edx, 8(%esp)
+
+	leal	16(%esp), %edx				/* &res */
+	movl	%edx, 4(%esp)
+
+	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %ebx	/* cif */
+	movl	%ebx, (%esp)
+
+	call	*RAW_CLOSURE_FUN_OFFSET(%eax)
+
+	movl	CIF_FLAGS_OFFSET(%ebx), %eax		/* load rtype */
+	andl	$X86_RET_TYPE_MASK, %eax
 
-.LSFDE3:
-	.long	.LEFDE3-.LASFDE3	/* FDE Length */
-.LASFDE3:
-	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
-	.long	FDE_ENCODE(.LFB3)	/* FDE initial location */
-	.long	.LFE3-.LFB3		/* FDE address range */
 #ifdef __PIC__
-	.byte	0x0	/* .uleb128 0x0; Augmentation size */
+	call	__x86.get_pc_thunk.bx
+1:	leal	0f-1b(%ebx, %eax, 8), %eax
+#else
+	leal	0f(,%eax, 8), %eax
 #endif
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI4-.LFB3
-	.byte	0xe	/* DW_CFA_def_cfa_offset */
-	.byte	0x8	/* .uleb128 0x8 */
-	.byte	0x85	/* DW_CFA_offset, column 0x5 */
-	.byte	0x2	/* .uleb128 0x2 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI5-.LCFI4
-	.byte	0xd	/* DW_CFA_def_cfa_register */
-	.byte	0x5	/* .uleb128 0x5 */
-	.byte	0x4	/* DW_CFA_advance_loc4 */
-	.long	.LCFI6-.LCFI5
-	.byte	0x86	/* DW_CFA_offset, column 0x6 */
-	.byte	0x3	/* .uleb128 0x3 */
-	.align 4
-.LEFDE3:
 
-#endif
+	movl	32(%esp), %ebx				/* restore ebx early */
+	.cfi_restore %ebx
+	jmp	*%eax
 
+	.align	8
+0:
+E X86_RET_FLOAT
+	flds	(%esp)
+	jmp	9f
+
+E X86_RET_DOUBLE
+	fldl	(%esp)
+	jmp	9f
+
+E X86_RET_LDOUBLE
+	fldt	(%esp)
+	jmp	9f
+
+E X86_RET_SINT8
+	movsbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_SINT16
+	movswl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT8
+	movzbl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_UINT16
+	movzwl	(%esp), %eax
+	jmp	9f
+
+E X86_RET_INT64
+	movl	4(%esp), %edx
+	/* fallthru */
+E X86_RET_INT32
+	movl	(%esp), %eax
+	/* fallthru */
+E X86_RET_VOID
+9:	addl	$ffi_closure_raw_TC_FS, %esp
+	.cfi_adjust_cfa_offset -ffi_closure_raw_TC_FS
+	/* Remove the extra %ecx argument we pushed.  */
+	ret	$4
+	.cfi_adjust_cfa_offset ffi_closure_raw_TC_FS
+
+	/* We should never get here.  */
+E X86_RET_STRUCTPOP
+	ud2
+E X86_RET_STRUCTECX
+	ud2
+E X86_RET_UNUSED12
+	ud2
+E X86_RET_UNUSED13
+	ud2
+E X86_RET_UNUSED14
+	ud2
+E X86_RET_UNUSED15
+	ud2
+
+	.cfi_endproc
+	.size	ffi_closure_raw_THISCALL, .-ffi_closure_raw_THISCALL
 #endif /* ifndef __x86_64__ */
 
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
+
+#ifdef __PIC__
+        .section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
+	.type	__x86.get_pc_thunk.bx, @function
+__x86.get_pc_thunk.bx:
+	.cfi_startproc
+	movl	(%esp), %ebx
+	ret
+	.cfi_endproc
+	.size	__x86.get_pc_thunk.bx, . - __x86.get_pc_thunk.bx
+#endif