From 1a0f488c328df63663eed29d18af44733ece3abc Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@redhat.com>
Date: Sat, 25 Dec 2004 01:54:40 -0800
Subject: [PATCH] ffi64.c (struct register_args): Rename from stackLayout.

        * src/x86/ffi64.c (struct register_args): Rename from stackLayout.
        (enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS.
        (merge_classes): Check for it.
        (SSE_CLASS_P): New.
        (classify_argument): Pass byte_offset by value; perform all updates
        inside struct case.
        (examine_argument): Add classes argument; handle
        X86_64_COMPLEX_X87_CLASS.
        (ffi_prep_args): Merge into ...
        (ffi_call): ... here.  Share stack frame with ffi_call_unix64.
        (ffi_prep_cif_machdep): Setup cif->flags for proper structure return.
        (ffi_fill_return_value): Remove.
        (ffi_prep_closure): Remove dead assert.
        (ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner.
        Rewrite to use struct register_args instead of va_list.  Create
        flags for handling structure returns.
        * src/x86/unix64.S: Remove dead strings.
        (ffi_call_unix64): Rename from ffi_call_UNIX64.  Rewrite to share
        stack frame with ffi_call.  Handle structure returns properly.
        (float2sse, floatfloat2sse, double2sse): Remove.
        (sse2float, sse2double, sse2floatfloat): Remove.
        (ffi_closure_unix64): Rename from ffi_closure_UNIX64.  Rewrite
        to handle structure returns properly.

From-SVN: r92602
---
 libffi/ChangeLog        |  26 ++
 libffi/src/x86/ffi64.c  | 662 ++++++++++++++++------------------------
 libffi/src/x86/unix64.S | 544 +++++++++++++++++++--------------
 3 files changed, 592 insertions(+), 640 deletions(-)

diff --git a/libffi/ChangeLog b/libffi/ChangeLog
index a8b1b8a04df..e26f22d9d21 100644
--- a/libffi/ChangeLog
+++ b/libffi/ChangeLog
@@ -1,3 +1,29 @@
+2004-12-25  Richard Henderson  <rth@redhat.com>
+
+	* src/x86/ffi64.c (struct register_args): Rename from stackLayout.
+	(enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS.
+	(merge_classes): Check for it.
+	(SSE_CLASS_P): New.
+	(classify_argument): Pass byte_offset by value; perform all updates
+	inside struct case.
+	(examine_argument): Add classes argument; handle
+	X86_64_COMPLEX_X87_CLASS.
+	(ffi_prep_args): Merge into ...
+	(ffi_call): ... here.  Share stack frame with ffi_call_unix64.
+	(ffi_prep_cif_machdep): Setup cif->flags for proper structure return.
+	(ffi_fill_return_value): Remove.
+	(ffi_prep_closure): Remove dead assert.
+	(ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner.
+	Rewrite to use struct register_args instead of va_list.  Create
+	flags for handling structure returns.
+	* src/x86/unix64.S: Remove dead strings.
+	(ffi_call_unix64): Rename from ffi_call_UNIX64.  Rewrite to share
+	stack frame with ffi_call.  Handle structure returns properly.
+	(float2sse, floatfloat2sse, double2sse): Remove.
+	(sse2float, sse2double, sse2floatfloat): Remove.
+	(ffi_closure_unix64): Rename from ffi_closure_UNIX64.  Rewrite
+	to handle structure returns properly.
+
 2004-12-08  David Edelsohn  <edelsohn@gnu.org>
 
 	* Makefile.am (AM_MAKEFLAGS): Remove duplicate LIBCFLAGS and
diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c
index 653d45c243a..754975ec060 100644
--- a/libffi/src/x86/ffi64.c
+++ b/libffi/src/x86/ffi64.c
@@ -29,22 +29,20 @@
 #include <stdlib.h>
 #include <stdarg.h>
 
-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments */
-
 #ifdef __x86_64__
 
 #define MAX_GPR_REGS 6
 #define MAX_SSE_REGS 8
-typedef struct
+
+struct register_args
 {
   /* Registers for argument passing.  */
-  long gpr[MAX_GPR_REGS];
+  UINT64 gpr[MAX_GPR_REGS];
   __int128_t sse[MAX_SSE_REGS];
+};
 
-  /* Stack space for arguments.  */
-  char argspace[0];
-} stackLayout;
+extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
+			     void *raddr, void (*fnaddr)());
 
 /* All reference to register classes here is identical to the code in
    gcc/config/i386/i386.c. Do *not* change one without the other.  */
@@ -55,8 +53,7 @@ typedef struct
    use SF or DFmode move instead of DImode to avoid reformating penalties.
 
    Similary we play games with INTEGERSI_CLASS to use cheaper SImode moves
-   whenever possible (upper half does contain padding).
- */
+   whenever possible (upper half does contain padding).  */
 enum x86_64_reg_class
   {
     X86_64_NO_CLASS,
@@ -68,11 +65,14 @@ enum x86_64_reg_class
     X86_64_SSEUP_CLASS,
     X86_64_X87_CLASS,
     X86_64_X87UP_CLASS,
+    X86_64_COMPLEX_X87_CLASS,
     X86_64_MEMORY_CLASS
   };
 
 #define MAX_CLASSES 4
 
+#define SSE_CLASS_P(X)	((X) >= X86_64_SSE_CLASS && X <= X86_64_SSEUP_CLASS)
+
 /* x86-64 register passing implementation.  See x86-64 ABI for details.  Goal
    of this code is to classify each 8bytes of incoming argument by the register
    class and assign registers accordingly.  */
@@ -106,9 +106,14 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
       || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
     return X86_64_INTEGER_CLASS;
 
-  /* Rule #5: If one of the classes is X87 or X87UP class, MEMORY is used.  */
-  if (class1 == X86_64_X87_CLASS || class1 == X86_64_X87UP_CLASS
-      || class2 == X86_64_X87_CLASS || class2 == X86_64_X87UP_CLASS)
+  /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
+     MEMORY is used.  */
+  if (class1 == X86_64_X87_CLASS
+      || class1 == X86_64_X87UP_CLASS
+      || class1 == X86_64_COMPLEX_X87_CLASS
+      || class2 == X86_64_X87_CLASS
+      || class2 == X86_64_X87UP_CLASS
+      || class2 == X86_64_COMPLEX_X87_CLASS)
     return X86_64_MEMORY_CLASS;
 
   /* Rule #6: Otherwise class SSE is used.  */
@@ -125,11 +130,8 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
 */
 static int
 classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
-		   int *byte_offset)
+		   size_t byte_offset)
 {
-  /* First, align to the right place.  */
-  *byte_offset = ALIGN(*byte_offset, type->alignment);
-
   switch (type->type)
     {
     case FFI_TYPE_UINT8:
@@ -141,13 +143,13 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
     case FFI_TYPE_UINT64:
     case FFI_TYPE_SINT64:
     case FFI_TYPE_POINTER:
-      if (((*byte_offset) % 8 + type->size) <= 4)
+      if (byte_offset + type->size <= 4)
 	classes[0] = X86_64_INTEGERSI_CLASS;
       else
 	classes[0] = X86_64_INTEGER_CLASS;
       return 1;
     case FFI_TYPE_FLOAT:
-      if (((*byte_offset) % 8) == 0)
+      if (byte_offset == 0)
 	classes[0] = X86_64_SSESF_CLASS;
       else
 	classes[0] = X86_64_SSE_CLASS;
@@ -175,22 +177,23 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  classes[i] = X86_64_NO_CLASS;
 
 	/* Merge the fields of structure.  */
-	for (ptr=type->elements; (*ptr)!=NULL; ptr++)
+	for (ptr = type->elements; *ptr != NULL; ptr++)
 	  {
 	    int num;
 
-	    num = classify_argument (*ptr, subclasses, byte_offset);
+	    byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
+
+	    num = classify_argument (*ptr, subclasses, byte_offset % 8);
 	    if (num == 0)
 	      return 0;
 	    for (i = 0; i < num; i++)
 	      {
-		int pos = *byte_offset / 8;
+		int pos = byte_offset / 8;
 		classes[i + pos] =
 		  merge_classes (subclasses[i], classes[i + pos]);
 	      }
 
-	    if ((*ptr)->type != FFI_TYPE_STRUCT)
-	      *byte_offset += (*ptr)->size;
+	    byte_offset += (*ptr)->size;
 	  }
 
 	/* Final merger cleanup.  */
@@ -222,155 +225,196 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 }
 
 /* Examine the argument and return set number of register required in each
-   class.  Return 0 iff parameter should be passed in memory.  */
+   class.  Return zero iff parameter should be passed in memory, otherwise
+   the number of registers.  */
+
 static int
-examine_argument (ffi_type *type, int in_return, int *int_nregs,int *sse_nregs)
+examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
+		  _Bool in_return, int *pngpr, int *pnsse)
 {
-  enum x86_64_reg_class class[MAX_CLASSES];
-  int offset = 0;
-  int n;
-
-  n = classify_argument (type, class, &offset);
+  int i, n, ngpr, nsse;
 
+  n = classify_argument (type, classes, 0);
   if (n == 0)
     return 0;
 
-  *int_nregs = 0;
-  *sse_nregs = 0;
-  for (n--; n>=0; n--)
-    switch (class[n])
+  ngpr = nsse = 0;
+  for (i = 0; i < n; ++i)
+    switch (classes[i])
       {
       case X86_64_INTEGER_CLASS:
       case X86_64_INTEGERSI_CLASS:
-	(*int_nregs)++;
+	ngpr++;
 	break;
       case X86_64_SSE_CLASS:
       case X86_64_SSESF_CLASS:
       case X86_64_SSEDF_CLASS:
-	(*sse_nregs)++;
+	nsse++;
 	break;
       case X86_64_NO_CLASS:
       case X86_64_SSEUP_CLASS:
 	break;
       case X86_64_X87_CLASS:
       case X86_64_X87UP_CLASS:
-	if (!in_return)
-	  return 0;
-	break;
+      case X86_64_COMPLEX_X87_CLASS:
+	return in_return != 0;
       default:
 	abort ();
       }
-  return 1;
+
+  *pngpr = ngpr;
+  *pnsse = nsse;
+
+  return n;
 }
 
-/* Functions to load floats and double to an SSE register placeholder.  */
-extern void float2sse (float, __int128_t *);
-extern void double2sse (double, __int128_t *);
-extern void floatfloat2sse (void *, __int128_t *);
+/* Perform machine dependent cif processing.  */
 
-/* Functions to put the floats and doubles back.  */
-extern float sse2float (__int128_t *);
-extern double sse2double (__int128_t *);
-extern void sse2floatfloat(__int128_t *, void *);
-
-/*@-exportheader@*/
-void
-ffi_prep_args (stackLayout *stack, extended_cif *ecif)
-/*@=exportheader@*/
+ffi_status
+ffi_prep_cif_machdep (ffi_cif *cif)
 {
-  int gprcount, ssecount, i, g, s;
-  void **p_argv;
-  void *argp = &stack->argspace;
-  ffi_type **p_arg;
+  int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
+  enum x86_64_reg_class classes[MAX_CLASSES];
+  size_t bytes;
 
-  /* First check if the return value should be passed in memory. If so,
-     pass the pointer as the first argument.  */
   gprcount = ssecount = 0;
-  if (ecif->cif->rtype->type != FFI_TYPE_VOID 
-      && examine_argument (ecif->cif->rtype, 1, &g, &s) == 0)
-    stack->gpr[gprcount++] = (long) ecif->rvalue;
 
-  for (i=ecif->cif->nargs, p_arg=ecif->cif->arg_types, p_argv = ecif->avalue;
-       i!=0; i--, p_arg++, p_argv++)
+  flags = cif->rtype->type;
+  if (flags != FFI_TYPE_VOID)
     {
-      int in_register = 0;
-
-      switch ((*p_arg)->type)
+      n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
+      if (n == 0)
 	{
-	case FFI_TYPE_SINT8:
-	case FFI_TYPE_SINT16:
-	case FFI_TYPE_SINT32:
-	case FFI_TYPE_SINT64:
-	case FFI_TYPE_UINT8:
-	case FFI_TYPE_UINT16:
-	case FFI_TYPE_UINT32:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_POINTER:
-	  if (gprcount < MAX_GPR_REGS)
-	    {
-	      stack->gpr[gprcount] = 0;
-	      stack->gpr[gprcount++] = *(long long *)(*p_argv);
-	      in_register = 1;
-	    }
-	  break;
-
-	case FFI_TYPE_FLOAT:
-	  if (ssecount < MAX_SSE_REGS)
-	    {
-	      float2sse (*(float *)(*p_argv), &stack->sse[ssecount++]);
-	      in_register = 1;
-	    }
-	  break;
-
-	case FFI_TYPE_DOUBLE:
-	  if (ssecount < MAX_SSE_REGS)
-	    {
-	      double2sse (*(double *)(*p_argv), &stack->sse[ssecount++]);
-	      in_register = 1;
-	    }
-	  break;
+	  /* The return value is passed in memory.  A pointer to that
+	     memory is the first argument.  Allocate a register for it.  */
+	  gprcount++;
+	  /* We don't have to do anything in asm for the return.  */
+	  flags = FFI_TYPE_VOID;
 	}
-
-      if (in_register)
-	continue;
-
-      /* Either all places in registers where filled, or this is a
-	 type that potentially goes into a memory slot.  */
-      if (examine_argument (*p_arg, 0, &g, &s) == 0
-	  || gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS)
+      else if (flags == FFI_TYPE_STRUCT)
 	{
-	  /* Pass this argument in memory.  */
-	  argp = (void *)ALIGN(argp, (*p_arg)->alignment);
-	  /* Stack arguments are *always* at least 8 byte aligned.  */
-	  argp = (void *)ALIGN(argp, 8);
-	  memcpy (argp, *p_argv, (*p_arg)->size);
-	  argp += (*p_arg)->size;
+	  /* Mark which registers the result appears in.  */
+	  _Bool sse0 = SSE_CLASS_P (classes[0]);
+	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
+	  if (sse0 && !sse1)
+	    flags |= 1 << 8;
+	  else if (!sse0 && sse1)
+	    flags |= 1 << 9;
+	  else if (sse0 && sse1)
+	    flags |= 1 << 10;
+	  /* Mark the true size of the structure.  */
+	  flags |= cif->rtype->size << 11;
+	}
+    }
+  cif->flags = flags;
+
+  /* Go over all arguments and determine the way they should be passed.
+     If it's in a register and there is space for it, let that be so. If
+     not, add it's size to the stack byte count.  */
+  for (bytes = 0, i = 0, avn = cif->nargs; i < avn; i++)
+    {
+      if (examine_argument (cif->arg_types[i], classes, 0, &ngpr, &nsse) == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
+	{
+	  long align = cif->arg_types[i]->alignment;
+
+	  if (align < 8)
+	    align = 8;
+
+	  bytes = ALIGN(bytes, align);
+	  bytes += cif->arg_types[i]->size;
 	}
       else
 	{
-	  /* All easy cases are eliminated. Now fire the big guns.  */
+	  gprcount += ngpr;
+	  ssecount += nsse;
+	}
+    }
+  cif->bytes = bytes;
 
-	  enum x86_64_reg_class classes[MAX_CLASSES];
-	  int offset = 0, j, num;
-	  void *a;
+  return FFI_OK;
+}
 
-	  num = classify_argument (*p_arg, classes, &offset);
-	  for (j=0, a=*p_argv; j<num; j++, a+=8)
+void
+ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
+{
+  enum x86_64_reg_class classes[MAX_CLASSES];
+  char *stack, *argp;
+  ffi_type **arg_types;
+  int gprcount, ssecount, ngpr, nsse, i, avn;
+  _Bool ret_in_memory;
+  struct register_args *reg_args;
+
+  /* Can't call 32-bit mode from 64-bit mode.  */
+  FFI_ASSERT (cif->abi == FFI_UNIX64);
+
+  /* If the return value is a struct and we don't have a return value
+     address then we need to make one.  Note the setting of flags to
+     VOID above in ffi_prep_cif_machdep.  */
+  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
+		   && cif->flags == FFI_TYPE_VOID);
+  if (rvalue == NULL && ret_in_memory)
+    rvalue = alloca (cif->rtype->size);
+
+  /* Allocate the space for the arguments, plus 4 words of temp space.  */
+  stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
+  reg_args = (struct register_args *) stack;
+  argp = stack + sizeof (struct register_args);
+
+  gprcount = ssecount = 0;
+
+  /* If the return value is passed in memory, add the pointer as the
+     first integer argument.  */
+  if (ret_in_memory)
+    reg_args->gpr[gprcount++] = (long) rvalue;
+
+  avn = cif->nargs;
+  arg_types = cif->arg_types;
+
+  for (i = 0; i < avn; ++i)
+    {
+      size_t size = arg_types[i]->size;
+      int n;
+
+      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
+      if (n == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
+	{
+	  long align = arg_types[i]->alignment;
+
+	  /* Stack arguments are *always* at least 8 byte aligned.  */
+	  if (align < 8)
+	    align = 8;
+
+	  /* Pass this argument in memory.  */
+	  argp = (void *) ALIGN (argp, align);
+	  memcpy (argp, avalue[i], size);
+	  argp += size;
+	}
+      else
+	{
+	  /* The argument is passed entirely in registers.  */
+	  char *a = (char *) avalue[i];
+	  int j;
+
+	  for (j = 0; j < n; j++, a += 8, size -= 8)
 	    {
 	      switch (classes[j])
 		{
 		case X86_64_INTEGER_CLASS:
 		case X86_64_INTEGERSI_CLASS:
-		  stack->gpr[gprcount++] = *(long long *)a;
+		  reg_args->gpr[gprcount] = 0;
+		  memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
+		  gprcount++;
 		  break;
 		case X86_64_SSE_CLASS:
-		  floatfloat2sse (a, &stack->sse[ssecount++]);
+		case X86_64_SSEDF_CLASS:
+		  reg_args->sse[ssecount++] = *(UINT64 *) a;
 		  break;
 		case X86_64_SSESF_CLASS:
-		  float2sse (*(float *)a, &stack->sse[ssecount++]);
-		  break;
-		case X86_64_SSEDF_CLASS:
-		  double2sse (*(double *)a, &stack->sse[ssecount++]);
+		  reg_args->sse[ssecount++] = *(UINT32 *) a;
 		  break;
 		default:
 		  abort();
@@ -378,203 +422,13 @@ ffi_prep_args (stackLayout *stack, extended_cif *ecif)
 	    }
 	}
     }
+
+  ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
+		   cif->flags, rvalue, fn);
 }
 
-/* Perform machine dependent cif processing.  */
-ffi_status
-ffi_prep_cif_machdep (ffi_cif *cif)
-{
-  int gprcount, ssecount, i, g, s;
 
-  gprcount = ssecount = 0;
-
-  /* Reset the byte count. We handle this size estimation here.  */
-  cif->bytes = 0;
-
-  /* If the return value should be passed in memory, pass the pointer
-     as the first argument. The actual memory isn't allocated here.  */
-  if (cif->rtype->type != FFI_TYPE_VOID 
-      && examine_argument (cif->rtype, 1, &g, &s) == 0)
-    gprcount = 1;
-
-  /* Go over all arguments and determine the way they should be passed.
-     If it's in a register and there is space for it, let that be so. If
-     not, add it's size to the stack byte count.  */
-  for (i=0; i<cif->nargs; i++)
-    {
-      if (examine_argument (cif->arg_types[i], 0, &g, &s) == 0
-	  || gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS)
-	{
-	  /* This is passed in memory. First align to the basic type.  */
-	  cif->bytes = ALIGN(cif->bytes, cif->arg_types[i]->alignment);
-
-	  /* Stack arguments are *always* at least 8 byte aligned.  */
-	  cif->bytes = ALIGN(cif->bytes, 8);
-
-	  /* Now add the size of this argument.  */
-	  cif->bytes += cif->arg_types[i]->size;
-	}
-      else
-	{
-	  gprcount += g;
-	  ssecount += s;
-	}
-    }
-
-  /* Set the flag for the closures return.  */
-    switch (cif->rtype->type)
-    {
-    case FFI_TYPE_VOID:
-    case FFI_TYPE_STRUCT:
-    case FFI_TYPE_SINT64:
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-    case FFI_TYPE_LONGDOUBLE:
-      cif->flags = (unsigned) cif->rtype->type;
-      break;
-
-    case FFI_TYPE_UINT64:
-      cif->flags = FFI_TYPE_SINT64;
-      break;
-
-    default:
-      cif->flags = FFI_TYPE_INT;
-      break;
-    }
-
-  return FFI_OK;
-}
-
-typedef struct
-{
-  long gpr[2];
-  __int128_t sse[2];
-  long double st0;
-} return_value;
-
-void
-ffi_fill_return_value (return_value *rv, extended_cif *ecif)
-{
-  enum x86_64_reg_class classes[MAX_CLASSES];
-  int i = 0, num;
-  long *gpr = rv->gpr;
-  __int128_t *sse = rv->sse;
-  signed char sc;
-  signed short ss;
-
-  /* This is needed because of the way x86-64 handles signed short
-     integers.  */
-  switch (ecif->cif->rtype->type)
-    {
-    case FFI_TYPE_SINT8:
-      sc = *(signed char *)gpr;
-      *(long long *)ecif->rvalue = (long long)sc;
-      return;
-    case FFI_TYPE_SINT16:
-      ss = *(signed short *)gpr;
-      *(long long *)ecif->rvalue = (long long)ss;
-      return;
-    default:
-      /* Just continue.  */
-      ;
-    }
-
-  num = classify_argument (ecif->cif->rtype, classes, &i);
-
-  if (num == 0)
-    /* Return in memory.  */
-    ecif->rvalue = (void *) rv->gpr[0];
-  else if (num == 2 && classes[0] == X86_64_X87_CLASS &&
-	classes[1] == X86_64_X87UP_CLASS)
-    /* This is a long double (this is easiest to handle this way instead
-       of an eightbyte at a time as in the loop below.  */
-    *((long double *)ecif->rvalue) = rv->st0;
-  else
-    {
-      void *a;
-
-      for (i=0, a=ecif->rvalue; i<num; i++, a+=8)
-	{
-	  switch (classes[i])
-	    {
-	    case X86_64_INTEGER_CLASS:
-	    case X86_64_INTEGERSI_CLASS:
-	      *(long long *)a = *gpr;
-	      gpr++;
-	      break;
-	    case X86_64_SSE_CLASS:
-	      sse2floatfloat (sse++, a);
-	      break;
-	    case X86_64_SSESF_CLASS:
-	      *(float *)a = sse2float (sse++);
-	      break;
-	    case X86_64_SSEDF_CLASS:
-	      *(double *)a = sse2double (sse++);
-	      break;
-	    default:
-	      abort();
-	    }
-	}
-    }
-}
-
-/*@-declundef@*/
-/*@-exportheader@*/
-extern void ffi_call_UNIX64(void (*)(stackLayout *, extended_cif *),
-			    void (*) (return_value *, extended_cif *),
-			    /*@out@*/ extended_cif *, 
-			    unsigned, /*@out@*/ unsigned *, void (*fn)());
-/*@=declundef@*/
-/*@=exportheader@*/
-
-void ffi_call(/*@dependent@*/ ffi_cif *cif, 
-	      void (*fn)(), 
-	      /*@out@*/ void *rvalue, 
-	      /*@dependent@*/ void **avalue)
-{
-  extended_cif ecif;
-  int dummy;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-  
-  /* If the return value is a struct and we don't have a return	*/
-  /* value address then we need to make one		        */
-
-  if ((rvalue == NULL) && 
-      (examine_argument (cif->rtype, 1, &dummy, &dummy) == 0))
-    {
-      /*@-sysunrecog@*/
-      ecif.rvalue = alloca(cif->rtype->size);
-      /*@=sysunrecog@*/
-    }
-  else
-    ecif.rvalue = rvalue;
-    
-  /* Stack must always be 16byte aligned. Make it so.  */
-  cif->bytes = ALIGN(cif->bytes, 16);
-  
-  switch (cif->abi) 
-    {
-    case FFI_SYSV:
-      /* Calling 32bit code from 64bit is not possible  */
-      FFI_ASSERT(0);
-      break;
-
-    case FFI_UNIX64:
-      /*@-usedef@*/
-      ffi_call_UNIX64 (ffi_prep_args, ffi_fill_return_value, &ecif,
-		       cif->bytes, ecif.rvalue, fn);
-      /*@=usedef@*/
-      break;
-
-    default:
-      FFI_ASSERT(0);
-      break;
-    }
-}
-
-extern void ffi_closure_UNIX64(void);
+extern void ffi_closure_unix64(void);
 
 ffi_status
 ffi_prep_closure (ffi_closure* closure,
@@ -584,14 +438,12 @@ ffi_prep_closure (ffi_closure* closure,
 {
   volatile unsigned short *tramp;
 
-  /* FFI_ASSERT (cif->abi == FFI_OSF);  */
-
   tramp = (volatile unsigned short *) &closure->tramp[0];
   tramp[0] = 0xbb49;		/* mov <code>, %r11	*/
   tramp[5] = 0xba49;		/* mov <data>, %r10	*/
   tramp[10] = 0xff49;		/* jmp *%r11	*/
   tramp[11] = 0x00e3;
-  *(void * volatile *) &tramp[1] = ffi_closure_UNIX64;
+  *(void * volatile *) &tramp[1] = ffi_closure_unix64;
   *(void * volatile *) &tramp[6] = closure;
 
   closure->cif = cif;
@@ -602,107 +454,109 @@ ffi_prep_closure (ffi_closure* closure,
 }
 
 int
-ffi_closure_UNIX64_inner(ffi_closure *closure, va_list l, void *rp)
+ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
+			 struct register_args *reg_args, char *argp)
 {
   ffi_cif *cif;
   void **avalue;
   ffi_type **arg_types;
-  long i, avn, argn;
+  long i, avn;
+  int gprcount, ssecount, ngpr, nsse;
+  int ret;
 
   cif = closure->cif;
   avalue = alloca(cif->nargs * sizeof(void *));
+  gprcount = ssecount = 0;
 
-  argn = 0;
+  ret = cif->rtype->type;
+  if (ret != FFI_TYPE_VOID)
+    {
+      enum x86_64_reg_class classes[MAX_CLASSES];
+      int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
+      if (n == 0)
+	{
+	  /* The return value goes in memory.  Arrange for the closure
+	     return value to go directly back to the original caller.  */
+	  rvalue = (void *) reg_args->gpr[gprcount++];
+	  /* We don't have to do anything in asm for the return.  */
+	  ret = FFI_TYPE_VOID;
+	}
+      else if (ret == FFI_TYPE_STRUCT && n == 2)
+	{
+	  /* Mark which register the second word of the structure goes in.  */
+	  _Bool sse0 = SSE_CLASS_P (classes[0]);
+	  _Bool sse1 = SSE_CLASS_P (classes[1]);
+	  if (!sse0 && sse1)
+	    ret |= 1 << 8;
+	  else if (sse0 && !sse1)
+	    ret |= 1 << 9;
+	}
+    }
 
-  i = 0;
   avn = cif->nargs;
   arg_types = cif->arg_types;
   
-  /* Grab the addresses of the arguments from the stack frame.  */
-  while (i < avn)
+  for (i = 0; i < avn; ++i)
     {
-      switch (arg_types[i]->type)
+      enum x86_64_reg_class classes[MAX_CLASSES];
+      int n;
+
+      n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
+      if (n == 0
+	  || gprcount + ngpr > MAX_GPR_REGS
+	  || ssecount + nsse > MAX_SSE_REGS)
 	{
-	case FFI_TYPE_SINT8:
-	case FFI_TYPE_UINT8:
-	case FFI_TYPE_SINT16:
-	case FFI_TYPE_UINT16:
-	case FFI_TYPE_SINT32:
-	case FFI_TYPE_UINT32:
-	case FFI_TYPE_SINT64:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_POINTER:
-	  {
-	    if (l->gp_offset > 48-8)
-	      {
-		avalue[i] = l->overflow_arg_area;
-		l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
-	      }
-	    else
-	      {
-		avalue[i] = (char *)l->reg_save_area + l->gp_offset;
-		l->gp_offset += 8;
-	      }
-	  }
-	  break;
+	  long align = arg_types[i]->alignment;
 
-	case FFI_TYPE_STRUCT:
-	  /* FIXME  */
-	  FFI_ASSERT(0);
-	  break;
+	  /* Stack arguments are *always* at least 8 byte aligned.  */
+	  if (align < 8)
+	    align = 8;
 
-	case FFI_TYPE_DOUBLE:
-	  {
-	    if (l->fp_offset > 176-16)
-	      {
-		avalue[i] = l->overflow_arg_area;
-		l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
-	      }
-	    else
-	      {
-		avalue[i] = (char *)l->reg_save_area + l->fp_offset;
-		l->fp_offset += 16;
-	      }
-	  }
-#if DEBUG_FFI
-	  fprintf (stderr, "double arg %d = %g\n", i, *(double *)avalue[i]);
-#endif
-	  break;
-	  
-	case FFI_TYPE_FLOAT:
-	  {
-	    if (l->fp_offset > 176-16)
-	      {
-		avalue[i] = l->overflow_arg_area;
-		l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
-	      }
-	    else
-	      {
-		avalue[i] = (char *)l->reg_save_area + l->fp_offset;
-		l->fp_offset += 16;
-	      }
-	  }
-#if DEBUG_FFI
-	  fprintf (stderr, "float arg %d = %g\n", i, *(float *)avalue[i]);
-#endif
-	  break;
-	  
-	default:
-	  FFI_ASSERT(0);
+	  /* Pass this argument in memory.  */
+	  argp = (void *) ALIGN (argp, align);
+	  avalue[i] = argp;
+	  argp += arg_types[i]->size;
 	}
+      /* If the argument is in a single register, or two consecutive
+	 registers, then we can use that address directly.  */
+      else if (n == 1
+	       || (n == 2
+		   && SSE_CLASS_P (classes[0]) == SSE_CLASS_P (classes[1])))
+	{
+	  /* The argument is in a single register.  */
+	  if (SSE_CLASS_P (classes[0]))
+	    {
+	      avalue[i] = &reg_args->sse[ssecount];
+	      ssecount += n;
+	    }
+	  else
+	    {
+	      avalue[i] = &reg_args->gpr[gprcount];
+	      gprcount += n;
+	    }
+	}
+      /* Otherwise, allocate space to make them consecutive.  */
+      else
+	{
+	  char *a = alloca (16);
+	  int j;
 
-      argn += ALIGN(arg_types[i]->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
-      i++;
+	  avalue[i] = a;
+	  for (j = 0; j < n; j++, a += 8)
+	    {
+	      if (SSE_CLASS_P (classes[j]))
+		memcpy (a, &reg_args->sse[ssecount++], 8);
+	      else
+		memcpy (a, &reg_args->gpr[gprcount++], 8);
+	    }
+	}
     }
 
   /* Invoke the closure.  */
-  (closure->fun) (cif, rp, avalue, closure->user_data);
+  closure->fun (cif, rvalue, avalue, closure->user_data);
 
-  /* FIXME: Structs not supported.  */
-  FFI_ASSERT(cif->rtype->type != FFI_TYPE_STRUCT);
-
-  /* Tell ffi_closure_UNIX64 how to perform return type promotions.  */
-
-  return cif->rtype->type;
+  /* Tell assembly how to perform return type promotions.  */
+  return ret;
 }
-#endif /* ifndef __x86_64__ */
+
+#endif /* __x86_64__ */
diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S
index 310fed71843..5e1c6c5e516 100644
--- a/libffi/src/x86/unix64.S
+++ b/libffi/src/x86/unix64.S
@@ -28,276 +28,348 @@
 #include <fficonfig.h>
 #include <ffi.h>
 
-	.section	.rodata
-.LC0:
-	.string	"asm in progress %lld\n"
-.LC1:
-	.string	"asm in progress\n"
 .text
+
+/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
+                    void *raddr, void (*fnaddr)());
+
+   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
+   for this function.  This has been allocated by ffi_call.  We also
+   deallocate some of the stack that has been alloca'd.  */
+
 	.align	2
-.globl ffi_call_UNIX64
-        .type	ffi_call_UNIX64,@function
+	.globl	ffi_call_unix64
+        .type	ffi_call_unix64,@function
 
-ffi_call_UNIX64:
-.LFB1:
-        pushq	%rbp
-.LCFI0:
-        movq	%rsp, %rbp
-.LCFI1:
-	/* Save all arguments */
-	subq	$48, %rsp
-.LCFI2:
-	movq	%rdi, -8(%rbp)		/* ffi_prep_args	 */
-	movq	%rsi, -16(%rbp)		/* ffi_fill_return_value */
-	movq	%rdx, -24(%rbp)		/* ecif			 */
-	movq	%rcx, -32(%rbp)		/* cif->bytes		 */
-	movq	%r8, -40(%rbp)		/* ecif.rvalue		 */
-	movq	%r9, -48(%rbp)		/* fn			 */
+ffi_call_unix64:
+.LUW0:
+	movq	(%rsp), %r10		/* Load return address.  */
+	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
+	movq	%rdx, (%rax)		/* Save flags.  */
+	movq	%rcx, 8(%rax)		/* Save raddr.  */
+	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
+	movq	%r10, 24(%rax)		/* Relocate return address.  */
+	movq	%rax, %rbp		/* Finalize local stack frame.  */
+.LUW1:
+	movq	%rdi, %r10		/* Save a copy of the register area. */
+	movq	%r8, %r11		/* Save a copy of the target fn.  */
 
-	/* Make room for all of the new args and the register args */
-	addl	$176, %ecx
-.LCFI3:
-	subq	%rcx, %rsp
-.LCFI4:
-	/* Setup the call to ffi_prep_args.  */
-	movq	%rdi, %rax		/* &ffi_prep_args	*/
-	movq	%rsp, %rdi		/* stackLayout		*/
-	movq	%rdx, %rsi		/* ecif			*/
-	call	*%rax			/* ffi_prep_args(stackLayout, ecif);*/ 
+	/* Load up all argument registers.  */
+	movq	(%r10), %rdi
+	movq	8(%r10), %rsi
+	movq	16(%r10), %rdx
+	movq	24(%r10), %rcx
+	movq	32(%r10), %r8
+	movq	40(%r10), %r9
+	movdqa	48(%r10), %xmm0
+	movdqa	64(%r10), %xmm1
+	movdqa	80(%r10), %xmm2
+	movdqa	96(%r10), %xmm3
+	movdqa	112(%r10), %xmm4
+	movdqa	128(%r10), %xmm5
+	movdqa	144(%r10), %xmm6
+	movdqa	160(%r10), %xmm7
 
-	/* ffi_prep_args have put all the register contents into the  */
-	/* stackLayout struct. Now put the register values in place.  */
-	movq	(%rsp), %rdi
-	movq	8(%rsp), %rsi
-	movq	16(%rsp), %rdx
-	movq	24(%rsp), %rcx
-	movq	32(%rsp), %r8
-	movq	40(%rsp), %r9
-	movaps	48(%rsp), %xmm0
-	movaps	64(%rsp), %xmm1
-	movaps	80(%rsp), %xmm2
-	movaps	96(%rsp), %xmm3
-	movaps	112(%rsp), %xmm4
-	movaps	128(%rsp), %xmm5
-	movaps	144(%rsp), %xmm6
-	movaps	160(%rsp), %xmm7
+	/* Deallocate the reg arg area.  */
+	leaq	176(%r10), %rsp
 
-	/* Remove space for stackLayout so stack arguments are placed
-	   correctly for the call.  */
-.LCFI5:
-	addq	$176, %rsp
-.LCFI6:
 	/* Call the user function.  */
-	call	*-48(%rbp)
+	call	*%r11
 
-	/* Make stack space for the return_value struct.  */
-	subq	$64, %rsp
+	/* Deallocate stack arg area; local stack frame in redzone.  */
+	leaq	24(%rbp), %rsp
 
-	/* Fill in all potential return values to this struct.  */
-	movq	%rax, (%rsp)
-	movq	%rdx, 8(%rsp)
-	movaps	%xmm0, 16(%rsp)
-	movaps	%xmm1, 32(%rsp)
-	fstpt	48(%rsp)
+	movq	0(%rbp), %rcx		/* Reload flags.  */
+	movq	8(%rbp), %rdi		/* Reload raddr.  */
+	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
+.LUW2:
 
-	/* Now call ffi_fill_return_value.  */
-	movq	%rsp, %rdi		/* struct return_value	  */
-	movq	-24(%rbp), %rsi		/* ecif			  */
-	movq	-16(%rbp), %rax		/* &ffi_fill_return_value */
-	call	*%rax			/* call it		  */
+	/* The first byte of the flags contains the FFI_TYPE.  */
+	movzbl	%cl, %r10d
+	leaq	.Lstore_table(%rip), %r11
+	movslq	(%r11, %r10, 4), %r10
+	addq	%r11, %r10
+	jmp	*%r10
 
-	/* And the work is done.  */
-        leave
-        ret
-.LFE1:
-.ffi_call_UNIX64_end:
-        .size    ffi_call_UNIX64,.ffi_call_UNIX64_end-ffi_call_UNIX64
+	.section .rodata
+.Lstore_table:
+	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
+	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
+	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
+	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
+	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
+	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
+	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
+	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
+	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
+	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
+	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
+	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
+	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
+	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
+	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
 
-.text
-	.align	2
-.globl float2sse
-        .type	float2sse,@function
-float2sse:
-	/* Save the contents of this sse-float in a pointer.  */
-	movaps	%xmm0, (%rdi)
+	.text
+	.align 2
+.Lst_void:
+	ret
+	.align 2
+
+.Lst_uint8:
+	movzbq	%al, %rax
+	movq	%rax, (%rdi)
+	ret
+	.align 2
+.Lst_sint8:
+	movsbq	%al, %rax
+	movq	%rax, (%rdi)
+	ret
+	.align 2
+.Lst_uint16:
+	movzwq	%ax, %rax
+	movq	%rax, (%rdi)
+	.align 2
+.Lst_sint16:
+	movswq	%ax, %rax
+	movq	%rax, (%rdi)
+	ret
+	.align 2
+.Lst_uint32:
+	movl	%eax, %eax
+	movq	%rax, (%rdi)
+	.align 2
+.Lst_sint32:
+	cltq
+	movq	%rax, (%rdi)
+	ret
+	.align 2
+.Lst_int64:
+	movq	%rax, (%rdi)
 	ret
 
-	.align	2
-.globl floatfloat2sse
-        .type	floatfloat2sse,@function
-floatfloat2sse:
-	/* Save the contents of these two sse-floats in a pointer.  */
-	movq	(%rdi), %xmm0
-	movaps	%xmm0, (%rsi)
+	.align 2
+.Lst_float:
+	movss	%xmm0, (%rdi)
+	ret
+	.align 2
+.Lst_double:
+	movsd	%xmm0, (%rdi)
+	ret
+.Lst_ldouble:
+	fstpt	(%rdi)
 	ret
 
-	.align	2
-.globl double2sse
-        .type	double2sse,@function
-double2sse:
-	/* Save the contents of this sse-double in a pointer.  */
-	movaps	%xmm0, (%rdi)
+	.align 2
+.Lst_struct:
+	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
+
+	/* We have to locate the values now, and since we don't want to
+	   write too much data into the user's return value, we spill the
+	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
+	   control where the values are located.  Only one of the three
+	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
+	movd	%xmm0, %r10
+	movd	%xmm1, %r11
+	testl	$0x100, %ecx
+	cmovnz	%rax, %rdx
+	cmovnz	%r10, %rax
+	testl	$0x200, %ecx
+	cmovnz	%r10, %rdx
+	testl	$0x400, %ecx
+	cmovnz	%r10, %rax
+	cmovnz	%r11, %rdx
+	movq	%rax, (%rsi)
+	movq	%rdx, 8(%rsi)
+
+	/* Bits 11-31 contain the true size of the structure.  Copy from
+	   the scratch area to the true destination.  */
+	shrl	$11, %ecx
+	rep movsb
 	ret
+.LUW3:
+	.size    ffi_call_unix64,.-ffi_call_unix64
 
 	.align	2
-.globl sse2float
-        .type	sse2float,@function
-sse2float:
-	/* Save the contents of this sse-float in a pointer.  */
-	movaps	(%rdi), %xmm0
-	ret
+	.globl ffi_closure_unix64
+        .type	ffi_closure_unix64,@function
 
-	.align	2
-.globl sse2double
-        .type	sse2double,@function
-sse2double:
-	/* Save the contents of this pointer in a sse-double.  */
-	movaps	(%rdi), %xmm0
-	ret
+ffi_closure_unix64:
+.LUW4:
+	subq	$200, %rsp
+.LUW5:
 
-	.align	2
-.globl sse2floatfloat
-        .type	sse2floatfloat,@function
-sse2floatfloat:
-	/* Save the contents of this pointer in two sse-floats.  */
-	movaps	(%rdi), %xmm0
-	movq	%xmm0, (%rsi)
-	ret
+	movq	%rdi, (%rsp)
+        movq    %rsi, 8(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %r9, 40(%rsp)
+	movdqa	%xmm0, 48(%rsp)
+	movdqa	%xmm1, 64(%rsp)
+	movdqa	%xmm2, 80(%rsp)
+	movdqa	%xmm3, 96(%rsp)
+	movdqa	%xmm4, 112(%rsp)
+	movdqa	%xmm5, 128(%rsp)
+	movdqa	%xmm6, 144(%rsp)
+	movdqa	%xmm7, 160(%rsp)
 
-	.align	2
-.globl ffi_closure_UNIX64
-        .type	ffi_closure_UNIX64,@function
-
-ffi_closure_UNIX64:
-.LFB2:
-        pushq   %rbp
-.LCFI10:
-        movq    %rsp, %rbp
-.LCFI11:
-        subq    $240, %rsp
-.LCFI12:
-	movq	%rdi, -176(%rbp)
-        movq    %rsi, -168(%rbp)
-        movq    %rdx, -160(%rbp)
-        movq    %rcx, -152(%rbp)
-        movq    %r8, -144(%rbp)
-        movq    %r9, -136(%rbp)
-        /* FIXME: We can avoid all this stashing of XMM registers by
-	   (in ffi_prep_closure) computing the number of
-	   floating-point args and moving it into %rax before calling
-	   this function.  Once this is done, uncomment the next few
-	   lines and only the essential XMM registers will be written
-	   to memory.  This is a significant saving.  */
-/*         movzbl  %al, %eax  */
-/*         movq    %rax, %rdx */
-/*         leaq    0(,%rdx,4), %rax */
-/*         leaq    2f(%rip), %rdx */
-/*         subq    %rax, %rdx */
-        leaq    -1(%rbp), %rax
-/*         jmp     *%rdx */
-        movaps  %xmm7, -15(%rax)
-        movaps  %xmm6, -31(%rax)
-        movaps  %xmm5, -47(%rax)
-        movaps  %xmm4, -63(%rax)
-        movaps  %xmm3, -79(%rax)
-        movaps  %xmm2, -95(%rax)
-        movaps  %xmm1, -111(%rax)
-        movaps  %xmm0, -127(%rax)
-2:
-        movl    %edi, -180(%rbp)
-        movl    $0, -224(%rbp)
-        movl    $48, -220(%rbp)
-        leaq    16(%rbp), %rax
-        movq    %rax, -216(%rbp)
-        leaq    -176(%rbp), %rdx
-        movq    %rdx, -208(%rbp)
-        leaq    -224(%rbp), %rsi
 	movq	%r10, %rdi
+	leaq	176(%rsp), %rsi
 	movq	%rsp, %rdx
-        call    ffi_closure_UNIX64_inner@PLT
+	leaq	208(%rsp), %rcx
+	call	ffi_closure_unix64_inner@PLT
 
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	1f
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	2f
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	3f
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	4f
-	popq	%rax
-        leave
-        ret
-1:
-2:
-3:	
-	movaps	-240(%rbp), %xmm0
-        leave
-        ret
-4:
-	leave
+	/* Deallocate stack frame early; return value is now in redzone.  */
+	addq	$200, %rsp
+.LUW6:
+
+	/* The first byte of the return value contains the FFI_TYPE.  */
+	movzbl	%al, %r10d
+	leaq	.Lload_table(%rip), %r11
+	movslq	(%r11, %r10, 4), %r10
+	addq	%r11, %r10
+	jmp	*%r10
+
+	.section .rodata
+.Lload_table:
+	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
+	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
+	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
+	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
+	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
+	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
+	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
+	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
+	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
+	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
+	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
+	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
+	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
+	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
+	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
+
+	.text
+	.align 2
+.Lld_void:
 	ret
-.LFE2:	
-		
-        .section        .eh_frame,EH_FRAME_FLAGS,@progbits
-.Lframe0:
-        .long   .LECIE1-.LSCIE1
+
+	.align 2
+.Lld_int8:
+	movzbl	-24(%rsp), %eax
+	ret
+	.align 2
+.Lld_int16:
+	movzwl	-24(%rsp), %eax
+	ret
+	.align 2
+.Lld_int32:
+	movl	-24(%rsp), %eax
+	ret
+	.align 2
+.Lld_int64:
+	movq	-24(%rsp), %rax
+	ret
+
+	.align 2
+.Lld_float:
+	movss	-24(%rsp), %xmm0
+	ret
+	.align 2
+.Lld_double:
+	movsd	-24(%rsp), %xmm0
+	ret
+	.align 2
+.Lld_ldouble:
+	fldt	-24(%rsp)
+	ret
+
+	.align 2
+.Lld_struct:
+	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
+	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
+	   both rdx and xmm1 with the second word.  For the remaining,
+	   bit 8 set means xmm0 gets the second word, and bit 9 means
+	   that rax gets the second word.  */
+	movq	-24(%rsp), %rcx
+	movq	-16(%rsp), %rdx
+	movq	-16(%rsp), %xmm1
+	testl	$0x100, %eax
+	cmovnz	%rdx, %rcx
+	movd	%rcx, %xmm0
+	testl	$0x200, %eax
+	movq	-24(%rsp), %rax
+	cmovnz	%rdx, %rax
+	ret
+.LUW7:
+	.size	ffi_closure_unix64,.-ffi_closure_unix64
+
+	.section	.eh_frame,"a",@progbits
+.Lframe1:
+	.long	.LECIE1-.LSCIE1		/* CIE Length */
 .LSCIE1:
-        .long   0x0
-        .byte   0x1
-        .string "zR"
-        .uleb128 0x1
-        .sleb128 -8
-        .byte   0x10
-        .uleb128 0x1
-        .byte   0x1b
-        .byte   0xc
-        .uleb128 0x7
-        .uleb128 0x8
-        .byte   0x90
-        .uleb128 0x1
-        .align 8
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii "zR\0"			/* CIE Augmentation */
+	.uleb128 1			/* CIE Code Alignment Factor */
+	.sleb128 -8			/* CIE Data Alignment Factor */
+	.byte	0x10			/* CIE RA Column */
+	.uleb128 1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
+	.uleb128 7
+	.uleb128 8
+	.byte	0x80+16			/* DW_CFA_offset, %rip offset 1*-8 */
+	.uleb128 1
+	.align 8
 .LECIE1:
 .LSFDE1:
-	.long	.LEFDE1-.LASFDE1
+	.long	.LEFDE1-.LASFDE1	/* FDE Length */
 .LASFDE1:
-        .long   .LASFDE1-.Lframe0
+	.long	.LASFDE1-.Lframe1	/* FDE CIE offset */
+	.long	.LUW0-.			/* FDE initial location */
+	.long	.LUW3-.LUW0		/* FDE address range */
+	.uleb128 0x0			/* Augmentation size */
 
-        .long   .LFB1-.
-        .long   .LFE1-.LFB1
-        .uleb128 0x0
-        .byte   0x4		# DW_CFA_advance_loc4
-        .long   .LCFI0-.LFB1
-        .byte   0xe		# DW_CFA_def_cfa_offset
-        .uleb128 0x10
-        .byte   0x86		# DW_CFA_offset: r6 at cfa-16
-        .uleb128 0x2
-        .byte   0x4		# DW_CFA_advance_loc4
-        .long   .LCFI1-.LCFI0
-        .byte   0x86		# DW_CFA_offset: r6 at cfa-16
-        .uleb128 0x2
-        .byte   0xd		# DW_CFA_def_cfa_reg: r6
-        .uleb128 0x6
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW1-.LUW0
+
+        /* New stack frame based off rbp.  This is a itty bit of unwind
+           trickery in that the CFA *has* changed.  There is no easy way
+           to describe it correctly on entry to the function.  Fortunately,
+           it doesn't matter too much since at all points we can correctly
+           unwind back to ffi_call.  Note that the location to which we
+           moved the return address is (the new) CFA-8, so from the
+           perspective of the unwind info, it hasn't moved.  */
+	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
+	.uleb128 6
+	.uleb128 32
+	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
+	.uleb128 2
+
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW2-.LUW3
+	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
+	.uleb128 7
+	.uleb128 8
+	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
 	.align 8
 .LEFDE1:
 .LSFDE3:
-        .long   .LEFDE3-.LASFDE3        # FDE Length
+	.long	.LEFDE3-.LASFDE3	/* FDE Length */
 .LASFDE3:
-        .long   .LASFDE3-.Lframe0       # FDE CIE offset
-
-        .long   .LFB2-. # FDE initial location
-        .long   .LFE2-.LFB2     # FDE address range
-        .uleb128 0x0    # Augmentation size
-        .byte   0x4     # DW_CFA_advance_loc4
-        .long   .LCFI10-.LFB2
-        .byte   0xe     # DW_CFA_def_cfa_offset
-        .uleb128 0x10
-        .byte   0x86    # DW_CFA_offset, column 0x6
-        .uleb128 0x2
-        .byte   0x4     # DW_CFA_advance_loc4
-        .long   .LCFI11-.LCFI10
-        .byte   0xd     # DW_CFA_def_cfa_register
-        .uleb128 0x6
-        .align 8
+	.long	.LASFDE3-.Lframe1	/* FDE CIE offset */
+	.long	.LUW4-.			/* FDE initial location */
+	.long	.LUW7-.LUW4		/* FDE address range */
+	.uleb128 0x0			/* Augmentation size */
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW5-.LUW4
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.uleb128 208
+	.byte	0x4			/* DW_CFA_advance_loc4 */
+	.long	.LUW6-.LUW5
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.uleb128 8
+	.align 8
 .LEFDE3:
 
-#endif /* __x86_64__  */
+#endif /* __x86_64__ */