ffi64.c (struct register_args): Rename from stackLayout.

* src/x86/ffi64.c (struct register_args): Rename from stackLayout.
        (enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS.
        (merge_classes): Check for it.
        (SSE_CLASS_P): New.
        (classify_argument): Pass byte_offset by value; perform all updates
        inside struct case.
        (examine_argument): Add classes argument; handle
        X86_64_COMPLEX_X87_CLASS.
        (ffi_prep_args): Merge into ...
        (ffi_call): ... here.  Share stack frame with ffi_call_unix64.
        (ffi_prep_cif_machdep): Setup cif->flags for proper structure return.
        (ffi_fill_return_value): Remove.
        (ffi_prep_closure): Remove dead assert.
        (ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner.
        Rewrite to use struct register_args instead of va_list.  Create
        flags for handling structure returns.
        * src/x86/unix64.S: Remove dead strings.
        (ffi_call_unix64): Rename from ffi_call_UNIX64.  Rewrite to share
        stack frame with ffi_call.  Handle structure returns properly.
        (float2sse, floatfloat2sse, double2sse): Remove.
        (sse2float, sse2double, sse2floatfloat): Remove.
        (ffi_closure_unix64): Rename from ffi_closure_UNIX64.  Rewrite
        to handle structure returns properly.

From-SVN: r92602
This commit is contained in:
Richard Henderson 2004-12-25 01:54:40 -08:00 committed by Richard Henderson
parent fa54a7a743
commit 1a0f488c32
3 changed files with 592 additions and 640 deletions

View File

@ -1,3 +1,29 @@
2004-12-25 Richard Henderson <rth@redhat.com>
* src/x86/ffi64.c (struct register_args): Rename from stackLayout.
(enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS.
(merge_classes): Check for it.
(SSE_CLASS_P): New.
(classify_argument): Pass byte_offset by value; perform all updates
inside struct case.
(examine_argument): Add classes argument; handle
X86_64_COMPLEX_X87_CLASS.
(ffi_prep_args): Merge into ...
(ffi_call): ... here. Share stack frame with ffi_call_unix64.
(ffi_prep_cif_machdep): Setup cif->flags for proper structure return.
(ffi_fill_return_value): Remove.
(ffi_prep_closure): Remove dead assert.
(ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner.
Rewrite to use struct register_args instead of va_list. Create
flags for handling structure returns.
* src/x86/unix64.S: Remove dead strings.
(ffi_call_unix64): Rename from ffi_call_UNIX64. Rewrite to share
stack frame with ffi_call. Handle structure returns properly.
(float2sse, floatfloat2sse, double2sse): Remove.
(sse2float, sse2double, sse2floatfloat): Remove.
(ffi_closure_unix64): Rename from ffi_closure_UNIX64. Rewrite
to handle structure returns properly.
2004-12-08 David Edelsohn <edelsohn@gnu.org>
* Makefile.am (AM_MAKEFLAGS): Remove duplicate LIBCFLAGS and

View File

@ -29,22 +29,20 @@
#include <stdlib.h>
#include <stdarg.h>
/* ffi_prep_args is called by the assembly routine once stack space
has been allocated for the function's arguments */
#ifdef __x86_64__
#define MAX_GPR_REGS 6
#define MAX_SSE_REGS 8
typedef struct
struct register_args
{
/* Registers for argument passing. */
long gpr[MAX_GPR_REGS];
UINT64 gpr[MAX_GPR_REGS];
__int128_t sse[MAX_SSE_REGS];
};
/* Stack space for arguments. */
char argspace[0];
} stackLayout;
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)());
/* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */
@ -55,8 +53,7 @@ typedef struct
use SF or DFmode move instead of DImode to avoid reformating penalties.
Similary we play games with INTEGERSI_CLASS to use cheaper SImode moves
whenever possible (upper half does contain padding).
*/
whenever possible (upper half does contain padding). */
enum x86_64_reg_class
{
X86_64_NO_CLASS,
@ -68,11 +65,14 @@ enum x86_64_reg_class
X86_64_SSEUP_CLASS,
X86_64_X87_CLASS,
X86_64_X87UP_CLASS,
X86_64_COMPLEX_X87_CLASS,
X86_64_MEMORY_CLASS
};
#define MAX_CLASSES 4
#define SSE_CLASS_P(X) ((X) >= X86_64_SSE_CLASS && X <= X86_64_SSEUP_CLASS)
/* x86-64 register passing implementation. See x86-64 ABI for details. Goal
of this code is to classify each 8bytes of incoming argument by the register
class and assign registers accordingly. */
@ -106,9 +106,14 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
|| class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS)
return X86_64_INTEGER_CLASS;
/* Rule #5: If one of the classes is X87 or X87UP class, MEMORY is used. */
if (class1 == X86_64_X87_CLASS || class1 == X86_64_X87UP_CLASS
|| class2 == X86_64_X87_CLASS || class2 == X86_64_X87UP_CLASS)
/* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class,
MEMORY is used. */
if (class1 == X86_64_X87_CLASS
|| class1 == X86_64_X87UP_CLASS
|| class1 == X86_64_COMPLEX_X87_CLASS
|| class2 == X86_64_X87_CLASS
|| class2 == X86_64_X87UP_CLASS
|| class2 == X86_64_COMPLEX_X87_CLASS)
return X86_64_MEMORY_CLASS;
/* Rule #6: Otherwise class SSE is used. */
@ -125,11 +130,8 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2)
*/
static int
classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
int *byte_offset)
size_t byte_offset)
{
/* First, align to the right place. */
*byte_offset = ALIGN(*byte_offset, type->alignment);
switch (type->type)
{
case FFI_TYPE_UINT8:
@ -141,13 +143,13 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
case FFI_TYPE_UINT64:
case FFI_TYPE_SINT64:
case FFI_TYPE_POINTER:
if (((*byte_offset) % 8 + type->size) <= 4)
if (byte_offset + type->size <= 4)
classes[0] = X86_64_INTEGERSI_CLASS;
else
classes[0] = X86_64_INTEGER_CLASS;
return 1;
case FFI_TYPE_FLOAT:
if (((*byte_offset) % 8) == 0)
if (byte_offset == 0)
classes[0] = X86_64_SSESF_CLASS;
else
classes[0] = X86_64_SSE_CLASS;
@ -175,22 +177,23 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
classes[i] = X86_64_NO_CLASS;
/* Merge the fields of structure. */
for (ptr=type->elements; (*ptr)!=NULL; ptr++)
for (ptr = type->elements; *ptr != NULL; ptr++)
{
int num;
num = classify_argument (*ptr, subclasses, byte_offset);
byte_offset = ALIGN (byte_offset, (*ptr)->alignment);
num = classify_argument (*ptr, subclasses, byte_offset % 8);
if (num == 0)
return 0;
for (i = 0; i < num; i++)
{
int pos = *byte_offset / 8;
int pos = byte_offset / 8;
classes[i + pos] =
merge_classes (subclasses[i], classes[i + pos]);
}
if ((*ptr)->type != FFI_TYPE_STRUCT)
*byte_offset += (*ptr)->size;
byte_offset += (*ptr)->size;
}
/* Final merger cleanup. */
@ -222,155 +225,196 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
}
/* Examine the argument and return set number of register required in each
class. Return 0 iff parameter should be passed in memory. */
class. Return zero iff parameter should be passed in memory, otherwise
the number of registers. */
static int
examine_argument (ffi_type *type, int in_return, int *int_nregs,int *sse_nregs)
examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES],
_Bool in_return, int *pngpr, int *pnsse)
{
enum x86_64_reg_class class[MAX_CLASSES];
int offset = 0;
int n;
n = classify_argument (type, class, &offset);
int i, n, ngpr, nsse;
n = classify_argument (type, classes, 0);
if (n == 0)
return 0;
*int_nregs = 0;
*sse_nregs = 0;
for (n--; n>=0; n--)
switch (class[n])
ngpr = nsse = 0;
for (i = 0; i < n; ++i)
switch (classes[i])
{
case X86_64_INTEGER_CLASS:
case X86_64_INTEGERSI_CLASS:
(*int_nregs)++;
ngpr++;
break;
case X86_64_SSE_CLASS:
case X86_64_SSESF_CLASS:
case X86_64_SSEDF_CLASS:
(*sse_nregs)++;
nsse++;
break;
case X86_64_NO_CLASS:
case X86_64_SSEUP_CLASS:
break;
case X86_64_X87_CLASS:
case X86_64_X87UP_CLASS:
if (!in_return)
return 0;
break;
case X86_64_COMPLEX_X87_CLASS:
return in_return != 0;
default:
abort ();
}
return 1;
*pngpr = ngpr;
*pnsse = nsse;
return n;
}
/* Functions to load floats and double to an SSE register placeholder. */
extern void float2sse (float, __int128_t *);
extern void double2sse (double, __int128_t *);
extern void floatfloat2sse (void *, __int128_t *);
/* Perform machine dependent cif processing. */
/* Functions to put the floats and doubles back. */
extern float sse2float (__int128_t *);
extern double sse2double (__int128_t *);
extern void sse2floatfloat(__int128_t *, void *);
/*@-exportheader@*/
void
ffi_prep_args (stackLayout *stack, extended_cif *ecif)
/*@=exportheader@*/
ffi_status
ffi_prep_cif_machdep (ffi_cif *cif)
{
int gprcount, ssecount, i, g, s;
void **p_argv;
void *argp = &stack->argspace;
ffi_type **p_arg;
int gprcount, ssecount, i, avn, n, ngpr, nsse, flags;
enum x86_64_reg_class classes[MAX_CLASSES];
size_t bytes;
/* First check if the return value should be passed in memory. If so,
pass the pointer as the first argument. */
gprcount = ssecount = 0;
if (ecif->cif->rtype->type != FFI_TYPE_VOID
&& examine_argument (ecif->cif->rtype, 1, &g, &s) == 0)
stack->gpr[gprcount++] = (long) ecif->rvalue;
for (i=ecif->cif->nargs, p_arg=ecif->cif->arg_types, p_argv = ecif->avalue;
i!=0; i--, p_arg++, p_argv++)
flags = cif->rtype->type;
if (flags != FFI_TYPE_VOID)
{
int in_register = 0;
switch ((*p_arg)->type)
n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
if (n == 0)
{
case FFI_TYPE_SINT8:
case FFI_TYPE_SINT16:
case FFI_TYPE_SINT32:
case FFI_TYPE_SINT64:
case FFI_TYPE_UINT8:
case FFI_TYPE_UINT16:
case FFI_TYPE_UINT32:
case FFI_TYPE_UINT64:
case FFI_TYPE_POINTER:
if (gprcount < MAX_GPR_REGS)
{
stack->gpr[gprcount] = 0;
stack->gpr[gprcount++] = *(long long *)(*p_argv);
in_register = 1;
}
break;
case FFI_TYPE_FLOAT:
if (ssecount < MAX_SSE_REGS)
{
float2sse (*(float *)(*p_argv), &stack->sse[ssecount++]);
in_register = 1;
}
break;
case FFI_TYPE_DOUBLE:
if (ssecount < MAX_SSE_REGS)
{
double2sse (*(double *)(*p_argv), &stack->sse[ssecount++]);
in_register = 1;
}
break;
/* The return value is passed in memory. A pointer to that
memory is the first argument. Allocate a register for it. */
gprcount++;
/* We don't have to do anything in asm for the return. */
flags = FFI_TYPE_VOID;
}
if (in_register)
continue;
/* Either all places in registers where filled, or this is a
type that potentially goes into a memory slot. */
if (examine_argument (*p_arg, 0, &g, &s) == 0
|| gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS)
else if (flags == FFI_TYPE_STRUCT)
{
/* Pass this argument in memory. */
argp = (void *)ALIGN(argp, (*p_arg)->alignment);
/* Stack arguments are *always* at least 8 byte aligned. */
argp = (void *)ALIGN(argp, 8);
memcpy (argp, *p_argv, (*p_arg)->size);
argp += (*p_arg)->size;
/* Mark which registers the result appears in. */
_Bool sse0 = SSE_CLASS_P (classes[0]);
_Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
if (sse0 && !sse1)
flags |= 1 << 8;
else if (!sse0 && sse1)
flags |= 1 << 9;
else if (sse0 && sse1)
flags |= 1 << 10;
/* Mark the true size of the structure. */
flags |= cif->rtype->size << 11;
}
}
cif->flags = flags;
/* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If
not, add it's size to the stack byte count. */
for (bytes = 0, i = 0, avn = cif->nargs; i < avn; i++)
{
if (examine_argument (cif->arg_types[i], classes, 0, &ngpr, &nsse) == 0
|| gprcount + ngpr > MAX_GPR_REGS
|| ssecount + nsse > MAX_SSE_REGS)
{
long align = cif->arg_types[i]->alignment;
if (align < 8)
align = 8;
bytes = ALIGN(bytes, align);
bytes += cif->arg_types[i]->size;
}
else
{
/* All easy cases are eliminated. Now fire the big guns. */
gprcount += ngpr;
ssecount += nsse;
}
}
cif->bytes = bytes;
enum x86_64_reg_class classes[MAX_CLASSES];
int offset = 0, j, num;
void *a;
return FFI_OK;
}
num = classify_argument (*p_arg, classes, &offset);
for (j=0, a=*p_argv; j<num; j++, a+=8)
void
ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
{
enum x86_64_reg_class classes[MAX_CLASSES];
char *stack, *argp;
ffi_type **arg_types;
int gprcount, ssecount, ngpr, nsse, i, avn;
_Bool ret_in_memory;
struct register_args *reg_args;
/* Can't call 32-bit mode from 64-bit mode. */
FFI_ASSERT (cif->abi == FFI_UNIX64);
/* If the return value is a struct and we don't have a return value
address then we need to make one. Note the setting of flags to
VOID above in ffi_prep_cif_machdep. */
ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
&& cif->flags == FFI_TYPE_VOID);
if (rvalue == NULL && ret_in_memory)
rvalue = alloca (cif->rtype->size);
/* Allocate the space for the arguments, plus 4 words of temp space. */
stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
reg_args = (struct register_args *) stack;
argp = stack + sizeof (struct register_args);
gprcount = ssecount = 0;
/* If the return value is passed in memory, add the pointer as the
first integer argument. */
if (ret_in_memory)
reg_args->gpr[gprcount++] = (long) rvalue;
avn = cif->nargs;
arg_types = cif->arg_types;
for (i = 0; i < avn; ++i)
{
size_t size = arg_types[i]->size;
int n;
n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
if (n == 0
|| gprcount + ngpr > MAX_GPR_REGS
|| ssecount + nsse > MAX_SSE_REGS)
{
long align = arg_types[i]->alignment;
/* Stack arguments are *always* at least 8 byte aligned. */
if (align < 8)
align = 8;
/* Pass this argument in memory. */
argp = (void *) ALIGN (argp, align);
memcpy (argp, avalue[i], size);
argp += size;
}
else
{
/* The argument is passed entirely in registers. */
char *a = (char *) avalue[i];
int j;
for (j = 0; j < n; j++, a += 8, size -= 8)
{
switch (classes[j])
{
case X86_64_INTEGER_CLASS:
case X86_64_INTEGERSI_CLASS:
stack->gpr[gprcount++] = *(long long *)a;
reg_args->gpr[gprcount] = 0;
memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
gprcount++;
break;
case X86_64_SSE_CLASS:
floatfloat2sse (a, &stack->sse[ssecount++]);
case X86_64_SSEDF_CLASS:
reg_args->sse[ssecount++] = *(UINT64 *) a;
break;
case X86_64_SSESF_CLASS:
float2sse (*(float *)a, &stack->sse[ssecount++]);
break;
case X86_64_SSEDF_CLASS:
double2sse (*(double *)a, &stack->sse[ssecount++]);
reg_args->sse[ssecount++] = *(UINT32 *) a;
break;
default:
abort();
@ -378,203 +422,13 @@ ffi_prep_args (stackLayout *stack, extended_cif *ecif)
}
}
}
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
cif->flags, rvalue, fn);
}
/* Perform machine dependent cif processing. */
ffi_status
ffi_prep_cif_machdep (ffi_cif *cif)
{
int gprcount, ssecount, i, g, s;
gprcount = ssecount = 0;
/* Reset the byte count. We handle this size estimation here. */
cif->bytes = 0;
/* If the return value should be passed in memory, pass the pointer
as the first argument. The actual memory isn't allocated here. */
if (cif->rtype->type != FFI_TYPE_VOID
&& examine_argument (cif->rtype, 1, &g, &s) == 0)
gprcount = 1;
/* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If
not, add it's size to the stack byte count. */
for (i=0; i<cif->nargs; i++)
{
if (examine_argument (cif->arg_types[i], 0, &g, &s) == 0
|| gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS)
{
/* This is passed in memory. First align to the basic type. */
cif->bytes = ALIGN(cif->bytes, cif->arg_types[i]->alignment);
/* Stack arguments are *always* at least 8 byte aligned. */
cif->bytes = ALIGN(cif->bytes, 8);
/* Now add the size of this argument. */
cif->bytes += cif->arg_types[i]->size;
}
else
{
gprcount += g;
ssecount += s;
}
}
/* Set the flag for the closures return. */
switch (cif->rtype->type)
{
case FFI_TYPE_VOID:
case FFI_TYPE_STRUCT:
case FFI_TYPE_SINT64:
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
cif->flags = (unsigned) cif->rtype->type;
break;
case FFI_TYPE_UINT64:
cif->flags = FFI_TYPE_SINT64;
break;
default:
cif->flags = FFI_TYPE_INT;
break;
}
return FFI_OK;
}
typedef struct
{
long gpr[2];
__int128_t sse[2];
long double st0;
} return_value;
void
ffi_fill_return_value (return_value *rv, extended_cif *ecif)
{
enum x86_64_reg_class classes[MAX_CLASSES];
int i = 0, num;
long *gpr = rv->gpr;
__int128_t *sse = rv->sse;
signed char sc;
signed short ss;
/* This is needed because of the way x86-64 handles signed short
integers. */
switch (ecif->cif->rtype->type)
{
case FFI_TYPE_SINT8:
sc = *(signed char *)gpr;
*(long long *)ecif->rvalue = (long long)sc;
return;
case FFI_TYPE_SINT16:
ss = *(signed short *)gpr;
*(long long *)ecif->rvalue = (long long)ss;
return;
default:
/* Just continue. */
;
}
num = classify_argument (ecif->cif->rtype, classes, &i);
if (num == 0)
/* Return in memory. */
ecif->rvalue = (void *) rv->gpr[0];
else if (num == 2 && classes[0] == X86_64_X87_CLASS &&
classes[1] == X86_64_X87UP_CLASS)
/* This is a long double (this is easiest to handle this way instead
of an eightbyte at a time as in the loop below. */
*((long double *)ecif->rvalue) = rv->st0;
else
{
void *a;
for (i=0, a=ecif->rvalue; i<num; i++, a+=8)
{
switch (classes[i])
{
case X86_64_INTEGER_CLASS:
case X86_64_INTEGERSI_CLASS:
*(long long *)a = *gpr;
gpr++;
break;
case X86_64_SSE_CLASS:
sse2floatfloat (sse++, a);
break;
case X86_64_SSESF_CLASS:
*(float *)a = sse2float (sse++);
break;
case X86_64_SSEDF_CLASS:
*(double *)a = sse2double (sse++);
break;
default:
abort();
}
}
}
}
/*@-declundef@*/
/*@-exportheader@*/
extern void ffi_call_UNIX64(void (*)(stackLayout *, extended_cif *),
void (*) (return_value *, extended_cif *),
/*@out@*/ extended_cif *,
unsigned, /*@out@*/ unsigned *, void (*fn)());
/*@=declundef@*/
/*@=exportheader@*/
void ffi_call(/*@dependent@*/ ffi_cif *cif,
void (*fn)(),
/*@out@*/ void *rvalue,
/*@dependent@*/ void **avalue)
{
extended_cif ecif;
int dummy;
ecif.cif = cif;
ecif.avalue = avalue;
/* If the return value is a struct and we don't have a return */
/* value address then we need to make one */
if ((rvalue == NULL) &&
(examine_argument (cif->rtype, 1, &dummy, &dummy) == 0))
{
/*@-sysunrecog@*/
ecif.rvalue = alloca(cif->rtype->size);
/*@=sysunrecog@*/
}
else
ecif.rvalue = rvalue;
/* Stack must always be 16byte aligned. Make it so. */
cif->bytes = ALIGN(cif->bytes, 16);
switch (cif->abi)
{
case FFI_SYSV:
/* Calling 32bit code from 64bit is not possible */
FFI_ASSERT(0);
break;
case FFI_UNIX64:
/*@-usedef@*/
ffi_call_UNIX64 (ffi_prep_args, ffi_fill_return_value, &ecif,
cif->bytes, ecif.rvalue, fn);
/*@=usedef@*/
break;
default:
FFI_ASSERT(0);
break;
}
}
extern void ffi_closure_UNIX64(void);
extern void ffi_closure_unix64(void);
ffi_status
ffi_prep_closure (ffi_closure* closure,
@ -584,14 +438,12 @@ ffi_prep_closure (ffi_closure* closure,
{
volatile unsigned short *tramp;
/* FFI_ASSERT (cif->abi == FFI_OSF); */
tramp = (volatile unsigned short *) &closure->tramp[0];
tramp[0] = 0xbb49; /* mov <code>, %r11 */
tramp[5] = 0xba49; /* mov <data>, %r10 */
tramp[10] = 0xff49; /* jmp *%r11 */
tramp[11] = 0x00e3;
*(void * volatile *) &tramp[1] = ffi_closure_UNIX64;
*(void * volatile *) &tramp[1] = ffi_closure_unix64;
*(void * volatile *) &tramp[6] = closure;
closure->cif = cif;
@ -602,107 +454,109 @@ ffi_prep_closure (ffi_closure* closure,
}
int
ffi_closure_UNIX64_inner(ffi_closure *closure, va_list l, void *rp)
ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue,
struct register_args *reg_args, char *argp)
{
ffi_cif *cif;
void **avalue;
ffi_type **arg_types;
long i, avn, argn;
long i, avn;
int gprcount, ssecount, ngpr, nsse;
int ret;
cif = closure->cif;
avalue = alloca(cif->nargs * sizeof(void *));
gprcount = ssecount = 0;
argn = 0;
ret = cif->rtype->type;
if (ret != FFI_TYPE_VOID)
{
enum x86_64_reg_class classes[MAX_CLASSES];
int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
if (n == 0)
{
/* The return value goes in memory. Arrange for the closure
return value to go directly back to the original caller. */
rvalue = (void *) reg_args->gpr[gprcount++];
/* We don't have to do anything in asm for the return. */
ret = FFI_TYPE_VOID;
}
else if (ret == FFI_TYPE_STRUCT && n == 2)
{
/* Mark which register the second word of the structure goes in. */
_Bool sse0 = SSE_CLASS_P (classes[0]);
_Bool sse1 = SSE_CLASS_P (classes[1]);
if (!sse0 && sse1)
ret |= 1 << 8;
else if (sse0 && !sse1)
ret |= 1 << 9;
}
}
i = 0;
avn = cif->nargs;
arg_types = cif->arg_types;
/* Grab the addresses of the arguments from the stack frame. */
while (i < avn)
for (i = 0; i < avn; ++i)
{
switch (arg_types[i]->type)
enum x86_64_reg_class classes[MAX_CLASSES];
int n;
n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse);
if (n == 0
|| gprcount + ngpr > MAX_GPR_REGS
|| ssecount + nsse > MAX_SSE_REGS)
{
case FFI_TYPE_SINT8:
case FFI_TYPE_UINT8:
case FFI_TYPE_SINT16:
case FFI_TYPE_UINT16:
case FFI_TYPE_SINT32:
case FFI_TYPE_UINT32:
case FFI_TYPE_SINT64:
case FFI_TYPE_UINT64:
case FFI_TYPE_POINTER:
{
if (l->gp_offset > 48-8)
{
avalue[i] = l->overflow_arg_area;
l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
}
else
{
avalue[i] = (char *)l->reg_save_area + l->gp_offset;
l->gp_offset += 8;
}
}
break;
long align = arg_types[i]->alignment;
case FFI_TYPE_STRUCT:
/* FIXME */
FFI_ASSERT(0);
break;
/* Stack arguments are *always* at least 8 byte aligned. */
if (align < 8)
align = 8;
case FFI_TYPE_DOUBLE:
{
if (l->fp_offset > 176-16)
{
avalue[i] = l->overflow_arg_area;
l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
}
else
{
avalue[i] = (char *)l->reg_save_area + l->fp_offset;
l->fp_offset += 16;
}
}
#if DEBUG_FFI
fprintf (stderr, "double arg %d = %g\n", i, *(double *)avalue[i]);
#endif
break;
case FFI_TYPE_FLOAT:
{
if (l->fp_offset > 176-16)
{
avalue[i] = l->overflow_arg_area;
l->overflow_arg_area = (char *)l->overflow_arg_area + 8;
}
else
{
avalue[i] = (char *)l->reg_save_area + l->fp_offset;
l->fp_offset += 16;
}
}
#if DEBUG_FFI
fprintf (stderr, "float arg %d = %g\n", i, *(float *)avalue[i]);
#endif
break;
default:
FFI_ASSERT(0);
/* Pass this argument in memory. */
argp = (void *) ALIGN (argp, align);
avalue[i] = argp;
argp += arg_types[i]->size;
}
/* If the argument is in a single register, or two consecutive
registers, then we can use that address directly. */
else if (n == 1
|| (n == 2
&& SSE_CLASS_P (classes[0]) == SSE_CLASS_P (classes[1])))
{
/* The argument is in a single register. */
if (SSE_CLASS_P (classes[0]))
{
avalue[i] = &reg_args->sse[ssecount];
ssecount += n;
}
else
{
avalue[i] = &reg_args->gpr[gprcount];
gprcount += n;
}
}
/* Otherwise, allocate space to make them consecutive. */
else
{
char *a = alloca (16);
int j;
argn += ALIGN(arg_types[i]->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG;
i++;
avalue[i] = a;
for (j = 0; j < n; j++, a += 8)
{
if (SSE_CLASS_P (classes[j]))
memcpy (a, &reg_args->sse[ssecount++], 8);
else
memcpy (a, &reg_args->gpr[gprcount++], 8);
}
}
}
/* Invoke the closure. */
(closure->fun) (cif, rp, avalue, closure->user_data);
closure->fun (cif, rvalue, avalue, closure->user_data);
/* FIXME: Structs not supported. */
FFI_ASSERT(cif->rtype->type != FFI_TYPE_STRUCT);
/* Tell ffi_closure_UNIX64 how to perform return type promotions. */
return cif->rtype->type;
/* Tell assembly how to perform return type promotions. */
return ret;
}
#endif /* ifndef __x86_64__ */
#endif /* __x86_64__ */

View File

@ -28,276 +28,348 @@
#include <fficonfig.h>
#include <ffi.h>
.section .rodata
.LC0:
.string "asm in progress %lld\n"
.LC1:
.string "asm in progress\n"
.text
/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)());
Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
for this function. This has been allocated by ffi_call. We also
deallocate some of the stack that has been alloca'd. */
.align 2
.globl ffi_call_UNIX64
.type ffi_call_UNIX64,@function
.globl ffi_call_unix64
.type ffi_call_unix64,@function
ffi_call_UNIX64:
.LFB1:
pushq %rbp
.LCFI0:
movq %rsp, %rbp
.LCFI1:
/* Save all arguments */
subq $48, %rsp
.LCFI2:
movq %rdi, -8(%rbp) /* ffi_prep_args */
movq %rsi, -16(%rbp) /* ffi_fill_return_value */
movq %rdx, -24(%rbp) /* ecif */
movq %rcx, -32(%rbp) /* cif->bytes */
movq %r8, -40(%rbp) /* ecif.rvalue */
movq %r9, -48(%rbp) /* fn */
ffi_call_unix64:
.LUW0:
movq (%rsp), %r10 /* Load return address. */
leaq (%rdi, %rsi), %rax /* Find local stack base. */
movq %rdx, (%rax) /* Save flags. */
movq %rcx, 8(%rax) /* Save raddr. */
movq %rbp, 16(%rax) /* Save old frame pointer. */
movq %r10, 24(%rax) /* Relocate return address. */
movq %rax, %rbp /* Finalize local stack frame. */
.LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */
/* Make room for all of the new args and the register args */
addl $176, %ecx
.LCFI3:
subq %rcx, %rsp
.LCFI4:
/* Setup the call to ffi_prep_args. */
movq %rdi, %rax /* &ffi_prep_args */
movq %rsp, %rdi /* stackLayout */
movq %rdx, %rsi /* ecif */
call *%rax /* ffi_prep_args(stackLayout, ecif);*/
/* Load up all argument registers. */
movq (%r10), %rdi
movq 8(%r10), %rsi
movq 16(%r10), %rdx
movq 24(%r10), %rcx
movq 32(%r10), %r8
movq 40(%r10), %r9
movdqa 48(%r10), %xmm0
movdqa 64(%r10), %xmm1
movdqa 80(%r10), %xmm2
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
/* ffi_prep_args have put all the register contents into the */
/* stackLayout struct. Now put the register values in place. */
movq (%rsp), %rdi
movq 8(%rsp), %rsi
movq 16(%rsp), %rdx
movq 24(%rsp), %rcx
movq 32(%rsp), %r8
movq 40(%rsp), %r9
movaps 48(%rsp), %xmm0
movaps 64(%rsp), %xmm1
movaps 80(%rsp), %xmm2
movaps 96(%rsp), %xmm3
movaps 112(%rsp), %xmm4
movaps 128(%rsp), %xmm5
movaps 144(%rsp), %xmm6
movaps 160(%rsp), %xmm7
/* Deallocate the reg arg area. */
leaq 176(%r10), %rsp
/* Remove space for stackLayout so stack arguments are placed
correctly for the call. */
.LCFI5:
addq $176, %rsp
.LCFI6:
/* Call the user function. */
call *-48(%rbp)
call *%r11
/* Make stack space for the return_value struct. */
subq $64, %rsp
/* Deallocate stack arg area; local stack frame in redzone. */
leaq 24(%rbp), %rsp
/* Fill in all potential return values to this struct. */
movq %rax, (%rsp)
movq %rdx, 8(%rsp)
movaps %xmm0, 16(%rsp)
movaps %xmm1, 32(%rsp)
fstpt 48(%rsp)
movq 0(%rbp), %rcx /* Reload flags. */
movq 8(%rbp), %rdi /* Reload raddr. */
movq 16(%rbp), %rbp /* Reload old frame pointer. */
.LUW2:
/* Now call ffi_fill_return_value. */
movq %rsp, %rdi /* struct return_value */
movq -24(%rbp), %rsi /* ecif */
movq -16(%rbp), %rax /* &ffi_fill_return_value */
call *%rax /* call it */
/* The first byte of the flags contains the FFI_TYPE. */
movzbl %cl, %r10d
leaq .Lstore_table(%rip), %r11
movslq (%r11, %r10, 4), %r10
addq %r11, %r10
jmp *%r10
/* And the work is done. */
leave
ret
.LFE1:
.ffi_call_UNIX64_end:
.size ffi_call_UNIX64,.ffi_call_UNIX64_end-ffi_call_UNIX64
.section .rodata
.Lstore_table:
.long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */
.long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */
.long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */
.long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */
.long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */
.long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */
.long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */
.long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */
.long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */
.long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */
.long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */
.long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */
.long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */
.long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */
.long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */
.text
.align 2
.globl float2sse
.type float2sse,@function
float2sse:
/* Save the contents of this sse-float in a pointer. */
movaps %xmm0, (%rdi)
.text
.align 2
.Lst_void:
ret
.align 2
.Lst_uint8:
movzbq %al, %rax
movq %rax, (%rdi)
ret
.align 2
.Lst_sint8:
movsbq %al, %rax
movq %rax, (%rdi)
ret
.align 2
.Lst_uint16:
movzwq %ax, %rax
movq %rax, (%rdi)
.align 2
.Lst_sint16:
movswq %ax, %rax
movq %rax, (%rdi)
ret
.align 2
.Lst_uint32:
movl %eax, %eax
movq %rax, (%rdi)
.align 2
.Lst_sint32:
cltq
movq %rax, (%rdi)
ret
.align 2
.Lst_int64:
movq %rax, (%rdi)
ret
.align 2
.globl floatfloat2sse
.type floatfloat2sse,@function
floatfloat2sse:
/* Save the contents of these two sse-floats in a pointer. */
movq (%rdi), %xmm0
movaps %xmm0, (%rsi)
.align 2
.Lst_float:
movss %xmm0, (%rdi)
ret
.align 2
.Lst_double:
movsd %xmm0, (%rdi)
ret
.Lst_ldouble:
fstpt (%rdi)
ret
.align 2
.globl double2sse
.type double2sse,@function
double2sse:
/* Save the contents of this sse-double in a pointer. */
movaps %xmm0, (%rdi)
.align 2
.Lst_struct:
leaq -20(%rsp), %rsi /* Scratch area in redzone. */
/* We have to locate the values now, and since we don't want to
write too much data into the user's return value, we spill the
value to a 16 byte scratch area first. Bits 8, 9, and 10
control where the values are located. Only one of the three
bits will be set; see ffi_prep_cif_machdep for the pattern. */
movd %xmm0, %r10
movd %xmm1, %r11
testl $0x100, %ecx
cmovnz %rax, %rdx
cmovnz %r10, %rax
testl $0x200, %ecx
cmovnz %r10, %rdx
testl $0x400, %ecx
cmovnz %r10, %rax
cmovnz %r11, %rdx
movq %rax, (%rsi)
movq %rdx, 8(%rsi)
/* Bits 11-31 contain the true size of the structure. Copy from
the scratch area to the true destination. */
shrl $11, %ecx
rep movsb
ret
.LUW3:
.size ffi_call_unix64,.-ffi_call_unix64
.align 2
.globl sse2float
.type sse2float,@function
sse2float:
/* Save the contents of this sse-float in a pointer. */
movaps (%rdi), %xmm0
ret
.globl ffi_closure_unix64
.type ffi_closure_unix64,@function
.align 2
.globl sse2double
.type sse2double,@function
sse2double:
/* Save the contents of this pointer in a sse-double. */
movaps (%rdi), %xmm0
ret
ffi_closure_unix64:
.LUW4:
subq $200, %rsp
.LUW5:
.align 2
.globl sse2floatfloat
.type sse2floatfloat,@function
sse2floatfloat:
/* Save the contents of this pointer in two sse-floats. */
movaps (%rdi), %xmm0
movq %xmm0, (%rsi)
ret
movq %rdi, (%rsp)
movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp)
movq %r8, 32(%rsp)
movq %r9, 40(%rsp)
movdqa %xmm0, 48(%rsp)
movdqa %xmm1, 64(%rsp)
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
.align 2
.globl ffi_closure_UNIX64
.type ffi_closure_UNIX64,@function
ffi_closure_UNIX64:
.LFB2:
pushq %rbp
.LCFI10:
movq %rsp, %rbp
.LCFI11:
subq $240, %rsp
.LCFI12:
movq %rdi, -176(%rbp)
movq %rsi, -168(%rbp)
movq %rdx, -160(%rbp)
movq %rcx, -152(%rbp)
movq %r8, -144(%rbp)
movq %r9, -136(%rbp)
/* FIXME: We can avoid all this stashing of XMM registers by
(in ffi_prep_closure) computing the number of
floating-point args and moving it into %rax before calling
this function. Once this is done, uncomment the next few
lines and only the essential XMM registers will be written
to memory. This is a significant saving. */
/* movzbl %al, %eax */
/* movq %rax, %rdx */
/* leaq 0(,%rdx,4), %rax */
/* leaq 2f(%rip), %rdx */
/* subq %rax, %rdx */
leaq -1(%rbp), %rax
/* jmp *%rdx */
movaps %xmm7, -15(%rax)
movaps %xmm6, -31(%rax)
movaps %xmm5, -47(%rax)
movaps %xmm4, -63(%rax)
movaps %xmm3, -79(%rax)
movaps %xmm2, -95(%rax)
movaps %xmm1, -111(%rax)
movaps %xmm0, -127(%rax)
2:
movl %edi, -180(%rbp)
movl $0, -224(%rbp)
movl $48, -220(%rbp)
leaq 16(%rbp), %rax
movq %rax, -216(%rbp)
leaq -176(%rbp), %rdx
movq %rdx, -208(%rbp)
leaq -224(%rbp), %rsi
movq %r10, %rdi
leaq 176(%rsp), %rsi
movq %rsp, %rdx
call ffi_closure_UNIX64_inner@PLT
leaq 208(%rsp), %rcx
call ffi_closure_unix64_inner@PLT
cmpl $FFI_TYPE_FLOAT, %eax
je 1f
cmpl $FFI_TYPE_DOUBLE, %eax
je 2f
cmpl $FFI_TYPE_LONGDOUBLE, %eax
je 3f
cmpl $FFI_TYPE_STRUCT, %eax
je 4f
popq %rax
leave
ret
1:
2:
3:
movaps -240(%rbp), %xmm0
leave
ret
4:
leave
/* Deallocate stack frame early; return value is now in redzone. */
addq $200, %rsp
.LUW6:
/* The first byte of the return value contains the FFI_TYPE. */
movzbl %al, %r10d
leaq .Lload_table(%rip), %r11
movslq (%r11, %r10, 4), %r10
addq %r11, %r10
jmp *%r10
.section .rodata
.Lload_table:
.long .Lld_void-.Lload_table /* FFI_TYPE_VOID */
.long .Lld_int32-.Lload_table /* FFI_TYPE_INT */
.long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */
.long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */
.long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */
.long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */
.long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */
.long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */
.long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */
.long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */
.long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */
.long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */
.long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */
.long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */
.long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */
.text
.align 2
.Lld_void:
ret
.LFE2:
.section .eh_frame,EH_FRAME_FLAGS,@progbits
.Lframe0:
.long .LECIE1-.LSCIE1
.align 2
.Lld_int8:
movzbl -24(%rsp), %eax
ret
.align 2
.Lld_int16:
movzwl -24(%rsp), %eax
ret
.align 2
.Lld_int32:
movl -24(%rsp), %eax
ret
.align 2
.Lld_int64:
movq -24(%rsp), %rax
ret
.align 2
.Lld_float:
movss -24(%rsp), %xmm0
ret
.align 2
.Lld_double:
movsd -24(%rsp), %xmm0
ret
.align 2
.Lld_ldouble:
fldt -24(%rsp)
ret
.align 2
.Lld_struct:
/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
%rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading
both rdx and xmm1 with the second word. For the remaining,
bit 8 set means xmm0 gets the second word, and bit 9 means
that rax gets the second word. */
movq -24(%rsp), %rcx
movq -16(%rsp), %rdx
movq -16(%rsp), %xmm1
testl $0x100, %eax
cmovnz %rdx, %rcx
movd %rcx, %xmm0
testl $0x200, %eax
movq -24(%rsp), %rax
cmovnz %rdx, %rax
ret
.LUW7:
.size ffi_closure_unix64,.-ffi_closure_unix64
.section .eh_frame,"a",@progbits
.Lframe1:
.long .LECIE1-.LSCIE1 /* CIE Length */
.LSCIE1:
.long 0x0
.byte 0x1
.string "zR"
.uleb128 0x1
.sleb128 -8
.byte 0x10
.uleb128 0x1
.byte 0x1b
.byte 0xc
.uleb128 0x7
.uleb128 0x8
.byte 0x90
.uleb128 0x1
.align 8
.long 0 /* CIE Identifier Tag */
.byte 1 /* CIE Version */
.ascii "zR\0" /* CIE Augmentation */
.uleb128 1 /* CIE Code Alignment Factor */
.sleb128 -8 /* CIE Data Alignment Factor */
.byte 0x10 /* CIE RA Column */
.uleb128 1 /* Augmentation size */
.byte 0x1b /* FDE Encoding (pcrel sdata4) */
.byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */
.uleb128 7
.uleb128 8
.byte 0x80+16 /* DW_CFA_offset, %rip offset 1*-8 */
.uleb128 1
.align 8
.LECIE1:
.LSFDE1:
.long .LEFDE1-.LASFDE1
.long .LEFDE1-.LASFDE1 /* FDE Length */
.LASFDE1:
.long .LASFDE1-.Lframe0
.long .LASFDE1-.Lframe1 /* FDE CIE offset */
.long .LUW0-. /* FDE initial location */
.long .LUW3-.LUW0 /* FDE address range */
.uleb128 0x0 /* Augmentation size */
.long .LFB1-.
.long .LFE1-.LFB1
.uleb128 0x0
.byte 0x4 # DW_CFA_advance_loc4
.long .LCFI0-.LFB1
.byte 0xe # DW_CFA_def_cfa_offset
.uleb128 0x10
.byte 0x86 # DW_CFA_offset: r6 at cfa-16
.uleb128 0x2
.byte 0x4 # DW_CFA_advance_loc4
.long .LCFI1-.LCFI0
.byte 0x86 # DW_CFA_offset: r6 at cfa-16
.uleb128 0x2
.byte 0xd # DW_CFA_def_cfa_reg: r6
.uleb128 0x6
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW1-.LUW0
/* New stack frame based off rbp. This is a itty bit of unwind
trickery in that the CFA *has* changed. There is no easy way
to describe it correctly on entry to the function. Fortunately,
it doesn't matter too much since at all points we can correctly
unwind back to ffi_call. Note that the location to which we
moved the return address is (the new) CFA-8, so from the
perspective of the unwind info, it hasn't moved. */
.byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */
.uleb128 6
.uleb128 32
.byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */
.uleb128 2
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW2-.LUW3
.byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */
.uleb128 7
.uleb128 8
.byte 0xc0+6 /* DW_CFA_restore, %rbp */
.align 8
.LEFDE1:
.LSFDE3:
.long .LEFDE3-.LASFDE3 # FDE Length
.long .LEFDE3-.LASFDE3 /* FDE Length */
.LASFDE3:
.long .LASFDE3-.Lframe0 # FDE CIE offset
.long .LFB2-. # FDE initial location
.long .LFE2-.LFB2 # FDE address range
.uleb128 0x0 # Augmentation size
.byte 0x4 # DW_CFA_advance_loc4
.long .LCFI10-.LFB2
.byte 0xe # DW_CFA_def_cfa_offset
.uleb128 0x10
.byte 0x86 # DW_CFA_offset, column 0x6
.uleb128 0x2
.byte 0x4 # DW_CFA_advance_loc4
.long .LCFI11-.LCFI10
.byte 0xd # DW_CFA_def_cfa_register
.uleb128 0x6
.align 8
.long .LASFDE3-.Lframe1 /* FDE CIE offset */
.long .LUW4-. /* FDE initial location */
.long .LUW7-.LUW4 /* FDE address range */
.uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW5-.LUW4
.byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 208
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW6-.LUW5
.byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 8
.align 8
.LEFDE3:
#endif /* __x86_64__ */
#endif /* __x86_64__ */