From 1a0f488c328df63663eed29d18af44733ece3abc Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Sat, 25 Dec 2004 01:54:40 -0800 Subject: [PATCH] ffi64.c (struct register_args): Rename from stackLayout. * src/x86/ffi64.c (struct register_args): Rename from stackLayout. (enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS. (merge_classes): Check for it. (SSE_CLASS_P): New. (classify_argument): Pass byte_offset by value; perform all updates inside struct case. (examine_argument): Add classes argument; handle X86_64_COMPLEX_X87_CLASS. (ffi_prep_args): Merge into ... (ffi_call): ... here. Share stack frame with ffi_call_unix64. (ffi_prep_cif_machdep): Setup cif->flags for proper structure return. (ffi_fill_return_value): Remove. (ffi_prep_closure): Remove dead assert. (ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner. Rewrite to use struct register_args instead of va_list. Create flags for handling structure returns. * src/x86/unix64.S: Remove dead strings. (ffi_call_unix64): Rename from ffi_call_UNIX64. Rewrite to share stack frame with ffi_call. Handle structure returns properly. (float2sse, floatfloat2sse, double2sse): Remove. (sse2float, sse2double, sse2floatfloat): Remove. (ffi_closure_unix64): Rename from ffi_closure_UNIX64. Rewrite to handle structure returns properly. From-SVN: r92602 --- libffi/ChangeLog | 26 ++ libffi/src/x86/ffi64.c | 662 ++++++++++++++++------------------------ libffi/src/x86/unix64.S | 544 +++++++++++++++++++-------------- 3 files changed, 592 insertions(+), 640 deletions(-) diff --git a/libffi/ChangeLog b/libffi/ChangeLog index a8b1b8a04df..e26f22d9d21 100644 --- a/libffi/ChangeLog +++ b/libffi/ChangeLog @@ -1,3 +1,29 @@ +2004-12-25 Richard Henderson + + * src/x86/ffi64.c (struct register_args): Rename from stackLayout. + (enum x86_64_reg_class): Add X86_64_COMPLEX_X87_CLASS. + (merge_classes): Check for it. + (SSE_CLASS_P): New. + (classify_argument): Pass byte_offset by value; perform all updates + inside struct case. + (examine_argument): Add classes argument; handle + X86_64_COMPLEX_X87_CLASS. + (ffi_prep_args): Merge into ... + (ffi_call): ... here. Share stack frame with ffi_call_unix64. + (ffi_prep_cif_machdep): Setup cif->flags for proper structure return. + (ffi_fill_return_value): Remove. + (ffi_prep_closure): Remove dead assert. + (ffi_closure_unix64_inner): Rename from ffi_closure_UNIX64_inner. + Rewrite to use struct register_args instead of va_list. Create + flags for handling structure returns. + * src/x86/unix64.S: Remove dead strings. + (ffi_call_unix64): Rename from ffi_call_UNIX64. Rewrite to share + stack frame with ffi_call. Handle structure returns properly. + (float2sse, floatfloat2sse, double2sse): Remove. + (sse2float, sse2double, sse2floatfloat): Remove. + (ffi_closure_unix64): Rename from ffi_closure_UNIX64. Rewrite + to handle structure returns properly. + 2004-12-08 David Edelsohn * Makefile.am (AM_MAKEFLAGS): Remove duplicate LIBCFLAGS and diff --git a/libffi/src/x86/ffi64.c b/libffi/src/x86/ffi64.c index 653d45c243a..754975ec060 100644 --- a/libffi/src/x86/ffi64.c +++ b/libffi/src/x86/ffi64.c @@ -29,22 +29,20 @@ #include #include -/* ffi_prep_args is called by the assembly routine once stack space - has been allocated for the function's arguments */ - #ifdef __x86_64__ #define MAX_GPR_REGS 6 #define MAX_SSE_REGS 8 -typedef struct + +struct register_args { /* Registers for argument passing. */ - long gpr[MAX_GPR_REGS]; + UINT64 gpr[MAX_GPR_REGS]; __int128_t sse[MAX_SSE_REGS]; +}; - /* Stack space for arguments. */ - char argspace[0]; -} stackLayout; +extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, + void *raddr, void (*fnaddr)()); /* All reference to register classes here is identical to the code in gcc/config/i386/i386.c. Do *not* change one without the other. */ @@ -55,8 +53,7 @@ typedef struct use SF or DFmode move instead of DImode to avoid reformating penalties. Similary we play games with INTEGERSI_CLASS to use cheaper SImode moves - whenever possible (upper half does contain padding). - */ + whenever possible (upper half does contain padding). */ enum x86_64_reg_class { X86_64_NO_CLASS, @@ -68,11 +65,14 @@ enum x86_64_reg_class X86_64_SSEUP_CLASS, X86_64_X87_CLASS, X86_64_X87UP_CLASS, + X86_64_COMPLEX_X87_CLASS, X86_64_MEMORY_CLASS }; #define MAX_CLASSES 4 +#define SSE_CLASS_P(X) ((X) >= X86_64_SSE_CLASS && X <= X86_64_SSEUP_CLASS) + /* x86-64 register passing implementation. See x86-64 ABI for details. Goal of this code is to classify each 8bytes of incoming argument by the register class and assign registers accordingly. */ @@ -106,9 +106,14 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) || class2 == X86_64_INTEGER_CLASS || class2 == X86_64_INTEGERSI_CLASS) return X86_64_INTEGER_CLASS; - /* Rule #5: If one of the classes is X87 or X87UP class, MEMORY is used. */ - if (class1 == X86_64_X87_CLASS || class1 == X86_64_X87UP_CLASS - || class2 == X86_64_X87_CLASS || class2 == X86_64_X87UP_CLASS) + /* Rule #5: If one of the classes is X87, X87UP, or COMPLEX_X87 class, + MEMORY is used. */ + if (class1 == X86_64_X87_CLASS + || class1 == X86_64_X87UP_CLASS + || class1 == X86_64_COMPLEX_X87_CLASS + || class2 == X86_64_X87_CLASS + || class2 == X86_64_X87UP_CLASS + || class2 == X86_64_COMPLEX_X87_CLASS) return X86_64_MEMORY_CLASS; /* Rule #6: Otherwise class SSE is used. */ @@ -125,11 +130,8 @@ merge_classes (enum x86_64_reg_class class1, enum x86_64_reg_class class2) */ static int classify_argument (ffi_type *type, enum x86_64_reg_class classes[], - int *byte_offset) + size_t byte_offset) { - /* First, align to the right place. */ - *byte_offset = ALIGN(*byte_offset, type->alignment); - switch (type->type) { case FFI_TYPE_UINT8: @@ -141,13 +143,13 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], case FFI_TYPE_UINT64: case FFI_TYPE_SINT64: case FFI_TYPE_POINTER: - if (((*byte_offset) % 8 + type->size) <= 4) + if (byte_offset + type->size <= 4) classes[0] = X86_64_INTEGERSI_CLASS; else classes[0] = X86_64_INTEGER_CLASS; return 1; case FFI_TYPE_FLOAT: - if (((*byte_offset) % 8) == 0) + if (byte_offset == 0) classes[0] = X86_64_SSESF_CLASS; else classes[0] = X86_64_SSE_CLASS; @@ -175,22 +177,23 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], classes[i] = X86_64_NO_CLASS; /* Merge the fields of structure. */ - for (ptr=type->elements; (*ptr)!=NULL; ptr++) + for (ptr = type->elements; *ptr != NULL; ptr++) { int num; - num = classify_argument (*ptr, subclasses, byte_offset); + byte_offset = ALIGN (byte_offset, (*ptr)->alignment); + + num = classify_argument (*ptr, subclasses, byte_offset % 8); if (num == 0) return 0; for (i = 0; i < num; i++) { - int pos = *byte_offset / 8; + int pos = byte_offset / 8; classes[i + pos] = merge_classes (subclasses[i], classes[i + pos]); } - if ((*ptr)->type != FFI_TYPE_STRUCT) - *byte_offset += (*ptr)->size; + byte_offset += (*ptr)->size; } /* Final merger cleanup. */ @@ -222,155 +225,196 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[], } /* Examine the argument and return set number of register required in each - class. Return 0 iff parameter should be passed in memory. */ + class. Return zero iff parameter should be passed in memory, otherwise + the number of registers. */ + static int -examine_argument (ffi_type *type, int in_return, int *int_nregs,int *sse_nregs) +examine_argument (ffi_type *type, enum x86_64_reg_class classes[MAX_CLASSES], + _Bool in_return, int *pngpr, int *pnsse) { - enum x86_64_reg_class class[MAX_CLASSES]; - int offset = 0; - int n; - - n = classify_argument (type, class, &offset); + int i, n, ngpr, nsse; + n = classify_argument (type, classes, 0); if (n == 0) return 0; - *int_nregs = 0; - *sse_nregs = 0; - for (n--; n>=0; n--) - switch (class[n]) + ngpr = nsse = 0; + for (i = 0; i < n; ++i) + switch (classes[i]) { case X86_64_INTEGER_CLASS: case X86_64_INTEGERSI_CLASS: - (*int_nregs)++; + ngpr++; break; case X86_64_SSE_CLASS: case X86_64_SSESF_CLASS: case X86_64_SSEDF_CLASS: - (*sse_nregs)++; + nsse++; break; case X86_64_NO_CLASS: case X86_64_SSEUP_CLASS: break; case X86_64_X87_CLASS: case X86_64_X87UP_CLASS: - if (!in_return) - return 0; - break; + case X86_64_COMPLEX_X87_CLASS: + return in_return != 0; default: abort (); } - return 1; + + *pngpr = ngpr; + *pnsse = nsse; + + return n; } -/* Functions to load floats and double to an SSE register placeholder. */ -extern void float2sse (float, __int128_t *); -extern void double2sse (double, __int128_t *); -extern void floatfloat2sse (void *, __int128_t *); +/* Perform machine dependent cif processing. */ -/* Functions to put the floats and doubles back. */ -extern float sse2float (__int128_t *); -extern double sse2double (__int128_t *); -extern void sse2floatfloat(__int128_t *, void *); - -/*@-exportheader@*/ -void -ffi_prep_args (stackLayout *stack, extended_cif *ecif) -/*@=exportheader@*/ +ffi_status +ffi_prep_cif_machdep (ffi_cif *cif) { - int gprcount, ssecount, i, g, s; - void **p_argv; - void *argp = &stack->argspace; - ffi_type **p_arg; + int gprcount, ssecount, i, avn, n, ngpr, nsse, flags; + enum x86_64_reg_class classes[MAX_CLASSES]; + size_t bytes; - /* First check if the return value should be passed in memory. If so, - pass the pointer as the first argument. */ gprcount = ssecount = 0; - if (ecif->cif->rtype->type != FFI_TYPE_VOID - && examine_argument (ecif->cif->rtype, 1, &g, &s) == 0) - stack->gpr[gprcount++] = (long) ecif->rvalue; - for (i=ecif->cif->nargs, p_arg=ecif->cif->arg_types, p_argv = ecif->avalue; - i!=0; i--, p_arg++, p_argv++) + flags = cif->rtype->type; + if (flags != FFI_TYPE_VOID) { - int in_register = 0; - - switch ((*p_arg)->type) + n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse); + if (n == 0) { - case FFI_TYPE_SINT8: - case FFI_TYPE_SINT16: - case FFI_TYPE_SINT32: - case FFI_TYPE_SINT64: - case FFI_TYPE_UINT8: - case FFI_TYPE_UINT16: - case FFI_TYPE_UINT32: - case FFI_TYPE_UINT64: - case FFI_TYPE_POINTER: - if (gprcount < MAX_GPR_REGS) - { - stack->gpr[gprcount] = 0; - stack->gpr[gprcount++] = *(long long *)(*p_argv); - in_register = 1; - } - break; - - case FFI_TYPE_FLOAT: - if (ssecount < MAX_SSE_REGS) - { - float2sse (*(float *)(*p_argv), &stack->sse[ssecount++]); - in_register = 1; - } - break; - - case FFI_TYPE_DOUBLE: - if (ssecount < MAX_SSE_REGS) - { - double2sse (*(double *)(*p_argv), &stack->sse[ssecount++]); - in_register = 1; - } - break; + /* The return value is passed in memory. A pointer to that + memory is the first argument. Allocate a register for it. */ + gprcount++; + /* We don't have to do anything in asm for the return. */ + flags = FFI_TYPE_VOID; } - - if (in_register) - continue; - - /* Either all places in registers where filled, or this is a - type that potentially goes into a memory slot. */ - if (examine_argument (*p_arg, 0, &g, &s) == 0 - || gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS) + else if (flags == FFI_TYPE_STRUCT) { - /* Pass this argument in memory. */ - argp = (void *)ALIGN(argp, (*p_arg)->alignment); - /* Stack arguments are *always* at least 8 byte aligned. */ - argp = (void *)ALIGN(argp, 8); - memcpy (argp, *p_argv, (*p_arg)->size); - argp += (*p_arg)->size; + /* Mark which registers the result appears in. */ + _Bool sse0 = SSE_CLASS_P (classes[0]); + _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]); + if (sse0 && !sse1) + flags |= 1 << 8; + else if (!sse0 && sse1) + flags |= 1 << 9; + else if (sse0 && sse1) + flags |= 1 << 10; + /* Mark the true size of the structure. */ + flags |= cif->rtype->size << 11; + } + } + cif->flags = flags; + + /* Go over all arguments and determine the way they should be passed. + If it's in a register and there is space for it, let that be so. If + not, add it's size to the stack byte count. */ + for (bytes = 0, i = 0, avn = cif->nargs; i < avn; i++) + { + if (examine_argument (cif->arg_types[i], classes, 0, &ngpr, &nsse) == 0 + || gprcount + ngpr > MAX_GPR_REGS + || ssecount + nsse > MAX_SSE_REGS) + { + long align = cif->arg_types[i]->alignment; + + if (align < 8) + align = 8; + + bytes = ALIGN(bytes, align); + bytes += cif->arg_types[i]->size; } else { - /* All easy cases are eliminated. Now fire the big guns. */ + gprcount += ngpr; + ssecount += nsse; + } + } + cif->bytes = bytes; - enum x86_64_reg_class classes[MAX_CLASSES]; - int offset = 0, j, num; - void *a; + return FFI_OK; +} - num = classify_argument (*p_arg, classes, &offset); - for (j=0, a=*p_argv; jabi == FFI_UNIX64); + + /* If the return value is a struct and we don't have a return value + address then we need to make one. Note the setting of flags to + VOID above in ffi_prep_cif_machdep. */ + ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT + && cif->flags == FFI_TYPE_VOID); + if (rvalue == NULL && ret_in_memory) + rvalue = alloca (cif->rtype->size); + + /* Allocate the space for the arguments, plus 4 words of temp space. */ + stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8); + reg_args = (struct register_args *) stack; + argp = stack + sizeof (struct register_args); + + gprcount = ssecount = 0; + + /* If the return value is passed in memory, add the pointer as the + first integer argument. */ + if (ret_in_memory) + reg_args->gpr[gprcount++] = (long) rvalue; + + avn = cif->nargs; + arg_types = cif->arg_types; + + for (i = 0; i < avn; ++i) + { + size_t size = arg_types[i]->size; + int n; + + n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse); + if (n == 0 + || gprcount + ngpr > MAX_GPR_REGS + || ssecount + nsse > MAX_SSE_REGS) + { + long align = arg_types[i]->alignment; + + /* Stack arguments are *always* at least 8 byte aligned. */ + if (align < 8) + align = 8; + + /* Pass this argument in memory. */ + argp = (void *) ALIGN (argp, align); + memcpy (argp, avalue[i], size); + argp += size; + } + else + { + /* The argument is passed entirely in registers. */ + char *a = (char *) avalue[i]; + int j; + + for (j = 0; j < n; j++, a += 8, size -= 8) { switch (classes[j]) { case X86_64_INTEGER_CLASS: case X86_64_INTEGERSI_CLASS: - stack->gpr[gprcount++] = *(long long *)a; + reg_args->gpr[gprcount] = 0; + memcpy (®_args->gpr[gprcount], a, size < 8 ? size : 8); + gprcount++; break; case X86_64_SSE_CLASS: - floatfloat2sse (a, &stack->sse[ssecount++]); + case X86_64_SSEDF_CLASS: + reg_args->sse[ssecount++] = *(UINT64 *) a; break; case X86_64_SSESF_CLASS: - float2sse (*(float *)a, &stack->sse[ssecount++]); - break; - case X86_64_SSEDF_CLASS: - double2sse (*(double *)a, &stack->sse[ssecount++]); + reg_args->sse[ssecount++] = *(UINT32 *) a; break; default: abort(); @@ -378,203 +422,13 @@ ffi_prep_args (stackLayout *stack, extended_cif *ecif) } } } + + ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args), + cif->flags, rvalue, fn); } -/* Perform machine dependent cif processing. */ -ffi_status -ffi_prep_cif_machdep (ffi_cif *cif) -{ - int gprcount, ssecount, i, g, s; - gprcount = ssecount = 0; - - /* Reset the byte count. We handle this size estimation here. */ - cif->bytes = 0; - - /* If the return value should be passed in memory, pass the pointer - as the first argument. The actual memory isn't allocated here. */ - if (cif->rtype->type != FFI_TYPE_VOID - && examine_argument (cif->rtype, 1, &g, &s) == 0) - gprcount = 1; - - /* Go over all arguments and determine the way they should be passed. - If it's in a register and there is space for it, let that be so. If - not, add it's size to the stack byte count. */ - for (i=0; inargs; i++) - { - if (examine_argument (cif->arg_types[i], 0, &g, &s) == 0 - || gprcount + g > MAX_GPR_REGS || ssecount + s > MAX_SSE_REGS) - { - /* This is passed in memory. First align to the basic type. */ - cif->bytes = ALIGN(cif->bytes, cif->arg_types[i]->alignment); - - /* Stack arguments are *always* at least 8 byte aligned. */ - cif->bytes = ALIGN(cif->bytes, 8); - - /* Now add the size of this argument. */ - cif->bytes += cif->arg_types[i]->size; - } - else - { - gprcount += g; - ssecount += s; - } - } - - /* Set the flag for the closures return. */ - switch (cif->rtype->type) - { - case FFI_TYPE_VOID: - case FFI_TYPE_STRUCT: - case FFI_TYPE_SINT64: - case FFI_TYPE_FLOAT: - case FFI_TYPE_DOUBLE: - case FFI_TYPE_LONGDOUBLE: - cif->flags = (unsigned) cif->rtype->type; - break; - - case FFI_TYPE_UINT64: - cif->flags = FFI_TYPE_SINT64; - break; - - default: - cif->flags = FFI_TYPE_INT; - break; - } - - return FFI_OK; -} - -typedef struct -{ - long gpr[2]; - __int128_t sse[2]; - long double st0; -} return_value; - -void -ffi_fill_return_value (return_value *rv, extended_cif *ecif) -{ - enum x86_64_reg_class classes[MAX_CLASSES]; - int i = 0, num; - long *gpr = rv->gpr; - __int128_t *sse = rv->sse; - signed char sc; - signed short ss; - - /* This is needed because of the way x86-64 handles signed short - integers. */ - switch (ecif->cif->rtype->type) - { - case FFI_TYPE_SINT8: - sc = *(signed char *)gpr; - *(long long *)ecif->rvalue = (long long)sc; - return; - case FFI_TYPE_SINT16: - ss = *(signed short *)gpr; - *(long long *)ecif->rvalue = (long long)ss; - return; - default: - /* Just continue. */ - ; - } - - num = classify_argument (ecif->cif->rtype, classes, &i); - - if (num == 0) - /* Return in memory. */ - ecif->rvalue = (void *) rv->gpr[0]; - else if (num == 2 && classes[0] == X86_64_X87_CLASS && - classes[1] == X86_64_X87UP_CLASS) - /* This is a long double (this is easiest to handle this way instead - of an eightbyte at a time as in the loop below. */ - *((long double *)ecif->rvalue) = rv->st0; - else - { - void *a; - - for (i=0, a=ecif->rvalue; irtype, 1, &dummy, &dummy) == 0)) - { - /*@-sysunrecog@*/ - ecif.rvalue = alloca(cif->rtype->size); - /*@=sysunrecog@*/ - } - else - ecif.rvalue = rvalue; - - /* Stack must always be 16byte aligned. Make it so. */ - cif->bytes = ALIGN(cif->bytes, 16); - - switch (cif->abi) - { - case FFI_SYSV: - /* Calling 32bit code from 64bit is not possible */ - FFI_ASSERT(0); - break; - - case FFI_UNIX64: - /*@-usedef@*/ - ffi_call_UNIX64 (ffi_prep_args, ffi_fill_return_value, &ecif, - cif->bytes, ecif.rvalue, fn); - /*@=usedef@*/ - break; - - default: - FFI_ASSERT(0); - break; - } -} - -extern void ffi_closure_UNIX64(void); +extern void ffi_closure_unix64(void); ffi_status ffi_prep_closure (ffi_closure* closure, @@ -584,14 +438,12 @@ ffi_prep_closure (ffi_closure* closure, { volatile unsigned short *tramp; - /* FFI_ASSERT (cif->abi == FFI_OSF); */ - tramp = (volatile unsigned short *) &closure->tramp[0]; tramp[0] = 0xbb49; /* mov , %r11 */ tramp[5] = 0xba49; /* mov , %r10 */ tramp[10] = 0xff49; /* jmp *%r11 */ tramp[11] = 0x00e3; - *(void * volatile *) &tramp[1] = ffi_closure_UNIX64; + *(void * volatile *) &tramp[1] = ffi_closure_unix64; *(void * volatile *) &tramp[6] = closure; closure->cif = cif; @@ -602,107 +454,109 @@ ffi_prep_closure (ffi_closure* closure, } int -ffi_closure_UNIX64_inner(ffi_closure *closure, va_list l, void *rp) +ffi_closure_unix64_inner(ffi_closure *closure, void *rvalue, + struct register_args *reg_args, char *argp) { ffi_cif *cif; void **avalue; ffi_type **arg_types; - long i, avn, argn; + long i, avn; + int gprcount, ssecount, ngpr, nsse; + int ret; cif = closure->cif; avalue = alloca(cif->nargs * sizeof(void *)); + gprcount = ssecount = 0; - argn = 0; + ret = cif->rtype->type; + if (ret != FFI_TYPE_VOID) + { + enum x86_64_reg_class classes[MAX_CLASSES]; + int n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse); + if (n == 0) + { + /* The return value goes in memory. Arrange for the closure + return value to go directly back to the original caller. */ + rvalue = (void *) reg_args->gpr[gprcount++]; + /* We don't have to do anything in asm for the return. */ + ret = FFI_TYPE_VOID; + } + else if (ret == FFI_TYPE_STRUCT && n == 2) + { + /* Mark which register the second word of the structure goes in. */ + _Bool sse0 = SSE_CLASS_P (classes[0]); + _Bool sse1 = SSE_CLASS_P (classes[1]); + if (!sse0 && sse1) + ret |= 1 << 8; + else if (sse0 && !sse1) + ret |= 1 << 9; + } + } - i = 0; avn = cif->nargs; arg_types = cif->arg_types; - /* Grab the addresses of the arguments from the stack frame. */ - while (i < avn) + for (i = 0; i < avn; ++i) { - switch (arg_types[i]->type) + enum x86_64_reg_class classes[MAX_CLASSES]; + int n; + + n = examine_argument (arg_types[i], classes, 0, &ngpr, &nsse); + if (n == 0 + || gprcount + ngpr > MAX_GPR_REGS + || ssecount + nsse > MAX_SSE_REGS) { - case FFI_TYPE_SINT8: - case FFI_TYPE_UINT8: - case FFI_TYPE_SINT16: - case FFI_TYPE_UINT16: - case FFI_TYPE_SINT32: - case FFI_TYPE_UINT32: - case FFI_TYPE_SINT64: - case FFI_TYPE_UINT64: - case FFI_TYPE_POINTER: - { - if (l->gp_offset > 48-8) - { - avalue[i] = l->overflow_arg_area; - l->overflow_arg_area = (char *)l->overflow_arg_area + 8; - } - else - { - avalue[i] = (char *)l->reg_save_area + l->gp_offset; - l->gp_offset += 8; - } - } - break; + long align = arg_types[i]->alignment; - case FFI_TYPE_STRUCT: - /* FIXME */ - FFI_ASSERT(0); - break; + /* Stack arguments are *always* at least 8 byte aligned. */ + if (align < 8) + align = 8; - case FFI_TYPE_DOUBLE: - { - if (l->fp_offset > 176-16) - { - avalue[i] = l->overflow_arg_area; - l->overflow_arg_area = (char *)l->overflow_arg_area + 8; - } - else - { - avalue[i] = (char *)l->reg_save_area + l->fp_offset; - l->fp_offset += 16; - } - } -#if DEBUG_FFI - fprintf (stderr, "double arg %d = %g\n", i, *(double *)avalue[i]); -#endif - break; - - case FFI_TYPE_FLOAT: - { - if (l->fp_offset > 176-16) - { - avalue[i] = l->overflow_arg_area; - l->overflow_arg_area = (char *)l->overflow_arg_area + 8; - } - else - { - avalue[i] = (char *)l->reg_save_area + l->fp_offset; - l->fp_offset += 16; - } - } -#if DEBUG_FFI - fprintf (stderr, "float arg %d = %g\n", i, *(float *)avalue[i]); -#endif - break; - - default: - FFI_ASSERT(0); + /* Pass this argument in memory. */ + argp = (void *) ALIGN (argp, align); + avalue[i] = argp; + argp += arg_types[i]->size; } + /* If the argument is in a single register, or two consecutive + registers, then we can use that address directly. */ + else if (n == 1 + || (n == 2 + && SSE_CLASS_P (classes[0]) == SSE_CLASS_P (classes[1]))) + { + /* The argument is in a single register. */ + if (SSE_CLASS_P (classes[0])) + { + avalue[i] = ®_args->sse[ssecount]; + ssecount += n; + } + else + { + avalue[i] = ®_args->gpr[gprcount]; + gprcount += n; + } + } + /* Otherwise, allocate space to make them consecutive. */ + else + { + char *a = alloca (16); + int j; - argn += ALIGN(arg_types[i]->size, FFI_SIZEOF_ARG) / FFI_SIZEOF_ARG; - i++; + avalue[i] = a; + for (j = 0; j < n; j++, a += 8) + { + if (SSE_CLASS_P (classes[j])) + memcpy (a, ®_args->sse[ssecount++], 8); + else + memcpy (a, ®_args->gpr[gprcount++], 8); + } + } } /* Invoke the closure. */ - (closure->fun) (cif, rp, avalue, closure->user_data); + closure->fun (cif, rvalue, avalue, closure->user_data); - /* FIXME: Structs not supported. */ - FFI_ASSERT(cif->rtype->type != FFI_TYPE_STRUCT); - - /* Tell ffi_closure_UNIX64 how to perform return type promotions. */ - - return cif->rtype->type; + /* Tell assembly how to perform return type promotions. */ + return ret; } -#endif /* ifndef __x86_64__ */ + +#endif /* __x86_64__ */ diff --git a/libffi/src/x86/unix64.S b/libffi/src/x86/unix64.S index 310fed71843..5e1c6c5e516 100644 --- a/libffi/src/x86/unix64.S +++ b/libffi/src/x86/unix64.S @@ -28,276 +28,348 @@ #include #include - .section .rodata -.LC0: - .string "asm in progress %lld\n" -.LC1: - .string "asm in progress\n" .text + +/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, + void *raddr, void (*fnaddr)()); + + Bit o trickiness here -- ARGS+BYTES is the base of the stack frame + for this function. This has been allocated by ffi_call. We also + deallocate some of the stack that has been alloca'd. */ + .align 2 -.globl ffi_call_UNIX64 - .type ffi_call_UNIX64,@function + .globl ffi_call_unix64 + .type ffi_call_unix64,@function -ffi_call_UNIX64: -.LFB1: - pushq %rbp -.LCFI0: - movq %rsp, %rbp -.LCFI1: - /* Save all arguments */ - subq $48, %rsp -.LCFI2: - movq %rdi, -8(%rbp) /* ffi_prep_args */ - movq %rsi, -16(%rbp) /* ffi_fill_return_value */ - movq %rdx, -24(%rbp) /* ecif */ - movq %rcx, -32(%rbp) /* cif->bytes */ - movq %r8, -40(%rbp) /* ecif.rvalue */ - movq %r9, -48(%rbp) /* fn */ +ffi_call_unix64: +.LUW0: + movq (%rsp), %r10 /* Load return address. */ + leaq (%rdi, %rsi), %rax /* Find local stack base. */ + movq %rdx, (%rax) /* Save flags. */ + movq %rcx, 8(%rax) /* Save raddr. */ + movq %rbp, 16(%rax) /* Save old frame pointer. */ + movq %r10, 24(%rax) /* Relocate return address. */ + movq %rax, %rbp /* Finalize local stack frame. */ +.LUW1: + movq %rdi, %r10 /* Save a copy of the register area. */ + movq %r8, %r11 /* Save a copy of the target fn. */ - /* Make room for all of the new args and the register args */ - addl $176, %ecx -.LCFI3: - subq %rcx, %rsp -.LCFI4: - /* Setup the call to ffi_prep_args. */ - movq %rdi, %rax /* &ffi_prep_args */ - movq %rsp, %rdi /* stackLayout */ - movq %rdx, %rsi /* ecif */ - call *%rax /* ffi_prep_args(stackLayout, ecif);*/ + /* Load up all argument registers. */ + movq (%r10), %rdi + movq 8(%r10), %rsi + movq 16(%r10), %rdx + movq 24(%r10), %rcx + movq 32(%r10), %r8 + movq 40(%r10), %r9 + movdqa 48(%r10), %xmm0 + movdqa 64(%r10), %xmm1 + movdqa 80(%r10), %xmm2 + movdqa 96(%r10), %xmm3 + movdqa 112(%r10), %xmm4 + movdqa 128(%r10), %xmm5 + movdqa 144(%r10), %xmm6 + movdqa 160(%r10), %xmm7 - /* ffi_prep_args have put all the register contents into the */ - /* stackLayout struct. Now put the register values in place. */ - movq (%rsp), %rdi - movq 8(%rsp), %rsi - movq 16(%rsp), %rdx - movq 24(%rsp), %rcx - movq 32(%rsp), %r8 - movq 40(%rsp), %r9 - movaps 48(%rsp), %xmm0 - movaps 64(%rsp), %xmm1 - movaps 80(%rsp), %xmm2 - movaps 96(%rsp), %xmm3 - movaps 112(%rsp), %xmm4 - movaps 128(%rsp), %xmm5 - movaps 144(%rsp), %xmm6 - movaps 160(%rsp), %xmm7 + /* Deallocate the reg arg area. */ + leaq 176(%r10), %rsp - /* Remove space for stackLayout so stack arguments are placed - correctly for the call. */ -.LCFI5: - addq $176, %rsp -.LCFI6: /* Call the user function. */ - call *-48(%rbp) + call *%r11 - /* Make stack space for the return_value struct. */ - subq $64, %rsp + /* Deallocate stack arg area; local stack frame in redzone. */ + leaq 24(%rbp), %rsp - /* Fill in all potential return values to this struct. */ - movq %rax, (%rsp) - movq %rdx, 8(%rsp) - movaps %xmm0, 16(%rsp) - movaps %xmm1, 32(%rsp) - fstpt 48(%rsp) + movq 0(%rbp), %rcx /* Reload flags. */ + movq 8(%rbp), %rdi /* Reload raddr. */ + movq 16(%rbp), %rbp /* Reload old frame pointer. */ +.LUW2: - /* Now call ffi_fill_return_value. */ - movq %rsp, %rdi /* struct return_value */ - movq -24(%rbp), %rsi /* ecif */ - movq -16(%rbp), %rax /* &ffi_fill_return_value */ - call *%rax /* call it */ + /* The first byte of the flags contains the FFI_TYPE. */ + movzbl %cl, %r10d + leaq .Lstore_table(%rip), %r11 + movslq (%r11, %r10, 4), %r10 + addq %r11, %r10 + jmp *%r10 - /* And the work is done. */ - leave - ret -.LFE1: -.ffi_call_UNIX64_end: - .size ffi_call_UNIX64,.ffi_call_UNIX64_end-ffi_call_UNIX64 + .section .rodata +.Lstore_table: + .long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */ + .long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */ + .long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */ + .long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */ + .long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */ + .long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */ + .long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */ + .long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */ + .long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */ + .long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */ + .long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */ + .long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */ + .long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */ + .long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */ + .long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */ -.text - .align 2 -.globl float2sse - .type float2sse,@function -float2sse: - /* Save the contents of this sse-float in a pointer. */ - movaps %xmm0, (%rdi) + .text + .align 2 +.Lst_void: + ret + .align 2 + +.Lst_uint8: + movzbq %al, %rax + movq %rax, (%rdi) + ret + .align 2 +.Lst_sint8: + movsbq %al, %rax + movq %rax, (%rdi) + ret + .align 2 +.Lst_uint16: + movzwq %ax, %rax + movq %rax, (%rdi) + .align 2 +.Lst_sint16: + movswq %ax, %rax + movq %rax, (%rdi) + ret + .align 2 +.Lst_uint32: + movl %eax, %eax + movq %rax, (%rdi) + .align 2 +.Lst_sint32: + cltq + movq %rax, (%rdi) + ret + .align 2 +.Lst_int64: + movq %rax, (%rdi) ret - .align 2 -.globl floatfloat2sse - .type floatfloat2sse,@function -floatfloat2sse: - /* Save the contents of these two sse-floats in a pointer. */ - movq (%rdi), %xmm0 - movaps %xmm0, (%rsi) + .align 2 +.Lst_float: + movss %xmm0, (%rdi) + ret + .align 2 +.Lst_double: + movsd %xmm0, (%rdi) + ret +.Lst_ldouble: + fstpt (%rdi) ret - .align 2 -.globl double2sse - .type double2sse,@function -double2sse: - /* Save the contents of this sse-double in a pointer. */ - movaps %xmm0, (%rdi) + .align 2 +.Lst_struct: + leaq -20(%rsp), %rsi /* Scratch area in redzone. */ + + /* We have to locate the values now, and since we don't want to + write too much data into the user's return value, we spill the + value to a 16 byte scratch area first. Bits 8, 9, and 10 + control where the values are located. Only one of the three + bits will be set; see ffi_prep_cif_machdep for the pattern. */ + movd %xmm0, %r10 + movd %xmm1, %r11 + testl $0x100, %ecx + cmovnz %rax, %rdx + cmovnz %r10, %rax + testl $0x200, %ecx + cmovnz %r10, %rdx + testl $0x400, %ecx + cmovnz %r10, %rax + cmovnz %r11, %rdx + movq %rax, (%rsi) + movq %rdx, 8(%rsi) + + /* Bits 11-31 contain the true size of the structure. Copy from + the scratch area to the true destination. */ + shrl $11, %ecx + rep movsb ret +.LUW3: + .size ffi_call_unix64,.-ffi_call_unix64 .align 2 -.globl sse2float - .type sse2float,@function -sse2float: - /* Save the contents of this sse-float in a pointer. */ - movaps (%rdi), %xmm0 - ret + .globl ffi_closure_unix64 + .type ffi_closure_unix64,@function - .align 2 -.globl sse2double - .type sse2double,@function -sse2double: - /* Save the contents of this pointer in a sse-double. */ - movaps (%rdi), %xmm0 - ret +ffi_closure_unix64: +.LUW4: + subq $200, %rsp +.LUW5: - .align 2 -.globl sse2floatfloat - .type sse2floatfloat,@function -sse2floatfloat: - /* Save the contents of this pointer in two sse-floats. */ - movaps (%rdi), %xmm0 - movq %xmm0, (%rsi) - ret + movq %rdi, (%rsp) + movq %rsi, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rcx, 24(%rsp) + movq %r8, 32(%rsp) + movq %r9, 40(%rsp) + movdqa %xmm0, 48(%rsp) + movdqa %xmm1, 64(%rsp) + movdqa %xmm2, 80(%rsp) + movdqa %xmm3, 96(%rsp) + movdqa %xmm4, 112(%rsp) + movdqa %xmm5, 128(%rsp) + movdqa %xmm6, 144(%rsp) + movdqa %xmm7, 160(%rsp) - .align 2 -.globl ffi_closure_UNIX64 - .type ffi_closure_UNIX64,@function - -ffi_closure_UNIX64: -.LFB2: - pushq %rbp -.LCFI10: - movq %rsp, %rbp -.LCFI11: - subq $240, %rsp -.LCFI12: - movq %rdi, -176(%rbp) - movq %rsi, -168(%rbp) - movq %rdx, -160(%rbp) - movq %rcx, -152(%rbp) - movq %r8, -144(%rbp) - movq %r9, -136(%rbp) - /* FIXME: We can avoid all this stashing of XMM registers by - (in ffi_prep_closure) computing the number of - floating-point args and moving it into %rax before calling - this function. Once this is done, uncomment the next few - lines and only the essential XMM registers will be written - to memory. This is a significant saving. */ -/* movzbl %al, %eax */ -/* movq %rax, %rdx */ -/* leaq 0(,%rdx,4), %rax */ -/* leaq 2f(%rip), %rdx */ -/* subq %rax, %rdx */ - leaq -1(%rbp), %rax -/* jmp *%rdx */ - movaps %xmm7, -15(%rax) - movaps %xmm6, -31(%rax) - movaps %xmm5, -47(%rax) - movaps %xmm4, -63(%rax) - movaps %xmm3, -79(%rax) - movaps %xmm2, -95(%rax) - movaps %xmm1, -111(%rax) - movaps %xmm0, -127(%rax) -2: - movl %edi, -180(%rbp) - movl $0, -224(%rbp) - movl $48, -220(%rbp) - leaq 16(%rbp), %rax - movq %rax, -216(%rbp) - leaq -176(%rbp), %rdx - movq %rdx, -208(%rbp) - leaq -224(%rbp), %rsi movq %r10, %rdi + leaq 176(%rsp), %rsi movq %rsp, %rdx - call ffi_closure_UNIX64_inner@PLT + leaq 208(%rsp), %rcx + call ffi_closure_unix64_inner@PLT - cmpl $FFI_TYPE_FLOAT, %eax - je 1f - cmpl $FFI_TYPE_DOUBLE, %eax - je 2f - cmpl $FFI_TYPE_LONGDOUBLE, %eax - je 3f - cmpl $FFI_TYPE_STRUCT, %eax - je 4f - popq %rax - leave - ret -1: -2: -3: - movaps -240(%rbp), %xmm0 - leave - ret -4: - leave + /* Deallocate stack frame early; return value is now in redzone. */ + addq $200, %rsp +.LUW6: + + /* The first byte of the return value contains the FFI_TYPE. */ + movzbl %al, %r10d + leaq .Lload_table(%rip), %r11 + movslq (%r11, %r10, 4), %r10 + addq %r11, %r10 + jmp *%r10 + + .section .rodata +.Lload_table: + .long .Lld_void-.Lload_table /* FFI_TYPE_VOID */ + .long .Lld_int32-.Lload_table /* FFI_TYPE_INT */ + .long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */ + .long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */ + .long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */ + .long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */ + .long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */ + .long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */ + .long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */ + .long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */ + .long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */ + .long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */ + .long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */ + .long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */ + .long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */ + + .text + .align 2 +.Lld_void: ret -.LFE2: - - .section .eh_frame,EH_FRAME_FLAGS,@progbits -.Lframe0: - .long .LECIE1-.LSCIE1 + + .align 2 +.Lld_int8: + movzbl -24(%rsp), %eax + ret + .align 2 +.Lld_int16: + movzwl -24(%rsp), %eax + ret + .align 2 +.Lld_int32: + movl -24(%rsp), %eax + ret + .align 2 +.Lld_int64: + movq -24(%rsp), %rax + ret + + .align 2 +.Lld_float: + movss -24(%rsp), %xmm0 + ret + .align 2 +.Lld_double: + movsd -24(%rsp), %xmm0 + ret + .align 2 +.Lld_ldouble: + fldt -24(%rsp) + ret + + .align 2 +.Lld_struct: + /* There are four possibilities here, %rax/%rdx, %xmm0/%rax, + %rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading + both rdx and xmm1 with the second word. For the remaining, + bit 8 set means xmm0 gets the second word, and bit 9 means + that rax gets the second word. */ + movq -24(%rsp), %rcx + movq -16(%rsp), %rdx + movq -16(%rsp), %xmm1 + testl $0x100, %eax + cmovnz %rdx, %rcx + movd %rcx, %xmm0 + testl $0x200, %eax + movq -24(%rsp), %rax + cmovnz %rdx, %rax + ret +.LUW7: + .size ffi_closure_unix64,.-ffi_closure_unix64 + + .section .eh_frame,"a",@progbits +.Lframe1: + .long .LECIE1-.LSCIE1 /* CIE Length */ .LSCIE1: - .long 0x0 - .byte 0x1 - .string "zR" - .uleb128 0x1 - .sleb128 -8 - .byte 0x10 - .uleb128 0x1 - .byte 0x1b - .byte 0xc - .uleb128 0x7 - .uleb128 0x8 - .byte 0x90 - .uleb128 0x1 - .align 8 + .long 0 /* CIE Identifier Tag */ + .byte 1 /* CIE Version */ + .ascii "zR\0" /* CIE Augmentation */ + .uleb128 1 /* CIE Code Alignment Factor */ + .sleb128 -8 /* CIE Data Alignment Factor */ + .byte 0x10 /* CIE RA Column */ + .uleb128 1 /* Augmentation size */ + .byte 0x1b /* FDE Encoding (pcrel sdata4) */ + .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ + .uleb128 7 + .uleb128 8 + .byte 0x80+16 /* DW_CFA_offset, %rip offset 1*-8 */ + .uleb128 1 + .align 8 .LECIE1: .LSFDE1: - .long .LEFDE1-.LASFDE1 + .long .LEFDE1-.LASFDE1 /* FDE Length */ .LASFDE1: - .long .LASFDE1-.Lframe0 + .long .LASFDE1-.Lframe1 /* FDE CIE offset */ + .long .LUW0-. /* FDE initial location */ + .long .LUW3-.LUW0 /* FDE address range */ + .uleb128 0x0 /* Augmentation size */ - .long .LFB1-. - .long .LFE1-.LFB1 - .uleb128 0x0 - .byte 0x4 # DW_CFA_advance_loc4 - .long .LCFI0-.LFB1 - .byte 0xe # DW_CFA_def_cfa_offset - .uleb128 0x10 - .byte 0x86 # DW_CFA_offset: r6 at cfa-16 - .uleb128 0x2 - .byte 0x4 # DW_CFA_advance_loc4 - .long .LCFI1-.LCFI0 - .byte 0x86 # DW_CFA_offset: r6 at cfa-16 - .uleb128 0x2 - .byte 0xd # DW_CFA_def_cfa_reg: r6 - .uleb128 0x6 + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW1-.LUW0 + + /* New stack frame based off rbp. This is a itty bit of unwind + trickery in that the CFA *has* changed. There is no easy way + to describe it correctly on entry to the function. Fortunately, + it doesn't matter too much since at all points we can correctly + unwind back to ffi_call. Note that the location to which we + moved the return address is (the new) CFA-8, so from the + perspective of the unwind info, it hasn't moved. */ + .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */ + .uleb128 6 + .uleb128 32 + .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ + .uleb128 2 + + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW2-.LUW3 + .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */ + .uleb128 7 + .uleb128 8 + .byte 0xc0+6 /* DW_CFA_restore, %rbp */ .align 8 .LEFDE1: .LSFDE3: - .long .LEFDE3-.LASFDE3 # FDE Length + .long .LEFDE3-.LASFDE3 /* FDE Length */ .LASFDE3: - .long .LASFDE3-.Lframe0 # FDE CIE offset - - .long .LFB2-. # FDE initial location - .long .LFE2-.LFB2 # FDE address range - .uleb128 0x0 # Augmentation size - .byte 0x4 # DW_CFA_advance_loc4 - .long .LCFI10-.LFB2 - .byte 0xe # DW_CFA_def_cfa_offset - .uleb128 0x10 - .byte 0x86 # DW_CFA_offset, column 0x6 - .uleb128 0x2 - .byte 0x4 # DW_CFA_advance_loc4 - .long .LCFI11-.LCFI10 - .byte 0xd # DW_CFA_def_cfa_register - .uleb128 0x6 - .align 8 + .long .LASFDE3-.Lframe1 /* FDE CIE offset */ + .long .LUW4-. /* FDE initial location */ + .long .LUW7-.LUW4 /* FDE address range */ + .uleb128 0x0 /* Augmentation size */ + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW5-.LUW4 + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 208 + .byte 0x4 /* DW_CFA_advance_loc4 */ + .long .LUW6-.LUW5 + .byte 0xe /* DW_CFA_def_cfa_offset */ + .uleb128 8 + .align 8 .LEFDE3: -#endif /* __x86_64__ */ +#endif /* __x86_64__ */