tcg/tci: Use ffi for calls

This requires adjusting where arguments are stored.
Place them on the stack at left-aligned positions.
Adjust the stack frame to be at entirely positive offsets.

Tested-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2021-01-30 14:24:25 -08:00
parent bcb81061dc
commit 7b7d8b2d9a
5 changed files with 153 additions and 106 deletions

View File

@ -52,6 +52,7 @@
#define MAX_OPC_PARAM (4 + (MAX_OPC_PARAM_PER_ARG * MAX_OPC_PARAM_ARGS))
#define CPU_TEMP_BUF_NLONGS 128
#define TCG_STATIC_FRAME_SIZE (CPU_TEMP_BUF_NLONGS * sizeof(long))
/* Default target word size to pointer size. */
#ifndef TCG_TARGET_REG_BITS

View File

@ -147,7 +147,12 @@ static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg, TCGReg arg1,
intptr_t arg2);
static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
TCGReg base, intptr_t ofs);
#ifdef CONFIG_TCG_INTERPRETER
static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
ffi_cif *cif);
#else
static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target);
#endif
static bool tcg_target_const_match(int64_t val, TCGType type, int ct);
#ifdef TCG_TARGET_NEED_LDST_LABELS
static int tcg_out_ldst_finalize(TCGContext *s);
@ -1554,25 +1559,37 @@ void tcg_gen_callN(void *func, TCGTemp *ret, int nargs, TCGTemp **args)
for (i = 0; i < nargs; i++) {
int argtype = extract32(typemask, (i + 1) * 3, 3);
bool is_64bit = (argtype & ~1) == dh_typecode_i64;
bool want_align = false;
#if defined(CONFIG_TCG_INTERPRETER)
/*
* Align all arguments, so that they land in predictable places
* for passing off to ffi_call.
*/
want_align = true;
#elif defined(TCG_TARGET_CALL_ALIGN_ARGS)
/* Some targets want aligned 64 bit args */
want_align = is_64bit;
#endif
if (TCG_TARGET_REG_BITS < 64 && want_align && (real_args & 1)) {
op->args[pi++] = TCG_CALL_DUMMY_ARG;
real_args++;
}
if (TCG_TARGET_REG_BITS < 64 && is_64bit) {
#ifdef TCG_TARGET_CALL_ALIGN_ARGS
/* some targets want aligned 64 bit args */
if (real_args & 1) {
op->args[pi++] = TCG_CALL_DUMMY_ARG;
real_args++;
}
#endif
/* If stack grows up, then we will be placing successive
arguments at lower addresses, which means we need to
reverse the order compared to how we would normally
treat either big or little-endian. For those arguments
that will wind up in registers, this still works for
HPPA (the only current STACK_GROWSUP target) since the
argument registers are *also* allocated in decreasing
order. If another such target is added, this logic may
have to get more complicated to differentiate between
stack arguments and register arguments. */
/*
* If stack grows up, then we will be placing successive
* arguments at lower addresses, which means we need to
* reverse the order compared to how we would normally
* treat either big or little-endian. For those arguments
* that will wind up in registers, this still works for
* HPPA (the only current STACK_GROWSUP target) since the
* argument registers are *also* allocated in decreasing
* order. If another such target is added, this logic may
* have to get more complicated to differentiate between
* stack arguments and register arguments.
*/
#if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
op->args[pi++] = temp_arg(args[i] + 1);
op->args[pi++] = temp_arg(args[i]);
@ -3836,6 +3853,7 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
const int nb_oargs = TCGOP_CALLO(op);
const int nb_iargs = TCGOP_CALLI(op);
const TCGLifeData arg_life = op->life;
const TCGHelperInfo *info;
int flags, nb_regs, i;
TCGReg reg;
TCGArg arg;
@ -3847,7 +3865,8 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
TCGRegSet allocated_regs;
func_addr = tcg_call_func(op);
flags = tcg_call_flags(op);
info = tcg_call_info(op);
flags = info->flags;
nb_regs = ARRAY_SIZE(tcg_target_call_iarg_regs);
if (nb_regs > nb_iargs) {
@ -3939,7 +3958,16 @@ static void tcg_reg_alloc_call(TCGContext *s, TCGOp *op)
save_globals(s, allocated_regs);
}
#ifdef CONFIG_TCG_INTERPRETER
{
gpointer hash = (gpointer)(uintptr_t)info->typemask;
ffi_cif *cif = g_hash_table_lookup(ffi_table, hash);
assert(cif != NULL);
tcg_out_call(s, func_addr, cif);
}
#else
tcg_out_call(s, func_addr);
#endif
/* assign output registers and emit moves if needed */
for(i = 0; i < nb_oargs; i++) {

142
tcg/tci.c
View File

@ -18,45 +18,26 @@
*/
#include "qemu/osdep.h"
#include "qemu-common.h"
#include "tcg/tcg.h" /* MAX_OPC_PARAM_IARGS */
#include "exec/cpu_ldst.h"
#include "tcg/tcg-op.h"
#include "qemu/compiler.h"
#include <ffi.h>
/* Enable TCI assertions only when debugging TCG (and without NDEBUG defined).
* Without assertions, the interpreter runs much faster. */
/*
* Enable TCI assertions only when debugging TCG (and without NDEBUG defined).
* Without assertions, the interpreter runs much faster.
*/
#if defined(CONFIG_DEBUG_TCG)
# define tci_assert(cond) assert(cond)
#else
# define tci_assert(cond) ((void)(cond))
#endif
#include "qemu-common.h"
#include "tcg/tcg.h" /* MAX_OPC_PARAM_IARGS */
#include "exec/cpu_ldst.h"
#include "tcg/tcg-op.h"
#include "qemu/compiler.h"
#if MAX_OPC_PARAM_IARGS != 6
# error Fix needed, number of supported input arguments changed!
#endif
#if TCG_TARGET_REG_BITS == 32
typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong);
#else
typedef uint64_t (*helper_function)(tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong,
tcg_target_ulong, tcg_target_ulong);
#endif
__thread uintptr_t tci_tb_ptr;
static tcg_target_ulong tci_read_reg(const tcg_target_ulong *regs, TCGReg index)
{
tci_assert(index < TCG_TARGET_NB_REGS);
return regs[index];
}
static void
tci_write_reg(tcg_target_ulong *regs, TCGReg index, tcg_target_ulong value)
{
@ -133,6 +114,7 @@ static tcg_target_ulong tci_read_label(const uint8_t **tb_ptr)
* I = immediate (tcg_target_ulong)
* l = label or pointer
* m = immediate (TCGMemOpIdx)
* n = immediate (call return length)
* r = register
* s = signed ldst offset
*/
@ -153,6 +135,18 @@ static void tci_args_l(const uint8_t **tb_ptr, void **l0)
check_size(start, tb_ptr);
}
static void tci_args_nll(const uint8_t **tb_ptr, uint8_t *n0,
void **l1, void **l2)
{
const uint8_t *start = *tb_ptr;
*n0 = tci_read_b(tb_ptr);
*l1 = (void *)tci_read_label(tb_ptr);
*l2 = (void *)tci_read_label(tb_ptr);
check_size(start, tb_ptr);
}
static void tci_args_rr(const uint8_t **tb_ptr,
TCGReg *r0, TCGReg *r1)
{
@ -487,11 +481,14 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
{
const uint8_t *tb_ptr = v_tb_ptr;
tcg_target_ulong regs[TCG_TARGET_NB_REGS];
long tcg_temps[CPU_TEMP_BUF_NLONGS];
uintptr_t sp_value = (uintptr_t)(tcg_temps + CPU_TEMP_BUF_NLONGS);
uint64_t stack[(TCG_STATIC_CALL_ARGS_SIZE + TCG_STATIC_FRAME_SIZE)
/ sizeof(uint64_t)];
void *call_slots[TCG_STATIC_CALL_ARGS_SIZE / sizeof(uint64_t)];
regs[TCG_AREG0] = (tcg_target_ulong)env;
regs[TCG_REG_CALL_STACK] = sp_value;
regs[TCG_REG_CALL_STACK] = (uintptr_t)stack;
/* Other call_slots entries initialized at first use (see below). */
call_slots[0] = NULL;
tci_assert(tb_ptr);
for (;;) {
@ -509,40 +506,58 @@ uintptr_t QEMU_DISABLE_CFI tcg_qemu_tb_exec(CPUArchState *env,
#endif
TCGMemOpIdx oi;
int32_t ofs;
void *ptr;
void *ptr, *cif;
/* Skip opcode and size entry. */
tb_ptr += 2;
switch (opc) {
case INDEX_op_call:
tci_args_l(&tb_ptr, &ptr);
/*
* Set up the ffi_avalue array once, delayed until now
* because many TB's do not make any calls. In tcg_gen_callN,
* we arranged for every real argument to be "left-aligned"
* in each 64-bit slot.
*/
if (unlikely(call_slots[0] == NULL)) {
for (int i = 0; i < ARRAY_SIZE(call_slots); ++i) {
call_slots[i] = &stack[i];
}
}
tci_args_nll(&tb_ptr, &len, &ptr, &cif);
/* Helper functions may need to access the "return address" */
tci_tb_ptr = (uintptr_t)tb_ptr;
#if TCG_TARGET_REG_BITS == 32
tmp64 = ((helper_function)ptr)(tci_read_reg(regs, TCG_REG_R0),
tci_read_reg(regs, TCG_REG_R1),
tci_read_reg(regs, TCG_REG_R2),
tci_read_reg(regs, TCG_REG_R3),
tci_read_reg(regs, TCG_REG_R4),
tci_read_reg(regs, TCG_REG_R5),
tci_read_reg(regs, TCG_REG_R6),
tci_read_reg(regs, TCG_REG_R7),
tci_read_reg(regs, TCG_REG_R8),
tci_read_reg(regs, TCG_REG_R9),
tci_read_reg(regs, TCG_REG_R10),
tci_read_reg(regs, TCG_REG_R11));
tci_write_reg(regs, TCG_REG_R0, tmp64);
tci_write_reg(regs, TCG_REG_R1, tmp64 >> 32);
#else
tmp64 = ((helper_function)ptr)(tci_read_reg(regs, TCG_REG_R0),
tci_read_reg(regs, TCG_REG_R1),
tci_read_reg(regs, TCG_REG_R2),
tci_read_reg(regs, TCG_REG_R3),
tci_read_reg(regs, TCG_REG_R4),
tci_read_reg(regs, TCG_REG_R5));
tci_write_reg(regs, TCG_REG_R0, tmp64);
#endif
ffi_call(cif, ptr, stack, call_slots);
/* Any result winds up "left-aligned" in the stack[0] slot. */
switch (len) {
case 0: /* void */
break;
case 1: /* uint32_t */
/*
* Note that libffi has an odd special case in that it will
* always widen an integral result to ffi_arg.
*/
if (sizeof(ffi_arg) == 4) {
regs[TCG_REG_R0] = *(uint32_t *)stack;
break;
}
/* fall through */
case 2: /* uint64_t */
if (TCG_TARGET_REG_BITS == 32) {
tci_write_reg64(regs, TCG_REG_R1, TCG_REG_R0, stack[0]);
} else {
regs[TCG_REG_R0] = stack[0];
}
break;
default:
g_assert_not_reached();
}
break;
case INDEX_op_br:
tci_args_l(&tb_ptr, &ptr);
tb_ptr = ptr;
@ -1119,7 +1134,7 @@ int print_insn_tci(bfd_vma addr, disassemble_info *info)
TCGCond c;
TCGMemOpIdx oi;
uint8_t pos, len;
void *ptr;
void *ptr, *cif;
const uint8_t *tb_ptr;
status = info->read_memory_func(addr, buf, 2, info);
@ -1147,13 +1162,18 @@ int print_insn_tci(bfd_vma addr, disassemble_info *info)
switch (op) {
case INDEX_op_br:
case INDEX_op_call:
case INDEX_op_exit_tb:
case INDEX_op_goto_tb:
tci_args_l(&tb_ptr, &ptr);
info->fprintf_func(info->stream, "%-12s %p", op_name, ptr);
break;
case INDEX_op_call:
tci_args_nll(&tb_ptr, &len, &ptr, &cif);
info->fprintf_func(info->stream, "%-12s %d, %p, %p",
op_name, len, ptr, cif);
break;
case INDEX_op_brcond_i32:
case INDEX_op_brcond_i64:
tci_args_rrcl(&tb_ptr, &r0, &r1, &c, &ptr);

View File

@ -192,23 +192,8 @@ static const int tcg_target_reg_alloc_order[] = {
# error Fix needed, number of supported input arguments changed!
#endif
static const int tcg_target_call_iarg_regs[] = {
TCG_REG_R0,
TCG_REG_R1,
TCG_REG_R2,
TCG_REG_R3,
TCG_REG_R4,
TCG_REG_R5,
#if TCG_TARGET_REG_BITS == 32
/* 32 bit hosts need 2 * MAX_OPC_PARAM_IARGS registers. */
TCG_REG_R6,
TCG_REG_R7,
TCG_REG_R8,
TCG_REG_R9,
TCG_REG_R10,
TCG_REG_R11,
#endif
};
/* No call arguments via registers. All will be stored on the "stack". */
static const int tcg_target_call_iarg_regs[] = { };
static const int tcg_target_call_oarg_regs[] = {
TCG_REG_R0,
@ -292,8 +277,9 @@ static void tci_out_label(TCGContext *s, TCGLabel *label)
static void stack_bounds_check(TCGReg base, target_long offset)
{
if (base == TCG_REG_CALL_STACK) {
tcg_debug_assert(offset < 0);
tcg_debug_assert(offset >= -(CPU_TEMP_BUF_NLONGS * sizeof(long)));
tcg_debug_assert(offset >= 0);
tcg_debug_assert(offset < (TCG_STATIC_CALL_ARGS_SIZE +
TCG_STATIC_FRAME_SIZE));
}
}
@ -593,11 +579,25 @@ static void tcg_out_movi(TCGContext *s, TCGType type,
}
}
static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *arg)
static void tcg_out_call(TCGContext *s, const tcg_insn_unit *func,
ffi_cif *cif)
{
uint8_t *old_code_ptr = s->code_ptr;
uint8_t which;
if (cif->rtype == &ffi_type_void) {
which = 0;
} else if (cif->rtype->size == 4) {
which = 1;
} else {
tcg_debug_assert(cif->rtype->size == 8);
which = 2;
}
tcg_out_op_t(s, INDEX_op_call);
tcg_out_i(s, (uintptr_t)arg);
tcg_out8(s, which);
tcg_out_i(s, (uintptr_t)func);
tcg_out_i(s, (uintptr_t)cif);
old_code_ptr[1] = s->code_ptr - old_code_ptr;
}
@ -822,11 +822,9 @@ static void tcg_target_init(TCGContext *s)
s->reserved_regs = 0;
tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
/* We use negative offsets from "sp" so that we can distinguish
stores that might pretend to be call arguments. */
tcg_set_frame(s, TCG_REG_CALL_STACK,
-CPU_TEMP_BUF_NLONGS * sizeof(long),
CPU_TEMP_BUF_NLONGS * sizeof(long));
/* The call arguments come first, followed by the temp storage. */
tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
TCG_STATIC_FRAME_SIZE);
}
/* Generate global QEMU prologue and epilogue code. */

View File

@ -162,7 +162,7 @@ typedef enum {
/* Used for function call generation. */
#define TCG_TARGET_CALL_STACK_OFFSET 0
#define TCG_TARGET_STACK_ALIGN 16
#define TCG_TARGET_STACK_ALIGN 8
#define HAVE_TCG_QEMU_TB_EXEC