indirect register lowering

-----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJXpMAtAAoJEK0ScMxN0Cebh60H/17kh95KYPERqTVI6eu9sMep
 Nn0tKXl1j4jGr5/w+1UIoY3zVCEI+17GeXQY6+XMtAvHrbFsvlbM7QgUST82l3ww
 dm9cMRMYgqAdsuUZHNAHkTxXtwgKgQkw06nJuYLDCpY1Skjw/vNt3pKqy4GDD7OJ
 FTHhq360hvE/mf7aFQV4477Cg8QdzvNTqoJgCC1waDN1N5BBNraq+wIjtyJZ299R
 6jAxjPBeGEIyv4/g4CdxrNPDdsBahnewO4wynQTbH52Whui1sRic2eSNzdKDK0hy
 aDVN2TDG1YnfhKCKAF73Gvpyb2eHcXDSdYQgFaVjaZtJpBXH845CRKHpo2kFrVw=
 =+piJ
 -----END PGP SIGNATURE-----

Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20160805' into staging

indirect register lowering

# gpg: Signature made Fri 05 Aug 2016 17:34:53 BST
# gpg:                using RSA key 0xAD1270CC4DD0279B
# gpg: Good signature from "Richard Henderson <rth7680@gmail.com>"
# gpg:                 aka "Richard Henderson <rth@redhat.com>"
# gpg:                 aka "Richard Henderson <rth@twiddle.net>"
# Primary key fingerprint: 9CB1 8DDA F8E8 49AD 2AFC  16A4 AD12 70CC 4DD0 279B

* remotes/rth/tags/pull-tcg-20160805:
  tcg: Lower indirect registers in a separate pass
  tcg: Require liveness analysis
  tcg: Include liveness info in the dumps
  tcg: Compress dead_temps and mem_temps into a single array
  tcg: Fold life data into TCGOp
  tcg: Reorg TCGOp chaining
  tcg: Compress liveness data to 16 bits

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2016-08-08 10:39:18 +01:00
commit cf5198d580
7 changed files with 442 additions and 268 deletions

View File

@ -59,7 +59,7 @@ static void gen_tb_end(TranslationBlock *tb, int num_insns)
}
/* Terminate the linked list. */
tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1;
tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0;
}
static inline void gen_io_start(void)

View File

@ -42,6 +42,7 @@ static inline bool qemu_log_separate(void)
#define CPU_LOG_TB_NOCHAIN (1 << 13)
#define CPU_LOG_PAGE (1 << 14)
#define LOG_TRACE (1 << 15)
#define CPU_LOG_TB_OP_IND (1 << 16)
/* Returns true if a bit is set in the current loglevel mask
*/
@ -54,7 +55,7 @@ static inline bool qemu_loglevel_mask(int mask)
/* main logging function
*/
void GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...);
int GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...);
/* vfprintf-like logging function
*/

View File

@ -82,37 +82,6 @@ static void init_temp_info(TCGArg temp)
}
}
static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op,
TCGOpcode opc, int nargs)
{
int oi = s->gen_next_op_idx;
int pi = s->gen_next_parm_idx;
int prev = old_op->prev;
int next = old_op - s->gen_op_buf;
TCGOp *new_op;
tcg_debug_assert(oi < OPC_BUF_SIZE);
tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE);
s->gen_next_op_idx = oi + 1;
s->gen_next_parm_idx = pi + nargs;
new_op = &s->gen_op_buf[oi];
*new_op = (TCGOp){
.opc = opc,
.args = pi,
.prev = prev,
.next = next
};
if (prev >= 0) {
s->gen_op_buf[prev].next = oi;
} else {
s->gen_first_op_idx = oi;
}
old_op->prev = oi;
return new_op;
}
static int op_bits(TCGOpcode op)
{
const TCGOpDef *def = &tcg_op_defs[op];
@ -583,7 +552,7 @@ void tcg_optimize(TCGContext *s)
nb_globals = s->nb_globals;
reset_all_temps(nb_temps);
for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
tcg_target_ulong mask, partmask, affected;
int nb_oargs, nb_iargs, i;
TCGArg tmp;
@ -1120,7 +1089,7 @@ void tcg_optimize(TCGContext *s)
uint64_t a = ((uint64_t)ah << 32) | al;
uint64_t b = ((uint64_t)bh << 32) | bl;
TCGArg rl, rh;
TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2);
TCGArg *args2 = &s->gen_opparam_buf[op2->args];
if (opc == INDEX_op_add2_i32) {
@ -1146,7 +1115,7 @@ void tcg_optimize(TCGContext *s)
uint32_t b = temps[args[3]].val;
uint64_t r = (uint64_t)a * b;
TCGArg rl, rh;
TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2);
TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2);
TCGArg *args2 = &s->gen_opparam_buf[op2->args];
rl = args[0];

View File

@ -52,7 +52,7 @@ static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args)
int pi = oi - 1;
tcg_debug_assert(oi < OPC_BUF_SIZE);
ctx->gen_last_op_idx = oi;
ctx->gen_op_buf[0].prev = oi;
ctx->gen_next_op_idx = ni;
ctx->gen_op_buf[oi] = (TCGOp){

590
tcg/tcg.c
View File

@ -23,7 +23,6 @@
*/
/* define it to use liveness analysis (better code) */
#define USE_LIVENESS_ANALYSIS
#define USE_TCG_OPTIMIZATIONS
#include "qemu/osdep.h"
@ -333,7 +332,7 @@ void tcg_context_init(TCGContext *s)
memset(s, 0, sizeof(*s));
s->nb_globals = 0;
/* Count total number of arguments and allocate the corresponding
space */
total_args = 0;
@ -438,9 +437,9 @@ void tcg_func_start(TCGContext *s)
s->goto_tb_issue_mask = 0;
#endif
s->gen_first_op_idx = 0;
s->gen_last_op_idx = -1;
s->gen_next_op_idx = 0;
s->gen_op_buf[0].next = 1;
s->gen_op_buf[0].prev = 0;
s->gen_next_op_idx = 1;
s->gen_next_parm_idx = 0;
s->be = tcg_malloc(sizeof(TCGBackendData));
@ -532,8 +531,12 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
#endif
if (!base_ts->fixed_reg) {
indirect_reg = 1;
/* We do not support double-indirect registers. */
tcg_debug_assert(!base_ts->indirect_reg);
base_ts->indirect_base = 1;
s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64
? 2 : 1);
indirect_reg = 1;
}
if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
@ -825,16 +828,16 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
real_args++;
}
#endif
/* If stack grows up, then we will be placing successive
arguments at lower addresses, which means we need to
reverse the order compared to how we would normally
treat either big or little-endian. For those arguments
that will wind up in registers, this still works for
HPPA (the only current STACK_GROWSUP target) since the
argument registers are *also* allocated in decreasing
order. If another such target is added, this logic may
have to get more complicated to differentiate between
stack arguments and register arguments. */
/* If stack grows up, then we will be placing successive
arguments at lower addresses, which means we need to
reverse the order compared to how we would normally
treat either big or little-endian. For those arguments
that will wind up in registers, this still works for
HPPA (the only current STACK_GROWSUP target) since the
argument registers are *also* allocated in decreasing
order. If another such target is added, this logic may
have to get more complicated to differentiate between
stack arguments and register arguments. */
#if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP)
s->gen_opparam_buf[pi++] = args[i] + 1;
s->gen_opparam_buf[pi++] = args[i];
@ -869,7 +872,7 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret,
/* Make sure the calli field didn't overflow. */
tcg_debug_assert(s->gen_op_buf[i].calli == real_args);
s->gen_last_op_idx = i;
s->gen_op_buf[0].prev = i;
s->gen_next_op_idx = i + 1;
s->gen_next_parm_idx = pi;
@ -1021,11 +1024,12 @@ void tcg_dump_ops(TCGContext *s)
TCGOp *op;
int oi;
for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) {
for (oi = s->gen_op_buf[0].next; oi != 0; oi = op->next) {
int i, k, nb_oargs, nb_iargs, nb_cargs;
const TCGOpDef *def;
const TCGArg *args;
TCGOpcode c;
int col = 0;
op = &s->gen_op_buf[oi];
c = op->opc;
@ -1033,7 +1037,7 @@ void tcg_dump_ops(TCGContext *s)
args = &s->gen_opparam_buf[op->args];
if (c == INDEX_op_insn_start) {
qemu_log("%s ----", oi != s->gen_first_op_idx ? "\n" : "");
col += qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : "");
for (i = 0; i < TARGET_INSN_START_WORDS; ++i) {
target_ulong a;
@ -1042,7 +1046,7 @@ void tcg_dump_ops(TCGContext *s)
#else
a = args[i];
#endif
qemu_log(" " TARGET_FMT_lx, a);
col += qemu_log(" " TARGET_FMT_lx, a);
}
} else if (c == INDEX_op_call) {
/* variable number of arguments */
@ -1051,12 +1055,12 @@ void tcg_dump_ops(TCGContext *s)
nb_cargs = def->nb_cargs;
/* function name, flags, out args */
qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name,
tcg_find_helper(s, args[nb_oargs + nb_iargs]),
args[nb_oargs + nb_iargs + 1], nb_oargs);
col += qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name,
tcg_find_helper(s, args[nb_oargs + nb_iargs]),
args[nb_oargs + nb_iargs + 1], nb_oargs);
for (i = 0; i < nb_oargs; i++) {
qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[i]));
col += qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[i]));
}
for (i = 0; i < nb_iargs; i++) {
TCGArg arg = args[nb_oargs + i];
@ -1064,10 +1068,10 @@ void tcg_dump_ops(TCGContext *s)
if (arg != TCG_CALL_DUMMY_ARG) {
t = tcg_get_arg_str_idx(s, buf, sizeof(buf), arg);
}
qemu_log(",%s", t);
col += qemu_log(",%s", t);
}
} else {
qemu_log(" %s ", def->name);
col += qemu_log(" %s ", def->name);
nb_oargs = def->nb_oargs;
nb_iargs = def->nb_iargs;
@ -1076,17 +1080,17 @@ void tcg_dump_ops(TCGContext *s)
k = 0;
for (i = 0; i < nb_oargs; i++) {
if (k != 0) {
qemu_log(",");
col += qemu_log(",");
}
qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[k++]));
col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[k++]));
}
for (i = 0; i < nb_iargs; i++) {
if (k != 0) {
qemu_log(",");
col += qemu_log(",");
}
qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[k++]));
col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf),
args[k++]));
}
switch (c) {
case INDEX_op_brcond_i32:
@ -1098,9 +1102,9 @@ void tcg_dump_ops(TCGContext *s)
case INDEX_op_setcond_i64:
case INDEX_op_movcond_i64:
if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) {
qemu_log(",%s", cond_name[args[k++]]);
col += qemu_log(",%s", cond_name[args[k++]]);
} else {
qemu_log(",$0x%" TCG_PRIlx, args[k++]);
col += qemu_log(",$0x%" TCG_PRIlx, args[k++]);
}
i = 1;
break;
@ -1114,12 +1118,12 @@ void tcg_dump_ops(TCGContext *s)
unsigned ix = get_mmuidx(oi);
if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) {
qemu_log(",$0x%x,%u", op, ix);
col += qemu_log(",$0x%x,%u", op, ix);
} else {
const char *s_al, *s_op;
s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT];
s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)];
qemu_log(",%s%s,%u", s_al, s_op, ix);
col += qemu_log(",%s%s,%u", s_al, s_op, ix);
}
i = 1;
}
@ -1134,14 +1138,39 @@ void tcg_dump_ops(TCGContext *s)
case INDEX_op_brcond_i32:
case INDEX_op_brcond_i64:
case INDEX_op_brcond2_i32:
qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id);
col += qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id);
i++, k++;
break;
default:
break;
}
for (; i < nb_cargs; i++, k++) {
qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]);
col += qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]);
}
}
if (op->life) {
unsigned life = op->life;
for (; col < 48; ++col) {
putc(' ', qemu_logfile);
}
if (life & (SYNC_ARG * 3)) {
qemu_log(" sync:");
for (i = 0; i < 2; ++i) {
if (life & (SYNC_ARG << i)) {
qemu_log(" %d", i);
}
}
}
life /= DEAD_ARG;
if (life) {
qemu_log(" dead:");
for (i = 0; life; ++i, life >>= 1) {
if (life & 1) {
qemu_log(" %d", i);
}
}
}
}
qemu_log("\n");
@ -1298,71 +1327,116 @@ void tcg_op_remove(TCGContext *s, TCGOp *op)
int next = op->next;
int prev = op->prev;
if (next >= 0) {
s->gen_op_buf[next].prev = prev;
} else {
s->gen_last_op_idx = prev;
}
if (prev >= 0) {
s->gen_op_buf[prev].next = next;
} else {
s->gen_first_op_idx = next;
}
/* We should never attempt to remove the list terminator. */
tcg_debug_assert(op != &s->gen_op_buf[0]);
memset(op, -1, sizeof(*op));
s->gen_op_buf[next].prev = prev;
s->gen_op_buf[prev].next = next;
memset(op, 0, sizeof(*op));
#ifdef CONFIG_PROFILER
s->del_op_count++;
#endif
}
#ifdef USE_LIVENESS_ANALYSIS
TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op,
TCGOpcode opc, int nargs)
{
int oi = s->gen_next_op_idx;
int pi = s->gen_next_parm_idx;
int prev = old_op->prev;
int next = old_op - s->gen_op_buf;
TCGOp *new_op;
tcg_debug_assert(oi < OPC_BUF_SIZE);
tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE);
s->gen_next_op_idx = oi + 1;
s->gen_next_parm_idx = pi + nargs;
new_op = &s->gen_op_buf[oi];
*new_op = (TCGOp){
.opc = opc,
.args = pi,
.prev = prev,
.next = next
};
s->gen_op_buf[prev].next = oi;
old_op->prev = oi;
return new_op;
}
TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op,
TCGOpcode opc, int nargs)
{
int oi = s->gen_next_op_idx;
int pi = s->gen_next_parm_idx;
int prev = old_op - s->gen_op_buf;
int next = old_op->next;
TCGOp *new_op;
tcg_debug_assert(oi < OPC_BUF_SIZE);
tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE);
s->gen_next_op_idx = oi + 1;
s->gen_next_parm_idx = pi + nargs;
new_op = &s->gen_op_buf[oi];
*new_op = (TCGOp){
.opc = opc,
.args = pi,
.prev = prev,
.next = next
};
s->gen_op_buf[next].prev = oi;
old_op->next = oi;
return new_op;
}
#define TS_DEAD 1
#define TS_MEM 2
#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n)))
#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n)))
/* liveness analysis: end of function: all temps are dead, and globals
should be in memory. */
static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps,
uint8_t *mem_temps)
static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state)
{
memset(dead_temps, 1, s->nb_temps);
memset(mem_temps, 1, s->nb_globals);
memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals);
memset(temp_state, TS_DEAD | TS_MEM, s->nb_globals);
memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals);
}
/* liveness analysis: end of basic block: all temps are dead, globals
and local temps should be in memory. */
static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps,
uint8_t *mem_temps)
static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state)
{
int i;
int i, n;
memset(dead_temps, 1, s->nb_temps);
memset(mem_temps, 1, s->nb_globals);
for(i = s->nb_globals; i < s->nb_temps; i++) {
mem_temps[i] = s->temps[i].temp_local;
tcg_la_func_end(s, temp_state);
for (i = s->nb_globals, n = s->nb_temps; i < n; i++) {
if (s->temps[i].temp_local) {
temp_state[i] |= TS_MEM;
}
}
}
/* Liveness analysis : update the opc_dead_args array to tell if a
/* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
static void tcg_liveness_analysis(TCGContext *s)
static void liveness_pass_1(TCGContext *s, uint8_t *temp_state)
{
uint8_t *dead_temps, *mem_temps;
int oi, oi_prev, nb_ops;
int nb_globals = s->nb_globals;
int oi, oi_prev;
nb_ops = s->gen_next_op_idx;
s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
dead_temps = tcg_malloc(s->nb_temps);
mem_temps = tcg_malloc(s->nb_temps);
tcg_la_func_end(s, dead_temps, mem_temps);
tcg_la_func_end(s, temp_state);
for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) {
for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) {
int i, nb_iargs, nb_oargs;
TCGOpcode opc_new, opc_new2;
bool have_opc_new2;
uint16_t dead_args;
uint8_t sync_args;
TCGLifeData arg_life = 0;
TCGArg arg;
TCGOp * const op = &s->gen_op_buf[oi];
@ -1385,7 +1459,7 @@ static void tcg_liveness_analysis(TCGContext *s)
if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) {
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (!dead_temps[arg] || mem_temps[arg]) {
if (temp_state[arg] != TS_DEAD) {
goto do_not_remove_call;
}
}
@ -1394,46 +1468,44 @@ static void tcg_liveness_analysis(TCGContext *s)
do_not_remove_call:
/* output args are dead */
dead_args = 0;
sync_args = 0;
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
dead_args |= (1 << i);
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
sync_args |= (1 << i);
if (temp_state[arg] & TS_MEM) {
arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
temp_state[arg] = TS_DEAD;
}
if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
/* globals should be synced to memory */
memset(mem_temps, 1, s->nb_globals);
}
if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS |
TCG_CALL_NO_READ_GLOBALS))) {
/* globals should go back to memory */
memset(dead_temps, 1, s->nb_globals);
memset(temp_state, TS_DEAD | TS_MEM, nb_globals);
} else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) {
/* globals should be synced to memory */
for (i = 0; i < nb_globals; i++) {
temp_state[i] |= TS_MEM;
}
}
/* record arguments that die in this helper */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
if (arg != TCG_CALL_DUMMY_ARG) {
if (dead_temps[arg]) {
dead_args |= (1 << i);
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
}
}
/* input arguments are live for preceding opcodes */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
dead_temps[arg] = 0;
if (arg != TCG_CALL_DUMMY_ARG) {
temp_state[arg] &= ~TS_DEAD;
}
}
s->op_dead_args[oi] = dead_args;
s->op_sync_args[oi] = sync_args;
}
}
break;
@ -1441,8 +1513,7 @@ static void tcg_liveness_analysis(TCGContext *s)
break;
case INDEX_op_discard:
/* mark the temporary as dead */
dead_temps[args[0]] = 1;
mem_temps[args[0]] = 0;
temp_state[args[0]] = TS_DEAD;
break;
case INDEX_op_add2_i32:
@ -1463,8 +1534,8 @@ static void tcg_liveness_analysis(TCGContext *s)
the low part. The result can be optimized to a simple
add or sub. This happens often for x86_64 guest when the
cpu mode is set to 32 bit. */
if (dead_temps[args[1]] && !mem_temps[args[1]]) {
if (dead_temps[args[0]] && !mem_temps[args[0]]) {
if (temp_state[args[1]] == TS_DEAD) {
if (temp_state[args[0]] == TS_DEAD) {
goto do_remove;
}
/* Replace the opcode and adjust the args in place,
@ -1501,8 +1572,8 @@ static void tcg_liveness_analysis(TCGContext *s)
do_mul2:
nb_iargs = 2;
nb_oargs = 2;
if (dead_temps[args[1]] && !mem_temps[args[1]]) {
if (dead_temps[args[0]] && !mem_temps[args[0]]) {
if (temp_state[args[1]] == TS_DEAD) {
if (temp_state[args[0]] == TS_DEAD) {
/* Both parts of the operation are dead. */
goto do_remove;
}
@ -1510,8 +1581,7 @@ static void tcg_liveness_analysis(TCGContext *s)
op->opc = opc = opc_new;
args[1] = args[2];
args[2] = args[3];
} else if (have_opc_new2 && dead_temps[args[0]]
&& !mem_temps[args[0]]) {
} else if (temp_state[args[0]] == TS_DEAD && have_opc_new2) {
/* The low part of the operation is dead; generate the high. */
op->opc = opc = opc_new2;
args[0] = args[1];
@ -1534,8 +1604,7 @@ static void tcg_liveness_analysis(TCGContext *s)
implies side effects */
if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) {
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (!dead_temps[arg] || mem_temps[arg]) {
if (temp_state[args[i]] != TS_DEAD) {
goto do_not_remove;
}
}
@ -1544,59 +1613,203 @@ static void tcg_liveness_analysis(TCGContext *s)
} else {
do_not_remove:
/* output args are dead */
dead_args = 0;
sync_args = 0;
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
dead_args |= (1 << i);
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
if (mem_temps[arg]) {
sync_args |= (1 << i);
if (temp_state[arg] & TS_MEM) {
arg_life |= SYNC_ARG << i;
}
dead_temps[arg] = 1;
mem_temps[arg] = 0;
temp_state[arg] = TS_DEAD;
}
/* if end of basic block, update */
if (def->flags & TCG_OPF_BB_END) {
tcg_la_bb_end(s, dead_temps, mem_temps);
tcg_la_bb_end(s, temp_state);
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
/* globals should be synced to memory */
memset(mem_temps, 1, s->nb_globals);
for (i = 0; i < nb_globals; i++) {
temp_state[i] |= TS_MEM;
}
}
/* record arguments that die in this opcode */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
arg = args[i];
if (dead_temps[arg]) {
dead_args |= (1 << i);
if (temp_state[arg] & TS_DEAD) {
arg_life |= DEAD_ARG << i;
}
}
/* input arguments are live for preceding opcodes */
for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
arg = args[i];
dead_temps[arg] = 0;
temp_state[args[i]] &= ~TS_DEAD;
}
s->op_dead_args[oi] = dead_args;
s->op_sync_args[oi] = sync_args;
}
break;
}
op->life = arg_life;
}
}
#else
/* dummy liveness analysis */
static void tcg_liveness_analysis(TCGContext *s)
{
int nb_ops = s->gen_next_op_idx;
s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t));
memset(s->op_dead_args, 0, nb_ops * sizeof(uint16_t));
s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t));
memset(s->op_sync_args, 0, nb_ops * sizeof(uint8_t));
/* Liveness analysis: Convert indirect regs to direct temporaries. */
static bool liveness_pass_2(TCGContext *s, uint8_t *temp_state)
{
int nb_globals = s->nb_globals;
int16_t *dir_temps;
int i, oi, oi_next;
bool changes = false;
dir_temps = tcg_malloc(nb_globals * sizeof(int16_t));
memset(dir_temps, 0, nb_globals * sizeof(int16_t));
/* Create a temporary for each indirect global. */
for (i = 0; i < nb_globals; ++i) {
TCGTemp *its = &s->temps[i];
if (its->indirect_reg) {
TCGTemp *dts = tcg_temp_alloc(s);
dts->type = its->type;
dts->base_type = its->base_type;
dir_temps[i] = temp_idx(s, dts);
}
}
memset(temp_state, TS_DEAD, nb_globals);
for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
TCGOp *op = &s->gen_op_buf[oi];
TCGArg *args = &s->gen_opparam_buf[op->args];
TCGOpcode opc = op->opc;
const TCGOpDef *def = &tcg_op_defs[opc];
TCGLifeData arg_life = op->life;
int nb_iargs, nb_oargs, call_flags;
TCGArg arg, dir;
oi_next = op->next;
if (opc == INDEX_op_call) {
nb_oargs = op->callo;
nb_iargs = op->calli;
call_flags = args[nb_oargs + nb_iargs + 1];
} else {
nb_iargs = def->nb_iargs;
nb_oargs = def->nb_oargs;
/* Set flags similar to how calls require. */
if (def->flags & TCG_OPF_BB_END) {
/* Like writing globals: save_globals */
call_flags = 0;
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
/* Like reading globals: sync_globals */
call_flags = TCG_CALL_NO_WRITE_GLOBALS;
} else {
/* No effect on globals. */
call_flags = (TCG_CALL_NO_READ_GLOBALS |
TCG_CALL_NO_WRITE_GLOBALS);
}
}
/* Make sure that input arguments are available. */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
/* Note this unsigned test catches TCG_CALL_ARG_DUMMY too. */
if (arg < nb_globals) {
dir = dir_temps[arg];
if (dir != 0 && temp_state[arg] == TS_DEAD) {
TCGTemp *its = &s->temps[arg];
TCGOpcode lopc = (its->type == TCG_TYPE_I32
? INDEX_op_ld_i32
: INDEX_op_ld_i64);
TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3);
TCGArg *largs = &s->gen_opparam_buf[lop->args];
largs[0] = dir;
largs[1] = temp_idx(s, its->mem_base);
largs[2] = its->mem_offset;
/* Loaded, but synced with memory. */
temp_state[arg] = TS_MEM;
}
}
}
/* Perform input replacement, and mark inputs that became dead.
No action is required except keeping temp_state up to date
so that we reload when needed. */
for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
arg = args[i];
if (arg < nb_globals) {
dir = dir_temps[arg];
if (dir != 0) {
args[i] = dir;
changes = true;
if (IS_DEAD_ARG(i)) {
temp_state[arg] = TS_DEAD;
}
}
}
}
/* Liveness analysis should ensure that the following are
all correct, for call sites and basic block end points. */
if (call_flags & TCG_CALL_NO_READ_GLOBALS) {
/* Nothing to do */
} else if (call_flags & TCG_CALL_NO_WRITE_GLOBALS) {
for (i = 0; i < nb_globals; ++i) {
/* Liveness should see that globals are synced back,
that is, either TS_DEAD or TS_MEM. */
tcg_debug_assert(dir_temps[i] == 0
|| temp_state[i] != 0);
}
} else {
for (i = 0; i < nb_globals; ++i) {
/* Liveness should see that globals are saved back,
that is, TS_DEAD, waiting to be reloaded. */
tcg_debug_assert(dir_temps[i] == 0
|| temp_state[i] == TS_DEAD);
}
}
/* Outputs become available. */
for (i = 0; i < nb_oargs; i++) {
arg = args[i];
if (arg >= nb_globals) {
continue;
}
dir = dir_temps[arg];
if (dir == 0) {
continue;
}
args[i] = dir;
changes = true;
/* The output is now live and modified. */
temp_state[arg] = 0;
/* Sync outputs upon their last write. */
if (NEED_SYNC_ARG(i)) {
TCGTemp *its = &s->temps[arg];
TCGOpcode sopc = (its->type == TCG_TYPE_I32
? INDEX_op_st_i32
: INDEX_op_st_i64);
TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3);
TCGArg *sargs = &s->gen_opparam_buf[sop->args];
sargs[0] = dir;
sargs[1] = temp_idx(s, its->mem_base);
sargs[2] = its->mem_offset;
temp_state[arg] = TS_MEM;
}
/* Drop outputs that are dead. */
if (IS_DEAD_ARG(i)) {
temp_state[arg] = TS_DEAD;
}
}
}
return changes;
}
#endif
#ifdef CONFIG_DEBUG_TCG
static void dump_regs(TCGContext *s)
@ -1728,14 +1941,6 @@ static void temp_sync(TCGContext *s, TCGTemp *ts,
if (!ts->mem_allocated) {
temp_allocate_frame(s, temp_idx(s, ts));
}
if (ts->indirect_reg) {
if (ts->val_type == TEMP_VAL_REG) {
tcg_regset_set_reg(allocated_regs, ts->reg);
}
temp_load(s, ts->mem_base,
tcg_target_available_regs[TCG_TYPE_PTR],
allocated_regs);
}
switch (ts->val_type) {
case TEMP_VAL_CONST:
/* If we're going to free the temp immediately, then we won't
@ -1826,12 +2031,6 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
break;
case TEMP_VAL_MEM:
reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base);
if (ts->indirect_reg) {
tcg_regset_set_reg(allocated_regs, reg);
temp_load(s, ts->mem_base,
tcg_target_available_regs[TCG_TYPE_PTR],
allocated_regs);
}
tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset);
ts->mem_coherent = 1;
break;
@ -1848,16 +2047,9 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs,
temporary registers needs to be allocated to store a constant. */
static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs)
{
#ifdef USE_LIVENESS_ANALYSIS
/* ??? Liveness does not yet incorporate indirect bases. */
if (!ts->indirect_base) {
/* The liveness analysis already ensures that globals are back
in memory. Keep an tcg_debug_assert for safety. */
tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
return;
}
#endif
temp_sync(s, ts, allocated_regs, 1);
/* The liveness analysis already ensures that globals are back
in memory. Keep an tcg_debug_assert for safety. */
tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg);
}
/* save globals to their canonical location and assume they can be
@ -1881,16 +2073,9 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
for (i = 0; i < s->nb_globals; i++) {
TCGTemp *ts = &s->temps[i];
#ifdef USE_LIVENESS_ANALYSIS
/* ??? Liveness does not yet incorporate indirect bases. */
if (!ts->indirect_base) {
tcg_debug_assert(ts->val_type != TEMP_VAL_REG
|| ts->fixed_reg
|| ts->mem_coherent);
continue;
}
#endif
temp_sync(s, ts, allocated_regs, 0);
tcg_debug_assert(ts->val_type != TEMP_VAL_REG
|| ts->fixed_reg
|| ts->mem_coherent);
}
}
@ -1905,27 +2090,17 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
if (ts->temp_local) {
temp_save(s, ts, allocated_regs);
} else {
#ifdef USE_LIVENESS_ANALYSIS
/* ??? Liveness does not yet incorporate indirect bases. */
if (!ts->indirect_base) {
/* The liveness analysis already ensures that temps are dead.
Keep an tcg_debug_assert for safety. */
tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
continue;
}
#endif
temp_dead(s, ts);
/* The liveness analysis already ensures that temps are dead.
Keep an tcg_debug_assert for safety. */
tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD);
}
}
save_globals(s, allocated_regs);
}
#define IS_DEAD_ARG(n) ((dead_args >> (n)) & 1)
#define NEED_SYNC_ARG(n) ((sync_args >> (n)) & 1)
static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
uint16_t dead_args, uint8_t sync_args)
TCGLifeData arg_life)
{
TCGTemp *ots;
tcg_target_ulong val;
@ -1954,8 +2129,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args,
}
static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
const TCGArg *args, uint16_t dead_args,
uint8_t sync_args)
const TCGArg *args, TCGLifeData arg_life)
{
TCGRegSet allocated_regs;
TCGTemp *ts, *ots;
@ -1987,12 +2161,6 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
if (!ots->mem_allocated) {
temp_allocate_frame(s, args[0]);
}
if (ots->indirect_reg) {
tcg_regset_set_reg(allocated_regs, ts->reg);
temp_load(s, ots->mem_base,
tcg_target_available_regs[TCG_TYPE_PTR],
allocated_regs);
}
tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset);
if (IS_DEAD_ARG(1)) {
temp_dead(s, ts);
@ -2040,8 +2208,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def,
static void tcg_reg_alloc_op(TCGContext *s,
const TCGOpDef *def, TCGOpcode opc,
const TCGArg *args, uint16_t dead_args,
uint8_t sync_args)
const TCGArg *args, TCGLifeData arg_life)
{
TCGRegSet allocated_regs;
int i, k, nb_iargs, nb_oargs;
@ -2206,8 +2373,7 @@ static void tcg_reg_alloc_op(TCGContext *s,
#endif
static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs,
const TCGArg * const args, uint16_t dead_args,
uint8_t sync_args)
const TCGArg * const args, TCGLifeData arg_life)
{
int flags, nb_regs, i;
TCGReg reg;
@ -2363,7 +2529,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
{
int n;
n = s->gen_last_op_idx + 1;
n = s->gen_op_buf[0].prev + 1;
s->op_count += n;
if (n > s->op_count_max) {
s->op_count_max = n;
@ -2399,7 +2565,27 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
s->la_time -= profile_getclock();
#endif
tcg_liveness_analysis(s);
{
uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects);
liveness_pass_1(s, temp_state);
if (s->nb_indirects > 0) {
#ifdef DEBUG_DISAS
if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND)
&& qemu_log_in_addr_range(tb->pc))) {
qemu_log("OP before indirect lowering:\n");
tcg_dump_ops(s);
qemu_log("\n");
}
#endif
/* Replace indirect temps with direct temps. */
if (liveness_pass_2(s, temp_state)) {
/* If changes were made, re-run liveness. */
liveness_pass_1(s, temp_state);
}
}
}
#ifdef CONFIG_PROFILER
s->la_time += profile_getclock();
@ -2422,13 +2608,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
tcg_out_tb_init(s);
num_insns = -1;
for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) {
for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
TCGOp * const op = &s->gen_op_buf[oi];
TCGArg * const args = &s->gen_opparam_buf[op->args];
TCGOpcode opc = op->opc;
const TCGOpDef *def = &tcg_op_defs[opc];
uint16_t dead_args = s->op_dead_args[oi];
uint8_t sync_args = s->op_sync_args[oi];
TCGLifeData arg_life = op->life;
oi_next = op->next;
#ifdef CONFIG_PROFILER
@ -2438,11 +2623,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
switch (opc) {
case INDEX_op_mov_i32:
case INDEX_op_mov_i64:
tcg_reg_alloc_mov(s, def, args, dead_args, sync_args);
tcg_reg_alloc_mov(s, def, args, arg_life);
break;
case INDEX_op_movi_i32:
case INDEX_op_movi_i64:
tcg_reg_alloc_movi(s, args, dead_args, sync_args);
tcg_reg_alloc_movi(s, args, arg_life);
break;
case INDEX_op_insn_start:
if (num_insns >= 0) {
@ -2467,8 +2652,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
tcg_out_label(s, arg_label(args[0]), s->code_ptr);
break;
case INDEX_op_call:
tcg_reg_alloc_call(s, op->callo, op->calli, args,
dead_args, sync_args);
tcg_reg_alloc_call(s, op->callo, op->calli, args, arg_life);
break;
default:
/* Sanity check that we've not introduced any unhandled opcodes. */
@ -2478,7 +2662,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
/* Note: in order to speed up the code, it would be much
faster to have specialized register allocator functions for
some common argument patterns */
tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args);
tcg_reg_alloc_op(s, def, opc, args, arg_life);
break;
}
#ifdef CONFIG_DEBUG_TCG

View File

@ -575,24 +575,41 @@ typedef struct TCGTempSet {
unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)];
} TCGTempSet;
/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding,
this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands.
There are never more than 2 outputs, which means that we can store all
dead + sync data within 16 bits. */
#define DEAD_ARG 4
#define SYNC_ARG 1
typedef uint16_t TCGLifeData;
/* The layout here is designed to avoid crossing of a 32-bit boundary.
If we do so, gcc adds padding, expanding the size to 12. */
typedef struct TCGOp {
TCGOpcode opc : 8;
TCGOpcode opc : 8; /* 8 */
/* Index of the prev/next op, or 0 for the end of the list. */
unsigned prev : 10; /* 18 */
unsigned next : 10; /* 28 */
/* The number of out and in parameter for a call. */
unsigned callo : 2;
unsigned calli : 6;
unsigned calli : 4; /* 32 */
unsigned callo : 2; /* 34 */
/* Index of the arguments for this op, or -1 for zero-operand ops. */
signed args : 16;
/* Index of the arguments for this op, or 0 for zero-operand ops. */
unsigned args : 14; /* 48 */
/* Index of the prex/next op, or -1 for the end of the list. */
signed prev : 16;
signed next : 16;
/* Lifetime data of the operands. */
unsigned life : 16; /* 64 */
} TCGOp;
QEMU_BUILD_BUG_ON(NB_OPS > 0xff);
QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff);
QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff);
/* Make sure operands fit in the bitfields above. */
QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8));
QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10));
QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14));
/* Make sure that we don't overflow 64 bits without noticing. */
QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8);
struct TCGContext {
uint8_t *pool_cur, *pool_end;
@ -600,6 +617,7 @@ struct TCGContext {
int nb_labels;
int nb_globals;
int nb_temps;
int nb_indirects;
/* goto_tb support */
tcg_insn_unit *code_buf;
@ -607,13 +625,6 @@ struct TCGContext {
uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */
uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */
/* liveness analysis */
uint16_t *op_dead_args; /* for each operation, each bit tells if the
corresponding argument is dead */
uint8_t *op_sync_args; /* for each operation, each bit tells if the
corresponding output argument needs to be
sync to memory. */
TCGRegSet reserved_regs;
intptr_t current_frame_offset;
intptr_t frame_start;
@ -649,8 +660,6 @@ struct TCGContext {
int goto_tb_issue_mask;
#endif
int gen_first_op_idx;
int gen_last_op_idx;
int gen_next_op_idx;
int gen_next_parm_idx;
@ -890,6 +899,9 @@ void tcg_gen_callN(TCGContext *s, void *func,
TCGArg ret, int nargs, TCGArg *args);
void tcg_op_remove(TCGContext *s, TCGOp *op);
TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg);
void tcg_optimize(TCGContext *s);
/* only used for debugging purposes */

View File

@ -32,15 +32,22 @@ int qemu_loglevel;
static int log_append = 0;
static GArray *debug_regions;
void qemu_log(const char *fmt, ...)
/* Return the number of characters emitted. */
int qemu_log(const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
int ret = 0;
if (qemu_logfile) {
vfprintf(qemu_logfile, fmt, ap);
va_list ap;
va_start(ap, fmt);
ret = vfprintf(qemu_logfile, fmt, ap);
va_end(ap);
/* Don't pass back error results. */
if (ret < 0) {
ret = 0;
}
}
va_end(ap);
return ret;
}
static bool log_uses_own_buffers;
@ -240,8 +247,9 @@ const QEMULogItem qemu_log_items[] = {
{ CPU_LOG_TB_OP, "op",
"show micro ops for each compiled TB" },
{ CPU_LOG_TB_OP_OPT, "op_opt",
"show micro ops (x86 only: before eflags optimization) and\n"
"after liveness analysis" },
"show micro ops after optimization" },
{ CPU_LOG_TB_OP_IND, "op_ind",
"show micro ops before indirect lowering" },
{ CPU_LOG_INT, "int",
"show interrupts/exceptions in short format" },
{ CPU_LOG_EXEC, "exec",