From b4cb76e6208cf6b5bb39404c6d44a6514eb6842a Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 8 Oct 2020 15:21:43 -0500 Subject: [PATCH 1/3] tcg: Do not kill globals at conditional branches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can easily register allocate the entire extended basic block (in this case, the set of blocks connected by fallthru), simply by not discarding the register state at the branch. This does not help blocks starting with a label, as they are reached via a taken branch, and that would require saving the complete register state at the branch. Reviewed-by: Alex Bennée Signed-off-by: Richard Henderson --- include/tcg/tcg-opc.h | 7 +++--- include/tcg/tcg.h | 4 +++- tcg/tcg.c | 55 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h index e3929b80d2..67092e82c6 100644 --- a/include/tcg/tcg-opc.h +++ b/include/tcg/tcg-opc.h @@ -81,7 +81,7 @@ DEF(extract_i32, 1, 1, 2, IMPL(TCG_TARGET_HAS_extract_i32)) DEF(sextract_i32, 1, 1, 2, IMPL(TCG_TARGET_HAS_sextract_i32)) DEF(extract2_i32, 1, 2, 1, IMPL(TCG_TARGET_HAS_extract2_i32)) -DEF(brcond_i32, 0, 2, 2, TCG_OPF_BB_END) +DEF(brcond_i32, 0, 2, 2, TCG_OPF_BB_END | TCG_OPF_COND_BRANCH) DEF(add2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_add2_i32)) DEF(sub2_i32, 2, 4, 0, IMPL(TCG_TARGET_HAS_sub2_i32)) @@ -89,7 +89,8 @@ DEF(mulu2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_mulu2_i32)) DEF(muls2_i32, 2, 2, 0, IMPL(TCG_TARGET_HAS_muls2_i32)) DEF(muluh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_muluh_i32)) DEF(mulsh_i32, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i32)) -DEF(brcond2_i32, 0, 4, 2, TCG_OPF_BB_END | IMPL(TCG_TARGET_REG_BITS == 32)) +DEF(brcond2_i32, 0, 4, 2, + TCG_OPF_BB_END | TCG_OPF_COND_BRANCH | IMPL(TCG_TARGET_REG_BITS == 32)) DEF(setcond2_i32, 1, 4, 1, IMPL(TCG_TARGET_REG_BITS == 32)) DEF(ext8s_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_ext8s_i32)) @@ -159,7 +160,7 @@ DEF(extrh_i64_i32, 1, 1, 0, IMPL(TCG_TARGET_HAS_extrh_i64_i32) | (TCG_TARGET_REG_BITS == 32 ? TCG_OPF_NOT_PRESENT : 0)) -DEF(brcond_i64, 0, 2, 2, TCG_OPF_BB_END | IMPL64) +DEF(brcond_i64, 0, 2, 2, TCG_OPF_BB_END | TCG_OPF_COND_BRANCH | IMPL64) DEF(ext8s_i64, 1, 1, 0, IMPL64 | IMPL(TCG_TARGET_HAS_ext8s_i64)) DEF(ext16s_i64, 1, 1, 0, IMPL64 | IMPL(TCG_TARGET_HAS_ext16s_i64)) DEF(ext32s_i64, 1, 1, 0, IMPL64 | IMPL(TCG_TARGET_HAS_ext32s_i64)) diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h index 8804a8c4a2..8ff9dad4ef 100644 --- a/include/tcg/tcg.h +++ b/include/tcg/tcg.h @@ -990,7 +990,7 @@ typedef struct TCGArgConstraint { #define TCG_MAX_OP_ARGS 16 -/* Bits for TCGOpDef->flags, 8 bits available. */ +/* Bits for TCGOpDef->flags, 8 bits available, all used. */ enum { /* Instruction exits the translation block. */ TCG_OPF_BB_EXIT = 0x01, @@ -1008,6 +1008,8 @@ enum { TCG_OPF_NOT_PRESENT = 0x20, /* Instruction operands are vectors. */ TCG_OPF_VECTOR = 0x40, + /* Instruction is a conditional branch. */ + TCG_OPF_COND_BRANCH = 0x80 }; typedef struct TCGOpDef { diff --git a/tcg/tcg.c b/tcg/tcg.c index a8c28440e2..f49f1a7f35 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -2519,6 +2519,28 @@ static void la_global_sync(TCGContext *s, int ng) } } +/* + * liveness analysis: conditional branch: all temps are dead, + * globals and local temps should be synced. + */ +static void la_bb_sync(TCGContext *s, int ng, int nt) +{ + la_global_sync(s, ng); + + for (int i = ng; i < nt; ++i) { + if (s->temps[i].temp_local) { + int state = s->temps[i].state; + s->temps[i].state = state | TS_MEM; + if (state != TS_DEAD) { + continue; + } + } else { + s->temps[i].state = TS_DEAD; + } + la_reset_pref(&s->temps[i]); + } +} + /* liveness analysis: sync globals back to memory and kill. */ static void la_global_kill(TCGContext *s, int ng) { @@ -2795,6 +2817,8 @@ static void liveness_pass_1(TCGContext *s) /* If end of basic block, update. */ if (def->flags & TCG_OPF_BB_EXIT) { la_func_end(s, nb_globals, nb_temps); + } else if (def->flags & TCG_OPF_COND_BRANCH) { + la_bb_sync(s, nb_globals, nb_temps); } else if (def->flags & TCG_OPF_BB_END) { la_bb_end(s, nb_globals, nb_temps); } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { @@ -2907,7 +2931,10 @@ static bool liveness_pass_2(TCGContext *s) nb_oargs = def->nb_oargs; /* Set flags similar to how calls require. */ - if (def->flags & TCG_OPF_BB_END) { + if (def->flags & TCG_OPF_COND_BRANCH) { + /* Like reading globals: sync_globals */ + call_flags = TCG_CALL_NO_WRITE_GLOBALS; + } else if (def->flags & TCG_OPF_BB_END) { /* Like writing globals: save_globals */ call_flags = 0; } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { @@ -3379,6 +3406,28 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) save_globals(s, allocated_regs); } +/* + * At a conditional branch, we assume all temporaries are dead and + * all globals and local temps are synced to their location. + */ +static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs) +{ + sync_globals(s, allocated_regs); + + for (int i = s->nb_globals; i < s->nb_temps; i++) { + TCGTemp *ts = &s->temps[i]; + /* + * The liveness analysis already ensures that temps are dead. + * Keep tcg_debug_asserts for safety. + */ + if (ts->temp_local) { + tcg_debug_assert(ts->val_type != TEMP_VAL_REG || ts->mem_coherent); + } else { + tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); + } + } +} + /* * Specialized code generation for INDEX_op_movi_*. */ @@ -3730,7 +3779,9 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op) } } - if (def->flags & TCG_OPF_BB_END) { + if (def->flags & TCG_OPF_COND_BRANCH) { + tcg_reg_alloc_cbranch(s, i_allocated_regs); + } else if (def->flags & TCG_OPF_BB_END) { tcg_reg_alloc_bb_end(s, i_allocated_regs); } else { if (def->flags & TCG_OPF_CALL_CLOBBER) { From cd0372c515c4732d8bd3777cdd995c139c7ed7ea Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 9 Oct 2020 08:23:52 -0500 Subject: [PATCH 2/3] tcg/optimize: Flush data at labels not TCG_OPF_BB_END MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We can easily propagate temp values through the entire extended basic block (in this case, the set of blocks connected by fallthru), simply by not discarding the register state at the branch. Reviewed-by: Alex Bennée Signed-off-by: Richard Henderson --- tcg/optimize.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index 220f4601d5..9952c28bdc 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -1484,29 +1484,30 @@ void tcg_optimize(TCGContext *s) } } } - goto do_reset_output; + /* fall through */ default: do_default: - /* Default case: we know nothing about operation (or were unable - to compute the operation result) so no propagation is done. - We trash everything if the operation is the end of a basic - block, otherwise we only trash the output args. "mask" is - the non-zero bits mask for the first output arg. */ - if (def->flags & TCG_OPF_BB_END) { - bitmap_zero(temps_used.l, nb_temps); - } else { - do_reset_output: - for (i = 0; i < nb_oargs; i++) { - reset_temp(op->args[i]); - /* Save the corresponding known-zero bits mask for the - first output argument (only one supported so far). */ - if (i == 0) { - arg_info(op->args[i])->mask = mask; - } + /* + * Default case: we know nothing about operation (or were unable + * to compute the operation result) so no propagation is done. + */ + for (i = 0; i < nb_oargs; i++) { + reset_temp(op->args[i]); + /* + * Save the corresponding known-zero bits mask for the + * first output argument (only one supported so far). + */ + if (i == 0) { + arg_info(op->args[i])->mask = mask; } } break; + + case INDEX_op_set_label: + /* Trash everything at the start of a new extended bb. */ + bitmap_zero(temps_used.l, nb_temps); + break; } /* Eliminate duplicate and redundant fence instructions. */ From 1d705e8a5bbfe36294081baa45ab68a9ad987f33 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Tue, 13 Oct 2020 13:26:58 +0100 Subject: [PATCH 3/3] accel/tcg: Add CPU_LOG_EXEC tracing for cpu_io_recompile() When using -icount, it's useful for the CPU_LOG_EXEC logging to include information about when cpu_io_recompile() was called, because it alerts the reader of the log that the tracing of a previous TB execution may not actually correspond to an actually executed instruction. For instance if you're using -icount and also -singlestep then a guest instruction that makes an IO access appears in two "Trace" lines, once in a TB that triggers the cpu_io_recompile() and then again in the TB that actually executes. (This is a similar reason to why the "Stopped execution of TB chain before..." logging in cpu_tb_exec() is helpful when trying to track execution flow in the logs.) Signed-off-by: Peter Maydell Message-Id: <20201013122658.4620-1-peter.maydell@linaro.org> Signed-off-by: Richard Henderson --- accel/tcg/translate-all.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c index d76097296d..4572b4901f 100644 --- a/accel/tcg/translate-all.c +++ b/accel/tcg/translate-all.c @@ -2267,6 +2267,10 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr) tb_destroy(tb); } + qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc, + "cpu_io_recompile: rewound execution of TB to " + TARGET_FMT_lx "\n", tb->pc); + /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not * the first in the TB) then we end up generating a whole new TB and * repeating the fault, which is horribly inefficient.