From a1b3c48d2b23d6eaeb4529d3e1183d2648731bf8 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 22 Jun 2016 15:46:09 -0700 Subject: [PATCH 1/7] tcg: Compress liveness data to 16 bits This reduces both memory usage and per-insn cacheline usage during code generation. Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/tcg.c | 58 +++++++++++++++++++++---------------------------------- tcg/tcg.h | 16 +++++++++------ 2 files changed, 32 insertions(+), 42 deletions(-) diff --git a/tcg/tcg.c b/tcg/tcg.c index 0c46c43cfa..4aa1933a3e 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -1341,7 +1341,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps, } } -/* Liveness analysis : update the opc_dead_args array to tell if a +/* Liveness analysis : update the opc_arg_life array to tell if a given input arguments is dead. Instructions updating dead temporaries are removed. */ static void tcg_liveness_analysis(TCGContext *s) @@ -1350,9 +1350,8 @@ static void tcg_liveness_analysis(TCGContext *s) int oi, oi_prev, nb_ops; nb_ops = s->gen_next_op_idx; - s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t)); - s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t)); - + s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData)); + dead_temps = tcg_malloc(s->nb_temps); mem_temps = tcg_malloc(s->nb_temps); tcg_la_func_end(s, dead_temps, mem_temps); @@ -1361,8 +1360,7 @@ static void tcg_liveness_analysis(TCGContext *s) int i, nb_iargs, nb_oargs; TCGOpcode opc_new, opc_new2; bool have_opc_new2; - uint16_t dead_args; - uint8_t sync_args; + TCGLifeData arg_life = 0; TCGArg arg; TCGOp * const op = &s->gen_op_buf[oi]; @@ -1394,15 +1392,13 @@ static void tcg_liveness_analysis(TCGContext *s) do_not_remove_call: /* output args are dead */ - dead_args = 0; - sync_args = 0; for (i = 0; i < nb_oargs; i++) { arg = args[i]; if (dead_temps[arg]) { - dead_args |= (1 << i); + arg_life |= DEAD_ARG << i; } if (mem_temps[arg]) { - sync_args |= (1 << i); + arg_life |= SYNC_ARG << i; } dead_temps[arg] = 1; mem_temps[arg] = 0; @@ -1423,7 +1419,7 @@ static void tcg_liveness_analysis(TCGContext *s) arg = args[i]; if (arg != TCG_CALL_DUMMY_ARG) { if (dead_temps[arg]) { - dead_args |= (1 << i); + arg_life |= DEAD_ARG << i; } } } @@ -1432,8 +1428,6 @@ static void tcg_liveness_analysis(TCGContext *s) arg = args[i]; dead_temps[arg] = 0; } - s->op_dead_args[oi] = dead_args; - s->op_sync_args[oi] = sync_args; } } break; @@ -1544,15 +1538,13 @@ static void tcg_liveness_analysis(TCGContext *s) } else { do_not_remove: /* output args are dead */ - dead_args = 0; - sync_args = 0; for (i = 0; i < nb_oargs; i++) { arg = args[i]; if (dead_temps[arg]) { - dead_args |= (1 << i); + arg_life |= DEAD_ARG << i; } if (mem_temps[arg]) { - sync_args |= (1 << i); + arg_life |= SYNC_ARG << i; } dead_temps[arg] = 1; mem_temps[arg] = 0; @@ -1570,7 +1562,7 @@ static void tcg_liveness_analysis(TCGContext *s) for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { arg = args[i]; if (dead_temps[arg]) { - dead_args |= (1 << i); + arg_life |= DEAD_ARG << i; } } /* input arguments are live for preceding opcodes */ @@ -1578,11 +1570,10 @@ static void tcg_liveness_analysis(TCGContext *s) arg = args[i]; dead_temps[arg] = 0; } - s->op_dead_args[oi] = dead_args; - s->op_sync_args[oi] = sync_args; } break; } + s->op_arg_life[oi] = arg_life; } } #else @@ -1921,11 +1912,11 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) save_globals(s, allocated_regs); } -#define IS_DEAD_ARG(n) ((dead_args >> (n)) & 1) -#define NEED_SYNC_ARG(n) ((sync_args >> (n)) & 1) +#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n))) +#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n))) static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args, - uint16_t dead_args, uint8_t sync_args) + TCGLifeData arg_life) { TCGTemp *ots; tcg_target_ulong val; @@ -1954,8 +1945,7 @@ static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args, } static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, - const TCGArg *args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg *args, TCGLifeData arg_life) { TCGRegSet allocated_regs; TCGTemp *ts, *ots; @@ -2040,8 +2030,7 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, static void tcg_reg_alloc_op(TCGContext *s, const TCGOpDef *def, TCGOpcode opc, - const TCGArg *args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg *args, TCGLifeData arg_life) { TCGRegSet allocated_regs; int i, k, nb_iargs, nb_oargs; @@ -2206,8 +2195,7 @@ static void tcg_reg_alloc_op(TCGContext *s, #endif static void tcg_reg_alloc_call(TCGContext *s, int nb_oargs, int nb_iargs, - const TCGArg * const args, uint16_t dead_args, - uint8_t sync_args) + const TCGArg * const args, TCGLifeData arg_life) { int flags, nb_regs, i; TCGReg reg; @@ -2427,8 +2415,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) TCGArg * const args = &s->gen_opparam_buf[op->args]; TCGOpcode opc = op->opc; const TCGOpDef *def = &tcg_op_defs[opc]; - uint16_t dead_args = s->op_dead_args[oi]; - uint8_t sync_args = s->op_sync_args[oi]; + TCGLifeData arg_life = s->op_arg_life[oi]; oi_next = op->next; #ifdef CONFIG_PROFILER @@ -2438,11 +2425,11 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) switch (opc) { case INDEX_op_mov_i32: case INDEX_op_mov_i64: - tcg_reg_alloc_mov(s, def, args, dead_args, sync_args); + tcg_reg_alloc_mov(s, def, args, arg_life); break; case INDEX_op_movi_i32: case INDEX_op_movi_i64: - tcg_reg_alloc_movi(s, args, dead_args, sync_args); + tcg_reg_alloc_movi(s, args, arg_life); break; case INDEX_op_insn_start: if (num_insns >= 0) { @@ -2467,8 +2454,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) tcg_out_label(s, arg_label(args[0]), s->code_ptr); break; case INDEX_op_call: - tcg_reg_alloc_call(s, op->callo, op->calli, args, - dead_args, sync_args); + tcg_reg_alloc_call(s, op->callo, op->calli, args, arg_life); break; default: /* Sanity check that we've not introduced any unhandled opcodes. */ @@ -2478,7 +2464,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) /* Note: in order to speed up the code, it would be much faster to have specialized register allocator functions for some common argument patterns */ - tcg_reg_alloc_op(s, def, opc, args, dead_args, sync_args); + tcg_reg_alloc_op(s, def, opc, args, arg_life); break; } #ifdef CONFIG_DEBUG_TCG diff --git a/tcg/tcg.h b/tcg/tcg.h index 6046dcdc89..7c0a138152 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -575,6 +575,14 @@ typedef struct TCGTempSet { unsigned long l[BITS_TO_LONGS(TCG_MAX_TEMPS)]; } TCGTempSet; +/* While we limit helpers to 6 arguments, for 32-bit hosts, with padding, + this imples a max of 6*2 (64-bit in) + 2 (64-bit out) = 14 operands. + There are never more than 2 outputs, which means that we can store all + dead + sync data within 16 bits. */ +#define DEAD_ARG 4 +#define SYNC_ARG 1 +typedef uint16_t TCGLifeData; + typedef struct TCGOp { TCGOpcode opc : 8; @@ -608,12 +616,8 @@ struct TCGContext { uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */ /* liveness analysis */ - uint16_t *op_dead_args; /* for each operation, each bit tells if the - corresponding argument is dead */ - uint8_t *op_sync_args; /* for each operation, each bit tells if the - corresponding output argument needs to be - sync to memory. */ - + TCGLifeData *op_arg_life; + TCGRegSet reserved_regs; intptr_t current_frame_offset; intptr_t frame_start; From dcb8e75870e2de199db853697f8839cb603beefe Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 22 Jun 2016 19:42:31 -0700 Subject: [PATCH 2/7] tcg: Reorg TCGOp chaining Instead of using -1 as end of chain, use 0, and link through the 0 entry as a fully circular double-linked list. Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- include/exec/gen-icount.h | 2 +- tcg/optimize.c | 8 ++------ tcg/tcg-op.c | 2 +- tcg/tcg.c | 35 +++++++++++++++-------------------- tcg/tcg.h | 22 ++++++++++++---------- 5 files changed, 31 insertions(+), 38 deletions(-) diff --git a/include/exec/gen-icount.h b/include/exec/gen-icount.h index 1af03d8f23..050de59b38 100644 --- a/include/exec/gen-icount.h +++ b/include/exec/gen-icount.h @@ -59,7 +59,7 @@ static void gen_tb_end(TranslationBlock *tb, int num_insns) } /* Terminate the linked list. */ - tcg_ctx.gen_op_buf[tcg_ctx.gen_last_op_idx].next = -1; + tcg_ctx.gen_op_buf[tcg_ctx.gen_op_buf[0].prev].next = 0; } static inline void gen_io_start(void) diff --git a/tcg/optimize.c b/tcg/optimize.c index c0d975b3d9..8df7fc7f69 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -103,11 +103,7 @@ static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op, .prev = prev, .next = next }; - if (prev >= 0) { - s->gen_op_buf[prev].next = oi; - } else { - s->gen_first_op_idx = oi; - } + s->gen_op_buf[prev].next = oi; old_op->prev = oi; return new_op; @@ -583,7 +579,7 @@ void tcg_optimize(TCGContext *s) nb_globals = s->nb_globals; reset_all_temps(nb_temps); - for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { tcg_target_ulong mask, partmask, affected; int nb_oargs, nb_iargs, i; TCGArg tmp; diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c index 293b854370..0243c99094 100644 --- a/tcg/tcg-op.c +++ b/tcg/tcg-op.c @@ -52,7 +52,7 @@ static void tcg_emit_op(TCGContext *ctx, TCGOpcode opc, int args) int pi = oi - 1; tcg_debug_assert(oi < OPC_BUF_SIZE); - ctx->gen_last_op_idx = oi; + ctx->gen_op_buf[0].prev = oi; ctx->gen_next_op_idx = ni; ctx->gen_op_buf[oi] = (TCGOp){ diff --git a/tcg/tcg.c b/tcg/tcg.c index 4aa1933a3e..cd76e4287c 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -438,9 +438,9 @@ void tcg_func_start(TCGContext *s) s->goto_tb_issue_mask = 0; #endif - s->gen_first_op_idx = 0; - s->gen_last_op_idx = -1; - s->gen_next_op_idx = 0; + s->gen_op_buf[0].next = 1; + s->gen_op_buf[0].prev = 0; + s->gen_next_op_idx = 1; s->gen_next_parm_idx = 0; s->be = tcg_malloc(sizeof(TCGBackendData)); @@ -869,7 +869,7 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, /* Make sure the calli field didn't overflow. */ tcg_debug_assert(s->gen_op_buf[i].calli == real_args); - s->gen_last_op_idx = i; + s->gen_op_buf[0].prev = i; s->gen_next_op_idx = i + 1; s->gen_next_parm_idx = pi; @@ -1021,7 +1021,7 @@ void tcg_dump_ops(TCGContext *s) TCGOp *op; int oi; - for (oi = s->gen_first_op_idx; oi >= 0; oi = op->next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = op->next) { int i, k, nb_oargs, nb_iargs, nb_cargs; const TCGOpDef *def; const TCGArg *args; @@ -1033,7 +1033,7 @@ void tcg_dump_ops(TCGContext *s) args = &s->gen_opparam_buf[op->args]; if (c == INDEX_op_insn_start) { - qemu_log("%s ----", oi != s->gen_first_op_idx ? "\n" : ""); + qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : ""); for (i = 0; i < TARGET_INSN_START_WORDS; ++i) { target_ulong a; @@ -1298,18 +1298,13 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) int next = op->next; int prev = op->prev; - if (next >= 0) { - s->gen_op_buf[next].prev = prev; - } else { - s->gen_last_op_idx = prev; - } - if (prev >= 0) { - s->gen_op_buf[prev].next = next; - } else { - s->gen_first_op_idx = next; - } + /* We should never attempt to remove the list terminator. */ + tcg_debug_assert(op != &s->gen_op_buf[0]); - memset(op, -1, sizeof(*op)); + s->gen_op_buf[next].prev = prev; + s->gen_op_buf[prev].next = next; + + memset(op, 0, sizeof(*op)); #ifdef CONFIG_PROFILER s->del_op_count++; @@ -1356,7 +1351,7 @@ static void tcg_liveness_analysis(TCGContext *s) mem_temps = tcg_malloc(s->nb_temps); tcg_la_func_end(s, dead_temps, mem_temps); - for (oi = s->gen_last_op_idx; oi >= 0; oi = oi_prev) { + for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) { int i, nb_iargs, nb_oargs; TCGOpcode opc_new, opc_new2; bool have_opc_new2; @@ -2351,7 +2346,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) { int n; - n = s->gen_last_op_idx + 1; + n = s->gen_op_buf[0].prev + 1; s->op_count += n; if (n > s->op_count_max) { s->op_count_max = n; @@ -2410,7 +2405,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) tcg_out_tb_init(s); num_insns = -1; - for (oi = s->gen_first_op_idx; oi >= 0; oi = oi_next) { + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { TCGOp * const op = &s->gen_op_buf[oi]; TCGArg * const args = &s->gen_opparam_buf[op->args]; TCGOpcode opc = op->opc; diff --git a/tcg/tcg.h b/tcg/tcg.h index 7c0a138152..007d7bcb5c 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -590,17 +590,21 @@ typedef struct TCGOp { unsigned callo : 2; unsigned calli : 6; - /* Index of the arguments for this op, or -1 for zero-operand ops. */ - signed args : 16; + /* Index of the arguments for this op, or 0 for zero-operand ops. */ + unsigned args : 16; - /* Index of the prex/next op, or -1 for the end of the list. */ - signed prev : 16; - signed next : 16; + /* Index of the prev/next op, or 0 for the end of the list. */ + unsigned prev : 16; + unsigned next : 16; } TCGOp; -QEMU_BUILD_BUG_ON(NB_OPS > 0xff); -QEMU_BUILD_BUG_ON(OPC_BUF_SIZE >= 0x7fff); -QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE >= 0x7fff); +/* Make sure operands fit in the bitfields above. */ +QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); +QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16)); +QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16)); + +/* Make sure that we don't overflow 64 bits without noticing. */ +QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8); struct TCGContext { uint8_t *pool_cur, *pool_end; @@ -653,8 +657,6 @@ struct TCGContext { int goto_tb_issue_mask; #endif - int gen_first_op_idx; - int gen_last_op_idx; int gen_next_op_idx; int gen_next_parm_idx; From bee158cb4dde35c41632a3a129c869f14a32f8f0 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 22 Jun 2016 20:43:29 -0700 Subject: [PATCH 3/7] tcg: Fold life data into TCGOp Reduce the size of other bitfields to make room. This reduces the cache footprint of compilation. Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/tcg.c | 9 +++------ tcg/tcg.h | 32 +++++++++++++++++--------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/tcg/tcg.c b/tcg/tcg.c index cd76e4287c..6bcf6e5f66 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -1342,10 +1342,7 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps, static void tcg_liveness_analysis(TCGContext *s) { uint8_t *dead_temps, *mem_temps; - int oi, oi_prev, nb_ops; - - nb_ops = s->gen_next_op_idx; - s->op_arg_life = tcg_malloc(nb_ops * sizeof(TCGLifeData)); + int oi, oi_prev; dead_temps = tcg_malloc(s->nb_temps); mem_temps = tcg_malloc(s->nb_temps); @@ -1568,7 +1565,7 @@ static void tcg_liveness_analysis(TCGContext *s) } break; } - s->op_arg_life[oi] = arg_life; + op->life = arg_life; } } #else @@ -2410,7 +2407,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) TCGArg * const args = &s->gen_opparam_buf[op->args]; TCGOpcode opc = op->opc; const TCGOpDef *def = &tcg_op_defs[opc]; - TCGLifeData arg_life = s->op_arg_life[oi]; + TCGLifeData arg_life = op->life; oi_next = op->next; #ifdef CONFIG_PROFILER diff --git a/tcg/tcg.h b/tcg/tcg.h index 007d7bcb5c..ebf68670f6 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -583,25 +583,30 @@ typedef struct TCGTempSet { #define SYNC_ARG 1 typedef uint16_t TCGLifeData; +/* The layout here is designed to avoid crossing of a 32-bit boundary. + If we do so, gcc adds padding, expanding the size to 12. */ typedef struct TCGOp { - TCGOpcode opc : 8; - - /* The number of out and in parameter for a call. */ - unsigned callo : 2; - unsigned calli : 6; - - /* Index of the arguments for this op, or 0 for zero-operand ops. */ - unsigned args : 16; + TCGOpcode opc : 8; /* 8 */ /* Index of the prev/next op, or 0 for the end of the list. */ - unsigned prev : 16; - unsigned next : 16; + unsigned prev : 10; /* 18 */ + unsigned next : 10; /* 28 */ + + /* The number of out and in parameter for a call. */ + unsigned calli : 4; /* 32 */ + unsigned callo : 2; /* 34 */ + + /* Index of the arguments for this op, or 0 for zero-operand ops. */ + unsigned args : 14; /* 48 */ + + /* Lifetime data of the operands. */ + unsigned life : 16; /* 64 */ } TCGOp; /* Make sure operands fit in the bitfields above. */ QEMU_BUILD_BUG_ON(NB_OPS > (1 << 8)); -QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 16)); -QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 16)); +QEMU_BUILD_BUG_ON(OPC_BUF_SIZE > (1 << 10)); +QEMU_BUILD_BUG_ON(OPPARAM_BUF_SIZE > (1 << 14)); /* Make sure that we don't overflow 64 bits without noticing. */ QEMU_BUILD_BUG_ON(sizeof(TCGOp) > 8); @@ -619,9 +624,6 @@ struct TCGContext { uint16_t *tb_jmp_insn_offset; /* tb->jmp_insn_offset if USE_DIRECT_JUMP */ uintptr_t *tb_jmp_target_addr; /* tb->jmp_target_addr if !USE_DIRECT_JUMP */ - /* liveness analysis */ - TCGLifeData *op_arg_life; - TCGRegSet reserved_regs; intptr_t current_frame_offset; intptr_t frame_start; From c70fbf0a9938baf3b4f843355a77c17a7e945b98 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 23 Jun 2016 20:34:22 -0700 Subject: [PATCH 4/7] tcg: Compress dead_temps and mem_temps into a single array We only need two bits per temporary. Fold the two bytes into one, and reduce the memory and cachelines required during compilation. Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/tcg.c | 119 +++++++++++++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/tcg/tcg.c b/tcg/tcg.c index 6bcf6e5f66..27bbb4dda2 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -333,7 +333,7 @@ void tcg_context_init(TCGContext *s) memset(s, 0, sizeof(*s)); s->nb_globals = 0; - + /* Count total number of arguments and allocate the corresponding space */ total_args = 0; @@ -825,16 +825,16 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, real_args++; } #endif - /* If stack grows up, then we will be placing successive - arguments at lower addresses, which means we need to - reverse the order compared to how we would normally - treat either big or little-endian. For those arguments - that will wind up in registers, this still works for - HPPA (the only current STACK_GROWSUP target) since the - argument registers are *also* allocated in decreasing - order. If another such target is added, this logic may - have to get more complicated to differentiate between - stack arguments and register arguments. */ + /* If stack grows up, then we will be placing successive + arguments at lower addresses, which means we need to + reverse the order compared to how we would normally + treat either big or little-endian. For those arguments + that will wind up in registers, this still works for + HPPA (the only current STACK_GROWSUP target) since the + argument registers are *also* allocated in decreasing + order. If another such target is added, this logic may + have to get more complicated to differentiate between + stack arguments and register arguments. */ #if defined(HOST_WORDS_BIGENDIAN) != defined(TCG_TARGET_STACK_GROWSUP) s->gen_opparam_buf[pi++] = args[i] + 1; s->gen_opparam_buf[pi++] = args[i]; @@ -1312,27 +1312,29 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) } #ifdef USE_LIVENESS_ANALYSIS + +#define TS_DEAD 1 +#define TS_MEM 2 + /* liveness analysis: end of function: all temps are dead, and globals should be in memory. */ -static inline void tcg_la_func_end(TCGContext *s, uint8_t *dead_temps, - uint8_t *mem_temps) +static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state) { - memset(dead_temps, 1, s->nb_temps); - memset(mem_temps, 1, s->nb_globals); - memset(mem_temps + s->nb_globals, 0, s->nb_temps - s->nb_globals); + memset(temp_state, TS_DEAD | TS_MEM, s->nb_globals); + memset(temp_state + s->nb_globals, TS_DEAD, s->nb_temps - s->nb_globals); } /* liveness analysis: end of basic block: all temps are dead, globals and local temps should be in memory. */ -static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps, - uint8_t *mem_temps) +static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state) { - int i; + int i, n; - memset(dead_temps, 1, s->nb_temps); - memset(mem_temps, 1, s->nb_globals); - for(i = s->nb_globals; i < s->nb_temps; i++) { - mem_temps[i] = s->temps[i].temp_local; + tcg_la_func_end(s, temp_state); + for (i = s->nb_globals, n = s->nb_temps; i < n; i++) { + if (s->temps[i].temp_local) { + temp_state[i] |= TS_MEM; + } } } @@ -1341,12 +1343,12 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *dead_temps, temporaries are removed. */ static void tcg_liveness_analysis(TCGContext *s) { - uint8_t *dead_temps, *mem_temps; + uint8_t *temp_state; int oi, oi_prev; + int nb_globals = s->nb_globals; - dead_temps = tcg_malloc(s->nb_temps); - mem_temps = tcg_malloc(s->nb_temps); - tcg_la_func_end(s, dead_temps, mem_temps); + temp_state = tcg_malloc(s->nb_temps); + tcg_la_func_end(s, temp_state); for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) { int i, nb_iargs, nb_oargs; @@ -1375,7 +1377,7 @@ static void tcg_liveness_analysis(TCGContext *s) if (call_flags & TCG_CALL_NO_SIDE_EFFECTS) { for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (!dead_temps[arg] || mem_temps[arg]) { + if (temp_state[arg] != TS_DEAD) { goto do_not_remove_call; } } @@ -1386,39 +1388,41 @@ static void tcg_liveness_analysis(TCGContext *s) /* output args are dead */ for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (dead_temps[arg]) { + if (temp_state[arg] & TS_DEAD) { arg_life |= DEAD_ARG << i; } - if (mem_temps[arg]) { + if (temp_state[arg] & TS_MEM) { arg_life |= SYNC_ARG << i; } - dead_temps[arg] = 1; - mem_temps[arg] = 0; + temp_state[arg] = TS_DEAD; } - if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) { - /* globals should be synced to memory */ - memset(mem_temps, 1, s->nb_globals); - } if (!(call_flags & (TCG_CALL_NO_WRITE_GLOBALS | TCG_CALL_NO_READ_GLOBALS))) { /* globals should go back to memory */ - memset(dead_temps, 1, s->nb_globals); + memset(temp_state, TS_DEAD | TS_MEM, nb_globals); + } else if (!(call_flags & TCG_CALL_NO_READ_GLOBALS)) { + /* globals should be synced to memory */ + for (i = 0; i < nb_globals; i++) { + temp_state[i] |= TS_MEM; + } } /* record arguments that die in this helper */ for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { arg = args[i]; if (arg != TCG_CALL_DUMMY_ARG) { - if (dead_temps[arg]) { + if (temp_state[arg] & TS_DEAD) { arg_life |= DEAD_ARG << i; } } } /* input arguments are live for preceding opcodes */ - for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { arg = args[i]; - dead_temps[arg] = 0; + if (arg != TCG_CALL_DUMMY_ARG) { + temp_state[arg] &= ~TS_DEAD; + } } } } @@ -1427,8 +1431,7 @@ static void tcg_liveness_analysis(TCGContext *s) break; case INDEX_op_discard: /* mark the temporary as dead */ - dead_temps[args[0]] = 1; - mem_temps[args[0]] = 0; + temp_state[args[0]] = TS_DEAD; break; case INDEX_op_add2_i32: @@ -1449,8 +1452,8 @@ static void tcg_liveness_analysis(TCGContext *s) the low part. The result can be optimized to a simple add or sub. This happens often for x86_64 guest when the cpu mode is set to 32 bit. */ - if (dead_temps[args[1]] && !mem_temps[args[1]]) { - if (dead_temps[args[0]] && !mem_temps[args[0]]) { + if (temp_state[args[1]] == TS_DEAD) { + if (temp_state[args[0]] == TS_DEAD) { goto do_remove; } /* Replace the opcode and adjust the args in place, @@ -1487,8 +1490,8 @@ static void tcg_liveness_analysis(TCGContext *s) do_mul2: nb_iargs = 2; nb_oargs = 2; - if (dead_temps[args[1]] && !mem_temps[args[1]]) { - if (dead_temps[args[0]] && !mem_temps[args[0]]) { + if (temp_state[args[1]] == TS_DEAD) { + if (temp_state[args[0]] == TS_DEAD) { /* Both parts of the operation are dead. */ goto do_remove; } @@ -1496,8 +1499,7 @@ static void tcg_liveness_analysis(TCGContext *s) op->opc = opc = opc_new; args[1] = args[2]; args[2] = args[3]; - } else if (have_opc_new2 && dead_temps[args[0]] - && !mem_temps[args[0]]) { + } else if (temp_state[args[0]] == TS_DEAD && have_opc_new2) { /* The low part of the operation is dead; generate the high. */ op->opc = opc = opc_new2; args[0] = args[1]; @@ -1520,8 +1522,7 @@ static void tcg_liveness_analysis(TCGContext *s) implies side effects */ if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && nb_oargs != 0) { for (i = 0; i < nb_oargs; i++) { - arg = args[i]; - if (!dead_temps[arg] || mem_temps[arg]) { + if (temp_state[args[i]] != TS_DEAD) { goto do_not_remove; } } @@ -1532,35 +1533,35 @@ static void tcg_liveness_analysis(TCGContext *s) /* output args are dead */ for (i = 0; i < nb_oargs; i++) { arg = args[i]; - if (dead_temps[arg]) { + if (temp_state[arg] & TS_DEAD) { arg_life |= DEAD_ARG << i; } - if (mem_temps[arg]) { + if (temp_state[arg] & TS_MEM) { arg_life |= SYNC_ARG << i; } - dead_temps[arg] = 1; - mem_temps[arg] = 0; + temp_state[arg] = TS_DEAD; } /* if end of basic block, update */ if (def->flags & TCG_OPF_BB_END) { - tcg_la_bb_end(s, dead_temps, mem_temps); + tcg_la_bb_end(s, temp_state); } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { /* globals should be synced to memory */ - memset(mem_temps, 1, s->nb_globals); + for (i = 0; i < nb_globals; i++) { + temp_state[i] |= TS_MEM; + } } /* record arguments that die in this opcode */ for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { arg = args[i]; - if (dead_temps[arg]) { + if (temp_state[arg] & TS_DEAD) { arg_life |= DEAD_ARG << i; } } /* input arguments are live for preceding opcodes */ for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) { - arg = args[i]; - dead_temps[arg] = 0; + temp_state[args[i]] &= ~TS_DEAD; } } break; From bdfb460ef77500f7b186759b585f06ff2120929d Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 23 Jun 2016 19:15:55 -0700 Subject: [PATCH 5/7] tcg: Include liveness info in the dumps Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- include/qemu/log.h | 2 +- tcg/tcg.c | 68 ++++++++++++++++++++++++++++++++-------------- util/log.c | 19 +++++++++---- 3 files changed, 61 insertions(+), 28 deletions(-) diff --git a/include/qemu/log.h b/include/qemu/log.h index 8bec6b4039..9ab8f51188 100644 --- a/include/qemu/log.h +++ b/include/qemu/log.h @@ -54,7 +54,7 @@ static inline bool qemu_loglevel_mask(int mask) /* main logging function */ -void GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...); +int GCC_FMT_ATTR(1, 2) qemu_log(const char *fmt, ...); /* vfprintf-like logging function */ diff --git a/tcg/tcg.c b/tcg/tcg.c index 27bbb4dda2..b0a88bae3a 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -1026,6 +1026,7 @@ void tcg_dump_ops(TCGContext *s) const TCGOpDef *def; const TCGArg *args; TCGOpcode c; + int col = 0; op = &s->gen_op_buf[oi]; c = op->opc; @@ -1033,7 +1034,7 @@ void tcg_dump_ops(TCGContext *s) args = &s->gen_opparam_buf[op->args]; if (c == INDEX_op_insn_start) { - qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : ""); + col += qemu_log("%s ----", oi != s->gen_op_buf[0].next ? "\n" : ""); for (i = 0; i < TARGET_INSN_START_WORDS; ++i) { target_ulong a; @@ -1042,7 +1043,7 @@ void tcg_dump_ops(TCGContext *s) #else a = args[i]; #endif - qemu_log(" " TARGET_FMT_lx, a); + col += qemu_log(" " TARGET_FMT_lx, a); } } else if (c == INDEX_op_call) { /* variable number of arguments */ @@ -1051,12 +1052,12 @@ void tcg_dump_ops(TCGContext *s) nb_cargs = def->nb_cargs; /* function name, flags, out args */ - qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name, - tcg_find_helper(s, args[nb_oargs + nb_iargs]), - args[nb_oargs + nb_iargs + 1], nb_oargs); + col += qemu_log(" %s %s,$0x%" TCG_PRIlx ",$%d", def->name, + tcg_find_helper(s, args[nb_oargs + nb_iargs]), + args[nb_oargs + nb_iargs + 1], nb_oargs); for (i = 0; i < nb_oargs; i++) { - qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[i])); + col += qemu_log(",%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[i])); } for (i = 0; i < nb_iargs; i++) { TCGArg arg = args[nb_oargs + i]; @@ -1064,10 +1065,10 @@ void tcg_dump_ops(TCGContext *s) if (arg != TCG_CALL_DUMMY_ARG) { t = tcg_get_arg_str_idx(s, buf, sizeof(buf), arg); } - qemu_log(",%s", t); + col += qemu_log(",%s", t); } } else { - qemu_log(" %s ", def->name); + col += qemu_log(" %s ", def->name); nb_oargs = def->nb_oargs; nb_iargs = def->nb_iargs; @@ -1076,17 +1077,17 @@ void tcg_dump_ops(TCGContext *s) k = 0; for (i = 0; i < nb_oargs; i++) { if (k != 0) { - qemu_log(","); + col += qemu_log(","); } - qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[k++])); + col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[k++])); } for (i = 0; i < nb_iargs; i++) { if (k != 0) { - qemu_log(","); + col += qemu_log(","); } - qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), - args[k++])); + col += qemu_log("%s", tcg_get_arg_str_idx(s, buf, sizeof(buf), + args[k++])); } switch (c) { case INDEX_op_brcond_i32: @@ -1098,9 +1099,9 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_setcond_i64: case INDEX_op_movcond_i64: if (args[k] < ARRAY_SIZE(cond_name) && cond_name[args[k]]) { - qemu_log(",%s", cond_name[args[k++]]); + col += qemu_log(",%s", cond_name[args[k++]]); } else { - qemu_log(",$0x%" TCG_PRIlx, args[k++]); + col += qemu_log(",$0x%" TCG_PRIlx, args[k++]); } i = 1; break; @@ -1114,12 +1115,12 @@ void tcg_dump_ops(TCGContext *s) unsigned ix = get_mmuidx(oi); if (op & ~(MO_AMASK | MO_BSWAP | MO_SSIZE)) { - qemu_log(",$0x%x,%u", op, ix); + col += qemu_log(",$0x%x,%u", op, ix); } else { const char *s_al, *s_op; s_al = alignment_name[(op & MO_AMASK) >> MO_ASHIFT]; s_op = ldst_name[op & (MO_BSWAP | MO_SSIZE)]; - qemu_log(",%s%s,%u", s_al, s_op, ix); + col += qemu_log(",%s%s,%u", s_al, s_op, ix); } i = 1; } @@ -1134,14 +1135,39 @@ void tcg_dump_ops(TCGContext *s) case INDEX_op_brcond_i32: case INDEX_op_brcond_i64: case INDEX_op_brcond2_i32: - qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id); + col += qemu_log("%s$L%d", k ? "," : "", arg_label(args[k])->id); i++, k++; break; default: break; } for (; i < nb_cargs; i++, k++) { - qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]); + col += qemu_log("%s$0x%" TCG_PRIlx, k ? "," : "", args[k]); + } + } + if (op->life) { + unsigned life = op->life; + + for (; col < 48; ++col) { + putc(' ', qemu_logfile); + } + + if (life & (SYNC_ARG * 3)) { + qemu_log(" sync:"); + for (i = 0; i < 2; ++i) { + if (life & (SYNC_ARG << i)) { + qemu_log(" %d", i); + } + } + } + life /= DEAD_ARG; + if (life) { + qemu_log(" dead:"); + for (i = 0; life; ++i, life >>= 1) { + if (life & 1) { + qemu_log(" %d", i); + } + } } } qemu_log("\n"); diff --git a/util/log.c b/util/log.c index b6c75b1102..9f0844481c 100644 --- a/util/log.c +++ b/util/log.c @@ -32,15 +32,22 @@ int qemu_loglevel; static int log_append = 0; static GArray *debug_regions; -void qemu_log(const char *fmt, ...) +/* Return the number of characters emitted. */ +int qemu_log(const char *fmt, ...) { - va_list ap; - - va_start(ap, fmt); + int ret = 0; if (qemu_logfile) { - vfprintf(qemu_logfile, fmt, ap); + va_list ap; + va_start(ap, fmt); + ret = vfprintf(qemu_logfile, fmt, ap); + va_end(ap); + + /* Don't pass back error results. */ + if (ret < 0) { + ret = 0; + } } - va_end(ap); + return ret; } static bool log_uses_own_buffers; From c0ef05b5e62ab0c291a94022f14104e61e306f03 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 22 Jun 2016 14:24:32 -0700 Subject: [PATCH 6/7] tcg: Require liveness analysis Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/tcg.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tcg/tcg.c b/tcg/tcg.c index b0a88bae3a..3c1f5262a8 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -23,7 +23,6 @@ */ /* define it to use liveness analysis (better code) */ -#define USE_LIVENESS_ANALYSIS #define USE_TCG_OPTIMIZATIONS #include "qemu/osdep.h" @@ -1337,8 +1336,6 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) #endif } -#ifdef USE_LIVENESS_ANALYSIS - #define TS_DEAD 1 #define TS_MEM 2 @@ -1595,18 +1592,6 @@ static void tcg_liveness_analysis(TCGContext *s) op->life = arg_life; } } -#else -/* dummy liveness analysis */ -static void tcg_liveness_analysis(TCGContext *s) -{ - int nb_ops = s->gen_next_op_idx; - - s->op_dead_args = tcg_malloc(nb_ops * sizeof(uint16_t)); - memset(s->op_dead_args, 0, nb_ops * sizeof(uint16_t)); - s->op_sync_args = tcg_malloc(nb_ops * sizeof(uint8_t)); - memset(s->op_sync_args, 0, nb_ops * sizeof(uint8_t)); -} -#endif #ifdef CONFIG_DEBUG_TCG static void dump_regs(TCGContext *s) @@ -1858,7 +1843,6 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs, temporary registers needs to be allocated to store a constant. */ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs) { -#ifdef USE_LIVENESS_ANALYSIS /* ??? Liveness does not yet incorporate indirect bases. */ if (!ts->indirect_base) { /* The liveness analysis already ensures that globals are back @@ -1866,7 +1850,6 @@ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs) tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg); return; } -#endif temp_sync(s, ts, allocated_regs, 1); } @@ -1891,7 +1874,6 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs) for (i = 0; i < s->nb_globals; i++) { TCGTemp *ts = &s->temps[i]; -#ifdef USE_LIVENESS_ANALYSIS /* ??? Liveness does not yet incorporate indirect bases. */ if (!ts->indirect_base) { tcg_debug_assert(ts->val_type != TEMP_VAL_REG @@ -1899,7 +1881,6 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs) || ts->mem_coherent); continue; } -#endif temp_sync(s, ts, allocated_regs, 0); } } @@ -1915,7 +1896,6 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) if (ts->temp_local) { temp_save(s, ts, allocated_regs); } else { -#ifdef USE_LIVENESS_ANALYSIS /* ??? Liveness does not yet incorporate indirect bases. */ if (!ts->indirect_base) { /* The liveness analysis already ensures that temps are dead. @@ -1923,7 +1903,6 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); continue; } -#endif temp_dead(s, ts); } } From 5a18407f55ade924aa6397c9a043a9ffd59645fe Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Thu, 23 Jun 2016 20:34:33 -0700 Subject: [PATCH 7/7] tcg: Lower indirect registers in a separate pass Rather than rely on recursion during the middle of register allocation, lower indirect registers to loads and stores off the indirect base into plain temps. For an x86_64 host, with sufficient registers, this results in identical code, modulo the actual register assignments. For an i686 host, with insufficient registers, this means that temps can be (temporarily) spilled to the stack in order to satisfy an allocation. This as opposed to the possibility of not being able to spill, to allocate a register for the indirect base, in order to perform a spill. Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- include/qemu/log.h | 1 + tcg/optimize.c | 31 +---- tcg/tcg.c | 306 +++++++++++++++++++++++++++++++++++++-------- tcg/tcg.h | 4 + util/log.c | 5 +- 5 files changed, 263 insertions(+), 84 deletions(-) diff --git a/include/qemu/log.h b/include/qemu/log.h index 9ab8f51188..00bf37fc0f 100644 --- a/include/qemu/log.h +++ b/include/qemu/log.h @@ -42,6 +42,7 @@ static inline bool qemu_log_separate(void) #define CPU_LOG_TB_NOCHAIN (1 << 13) #define CPU_LOG_PAGE (1 << 14) #define LOG_TRACE (1 << 15) +#define CPU_LOG_TB_OP_IND (1 << 16) /* Returns true if a bit is set in the current loglevel mask */ diff --git a/tcg/optimize.c b/tcg/optimize.c index 8df7fc7f69..cffe89b525 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -82,33 +82,6 @@ static void init_temp_info(TCGArg temp) } } -static TCGOp *insert_op_before(TCGContext *s, TCGOp *old_op, - TCGOpcode opc, int nargs) -{ - int oi = s->gen_next_op_idx; - int pi = s->gen_next_parm_idx; - int prev = old_op->prev; - int next = old_op - s->gen_op_buf; - TCGOp *new_op; - - tcg_debug_assert(oi < OPC_BUF_SIZE); - tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); - s->gen_next_op_idx = oi + 1; - s->gen_next_parm_idx = pi + nargs; - - new_op = &s->gen_op_buf[oi]; - *new_op = (TCGOp){ - .opc = opc, - .args = pi, - .prev = prev, - .next = next - }; - s->gen_op_buf[prev].next = oi; - old_op->prev = oi; - - return new_op; -} - static int op_bits(TCGOpcode op) { const TCGOpDef *def = &tcg_op_defs[op]; @@ -1116,7 +1089,7 @@ void tcg_optimize(TCGContext *s) uint64_t a = ((uint64_t)ah << 32) | al; uint64_t b = ((uint64_t)bh << 32) | bl; TCGArg rl, rh; - TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2); + TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2); TCGArg *args2 = &s->gen_opparam_buf[op2->args]; if (opc == INDEX_op_add2_i32) { @@ -1142,7 +1115,7 @@ void tcg_optimize(TCGContext *s) uint32_t b = temps[args[3]].val; uint64_t r = (uint64_t)a * b; TCGArg rl, rh; - TCGOp *op2 = insert_op_before(s, op, INDEX_op_movi_i32, 2); + TCGOp *op2 = tcg_op_insert_before(s, op, INDEX_op_movi_i32, 2); TCGArg *args2 = &s->gen_opparam_buf[op2->args]; rl = args[0]; diff --git a/tcg/tcg.c b/tcg/tcg.c index 3c1f5262a8..42417bdc92 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -531,8 +531,12 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base, #endif if (!base_ts->fixed_reg) { - indirect_reg = 1; + /* We do not support double-indirect registers. */ + tcg_debug_assert(!base_ts->indirect_reg); base_ts->indirect_base = 1; + s->nb_indirects += (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64 + ? 2 : 1); + indirect_reg = 1; } if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) { @@ -1336,9 +1340,66 @@ void tcg_op_remove(TCGContext *s, TCGOp *op) #endif } +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *old_op, + TCGOpcode opc, int nargs) +{ + int oi = s->gen_next_op_idx; + int pi = s->gen_next_parm_idx; + int prev = old_op->prev; + int next = old_op - s->gen_op_buf; + TCGOp *new_op; + + tcg_debug_assert(oi < OPC_BUF_SIZE); + tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); + s->gen_next_op_idx = oi + 1; + s->gen_next_parm_idx = pi + nargs; + + new_op = &s->gen_op_buf[oi]; + *new_op = (TCGOp){ + .opc = opc, + .args = pi, + .prev = prev, + .next = next + }; + s->gen_op_buf[prev].next = oi; + old_op->prev = oi; + + return new_op; +} + +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *old_op, + TCGOpcode opc, int nargs) +{ + int oi = s->gen_next_op_idx; + int pi = s->gen_next_parm_idx; + int prev = old_op - s->gen_op_buf; + int next = old_op->next; + TCGOp *new_op; + + tcg_debug_assert(oi < OPC_BUF_SIZE); + tcg_debug_assert(pi + nargs <= OPPARAM_BUF_SIZE); + s->gen_next_op_idx = oi + 1; + s->gen_next_parm_idx = pi + nargs; + + new_op = &s->gen_op_buf[oi]; + *new_op = (TCGOp){ + .opc = opc, + .args = pi, + .prev = prev, + .next = next + }; + s->gen_op_buf[next].prev = oi; + old_op->next = oi; + + return new_op; +} + #define TS_DEAD 1 #define TS_MEM 2 +#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n))) +#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n))) + /* liveness analysis: end of function: all temps are dead, and globals should be in memory. */ static inline void tcg_la_func_end(TCGContext *s, uint8_t *temp_state) @@ -1364,13 +1425,11 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t *temp_state) /* Liveness analysis : update the opc_arg_life array to tell if a given input arguments is dead. Instructions updating dead temporaries are removed. */ -static void tcg_liveness_analysis(TCGContext *s) +static void liveness_pass_1(TCGContext *s, uint8_t *temp_state) { - uint8_t *temp_state; - int oi, oi_prev; int nb_globals = s->nb_globals; + int oi, oi_prev; - temp_state = tcg_malloc(s->nb_temps); tcg_la_func_end(s, temp_state); for (oi = s->gen_op_buf[0].prev; oi != 0; oi = oi_prev) { @@ -1593,6 +1652,165 @@ static void tcg_liveness_analysis(TCGContext *s) } } +/* Liveness analysis: Convert indirect regs to direct temporaries. */ +static bool liveness_pass_2(TCGContext *s, uint8_t *temp_state) +{ + int nb_globals = s->nb_globals; + int16_t *dir_temps; + int i, oi, oi_next; + bool changes = false; + + dir_temps = tcg_malloc(nb_globals * sizeof(int16_t)); + memset(dir_temps, 0, nb_globals * sizeof(int16_t)); + + /* Create a temporary for each indirect global. */ + for (i = 0; i < nb_globals; ++i) { + TCGTemp *its = &s->temps[i]; + if (its->indirect_reg) { + TCGTemp *dts = tcg_temp_alloc(s); + dts->type = its->type; + dts->base_type = its->base_type; + dir_temps[i] = temp_idx(s, dts); + } + } + + memset(temp_state, TS_DEAD, nb_globals); + + for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) { + TCGOp *op = &s->gen_op_buf[oi]; + TCGArg *args = &s->gen_opparam_buf[op->args]; + TCGOpcode opc = op->opc; + const TCGOpDef *def = &tcg_op_defs[opc]; + TCGLifeData arg_life = op->life; + int nb_iargs, nb_oargs, call_flags; + TCGArg arg, dir; + + oi_next = op->next; + + if (opc == INDEX_op_call) { + nb_oargs = op->callo; + nb_iargs = op->calli; + call_flags = args[nb_oargs + nb_iargs + 1]; + } else { + nb_iargs = def->nb_iargs; + nb_oargs = def->nb_oargs; + + /* Set flags similar to how calls require. */ + if (def->flags & TCG_OPF_BB_END) { + /* Like writing globals: save_globals */ + call_flags = 0; + } else if (def->flags & TCG_OPF_SIDE_EFFECTS) { + /* Like reading globals: sync_globals */ + call_flags = TCG_CALL_NO_WRITE_GLOBALS; + } else { + /* No effect on globals. */ + call_flags = (TCG_CALL_NO_READ_GLOBALS | + TCG_CALL_NO_WRITE_GLOBALS); + } + } + + /* Make sure that input arguments are available. */ + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { + arg = args[i]; + /* Note this unsigned test catches TCG_CALL_ARG_DUMMY too. */ + if (arg < nb_globals) { + dir = dir_temps[arg]; + if (dir != 0 && temp_state[arg] == TS_DEAD) { + TCGTemp *its = &s->temps[arg]; + TCGOpcode lopc = (its->type == TCG_TYPE_I32 + ? INDEX_op_ld_i32 + : INDEX_op_ld_i64); + TCGOp *lop = tcg_op_insert_before(s, op, lopc, 3); + TCGArg *largs = &s->gen_opparam_buf[lop->args]; + + largs[0] = dir; + largs[1] = temp_idx(s, its->mem_base); + largs[2] = its->mem_offset; + + /* Loaded, but synced with memory. */ + temp_state[arg] = TS_MEM; + } + } + } + + /* Perform input replacement, and mark inputs that became dead. + No action is required except keeping temp_state up to date + so that we reload when needed. */ + for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) { + arg = args[i]; + if (arg < nb_globals) { + dir = dir_temps[arg]; + if (dir != 0) { + args[i] = dir; + changes = true; + if (IS_DEAD_ARG(i)) { + temp_state[arg] = TS_DEAD; + } + } + } + } + + /* Liveness analysis should ensure that the following are + all correct, for call sites and basic block end points. */ + if (call_flags & TCG_CALL_NO_READ_GLOBALS) { + /* Nothing to do */ + } else if (call_flags & TCG_CALL_NO_WRITE_GLOBALS) { + for (i = 0; i < nb_globals; ++i) { + /* Liveness should see that globals are synced back, + that is, either TS_DEAD or TS_MEM. */ + tcg_debug_assert(dir_temps[i] == 0 + || temp_state[i] != 0); + } + } else { + for (i = 0; i < nb_globals; ++i) { + /* Liveness should see that globals are saved back, + that is, TS_DEAD, waiting to be reloaded. */ + tcg_debug_assert(dir_temps[i] == 0 + || temp_state[i] == TS_DEAD); + } + } + + /* Outputs become available. */ + for (i = 0; i < nb_oargs; i++) { + arg = args[i]; + if (arg >= nb_globals) { + continue; + } + dir = dir_temps[arg]; + if (dir == 0) { + continue; + } + args[i] = dir; + changes = true; + + /* The output is now live and modified. */ + temp_state[arg] = 0; + + /* Sync outputs upon their last write. */ + if (NEED_SYNC_ARG(i)) { + TCGTemp *its = &s->temps[arg]; + TCGOpcode sopc = (its->type == TCG_TYPE_I32 + ? INDEX_op_st_i32 + : INDEX_op_st_i64); + TCGOp *sop = tcg_op_insert_after(s, op, sopc, 3); + TCGArg *sargs = &s->gen_opparam_buf[sop->args]; + + sargs[0] = dir; + sargs[1] = temp_idx(s, its->mem_base); + sargs[2] = its->mem_offset; + + temp_state[arg] = TS_MEM; + } + /* Drop outputs that are dead. */ + if (IS_DEAD_ARG(i)) { + temp_state[arg] = TS_DEAD; + } + } + } + + return changes; +} + #ifdef CONFIG_DEBUG_TCG static void dump_regs(TCGContext *s) { @@ -1723,14 +1941,6 @@ static void temp_sync(TCGContext *s, TCGTemp *ts, if (!ts->mem_allocated) { temp_allocate_frame(s, temp_idx(s, ts)); } - if (ts->indirect_reg) { - if (ts->val_type == TEMP_VAL_REG) { - tcg_regset_set_reg(allocated_regs, ts->reg); - } - temp_load(s, ts->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } switch (ts->val_type) { case TEMP_VAL_CONST: /* If we're going to free the temp immediately, then we won't @@ -1821,12 +2031,6 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs, break; case TEMP_VAL_MEM: reg = tcg_reg_alloc(s, desired_regs, allocated_regs, ts->indirect_base); - if (ts->indirect_reg) { - tcg_regset_set_reg(allocated_regs, reg); - temp_load(s, ts->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } tcg_out_ld(s, ts->type, reg, ts->mem_base->reg, ts->mem_offset); ts->mem_coherent = 1; break; @@ -1843,14 +2047,9 @@ static void temp_load(TCGContext *s, TCGTemp *ts, TCGRegSet desired_regs, temporary registers needs to be allocated to store a constant. */ static void temp_save(TCGContext *s, TCGTemp *ts, TCGRegSet allocated_regs) { - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - /* The liveness analysis already ensures that globals are back - in memory. Keep an tcg_debug_assert for safety. */ - tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg); - return; - } - temp_sync(s, ts, allocated_regs, 1); + /* The liveness analysis already ensures that globals are back + in memory. Keep an tcg_debug_assert for safety. */ + tcg_debug_assert(ts->val_type == TEMP_VAL_MEM || ts->fixed_reg); } /* save globals to their canonical location and assume they can be @@ -1874,14 +2073,9 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs) for (i = 0; i < s->nb_globals; i++) { TCGTemp *ts = &s->temps[i]; - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - tcg_debug_assert(ts->val_type != TEMP_VAL_REG - || ts->fixed_reg - || ts->mem_coherent); - continue; - } - temp_sync(s, ts, allocated_regs, 0); + tcg_debug_assert(ts->val_type != TEMP_VAL_REG + || ts->fixed_reg + || ts->mem_coherent); } } @@ -1896,23 +2090,15 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs) if (ts->temp_local) { temp_save(s, ts, allocated_regs); } else { - /* ??? Liveness does not yet incorporate indirect bases. */ - if (!ts->indirect_base) { - /* The liveness analysis already ensures that temps are dead. - Keep an tcg_debug_assert for safety. */ - tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); - continue; - } - temp_dead(s, ts); + /* The liveness analysis already ensures that temps are dead. + Keep an tcg_debug_assert for safety. */ + tcg_debug_assert(ts->val_type == TEMP_VAL_DEAD); } } save_globals(s, allocated_regs); } -#define IS_DEAD_ARG(n) (arg_life & (DEAD_ARG << (n))) -#define NEED_SYNC_ARG(n) (arg_life & (SYNC_ARG << (n))) - static void tcg_reg_alloc_movi(TCGContext *s, const TCGArg *args, TCGLifeData arg_life) { @@ -1975,12 +2161,6 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOpDef *def, if (!ots->mem_allocated) { temp_allocate_frame(s, args[0]); } - if (ots->indirect_reg) { - tcg_regset_set_reg(allocated_regs, ts->reg); - temp_load(s, ots->mem_base, - tcg_target_available_regs[TCG_TYPE_PTR], - allocated_regs); - } tcg_out_st(s, otype, ts->reg, ots->mem_base->reg, ots->mem_offset); if (IS_DEAD_ARG(1)) { temp_dead(s, ts); @@ -2385,7 +2565,27 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb) s->la_time -= profile_getclock(); #endif - tcg_liveness_analysis(s); + { + uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects); + + liveness_pass_1(s, temp_state); + + if (s->nb_indirects > 0) { +#ifdef DEBUG_DISAS + if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP_IND) + && qemu_log_in_addr_range(tb->pc))) { + qemu_log("OP before indirect lowering:\n"); + tcg_dump_ops(s); + qemu_log("\n"); + } +#endif + /* Replace indirect temps with direct temps. */ + if (liveness_pass_2(s, temp_state)) { + /* If changes were made, re-run liveness. */ + liveness_pass_1(s, temp_state); + } + } + } #ifdef CONFIG_PROFILER s->la_time += profile_getclock(); diff --git a/tcg/tcg.h b/tcg/tcg.h index ebf68670f6..1bcabcad9d 100644 --- a/tcg/tcg.h +++ b/tcg/tcg.h @@ -617,6 +617,7 @@ struct TCGContext { int nb_labels; int nb_globals; int nb_temps; + int nb_indirects; /* goto_tb support */ tcg_insn_unit *code_buf; @@ -898,6 +899,9 @@ void tcg_gen_callN(TCGContext *s, void *func, TCGArg ret, int nargs, TCGArg *args); void tcg_op_remove(TCGContext *s, TCGOp *op); +TCGOp *tcg_op_insert_before(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); +TCGOp *tcg_op_insert_after(TCGContext *s, TCGOp *op, TCGOpcode opc, int narg); + void tcg_optimize(TCGContext *s); /* only used for debugging purposes */ diff --git a/util/log.c b/util/log.c index 9f0844481c..54b54e868a 100644 --- a/util/log.c +++ b/util/log.c @@ -247,8 +247,9 @@ const QEMULogItem qemu_log_items[] = { { CPU_LOG_TB_OP, "op", "show micro ops for each compiled TB" }, { CPU_LOG_TB_OP_OPT, "op_opt", - "show micro ops (x86 only: before eflags optimization) and\n" - "after liveness analysis" }, + "show micro ops after optimization" }, + { CPU_LOG_TB_OP_IND, "op_ind", + "show micro ops before indirect lowering" }, { CPU_LOG_INT, "int", "show interrupts/exceptions in short format" }, { CPU_LOG_EXEC, "exec",