Queued TCG patches
-----BEGIN PGP SIGNATURE----- iQEcBAABAgAGBQJZSBP2AAoJEK0ScMxN0CebnyMH/1ZiDhYiqCD7PYfk4/Y7Db+h MNKNozrWKyChWQp1RzwWqcBaIzbuMZkDYn8dfS419PNtFRNoYtHjhYvjSTfcrxS0 U8dGOoqQUHCr/jlyIDUE4y5+aFA9R/1Ih5IQv+QCi5QNXcfeST8zcYF+ImuikP6C 7heIc7dE9kXdA8ycWJ39kYErHK9qEJbvDx6dxMPmb4cM36U239Zb9so985TXULlQ LoHrDpOCBzCbsICBE8iP2RKDvcwENIx21Dwv+9gW/NqR+nRdKcxhTjKEodkS8gl/ UxMxM/TjIPQOLLUhdck5DFgIgBgQWHRqPMJKqt466I0JlXvSpifmWxckWzslXLc= =R+em -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20170619' into staging Queued TCG patches # gpg: Signature made Mon 19 Jun 2017 19:12:06 BST # gpg: using RSA key 0xAD1270CC4DD0279B # gpg: Good signature from "Richard Henderson <rth7680@gmail.com>" # gpg: aka "Richard Henderson <rth@redhat.com>" # gpg: aka "Richard Henderson <rth@twiddle.net>" # Primary key fingerprint: 9CB1 8DDA F8E8 49AD 2AFC 16A4 AD12 70CC 4DD0 279B * remotes/rth/tags/pull-tcg-20170619: target/arm: Exit after clearing aarch64 interrupt mask target/s390x: Exit after changing PSW mask target/alpha: Use tcg_gen_lookup_and_goto_ptr tcg: Increase hit rate of lookup_tb_ptr tcg/arm: Use ldr (literal) for goto_tb tcg/arm: Try pc-relative addresses for movi tcg/arm: Remove limit on code buffer size tcg/arm: Use indirect branch for goto_tb tcg/aarch64: Use ADR in tcg_out_movi translate-all: consolidate tb init in tb_gen_code tcg: allocate TB structs before the corresponding translated code util: add cacheinfo Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
commit
db7a99cdc1
@ -523,8 +523,6 @@ static inline PageDesc *page_find(tb_page_addr_t index)
|
||||
# define MAX_CODE_GEN_BUFFER_SIZE (32u * 1024 * 1024)
|
||||
#elif defined(__aarch64__)
|
||||
# define MAX_CODE_GEN_BUFFER_SIZE (128ul * 1024 * 1024)
|
||||
#elif defined(__arm__)
|
||||
# define MAX_CODE_GEN_BUFFER_SIZE (16u * 1024 * 1024)
|
||||
#elif defined(__s390x__)
|
||||
/* We have a +- 4GB range on the branches; leave some slop. */
|
||||
# define MAX_CODE_GEN_BUFFER_SIZE (3ul * 1024 * 1024 * 1024)
|
||||
@ -781,12 +779,13 @@ static inline void code_gen_alloc(size_t tb_size)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Estimate a good size for the number of TBs we can support. We
|
||||
still haven't deducted the prologue from the buffer size here,
|
||||
but that's minimal and won't affect the estimate much. */
|
||||
tcg_ctx.code_gen_max_blocks
|
||||
= tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE;
|
||||
tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock, tcg_ctx.code_gen_max_blocks);
|
||||
/* size this conservatively -- realloc later if needed */
|
||||
tcg_ctx.tb_ctx.tbs_size =
|
||||
tcg_ctx.code_gen_buffer_size / CODE_GEN_AVG_BLOCK_SIZE / 8;
|
||||
if (unlikely(!tcg_ctx.tb_ctx.tbs_size)) {
|
||||
tcg_ctx.tb_ctx.tbs_size = 64 * 1024;
|
||||
}
|
||||
tcg_ctx.tb_ctx.tbs = g_new(TranslationBlock *, tcg_ctx.tb_ctx.tbs_size);
|
||||
|
||||
qemu_mutex_init(&tcg_ctx.tb_ctx.tb_lock);
|
||||
}
|
||||
@ -828,16 +827,20 @@ bool tcg_enabled(void)
|
||||
static TranslationBlock *tb_alloc(target_ulong pc)
|
||||
{
|
||||
TranslationBlock *tb;
|
||||
TBContext *ctx;
|
||||
|
||||
assert_tb_locked();
|
||||
|
||||
if (tcg_ctx.tb_ctx.nb_tbs >= tcg_ctx.code_gen_max_blocks) {
|
||||
tb = tcg_tb_alloc(&tcg_ctx);
|
||||
if (unlikely(tb == NULL)) {
|
||||
return NULL;
|
||||
}
|
||||
tb = &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs++];
|
||||
tb->pc = pc;
|
||||
tb->cflags = 0;
|
||||
tb->invalid = false;
|
||||
ctx = &tcg_ctx.tb_ctx;
|
||||
if (unlikely(ctx->nb_tbs == ctx->tbs_size)) {
|
||||
ctx->tbs_size *= 2;
|
||||
ctx->tbs = g_renew(TranslationBlock *, ctx->tbs, ctx->tbs_size);
|
||||
}
|
||||
ctx->tbs[ctx->nb_tbs++] = tb;
|
||||
return tb;
|
||||
}
|
||||
|
||||
@ -850,8 +853,10 @@ void tb_free(TranslationBlock *tb)
|
||||
Ignore the hard cases and just back up if this TB happens to
|
||||
be the last one generated. */
|
||||
if (tcg_ctx.tb_ctx.nb_tbs > 0 &&
|
||||
tb == &tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
|
||||
tcg_ctx.code_gen_ptr = tb->tc_ptr;
|
||||
tb == tcg_ctx.tb_ctx.tbs[tcg_ctx.tb_ctx.nb_tbs - 1]) {
|
||||
size_t struct_size = ROUND_UP(sizeof(*tb), qemu_icache_linesize);
|
||||
|
||||
tcg_ctx.code_gen_ptr = tb->tc_ptr - struct_size;
|
||||
tcg_ctx.tb_ctx.nb_tbs--;
|
||||
}
|
||||
}
|
||||
@ -1279,9 +1284,11 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
|
||||
|
||||
gen_code_buf = tcg_ctx.code_gen_ptr;
|
||||
tb->tc_ptr = gen_code_buf;
|
||||
tb->pc = pc;
|
||||
tb->cs_base = cs_base;
|
||||
tb->flags = flags;
|
||||
tb->cflags = cflags;
|
||||
tb->invalid = false;
|
||||
|
||||
#ifdef CONFIG_PROFILER
|
||||
tcg_ctx.tb_count1++; /* includes aborted translations because of
|
||||
@ -1666,7 +1673,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
|
||||
m_max = tcg_ctx.tb_ctx.nb_tbs - 1;
|
||||
while (m_min <= m_max) {
|
||||
m = (m_min + m_max) >> 1;
|
||||
tb = &tcg_ctx.tb_ctx.tbs[m];
|
||||
tb = tcg_ctx.tb_ctx.tbs[m];
|
||||
v = (uintptr_t)tb->tc_ptr;
|
||||
if (v == tc_ptr) {
|
||||
return tb;
|
||||
@ -1676,7 +1683,7 @@ static TranslationBlock *tb_find_pc(uintptr_t tc_ptr)
|
||||
m_min = m + 1;
|
||||
}
|
||||
}
|
||||
return &tcg_ctx.tb_ctx.tbs[m_max];
|
||||
return tcg_ctx.tb_ctx.tbs[m_max];
|
||||
}
|
||||
|
||||
#if !defined(CONFIG_USER_ONLY)
|
||||
@ -1874,7 +1881,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
|
||||
direct_jmp_count = 0;
|
||||
direct_jmp2_count = 0;
|
||||
for (i = 0; i < tcg_ctx.tb_ctx.nb_tbs; i++) {
|
||||
tb = &tcg_ctx.tb_ctx.tbs[i];
|
||||
tb = tcg_ctx.tb_ctx.tbs[i];
|
||||
target_code_size += tb->size;
|
||||
if (tb->size > max_target_code_size) {
|
||||
max_target_code_size = tb->size;
|
||||
@ -1894,8 +1901,7 @@ void dump_exec_info(FILE *f, fprintf_function cpu_fprintf)
|
||||
cpu_fprintf(f, "gen code size %td/%zd\n",
|
||||
tcg_ctx.code_gen_ptr - tcg_ctx.code_gen_buffer,
|
||||
tcg_ctx.code_gen_highwater - tcg_ctx.code_gen_buffer);
|
||||
cpu_fprintf(f, "TB count %d/%d\n",
|
||||
tcg_ctx.tb_ctx.nb_tbs, tcg_ctx.code_gen_max_blocks);
|
||||
cpu_fprintf(f, "TB count %d\n", tcg_ctx.tb_ctx.nb_tbs);
|
||||
cpu_fprintf(f, "TB avg target size %d max=%d bytes\n",
|
||||
tcg_ctx.tb_ctx.nb_tbs ? target_code_size /
|
||||
tcg_ctx.tb_ctx.nb_tbs : 0,
|
||||
|
@ -301,7 +301,7 @@ static inline void tlb_flush_by_mmuidx_all_cpus_synced(CPUState *cpu,
|
||||
#define CODE_GEN_AVG_BLOCK_SIZE 150
|
||||
#endif
|
||||
|
||||
#if defined(__arm__) || defined(_ARCH_PPC) \
|
||||
#if defined(_ARCH_PPC) \
|
||||
|| defined(__x86_64__) || defined(__i386__) \
|
||||
|| defined(__sparc__) || defined(__aarch64__) \
|
||||
|| defined(__s390x__) || defined(__mips__) \
|
||||
@ -401,9 +401,6 @@ static inline void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr)
|
||||
#elif defined(__aarch64__)
|
||||
void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr);
|
||||
#define tb_set_jmp_target1 aarch64_tb_set_jmp_target
|
||||
#elif defined(__arm__)
|
||||
void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr);
|
||||
#define tb_set_jmp_target1 arm_tb_set_jmp_target
|
||||
#elif defined(__sparc__) || defined(__mips__)
|
||||
void tb_set_jmp_target1(uintptr_t jmp_addr, uintptr_t addr);
|
||||
#else
|
||||
|
@ -31,8 +31,9 @@ typedef struct TBContext TBContext;
|
||||
|
||||
struct TBContext {
|
||||
|
||||
TranslationBlock *tbs;
|
||||
TranslationBlock **tbs;
|
||||
struct qht htable;
|
||||
size_t tbs_size;
|
||||
int nb_tbs;
|
||||
/* any access to the tbs or the page table must use this lock */
|
||||
QemuMutex tb_lock;
|
||||
|
@ -483,4 +483,7 @@ char *qemu_get_pid_name(pid_t pid);
|
||||
*/
|
||||
pid_t qemu_fork(Error **errp);
|
||||
|
||||
extern int qemu_icache_linesize;
|
||||
extern int qemu_dcache_linesize;
|
||||
|
||||
#endif
|
||||
|
@ -84,6 +84,7 @@ typedef enum {
|
||||
the PC (for whatever reason), so there's no need to do it again on
|
||||
exiting the TB. */
|
||||
EXIT_PC_UPDATED,
|
||||
EXIT_PC_UPDATED_NOCHAIN,
|
||||
|
||||
/* We are exiting the TB, but have neither emitted a goto_tb, nor
|
||||
updated the PC for the next instruction to be executed. */
|
||||
@ -458,11 +459,17 @@ static bool in_superpage(DisasContext *ctx, int64_t addr)
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool use_exit_tb(DisasContext *ctx)
|
||||
{
|
||||
return ((ctx->tb->cflags & CF_LAST_IO)
|
||||
|| ctx->singlestep_enabled
|
||||
|| singlestep);
|
||||
}
|
||||
|
||||
static bool use_goto_tb(DisasContext *ctx, uint64_t dest)
|
||||
{
|
||||
/* Suppress goto_tb in the case of single-steping and IO. */
|
||||
if ((ctx->tb->cflags & CF_LAST_IO)
|
||||
|| ctx->singlestep_enabled || singlestep) {
|
||||
if (unlikely(use_exit_tb(ctx))) {
|
||||
return false;
|
||||
}
|
||||
#ifndef CONFIG_USER_ONLY
|
||||
@ -1198,7 +1205,10 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
|
||||
tcg_gen_andi_i64(tmp, ctx->ir[IR_A0], PS_INT_MASK);
|
||||
tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, ps));
|
||||
tcg_temp_free(tmp);
|
||||
break;
|
||||
|
||||
/* Allow interrupts to be recognized right away. */
|
||||
tcg_gen_movi_i64(cpu_pc, ctx->pc);
|
||||
return EXIT_PC_UPDATED_NOCHAIN;
|
||||
|
||||
case 0x36:
|
||||
/* RDPS */
|
||||
@ -1266,7 +1276,7 @@ static ExitStatus gen_call_pal(DisasContext *ctx, int palcode)
|
||||
need the page permissions check. We'll see the existence of
|
||||
the page when we create the TB, and we'll flush all TBs if
|
||||
we change the PAL base register. */
|
||||
if (!ctx->singlestep_enabled && !(ctx->tb->cflags & CF_LAST_IO)) {
|
||||
if (!use_exit_tb(ctx)) {
|
||||
tcg_gen_goto_tb(0);
|
||||
tcg_gen_movi_i64(cpu_pc, entry);
|
||||
tcg_gen_exit_tb((uintptr_t)ctx->tb);
|
||||
@ -2686,7 +2696,8 @@ static ExitStatus translate_one(DisasContext *ctx, uint32_t insn)
|
||||
tcg_gen_andi_i64(tmp, vb, 1);
|
||||
tcg_gen_st8_i64(tmp, cpu_env, offsetof(CPUAlphaState, pal_mode));
|
||||
tcg_gen_andi_i64(cpu_pc, vb, ~3);
|
||||
ret = EXIT_PC_UPDATED;
|
||||
/* Allow interrupts to be recognized right away. */
|
||||
ret = EXIT_PC_UPDATED_NOCHAIN;
|
||||
break;
|
||||
#else
|
||||
goto invalid_opc;
|
||||
@ -3010,6 +3021,12 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
|
||||
tcg_gen_movi_i64(cpu_pc, ctx.pc);
|
||||
/* FALLTHRU */
|
||||
case EXIT_PC_UPDATED:
|
||||
if (!use_exit_tb(&ctx)) {
|
||||
tcg_gen_lookup_and_goto_ptr(cpu_pc);
|
||||
break;
|
||||
}
|
||||
/* FALLTHRU */
|
||||
case EXIT_PC_UPDATED_NOCHAIN:
|
||||
if (ctx.singlestep_enabled) {
|
||||
gen_excp_1(EXCP_DEBUG, 0);
|
||||
} else {
|
||||
|
@ -1422,7 +1422,9 @@ static void handle_msr_i(DisasContext *s, uint32_t insn,
|
||||
gen_helper_msr_i_pstate(cpu_env, tcg_op, tcg_imm);
|
||||
tcg_temp_free_i32(tcg_imm);
|
||||
tcg_temp_free_i32(tcg_op);
|
||||
s->is_jmp = DISAS_UPDATE;
|
||||
/* For DAIFClear, exit the cpu loop to re-evaluate pending IRQs. */
|
||||
gen_a64_set_pc_im(s->pc);
|
||||
s->is_jmp = (op == 0x1f ? DISAS_EXIT : DISAS_JUMP);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@ -11369,6 +11371,9 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
|
||||
case DISAS_JUMP:
|
||||
tcg_gen_lookup_and_goto_ptr(cpu_pc);
|
||||
break;
|
||||
case DISAS_EXIT:
|
||||
tcg_gen_exit_tb(0);
|
||||
break;
|
||||
case DISAS_TB_JUMP:
|
||||
case DISAS_EXC:
|
||||
case DISAS_SWI:
|
||||
|
@ -1173,6 +1173,8 @@ typedef enum {
|
||||
/* We are exiting the TB, but have neither emitted a goto_tb, nor
|
||||
updated the PC for the next instruction to be executed. */
|
||||
EXIT_PC_STALE,
|
||||
/* We are exiting the TB to the main loop. */
|
||||
EXIT_PC_STALE_NOCHAIN,
|
||||
/* We are ending the TB with a noreturn function call, e.g. longjmp.
|
||||
No following code will be executed. */
|
||||
EXIT_NORETURN,
|
||||
@ -3795,7 +3797,8 @@ static ExitStatus op_ssm(DisasContext *s, DisasOps *o)
|
||||
{
|
||||
check_privileged(s);
|
||||
tcg_gen_deposit_i64(psw_mask, psw_mask, o->in2, 56, 8);
|
||||
return NO_EXIT;
|
||||
/* Exit to main loop to reevaluate s390_cpu_exec_interrupt. */
|
||||
return EXIT_PC_STALE_NOCHAIN;
|
||||
}
|
||||
|
||||
static ExitStatus op_stap(DisasContext *s, DisasOps *o)
|
||||
@ -4038,7 +4041,9 @@ static ExitStatus op_stnosm(DisasContext *s, DisasOps *o)
|
||||
} else {
|
||||
tcg_gen_ori_i64(psw_mask, psw_mask, i2 << 56);
|
||||
}
|
||||
return NO_EXIT;
|
||||
|
||||
/* Exit to main loop to reevaluate s390_cpu_exec_interrupt. */
|
||||
return EXIT_PC_STALE_NOCHAIN;
|
||||
}
|
||||
|
||||
static ExitStatus op_stura(DisasContext *s, DisasOps *o)
|
||||
@ -5788,6 +5793,7 @@ void gen_intermediate_code(CPUS390XState *env, struct TranslationBlock *tb)
|
||||
case EXIT_NORETURN:
|
||||
break;
|
||||
case EXIT_PC_STALE:
|
||||
case EXIT_PC_STALE_NOCHAIN:
|
||||
update_psw_addr(&dc);
|
||||
/* FALLTHRU */
|
||||
case EXIT_PC_UPDATED:
|
||||
@ -5799,14 +5805,14 @@ void gen_intermediate_code(CPUS390XState *env, struct TranslationBlock *tb)
|
||||
/* Exit the TB, either by raising a debug exception or by return. */
|
||||
if (do_debug) {
|
||||
gen_exception(EXCP_DEBUG);
|
||||
} else if (use_exit_tb(&dc)) {
|
||||
} else if (use_exit_tb(&dc) || status == EXIT_PC_STALE_NOCHAIN) {
|
||||
tcg_gen_exit_tb(0);
|
||||
} else {
|
||||
tcg_gen_lookup_and_goto_ptr(psw_addr);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
abort();
|
||||
g_assert_not_reached();
|
||||
}
|
||||
|
||||
gen_tb_end(tb, num_insns);
|
||||
|
@ -616,7 +616,12 @@ static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
|
||||
/* Look for host pointer values within 4G of the PC. This happens
|
||||
often when loading pointers to QEMU's own data structures. */
|
||||
if (type == TCG_TYPE_I64) {
|
||||
tcg_target_long disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
|
||||
tcg_target_long disp = value - (intptr_t)s->code_ptr;
|
||||
if (disp == sextract64(disp, 0, 21)) {
|
||||
tcg_out_insn(s, 3406, ADR, rd, disp);
|
||||
return;
|
||||
}
|
||||
disp = (value >> 12) - ((intptr_t)s->code_ptr >> 12);
|
||||
if (disp == sextract64(disp, 0, 21)) {
|
||||
tcg_out_insn(s, 3406, ADRP, rd, disp);
|
||||
if (value & 0xfff) {
|
||||
|
@ -418,23 +418,37 @@ static inline void tcg_out_dat_imm(TCGContext *s,
|
||||
|
||||
static void tcg_out_movi32(TCGContext *s, int cond, int rd, uint32_t arg)
|
||||
{
|
||||
int rot, opc, rn;
|
||||
int rot, opc, rn, diff;
|
||||
|
||||
/* For armv7, make sure not to use movw+movt when mov/mvn would do.
|
||||
Speed things up by only checking when movt would be required.
|
||||
Prior to armv7, have one go at fully rotated immediates before
|
||||
doing the decomposition thing below. */
|
||||
if (!use_armv7_instructions || (arg & 0xffff0000)) {
|
||||
rot = encode_imm(arg);
|
||||
/* Check a single MOV/MVN before anything else. */
|
||||
rot = encode_imm(arg);
|
||||
if (rot >= 0) {
|
||||
tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
|
||||
rotl(arg, rot) | (rot << 7));
|
||||
return;
|
||||
}
|
||||
rot = encode_imm(~arg);
|
||||
if (rot >= 0) {
|
||||
tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
|
||||
rotl(~arg, rot) | (rot << 7));
|
||||
return;
|
||||
}
|
||||
|
||||
/* Check for a pc-relative address. This will usually be the TB,
|
||||
or within the TB, which is immediately before the code block. */
|
||||
diff = arg - ((intptr_t)s->code_ptr + 8);
|
||||
if (diff >= 0) {
|
||||
rot = encode_imm(diff);
|
||||
if (rot >= 0) {
|
||||
tcg_out_dat_imm(s, cond, ARITH_MOV, rd, 0,
|
||||
rotl(arg, rot) | (rot << 7));
|
||||
tcg_out_dat_imm(s, cond, ARITH_ADD, rd, TCG_REG_PC,
|
||||
rotl(diff, rot) | (rot << 7));
|
||||
return;
|
||||
}
|
||||
rot = encode_imm(~arg);
|
||||
} else {
|
||||
rot = encode_imm(-diff);
|
||||
if (rot >= 0) {
|
||||
tcg_out_dat_imm(s, cond, ARITH_MVN, rd, 0,
|
||||
rotl(~arg, rot) | (rot << 7));
|
||||
tcg_out_dat_imm(s, cond, ARITH_SUB, rd, TCG_REG_PC,
|
||||
rotl(-diff, rot) | (rot << 7));
|
||||
return;
|
||||
}
|
||||
}
|
||||
@ -1026,16 +1040,6 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit *addr)
|
||||
}
|
||||
}
|
||||
|
||||
void arm_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
|
||||
{
|
||||
tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
|
||||
tcg_insn_unit *target = (tcg_insn_unit *)addr;
|
||||
|
||||
/* we could use a ldr pc, [pc, #-4] kind of branch and avoid the flush */
|
||||
reloc_pc24_atomic(code_ptr, target);
|
||||
flush_icache_range(jmp_addr, jmp_addr + 4);
|
||||
}
|
||||
|
||||
static inline void tcg_out_goto_label(TCGContext *s, int cond, TCGLabel *l)
|
||||
{
|
||||
if (l->has_value) {
|
||||
@ -1665,17 +1669,27 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
|
||||
}
|
||||
break;
|
||||
case INDEX_op_goto_tb:
|
||||
if (s->tb_jmp_insn_offset) {
|
||||
/* Direct jump method */
|
||||
s->tb_jmp_insn_offset[args[0]] = tcg_current_code_size(s);
|
||||
tcg_out_b_noaddr(s, COND_AL);
|
||||
} else {
|
||||
{
|
||||
/* Indirect jump method */
|
||||
intptr_t ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
|
||||
tcg_out_movi32(s, COND_AL, TCG_REG_R0, ptr & ~0xfff);
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, TCG_REG_R0, ptr & 0xfff);
|
||||
intptr_t ptr, dif, dil;
|
||||
TCGReg base = TCG_REG_PC;
|
||||
|
||||
tcg_debug_assert(s->tb_jmp_insn_offset == 0);
|
||||
ptr = (intptr_t)(s->tb_jmp_target_addr + args[0]);
|
||||
dif = ptr - ((intptr_t)s->code_ptr + 8);
|
||||
dil = sextract32(dif, 0, 12);
|
||||
if (dif != dil) {
|
||||
/* The TB is close, but outside the 12 bits addressable by
|
||||
the load. We can extend this to 20 bits with a sub of a
|
||||
shifted immediate from pc. In the vastly unlikely event
|
||||
the code requires more than 1MB, we'll use 2 insns and
|
||||
be no worse off. */
|
||||
base = TCG_REG_R0;
|
||||
tcg_out_movi32(s, COND_AL, base, ptr - dil);
|
||||
}
|
||||
tcg_out_ld32_12(s, COND_AL, TCG_REG_PC, base, dil);
|
||||
s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
|
||||
}
|
||||
s->tb_jmp_reset_offset[args[0]] = tcg_current_code_size(s);
|
||||
break;
|
||||
case INDEX_op_goto_ptr:
|
||||
tcg_out_bx(s, COND_AL, args[0]);
|
||||
|
@ -2820,14 +2820,11 @@ void tcg_register_jit(void *buf, size_t buf_size)
|
||||
}
|
||||
#endif /* __ELF__ */
|
||||
|
||||
static size_t dcache_bsize = 16;
|
||||
static size_t icache_bsize = 16;
|
||||
|
||||
void flush_icache_range(uintptr_t start, uintptr_t stop)
|
||||
{
|
||||
uintptr_t p, start1, stop1;
|
||||
size_t dsize = dcache_bsize;
|
||||
size_t isize = icache_bsize;
|
||||
size_t dsize = qemu_dcache_linesize;
|
||||
size_t isize = qemu_icache_linesize;
|
||||
|
||||
start1 = start & ~(dsize - 1);
|
||||
stop1 = (stop + dsize - 1) & ~(dsize - 1);
|
||||
@ -2844,67 +2841,3 @@ void flush_icache_range(uintptr_t start, uintptr_t stop)
|
||||
asm volatile ("sync" : : : "memory");
|
||||
asm volatile ("isync" : : : "memory");
|
||||
}
|
||||
|
||||
#if defined _AIX
|
||||
#include <sys/systemcfg.h>
|
||||
|
||||
static void __attribute__((constructor)) tcg_cache_init(void)
|
||||
{
|
||||
icache_bsize = _system_configuration.icache_line;
|
||||
dcache_bsize = _system_configuration.dcache_line;
|
||||
}
|
||||
|
||||
#elif defined __linux__
|
||||
static void __attribute__((constructor)) tcg_cache_init(void)
|
||||
{
|
||||
unsigned long dsize = qemu_getauxval(AT_DCACHEBSIZE);
|
||||
unsigned long isize = qemu_getauxval(AT_ICACHEBSIZE);
|
||||
|
||||
if (dsize == 0 || isize == 0) {
|
||||
if (dsize == 0) {
|
||||
fprintf(stderr, "getauxval AT_DCACHEBSIZE failed\n");
|
||||
}
|
||||
if (isize == 0) {
|
||||
fprintf(stderr, "getauxval AT_ICACHEBSIZE failed\n");
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
dcache_bsize = dsize;
|
||||
icache_bsize = isize;
|
||||
}
|
||||
|
||||
#elif defined __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
static void __attribute__((constructor)) tcg_cache_init(void)
|
||||
{
|
||||
size_t len;
|
||||
unsigned cacheline;
|
||||
int name[2] = { CTL_HW, HW_CACHELINE };
|
||||
|
||||
len = sizeof(cacheline);
|
||||
if (sysctl(name, 2, &cacheline, &len, NULL, 0)) {
|
||||
perror("sysctl CTL_HW HW_CACHELINE failed");
|
||||
exit(1);
|
||||
}
|
||||
dcache_bsize = cacheline;
|
||||
icache_bsize = cacheline;
|
||||
}
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
||||
#include <sys/sysctl.h>
|
||||
|
||||
static void __attribute__((constructor)) tcg_cache_init(void)
|
||||
{
|
||||
size_t len = 4;
|
||||
unsigned cacheline;
|
||||
|
||||
if (sysctlbyname ("machdep.cacheline_size", &cacheline, &len, NULL, 0)) {
|
||||
fprintf(stderr, "sysctlbyname machdep.cacheline_size failed: %s\n",
|
||||
strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
dcache_bsize = cacheline;
|
||||
icache_bsize = cacheline;
|
||||
}
|
||||
#endif
|
||||
|
@ -149,23 +149,23 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
|
||||
CPUState *cpu = ENV_GET_CPU(env);
|
||||
TranslationBlock *tb;
|
||||
target_ulong cs_base, pc;
|
||||
uint32_t flags;
|
||||
uint32_t flags, addr_hash;
|
||||
|
||||
tb = atomic_rcu_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
|
||||
if (likely(tb)) {
|
||||
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
|
||||
if (likely(tb->pc == addr && tb->cs_base == cs_base &&
|
||||
tb->flags == flags)) {
|
||||
goto found;
|
||||
}
|
||||
addr_hash = tb_jmp_cache_hash_func(addr);
|
||||
tb = atomic_rcu_read(&cpu->tb_jmp_cache[addr_hash]);
|
||||
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
|
||||
|
||||
if (unlikely(!(tb
|
||||
&& tb->pc == addr
|
||||
&& tb->cs_base == cs_base
|
||||
&& tb->flags == flags))) {
|
||||
tb = tb_htable_lookup(cpu, addr, cs_base, flags);
|
||||
if (likely(tb)) {
|
||||
atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(addr)], tb);
|
||||
goto found;
|
||||
if (!tb) {
|
||||
return tcg_ctx.code_gen_epilogue;
|
||||
}
|
||||
atomic_set(&cpu->tb_jmp_cache[addr_hash], tb);
|
||||
}
|
||||
return tcg_ctx.code_gen_epilogue;
|
||||
found:
|
||||
|
||||
qemu_log_mask_and_addr(CPU_LOG_EXEC, addr,
|
||||
"Chain %p [%d: " TARGET_FMT_lx "] %s\n",
|
||||
tb->tc_ptr, cpu->cpu_index, addr,
|
||||
|
20
tcg/tcg.c
20
tcg/tcg.c
@ -383,6 +383,26 @@ void tcg_context_init(TCGContext *s)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate TBs right before their corresponding translated code, making
|
||||
* sure that TBs and code are on different cache lines.
|
||||
*/
|
||||
TranslationBlock *tcg_tb_alloc(TCGContext *s)
|
||||
{
|
||||
uintptr_t align = qemu_icache_linesize;
|
||||
TranslationBlock *tb;
|
||||
void *next;
|
||||
|
||||
tb = (void *)ROUND_UP((uintptr_t)s->code_gen_ptr, align);
|
||||
next = (void *)ROUND_UP((uintptr_t)(tb + 1), align);
|
||||
|
||||
if (unlikely(next > s->code_gen_highwater)) {
|
||||
return NULL;
|
||||
}
|
||||
s->code_gen_ptr = next;
|
||||
return tb;
|
||||
}
|
||||
|
||||
void tcg_prologue_init(TCGContext *s)
|
||||
{
|
||||
size_t prologue_size, total_size;
|
||||
|
@ -697,7 +697,6 @@ struct TCGContext {
|
||||
here, because there's too much arithmetic throughout that relies
|
||||
on addition and subtraction working on bytes. Rely on the GCC
|
||||
extension that allows arithmetic on void*. */
|
||||
int code_gen_max_blocks;
|
||||
void *code_gen_prologue;
|
||||
void *code_gen_epilogue;
|
||||
void *code_gen_buffer;
|
||||
@ -756,6 +755,7 @@ static inline bool tcg_op_buf_full(void)
|
||||
/* tb_lock must be held for tcg_malloc_internal. */
|
||||
void *tcg_malloc_internal(TCGContext *s, int size);
|
||||
void tcg_pool_reset(TCGContext *s);
|
||||
TranslationBlock *tcg_tb_alloc(TCGContext *s);
|
||||
|
||||
void tb_lock(void);
|
||||
void tb_unlock(void);
|
||||
|
@ -20,6 +20,7 @@ util-obj-y += host-utils.o
|
||||
util-obj-y += bitmap.o bitops.o hbitmap.o
|
||||
util-obj-y += fifo8.o
|
||||
util-obj-y += acl.o
|
||||
util-obj-y += cacheinfo.o
|
||||
util-obj-y += error.o qemu-error.o
|
||||
util-obj-y += id.o
|
||||
util-obj-y += iov.o qemu-config.o qemu-sockets.o uri.o notify.o
|
||||
|
185
util/cacheinfo.c
Normal file
185
util/cacheinfo.c
Normal file
@ -0,0 +1,185 @@
|
||||
/*
|
||||
* cacheinfo.c - helpers to query the host about its caches
|
||||
*
|
||||
* Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
|
||||
* License: GNU GPL, version 2 or later.
|
||||
* See the COPYING file in the top-level directory.
|
||||
*/
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
|
||||
int qemu_icache_linesize = 0;
|
||||
int qemu_dcache_linesize = 0;
|
||||
|
||||
/*
|
||||
* Operating system specific detection mechanisms.
|
||||
*/
|
||||
|
||||
#if defined(_AIX)
|
||||
# include <sys/systemcfg.h>
|
||||
|
||||
static void sys_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
*isize = _system_configuration.icache_line;
|
||||
*dsize = _system_configuration.dcache_line;
|
||||
}
|
||||
|
||||
#elif defined(_WIN32)
|
||||
|
||||
static void sys_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf;
|
||||
DWORD size = 0;
|
||||
BOOL success;
|
||||
size_t i, n;
|
||||
|
||||
/* Check for the required buffer size first. Note that if the zero
|
||||
size we use for the probe results in success, then there is no
|
||||
data available; fail in that case. */
|
||||
success = GetLogicalProcessorInformation(0, &size);
|
||||
if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
|
||||
return;
|
||||
}
|
||||
|
||||
n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
|
||||
size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
|
||||
buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n);
|
||||
if (!GetLogicalProcessorInformation(buf, &size)) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (buf[i].Relationship == RelationCache
|
||||
&& buf[i].Cache.Level == 1) {
|
||||
switch (buf[i].Cache.Type) {
|
||||
case CacheUnified:
|
||||
*isize = *dsize = buf[i].Cache.LineSize;
|
||||
break;
|
||||
case CacheInstruction:
|
||||
*isize = buf[i].Cache.LineSize;
|
||||
break;
|
||||
case CacheData:
|
||||
*dsize = buf[i].Cache.LineSize;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fail:
|
||||
g_free(buf);
|
||||
}
|
||||
|
||||
#elif defined(__APPLE__) \
|
||||
|| defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
|
||||
# include <sys/sysctl.h>
|
||||
# if defined(__APPLE__)
|
||||
# define SYSCTL_CACHELINE_NAME "hw.cachelinesize"
|
||||
# else
|
||||
# define SYSCTL_CACHELINE_NAME "machdep.cacheline_size"
|
||||
# endif
|
||||
|
||||
static void sys_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
/* There's only a single sysctl for both I/D cache line sizes. */
|
||||
long size;
|
||||
size_t len = sizeof(size);
|
||||
if (!sysctlbyname(SYSCTL_CACHELINE_NAME, &size, &len, NULL, 0)) {
|
||||
*isize = *dsize = size;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
/* POSIX */
|
||||
|
||||
static void sys_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
# ifdef _SC_LEVEL1_ICACHE_LINESIZE
|
||||
*isize = sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
|
||||
# endif
|
||||
# ifdef _SC_LEVEL1_DCACHE_LINESIZE
|
||||
*dsize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
# endif
|
||||
}
|
||||
#endif /* sys_cache_info */
|
||||
|
||||
/*
|
||||
* Architecture (+ OS) specific detection mechanisms.
|
||||
*/
|
||||
|
||||
#if defined(__aarch64__)
|
||||
|
||||
static void arch_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
if (*isize == 0 || *dsize == 0) {
|
||||
unsigned ctr;
|
||||
|
||||
/* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1,
|
||||
but (at least under Linux) these are marked protected by the
|
||||
kernel. However, CTR_EL0 contains the minimum linesize in the
|
||||
entire hierarchy, and is used by userspace cache flushing. */
|
||||
asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr));
|
||||
if (*isize == 0) {
|
||||
*isize = 4 << (ctr & 0xf);
|
||||
}
|
||||
if (*dsize == 0) {
|
||||
*dsize = 4 << ((ctr >> 16) & 0xf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(_ARCH_PPC) && defined(__linux__)
|
||||
|
||||
static void arch_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
if (*isize == 0) {
|
||||
*isize = qemu_getauxval(AT_ICACHEBSIZE);
|
||||
}
|
||||
if (*dsize == 0) {
|
||||
*dsize = qemu_getauxval(AT_DCACHEBSIZE);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
static void arch_cache_info(int *isize, int *dsize) { }
|
||||
#endif /* arch_cache_info */
|
||||
|
||||
/*
|
||||
* ... and if all else fails ...
|
||||
*/
|
||||
|
||||
static void fallback_cache_info(int *isize, int *dsize)
|
||||
{
|
||||
/* If we can only find one of the two, assume they're the same. */
|
||||
if (*isize) {
|
||||
if (*dsize) {
|
||||
/* Success! */
|
||||
} else {
|
||||
*dsize = *isize;
|
||||
}
|
||||
} else if (*dsize) {
|
||||
*isize = *dsize;
|
||||
} else {
|
||||
#if defined(_ARCH_PPC)
|
||||
/* For PPC, we're going to use the icache size computed for
|
||||
flush_icache_range. Which means that we must use the
|
||||
architecture minimum. */
|
||||
*isize = *dsize = 16;
|
||||
#else
|
||||
/* Otherwise, 64 bytes is not uncommon. */
|
||||
*isize = *dsize = 64;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) init_cache_info(void)
|
||||
{
|
||||
int isize = 0, dsize = 0;
|
||||
|
||||
sys_cache_info(&isize, &dsize);
|
||||
arch_cache_info(&isize, &dsize);
|
||||
fallback_cache_info(&isize, &dsize);
|
||||
|
||||
qemu_icache_linesize = isize;
|
||||
qemu_dcache_linesize = dsize;
|
||||
}
|
Loading…
Reference in New Issue
Block a user