tcg/arm: Introduce prepare_host_addr

Merge tcg_out_tlb_load, add_qemu_ldst_label, and some code that lived
in both tcg_out_qemu_ld and tcg_out_qemu_st into one function that
returns HostAddress and TCGLabelQemuLdst structures.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2023-04-23 15:54:41 +01:00
parent 1e612dd66a
commit 7131d3cf72

View File

@ -1434,125 +1434,6 @@ static TCGReg tcg_out_arg_reg64(TCGContext *s, TCGReg argreg,
}
}
#define TLB_SHIFT (CPU_TLB_ENTRY_BITS + CPU_TLB_BITS)
/* We expect to use an 9-bit sign-magnitude negative offset from ENV. */
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
/* These offsets are built into the LDRD below. */
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
/* Load and compare a TLB entry, leaving the flags set. Returns the register
containing the addend of the tlb entry. Clobbers R0, R1, R2, TMP. */
static TCGReg tcg_out_tlb_read(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
MemOp opc, int mem_index, bool is_load)
{
int cmp_off = (is_load ? offsetof(CPUTLBEntry, addr_read)
: offsetof(CPUTLBEntry, addr_write));
int fast_off = TLB_MASK_TABLE_OFS(mem_index);
unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
unsigned a_mask = (1 << get_alignment_bits(opc)) - 1;
TCGReg t_addr;
/* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}. */
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
/* Extract the tlb index from the address into R0. */
tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
/*
* Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
* Load the tlb comparator into R2/R3 and the fast path addend into R1.
*/
if (cmp_off == 0) {
if (TARGET_LONG_BITS == 64) {
tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
} else {
tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
}
} else {
tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
if (TARGET_LONG_BITS == 64) {
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
} else {
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
}
}
/* Load the tlb addend. */
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
offsetof(CPUTLBEntry, addend));
/*
* Check alignment, check comparators.
* Do this in 2-4 insns. Use MOVW for v7, if possible,
* to reduce the number of sequential conditional instructions.
* Almost all guests have at least 4k pages, which means that we need
* to clear at least 9 bits even for an 8-byte memory, which means it
* isn't worth checking for an immediate operand for BIC.
*
* For unaligned accesses, test the page of the last unit of alignment.
* This leaves the least significant alignment bits unchanged, and of
* course must be zero.
*/
t_addr = addrlo;
if (a_mask < s_mask) {
t_addr = TCG_REG_R0;
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr,
addrlo, s_mask - a_mask);
}
if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) {
tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask));
tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
t_addr, TCG_REG_TMP, 0);
tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
} else {
if (a_mask) {
tcg_debug_assert(a_mask <= 0xff);
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
}
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr,
SHIFT_IMM_LSR(TARGET_PAGE_BITS));
tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP,
0, TCG_REG_R2, TCG_REG_TMP,
SHIFT_IMM_LSL(TARGET_PAGE_BITS));
}
if (TARGET_LONG_BITS == 64) {
tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);
}
return TCG_REG_R1;
}
/* Record the context of a call to the out of line helper code for the slow
path for a load or store, so that we can later generate the correct
helper code. */
static void add_qemu_ldst_label(TCGContext *s, bool is_ld,
MemOpIdx oi, TCGType type,
TCGReg datalo, TCGReg datahi,
TCGReg addrlo, TCGReg addrhi,
tcg_insn_unit *raddr,
tcg_insn_unit *label_ptr)
{
TCGLabelQemuLdst *label = new_ldst_label(s);
label->is_ld = is_ld;
label->oi = oi;
label->type = type;
label->datalo_reg = datalo;
label->datahi_reg = datahi;
label->addrlo_reg = addrlo;
label->addrhi_reg = addrhi;
label->raddr = tcg_splitwx_to_rx(raddr);
label->label_ptr[0] = label_ptr;
}
static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
{
TCGReg argreg;
@ -1636,29 +1517,6 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
return true;
}
#else
static void tcg_out_test_alignment(TCGContext *s, bool is_ld, TCGReg addrlo,
TCGReg addrhi, unsigned a_bits)
{
unsigned a_mask = (1 << a_bits) - 1;
TCGLabelQemuLdst *label = new_ldst_label(s);
label->is_ld = is_ld;
label->addrlo_reg = addrlo;
label->addrhi_reg = addrhi;
/* We are expecting a_bits to max out at 7, and can easily support 8. */
tcg_debug_assert(a_mask <= 0xff);
/* tst addr, #mask */
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
/* blne slow_path */
label->label_ptr[0] = s->code_ptr;
tcg_out_bl_imm(s, COND_NE, 0);
label->raddr = tcg_splitwx_to_rx(s->code_ptr);
}
static bool tcg_out_fail_alignment(TCGContext *s, TCGLabelQemuLdst *l)
{
if (!reloc_pc24(l->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
@ -1703,6 +1561,134 @@ static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
}
#endif /* SOFTMMU */
static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
TCGReg addrlo, TCGReg addrhi,
MemOpIdx oi, bool is_ld)
{
TCGLabelQemuLdst *ldst = NULL;
MemOp opc = get_memop(oi);
MemOp a_bits = get_alignment_bits(opc);
unsigned a_mask = (1 << a_bits) - 1;
#ifdef CONFIG_SOFTMMU
int mem_index = get_mmuidx(oi);
int cmp_off = is_ld ? offsetof(CPUTLBEntry, addr_read)
: offsetof(CPUTLBEntry, addr_write);
int fast_off = TLB_MASK_TABLE_OFS(mem_index);
unsigned s_mask = (1 << (opc & MO_SIZE)) - 1;
TCGReg t_addr;
ldst = new_ldst_label(s);
ldst->is_ld = is_ld;
ldst->oi = oi;
ldst->addrlo_reg = addrlo;
ldst->addrhi_reg = addrhi;
/* Load env_tlb(env)->f[mmu_idx].{mask,table} into {r0,r1}. */
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) > 0);
QEMU_BUILD_BUG_ON(TLB_MASK_TABLE_OFS(0) < -256);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 4);
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R0, TCG_AREG0, fast_off);
/* Extract the tlb index from the address into R0. */
tcg_out_dat_reg(s, COND_AL, ARITH_AND, TCG_REG_R0, TCG_REG_R0, addrlo,
SHIFT_IMM_LSR(TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS));
/*
* Add the tlb_table pointer, creating the CPUTLBEntry address in R1.
* Load the tlb comparator into R2/R3 and the fast path addend into R1.
*/
if (cmp_off == 0) {
if (TARGET_LONG_BITS == 64) {
tcg_out_ldrd_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
} else {
tcg_out_ld32_rwb(s, COND_AL, TCG_REG_R2, TCG_REG_R1, TCG_REG_R0);
}
} else {
tcg_out_dat_reg(s, COND_AL, ARITH_ADD,
TCG_REG_R1, TCG_REG_R1, TCG_REG_R0, 0);
if (TARGET_LONG_BITS == 64) {
tcg_out_ldrd_8(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
} else {
tcg_out_ld32_12(s, COND_AL, TCG_REG_R2, TCG_REG_R1, cmp_off);
}
}
/* Load the tlb addend. */
tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R1,
offsetof(CPUTLBEntry, addend));
/*
* Check alignment, check comparators.
* Do this in 2-4 insns. Use MOVW for v7, if possible,
* to reduce the number of sequential conditional instructions.
* Almost all guests have at least 4k pages, which means that we need
* to clear at least 9 bits even for an 8-byte memory, which means it
* isn't worth checking for an immediate operand for BIC.
*
* For unaligned accesses, test the page of the last unit of alignment.
* This leaves the least significant alignment bits unchanged, and of
* course must be zero.
*/
t_addr = addrlo;
if (a_mask < s_mask) {
t_addr = TCG_REG_R0;
tcg_out_dat_imm(s, COND_AL, ARITH_ADD, t_addr,
addrlo, s_mask - a_mask);
}
if (use_armv7_instructions && TARGET_PAGE_BITS <= 16) {
tcg_out_movi32(s, COND_AL, TCG_REG_TMP, ~(TARGET_PAGE_MASK | a_mask));
tcg_out_dat_reg(s, COND_AL, ARITH_BIC, TCG_REG_TMP,
t_addr, TCG_REG_TMP, 0);
tcg_out_dat_reg(s, COND_AL, ARITH_CMP, 0, TCG_REG_R2, TCG_REG_TMP, 0);
} else {
if (a_mask) {
tcg_debug_assert(a_mask <= 0xff);
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
}
tcg_out_dat_reg(s, COND_AL, ARITH_MOV, TCG_REG_TMP, 0, t_addr,
SHIFT_IMM_LSR(TARGET_PAGE_BITS));
tcg_out_dat_reg(s, (a_mask ? COND_EQ : COND_AL), ARITH_CMP,
0, TCG_REG_R2, TCG_REG_TMP,
SHIFT_IMM_LSL(TARGET_PAGE_BITS));
}
if (TARGET_LONG_BITS == 64) {
tcg_out_dat_reg(s, COND_EQ, ARITH_CMP, 0, TCG_REG_R3, addrhi, 0);
}
*h = (HostAddress){
.cond = COND_AL,
.base = addrlo,
.index = TCG_REG_R1,
.index_scratch = true,
};
#else
if (a_mask) {
ldst = new_ldst_label(s);
ldst->is_ld = is_ld;
ldst->oi = oi;
ldst->addrlo_reg = addrlo;
ldst->addrhi_reg = addrhi;
/* We are expecting a_bits to max out at 7 */
tcg_debug_assert(a_mask <= 0xff);
/* tst addr, #mask */
tcg_out_dat_imm(s, COND_AL, ARITH_TST, 0, addrlo, a_mask);
}
*h = (HostAddress){
.cond = COND_AL,
.base = addrlo,
.index = guest_base ? TCG_REG_GUEST_BASE : -1,
.index_scratch = false,
};
#endif
return ldst;
}
static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp opc, TCGReg datalo,
TCGReg datahi, HostAddress h)
{
@ -1799,37 +1785,28 @@ static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
MemOpIdx oi, TCGType data_type)
{
MemOp opc = get_memop(oi);
TCGLabelQemuLdst *ldst;
HostAddress h;
#ifdef CONFIG_SOFTMMU
h.cond = COND_AL;
h.base = addrlo;
h.index_scratch = true;
h.index = tcg_out_tlb_read(s, addrlo, addrhi, opc, get_mmuidx(oi), 1);
ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
if (ldst) {
ldst->type = data_type;
ldst->datalo_reg = datalo;
ldst->datahi_reg = datahi;
/*
* This a conditional BL only to load a pointer within this opcode into
* LR for the slow path. We will not be using the value for a tail call.
*/
tcg_insn_unit *label_ptr = s->code_ptr;
tcg_out_bl_imm(s, COND_NE, 0);
/*
* This a conditional BL only to load a pointer within this
* opcode into LR for the slow path. We will not be using
* the value for a tail call.
*/
ldst->label_ptr[0] = s->code_ptr;
tcg_out_bl_imm(s, COND_NE, 0);
tcg_out_qemu_ld_direct(s, opc, datalo, datahi, h);
add_qemu_ldst_label(s, true, oi, data_type, datalo, datahi,
addrlo, addrhi, s->code_ptr, label_ptr);
#else
unsigned a_bits = get_alignment_bits(opc);
if (a_bits) {
tcg_out_test_alignment(s, true, addrlo, addrhi, a_bits);
tcg_out_qemu_ld_direct(s, opc, datalo, datahi, h);
ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
} else {
tcg_out_qemu_ld_direct(s, opc, datalo, datahi, h);
}
h.cond = COND_AL;
h.base = addrlo;
h.index = guest_base ? TCG_REG_GUEST_BASE : -1;
h.index_scratch = false;
tcg_out_qemu_ld_direct(s, opc, datalo, datahi, h);
#endif
}
static void tcg_out_qemu_st_direct(TCGContext *s, MemOp opc, TCGReg datalo,
@ -1891,35 +1868,25 @@ static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
MemOpIdx oi, TCGType data_type)
{
MemOp opc = get_memop(oi);
TCGLabelQemuLdst *ldst;
HostAddress h;
#ifdef CONFIG_SOFTMMU
h.cond = COND_EQ;
h.base = addrlo;
h.index_scratch = true;
h.index = tcg_out_tlb_read(s, addrlo, addrhi, opc, get_mmuidx(oi), 0);
tcg_out_qemu_st_direct(s, opc, datalo, datahi, h);
ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
if (ldst) {
ldst->type = data_type;
ldst->datalo_reg = datalo;
ldst->datahi_reg = datahi;
/* The conditional call must come last, as we're going to return here. */
tcg_insn_unit *label_ptr = s->code_ptr;
tcg_out_bl_imm(s, COND_NE, 0);
add_qemu_ldst_label(s, false, oi, data_type, datalo, datahi,
addrlo, addrhi, s->code_ptr, label_ptr);
#else
unsigned a_bits = get_alignment_bits(opc);
h.cond = COND_AL;
if (a_bits) {
tcg_out_test_alignment(s, false, addrlo, addrhi, a_bits);
h.cond = COND_EQ;
}
tcg_out_qemu_st_direct(s, opc, datalo, datahi, h);
h.base = addrlo;
h.index = guest_base ? TCG_REG_GUEST_BASE : -1;
h.index_scratch = false;
tcg_out_qemu_st_direct(s, opc, datalo, datahi, h);
#endif
/* The conditional call is last, as we're going to return here. */
ldst->label_ptr[0] = s->code_ptr;
tcg_out_bl_imm(s, COND_NE, 0);
ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
} else {
tcg_out_qemu_st_direct(s, opc, datalo, datahi, h);
}
}
static void tcg_out_epilogue(TCGContext *s);