target/sh4: Recognize common gUSA sequences

For many of the sequences produced by gcc or glibc,
we can translate these as host atomic operations.
Which saves the need to acquire the exclusive lock.

Signed-off-by: Richard Henderson <rth@twiddle.net>

Message-Id: <20170718200255.31647-8-rth@twiddle.net>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
This commit is contained in:
Richard Henderson 2017-07-18 10:02:32 -10:00 committed by Aurelien Jarno
parent 4bfa602bc2
commit d6a6cffdd3

View File

@ -1869,10 +1869,17 @@ static void decode_opc(DisasContext * ctx)
*/
static int decode_gusa(DisasContext *ctx, CPUSH4State *env, int *pmax_insns)
{
uint16_t insns[5];
int ld_adr, ld_dst, ld_mop;
int op_dst, op_src, op_opc;
int mv_src, mt_dst, st_src, st_mop;
TCGv op_arg;
uint32_t pc = ctx->pc;
uint32_t pc_end = ctx->tb->cs_base;
int backup = sextract32(ctx->tbflags, GUSA_SHIFT, 8);
int max_insns = (pc_end - pc) / 2;
int i;
if (pc != pc_end + backup || max_insns < 2) {
/* This is a malformed gUSA region. Don't do anything special,
@ -1889,6 +1896,320 @@ static int decode_gusa(DisasContext *ctx, CPUSH4State *env, int *pmax_insns)
return 0;
}
/* The state machine below will consume only a few insns.
If there are more than that in a region, fail now. */
if (max_insns > ARRAY_SIZE(insns)) {
goto fail;
}
/* Read all of the insns for the region. */
for (i = 0; i < max_insns; ++i) {
insns[i] = cpu_lduw_code(env, pc + i * 2);
}
ld_adr = ld_dst = ld_mop = -1;
mv_src = -1;
op_dst = op_src = op_opc = -1;
mt_dst = -1;
st_src = st_mop = -1;
TCGV_UNUSED(op_arg);
i = 0;
#define NEXT_INSN \
do { if (i >= max_insns) goto fail; ctx->opcode = insns[i++]; } while (0)
/*
* Expect a load to begin the region.
*/
NEXT_INSN;
switch (ctx->opcode & 0xf00f) {
case 0x6000: /* mov.b @Rm,Rn */
ld_mop = MO_SB;
break;
case 0x6001: /* mov.w @Rm,Rn */
ld_mop = MO_TESW;
break;
case 0x6002: /* mov.l @Rm,Rn */
ld_mop = MO_TESL;
break;
default:
goto fail;
}
ld_adr = B7_4;
ld_dst = B11_8;
if (ld_adr == ld_dst) {
goto fail;
}
/* Unless we see a mov, any two-operand operation must use ld_dst. */
op_dst = ld_dst;
/*
* Expect an optional register move.
*/
NEXT_INSN;
switch (ctx->opcode & 0xf00f) {
case 0x6003: /* mov Rm,Rn */
/* Here we want to recognize ld_dst being saved for later consumtion,
or for another input register being copied so that ld_dst need not
be clobbered during the operation. */
op_dst = B11_8;
mv_src = B7_4;
if (op_dst == ld_dst) {
/* Overwriting the load output. */
goto fail;
}
if (mv_src != ld_dst) {
/* Copying a new input; constrain op_src to match the load. */
op_src = ld_dst;
}
break;
default:
/* Put back and re-examine as operation. */
--i;
}
/*
* Expect the operation.
*/
NEXT_INSN;
switch (ctx->opcode & 0xf00f) {
case 0x300c: /* add Rm,Rn */
op_opc = INDEX_op_add_i32;
goto do_reg_op;
case 0x2009: /* and Rm,Rn */
op_opc = INDEX_op_and_i32;
goto do_reg_op;
case 0x200a: /* xor Rm,Rn */
op_opc = INDEX_op_xor_i32;
goto do_reg_op;
case 0x200b: /* or Rm,Rn */
op_opc = INDEX_op_or_i32;
do_reg_op:
/* The operation register should be as expected, and the
other input cannot depend on the load. */
if (op_dst != B11_8) {
goto fail;
}
if (op_src < 0) {
/* Unconstrainted input. */
op_src = B7_4;
} else if (op_src == B7_4) {
/* Constrained input matched load. All operations are
commutative; "swap" them by "moving" the load output
to the (implicit) first argument and the move source
to the (explicit) second argument. */
op_src = mv_src;
} else {
goto fail;
}
op_arg = REG(op_src);
break;
case 0x6007: /* not Rm,Rn */
if (ld_dst != B7_4 || mv_src >= 0) {
goto fail;
}
op_dst = B11_8;
op_opc = INDEX_op_xor_i32;
op_arg = tcg_const_i32(-1);
break;
case 0x7000 ... 0x700f: /* add #imm,Rn */
if (op_dst != B11_8 || mv_src >= 0) {
goto fail;
}
op_opc = INDEX_op_add_i32;
op_arg = tcg_const_i32(B7_0s);
break;
case 0x3000: /* cmp/eq Rm,Rn */
/* Looking for the middle of a compare-and-swap sequence,
beginning with the compare. Operands can be either order,
but with only one overlapping the load. */
if ((ld_dst == B11_8) + (ld_dst == B7_4) != 1 || mv_src >= 0) {
goto fail;
}
op_opc = INDEX_op_setcond_i32; /* placeholder */
op_src = (ld_dst == B11_8 ? B7_4 : B11_8);
op_arg = REG(op_src);
NEXT_INSN;
switch (ctx->opcode & 0xff00) {
case 0x8b00: /* bf label */
case 0x8f00: /* bf/s label */
if (pc + (i + 1 + B7_0s) * 2 != pc_end) {
goto fail;
}
if ((ctx->opcode & 0xff00) == 0x8b00) { /* bf label */
break;
}
/* We're looking to unconditionally modify Rn with the
result of the comparison, within the delay slot of
the branch. This is used by older gcc. */
NEXT_INSN;
if ((ctx->opcode & 0xf0ff) == 0x0029) { /* movt Rn */
mt_dst = B11_8;
} else {
goto fail;
}
break;
default:
goto fail;
}
break;
case 0x2008: /* tst Rm,Rn */
/* Looking for a compare-and-swap against zero. */
if (ld_dst != B11_8 || ld_dst != B7_4 || mv_src >= 0) {
goto fail;
}
op_opc = INDEX_op_setcond_i32;
op_arg = tcg_const_i32(0);
NEXT_INSN;
if ((ctx->opcode & 0xff00) != 0x8900 /* bt label */
|| pc + (i + 1 + B7_0s) * 2 != pc_end) {
goto fail;
}
break;
default:
/* Put back and re-examine as store. */
--i;
}
/*
* Expect the store.
*/
/* The store must be the last insn. */
if (i != max_insns - 1) {
goto fail;
}
NEXT_INSN;
switch (ctx->opcode & 0xf00f) {
case 0x2000: /* mov.b Rm,@Rn */
st_mop = MO_UB;
break;
case 0x2001: /* mov.w Rm,@Rn */
st_mop = MO_UW;
break;
case 0x2002: /* mov.l Rm,@Rn */
st_mop = MO_UL;
break;
default:
goto fail;
}
/* The store must match the load. */
if (ld_adr != B11_8 || st_mop != (ld_mop & MO_SIZE)) {
goto fail;
}
st_src = B7_4;
#undef NEXT_INSN
/*
* Emit the operation.
*/
tcg_gen_insn_start(pc, ctx->envflags);
switch (op_opc) {
case -1:
/* No operation found. Look for exchange pattern. */
if (st_src == ld_dst || mv_src >= 0) {
goto fail;
}
tcg_gen_atomic_xchg_i32(REG(ld_dst), REG(ld_adr), REG(st_src),
ctx->memidx, ld_mop);
break;
case INDEX_op_add_i32:
if (op_dst != st_src) {
goto fail;
}
if (op_dst == ld_dst && st_mop == MO_UL) {
tcg_gen_atomic_add_fetch_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
} else {
tcg_gen_atomic_fetch_add_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
if (op_dst != ld_dst) {
/* Note that mop sizes < 4 cannot use add_fetch
because it won't carry into the higher bits. */
tcg_gen_add_i32(REG(op_dst), REG(ld_dst), op_arg);
}
}
break;
case INDEX_op_and_i32:
if (op_dst != st_src) {
goto fail;
}
if (op_dst == ld_dst) {
tcg_gen_atomic_and_fetch_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
} else {
tcg_gen_atomic_fetch_and_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
tcg_gen_and_i32(REG(op_dst), REG(ld_dst), op_arg);
}
break;
case INDEX_op_or_i32:
if (op_dst != st_src) {
goto fail;
}
if (op_dst == ld_dst) {
tcg_gen_atomic_or_fetch_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
} else {
tcg_gen_atomic_fetch_or_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
tcg_gen_or_i32(REG(op_dst), REG(ld_dst), op_arg);
}
break;
case INDEX_op_xor_i32:
if (op_dst != st_src) {
goto fail;
}
if (op_dst == ld_dst) {
tcg_gen_atomic_xor_fetch_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
} else {
tcg_gen_atomic_fetch_xor_i32(REG(ld_dst), REG(ld_adr),
op_arg, ctx->memidx, ld_mop);
tcg_gen_xor_i32(REG(op_dst), REG(ld_dst), op_arg);
}
break;
case INDEX_op_setcond_i32:
if (st_src == ld_dst) {
goto fail;
}
tcg_gen_atomic_cmpxchg_i32(REG(ld_dst), REG(ld_adr), op_arg,
REG(st_src), ctx->memidx, ld_mop);
tcg_gen_setcond_i32(TCG_COND_EQ, cpu_sr_t, REG(ld_dst), op_arg);
if (mt_dst >= 0) {
tcg_gen_mov_i32(REG(mt_dst), cpu_sr_t);
}
break;
default:
g_assert_not_reached();
}
/* If op_src is not a valid register, then op_arg was a constant. */
if (op_src < 0) {
tcg_temp_free_i32(op_arg);
}
/* The entire region has been translated. */
ctx->envflags &= ~GUSA_MASK;
ctx->pc = pc_end;
return max_insns;
fail:
qemu_log_mask(LOG_UNIMP, "Unrecognized gUSA sequence %08x-%08x\n",
pc, pc_end);