From 405c7c07082b4f9d11500659c849657297f0d065 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Oct 2023 10:31:39 +0200 Subject: [PATCH] target/i386: implement CMPccXADD The main difficulty here is that a page fault when writing to the destination must not overwrite the flags. Therefore, the flags computation must be inlined instead of using gen_jcc1*. For simplicity, I am using an unconditional cmpxchg operation, that becomes a NOP if the comparison fails. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 2 +- target/i386/tcg/decode-new.c.inc | 25 ++++++++ target/i386/tcg/decode-new.h | 1 + target/i386/tcg/emit.c.inc | 104 +++++++++++++++++++++++++++++++ target/i386/tcg/translate.c | 2 + 5 files changed, 133 insertions(+), 1 deletion(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 95d5f16cd5..fd47ee7def 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -738,7 +738,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, #define TCG_7_0_EDX_FEATURES (CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_KERNEL_FEATURES) #define TCG_7_1_EAX_FEATURES (CPUID_7_1_EAX_FZRM | CPUID_7_1_EAX_FSRS | \ - CPUID_7_1_EAX_FSRC) + CPUID_7_1_EAX_FSRC | CPUID_7_1_EAX_CMPCCXADD) #define TCG_7_1_EDX_FEATURES 0 #define TCG_7_2_EDX_FEATURES 0 #define TCG_APM_FEATURES 0 diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 717d730772..426c459412 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -538,6 +538,28 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), [0xde] = X86_OP_ENTRY3(VAESDEC, V,x, H,x, W,x, vex4 cpuid(AES) p_66), [0xdf] = X86_OP_ENTRY3(VAESDECLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), + + /* + * REG selects srcdest2 operand, VEX.vvvv selects src3. VEX class not found + * in manual, assumed to be 13 from the VEX.L0 constraint. + */ + [0xe0] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe1] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe2] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe3] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe4] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe5] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe6] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe7] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + + [0xe8] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xe9] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xea] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xeb] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xec] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xed] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xee] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), + [0xef] = X86_OP_ENTRY3(CMPccXADD, M,y, G,y, B,y, vex13 xchg chk(o64) cpuid(CMPCCXADD) p_66), }; /* five rows for no prefix, 66, F3, F2, 66+F2 */ @@ -1503,6 +1525,9 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_AVX2); case X86_FEAT_SHA_NI: return (s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SHA_NI); + + case X86_FEAT_CMPCCXADD: + return (s->cpuid_7_1_eax_features & CPUID_7_1_EAX_CMPCCXADD); } g_assert_not_reached(); } diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index 25220fc436..15e6bfef4b 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -104,6 +104,7 @@ typedef enum X86CPUIDFeature { X86_FEAT_AVX2, X86_FEAT_BMI1, X86_FEAT_BMI2, + X86_FEAT_CMPCCXADD, X86_FEAT_F16C, X86_FEAT_FMA, X86_FEAT_MOVBE, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index fd120e7b9b..6bcf88ecd7 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -1190,6 +1190,110 @@ static void gen_BZHI(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) prepare_update2_cc(decode, s, CC_OP_BMILGB + ot); } +static void gen_CMPccXADD(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + TCGLabel *label_top = gen_new_label(); + TCGLabel *label_bottom = gen_new_label(); + TCGv oldv = tcg_temp_new(); + TCGv newv = tcg_temp_new(); + TCGv cmpv = tcg_temp_new(); + TCGCond cond; + + TCGv cmp_lhs, cmp_rhs; + MemOp ot, ot_full; + + int jcc_op = (decode->b >> 1) & 7; + static const TCGCond cond_table[8] = { + [JCC_O] = TCG_COND_LT, /* test sign bit by comparing against 0 */ + [JCC_B] = TCG_COND_LTU, + [JCC_Z] = TCG_COND_EQ, + [JCC_BE] = TCG_COND_LEU, + [JCC_S] = TCG_COND_LT, /* test sign bit by comparing against 0 */ + [JCC_P] = TCG_COND_EQ, /* even parity - tests low bit of popcount */ + [JCC_L] = TCG_COND_LT, + [JCC_LE] = TCG_COND_LE, + }; + + cond = cond_table[jcc_op]; + if (decode->b & 1) { + cond = tcg_invert_cond(cond); + } + + ot = decode->op[0].ot; + ot_full = ot | MO_LE; + if (jcc_op >= JCC_S) { + /* + * Sign-extend values before subtracting for S, P (zero/sign extension + * does not matter there) L, LE and their inverses. + */ + ot_full |= MO_SIGN; + } + + /* + * cmpv will be moved to cc_src *after* cpu_regs[] is written back, so use + * tcg_gen_ext_tl instead of gen_ext_tl. + */ + tcg_gen_ext_tl(cmpv, cpu_regs[decode->op[1].n], ot_full); + + /* + * Cmpxchg loop starts here. + * - s->T1: addition operand (from decoder) + * - s->A0: dest address (from decoder) + * - s->cc_srcT: memory operand (lhs for comparison) + * - cmpv: rhs for comparison + */ + gen_set_label(label_top); + gen_op_ld_v(s, ot_full, s->cc_srcT, s->A0); + tcg_gen_sub_tl(s->T0, s->cc_srcT, cmpv); + + /* Compute the comparison result by hand, to avoid clobbering cc_*. */ + switch (jcc_op) { + case JCC_O: + /* (src1 ^ src2) & (src1 ^ dst). newv is only used here for a moment */ + tcg_gen_xor_tl(newv, s->cc_srcT, s->T0); + tcg_gen_xor_tl(s->tmp0, s->cc_srcT, cmpv); + tcg_gen_and_tl(s->tmp0, s->tmp0, newv); + tcg_gen_sextract_tl(s->tmp0, s->tmp0, 0, 8 << ot); + cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0); + break; + + case JCC_P: + tcg_gen_ext8u_tl(s->tmp0, s->T0); + tcg_gen_ctpop_tl(s->tmp0, s->tmp0); + tcg_gen_andi_tl(s->tmp0, s->tmp0, 1); + cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0); + break; + + case JCC_S: + tcg_gen_sextract_tl(s->tmp0, s->T0, 0, 8 << ot); + cmp_lhs = s->tmp0, cmp_rhs = tcg_constant_tl(0); + break; + + default: + cmp_lhs = s->cc_srcT, cmp_rhs = cmpv; + break; + } + + /* Compute new value: if condition does not hold, just store back s->cc_srcT */ + tcg_gen_add_tl(newv, s->cc_srcT, s->T1); + tcg_gen_movcond_tl(cond, newv, cmp_lhs, cmp_rhs, newv, s->cc_srcT); + tcg_gen_atomic_cmpxchg_tl(oldv, s->A0, s->cc_srcT, newv, s->mem_index, ot_full); + + /* Exit unconditionally if cmpxchg succeeded. */ + tcg_gen_brcond_tl(TCG_COND_EQ, oldv, s->cc_srcT, label_bottom); + + /* Try again if there was actually a store to make. */ + tcg_gen_brcond_tl(cond, cmp_lhs, cmp_rhs, label_top); + gen_set_label(label_bottom); + + /* Store old value to registers only after a successful store. */ + gen_writeback(s, decode, 1, s->cc_srcT); + + decode->cc_dst = s->T0; + decode->cc_src = cmpv; + decode->cc_op = CC_OP_SUBB + ot; +} + static void gen_CRC32(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) { MemOp ot = decode->op[2].ot; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index fe82d42157..e1eb82a5c6 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -122,6 +122,7 @@ typedef struct DisasContext { int cpuid_ext3_features; int cpuid_7_0_ebx_features; int cpuid_7_0_ecx_features; + int cpuid_7_1_eax_features; int cpuid_xsave_features; /* TCG local temps */ @@ -6963,6 +6964,7 @@ static void i386_tr_init_disas_context(DisasContextBase *dcbase, CPUState *cpu) dc->cpuid_ext3_features = env->features[FEAT_8000_0001_ECX]; dc->cpuid_7_0_ebx_features = env->features[FEAT_7_0_EBX]; dc->cpuid_7_0_ecx_features = env->features[FEAT_7_0_ECX]; + dc->cpuid_7_1_eax_features = env->features[FEAT_7_1_EAX]; dc->cpuid_xsave_features = env->features[FEAT_XSAVE]; dc->jmp_opt = !((cflags & CF_NO_GOTO_TB) || (flags & (HF_TF_MASK | HF_INHIBIT_IRQ_MASK)));