From 92ec056a6b2fc5d5a5593121c5d9475d2a2461d6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 20 Sep 2022 05:42:45 -0400 Subject: [PATCH] target/i386: reimplement 0x0f 0x60-0x6f, add AVX These are both MMX and SSE/AVX instructions, except for vmovdqu. In both cases the inputs and output is in s->ptr{0,1,2}, so the only difference between MMX, SSE, and AVX is which helper to call. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 42 +++++++ target/i386/tcg/emit.c.inc | 202 +++++++++++++++++++++++++++++++ target/i386/tcg/translate.c | 19 ++- 3 files changed, 262 insertions(+), 1 deletion(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index dc04aa9cbb..8a749f33fd 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -135,6 +135,19 @@ static uint8_t get_modrm(DisasContext *s, CPUX86State *env) return s->modrm; } +static inline const X86OpEntry *decode_by_prefix(DisasContext *s, const X86OpEntry entries[4]) +{ + if (s->prefix & PREFIX_REPNZ) { + return &entries[3]; + } else if (s->prefix & PREFIX_REPZ) { + return &entries[2]; + } else if (s->prefix & PREFIX_DATA) { + return &entries[1]; + } else { + return &entries[0]; + } +} + static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) { static const X86GenFunc group17_gen[8] = { @@ -144,6 +157,17 @@ static void decode_group17(DisasContext *s, CPUX86State *env, X86OpEntry *entry, entry->gen = group17_gen[op]; } +static void decode_0F6F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry opcodes_0F6F[4] = { + X86_OP_ENTRY3(MOVDQ, P,q, None,None, Q,q, vex1 mmx), /* movq */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex1), /* movdqa */ + X86_OP_ENTRY3(MOVDQ, V,x, None,None, W,x, vex4_unal), /* movdqu */ + {}, + }; + *entry = *decode_by_prefix(s, opcodes_0F6F); +} + static const X86OpEntry opcodes_0F38_00toEF[240] = { }; @@ -229,8 +253,26 @@ static void decode_0F3A(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui } static const X86OpEntry opcodes_0F[256] = { + [0x60] = X86_OP_ENTRY3(PUNPCKLBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x61] = X86_OP_ENTRY3(PUNPCKLWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x62] = X86_OP_ENTRY3(PUNPCKLDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x63] = X86_OP_ENTRY3(PACKSSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x64] = X86_OP_ENTRY3(PCMPGTB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x65] = X86_OP_ENTRY3(PCMPGTW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x66] = X86_OP_ENTRY3(PCMPGTD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x67] = X86_OP_ENTRY3(PACKUSWB, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x38] = X86_OP_GROUP0(0F38), [0x3a] = X86_OP_GROUP0(0F3A), + + [0x68] = X86_OP_ENTRY3(PUNPCKHBW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x69] = X86_OP_ENTRY3(PUNPCKHWD, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6a] = X86_OP_ENTRY3(PUNPCKHDQ, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6b] = X86_OP_ENTRY3(PACKSSDW, V,x, H,x, W,x, vex4 mmx avx2_256 p_00_66), + [0x6c] = X86_OP_ENTRY3(PUNPCKLQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256), + [0x6d] = X86_OP_ENTRY3(PUNPCKHQDQ, V,x, H,x, W,x, vex4 p_66 avx2_256), + [0x6e] = X86_OP_ENTRY3(MOVD_to, V,x, None,None, E,y, vex5 mmx p_00_66), /* wrong dest Vy on SDM! */ + [0x6f] = X86_OP_GROUP0(0F6F), }; static void do_decode_0F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 947deacd37..8dbacc21ed 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -71,6 +71,56 @@ static inline int xmm_offset(MemOp ot) } } +static int vector_reg_offset(X86DecodedOp *op) +{ + assert(op->unit == X86_OP_MMX || op->unit == X86_OP_SSE); + + if (op->unit == X86_OP_MMX) { + return op->offset - mmx_offset(op->ot); + } else { + return op->offset - xmm_offset(op->ot); + } +} + +static int vector_elem_offset(X86DecodedOp *op, MemOp ot, int n) +{ + int base_ofs = vector_reg_offset(op); + switch(ot) { + case MO_8: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_B(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_B(n)); + } + case MO_16: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_W(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_W(n)); + } + case MO_32: + if (op->unit == X86_OP_MMX) { + return base_ofs + offsetof(MMXReg, MMX_L(n)); + } else { + return base_ofs + offsetof(ZMMReg, ZMM_L(n)); + } + case MO_64: + if (op->unit == X86_OP_MMX) { + return base_ofs; + } else { + return base_ofs + offsetof(ZMMReg, ZMM_Q(n)); + } + case MO_128: + assert(op->unit == X86_OP_SSE); + return base_ofs + offsetof(ZMMReg, ZMM_X(n)); + case MO_256: + assert(op->unit == X86_OP_SSE); + return base_ofs + offsetof(ZMMReg, ZMM_Y(n)); + default: + g_assert_not_reached(); + } +} + static void compute_mmx_offset(X86DecodedOp *op) { if (!op->has_ea) { @@ -183,6 +233,23 @@ static void gen_load(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v) } } +static TCGv_ptr op_ptr(X86DecodedInsn *decode, int opn) +{ + X86DecodedOp *op = &decode->op[opn]; + if (op->v_ptr) { + return op->v_ptr; + } + op->v_ptr = tcg_temp_new_ptr(); + + /* The temporary points to the MMXReg or ZMMReg. */ + tcg_gen_addi_ptr(op->v_ptr, cpu_env, vector_reg_offset(op)); + return op->v_ptr; +} + +#define OP_PTR0 op_ptr(decode, 0) +#define OP_PTR1 op_ptr(decode, 1) +#define OP_PTR2 op_ptr(decode, 2) + static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv v) { X86DecodedOp *op = &decode->op[opn]; @@ -216,6 +283,114 @@ static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv } } +static inline int vector_len(DisasContext *s, X86DecodedInsn *decode) +{ + if (decode->e.special == X86_SPECIAL_MMX && + !(s->prefix & (PREFIX_DATA | PREFIX_REPZ | PREFIX_REPNZ))) { + return 8; + } + return s->vex_l ? 32 : 16; +} + +static void gen_store_sse(DisasContext *s, X86DecodedInsn *decode, int src_ofs) +{ + MemOp ot = decode->op[0].ot; + int vec_len = vector_len(s, decode); + bool aligned = sse_needs_alignment(s, decode, ot); + + if (!decode->op[0].has_ea) { + tcg_gen_gvec_mov(MO_64, decode->op[0].offset, src_ofs, vec_len, vec_len); + return; + } + + switch (ot) { + case MO_64: + gen_stq_env_A0(s, src_ofs); + break; + case MO_128: + gen_sto_env_A0(s, src_ofs, aligned); + break; + case MO_256: + gen_sty_env_A0(s, src_ofs, aligned); + break; + default: + g_assert_not_reached(); + } +} + +#define BINARY_INT_GVEC(uname, func, ...) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + int vec_len = vector_len(s, decode); \ + \ + func(__VA_ARGS__, \ + decode->op[0].offset, decode->op[1].offset, \ + decode->op[2].offset, vec_len, vec_len); \ +} + +BINARY_INT_GVEC(PCMPGTB, tcg_gen_gvec_cmp, TCG_COND_GT, MO_8) +BINARY_INT_GVEC(PCMPGTW, tcg_gen_gvec_cmp, TCG_COND_GT, MO_16) +BINARY_INT_GVEC(PCMPGTD, tcg_gen_gvec_cmp, TCG_COND_GT, MO_32) + + +/* + * 00 = p* Pq, Qq (if mmx not NULL; no VEX) + * 66 = vp* Vx, Hx, Wx + * + * These are really the same encoding, because 1) V is the same as P when VEX.V + * is not present 2) P and Q are the same as H and W apart from MM/XMM + */ +static inline void gen_binary_int_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, + SSEFunc_0_eppp mmx, SSEFunc_0_eppp xmm, SSEFunc_0_eppp ymm) +{ + assert(!!mmx == !!(decode->e.special == X86_SPECIAL_MMX)); + + if (mmx && (s->prefix & PREFIX_VEX) && !(s->prefix & PREFIX_DATA)) { + /* VEX encoding is not applicable to MMX instructions. */ + gen_illegal_opcode(s); + return; + } + if (!(s->prefix & PREFIX_DATA)) { + mmx(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else if (!s->vex_l) { + xmm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } else { + ymm(cpu_env, OP_PTR0, OP_PTR1, OP_PTR2); + } +} + + +#define BINARY_INT_MMX(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_binary_int_sse(s, env, decode, \ + gen_helper_##lname##_mmx, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} +BINARY_INT_MMX(PUNPCKLBW, punpcklbw) +BINARY_INT_MMX(PUNPCKLWD, punpcklwd) +BINARY_INT_MMX(PUNPCKLDQ, punpckldq) +BINARY_INT_MMX(PACKSSWB, packsswb) +BINARY_INT_MMX(PACKUSWB, packuswb) +BINARY_INT_MMX(PUNPCKHBW, punpckhbw) +BINARY_INT_MMX(PUNPCKHWD, punpckhwd) +BINARY_INT_MMX(PUNPCKHDQ, punpckhdq) +BINARY_INT_MMX(PACKSSDW, packssdw) + +/* Instructions with no MMX equivalent. */ +#define BINARY_INT_SSE(uname, lname) \ +static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + gen_binary_int_sse(s, env, decode, \ + NULL, \ + gen_helper_##lname##_xmm, \ + gen_helper_##lname##_ymm); \ +} + +BINARY_INT_SSE(PUNPCKLQDQ, punpcklqdq) +BINARY_INT_SSE(PUNPCKHQDQ, punpckhqdq) + static void gen_ADCOX(DisasContext *s, CPUX86State *env, MemOp ot, int cc_op) { TCGv carry_in = NULL; @@ -383,6 +558,33 @@ static void gen_MOVBE(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) } } +static void gen_MOVD_to(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[2].ot; + int vec_len = vector_len(s, decode); + int lo_ofs = vector_elem_offset(&decode->op[0], ot, 0); + + tcg_gen_gvec_dup_imm(MO_64, decode->op[0].offset, vec_len, vec_len, 0); + + switch (ot) { + case MO_32: +#ifdef TARGET_X86_64 + tcg_gen_st32_tl(s->T1, cpu_env, lo_ofs); + break; + case MO_64: +#endif + tcg_gen_st_tl(s->T1, cpu_env, lo_ofs); + break; + default: + g_assert_not_reached(); + } +} + +static void gen_MOVDQ(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_store_sse(s, decode, decode->op[2].offset); +} + static void gen_MULX(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) { MemOp ot = decode->op[0].ot; diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index e2c01af02d..5133b8c23d 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -2935,6 +2935,23 @@ static void gen_ldy_env_A0(DisasContext *s, int offset, bool align) tcg_gen_st_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3))); } +static void gen_sty_env_A0(DisasContext *s, int offset, bool align) +{ + int mem_index = s->mem_index; + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(0))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->A0, mem_index, + MO_LEUQ | (align ? MO_ALIGN_32 : 0)); + tcg_gen_addi_tl(s->tmp0, s->A0, 8); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(1))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 16); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(2))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); + tcg_gen_addi_tl(s->tmp0, s->A0, 24); + tcg_gen_ld_i64(s->tmp1_i64, cpu_env, offset + offsetof(YMMReg, YMM_Q(3))); + tcg_gen_qemu_st_i64(s->tmp1_i64, s->tmp0, mem_index, MO_LEUQ); +} + static inline void gen_op_movo(DisasContext *s, int d_offset, int s_offset) { tcg_gen_ld_i64(s->tmp1_i64, cpu_env, s_offset + offsetof(XMMReg, XMM_Q(0))); @@ -4764,7 +4781,7 @@ static bool disas_insn(DisasContext *s, CPUState *cpu) #ifndef CONFIG_USER_ONLY use_new &= b <= limit; #endif - if (use_new && 0) { + if (use_new && (b >= 0x160 && b <= 0x16f)) { disas_insn_new(s, cpu, b + 0x100); return s->pc; }