From f6aa2f7dee920f6f06fefe122cf2a58cabe3cac0 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 21 Jan 2014 08:36:38 -0800 Subject: [PATCH 01/15] TCG: Fix 32-bit host allocation typo The second half register of a 64-bit temp on a 32-bit host was allocated with the wrong base_type. The base_type of the second half register is never checked, but for consistency it should be the same as the first half. Signed-off-by: Richard Henderson --- tcg/tcg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcg/tcg.c b/tcg/tcg.c index acd02b99b6..ffc851e0c6 100644 --- a/tcg/tcg.c +++ b/tcg/tcg.c @@ -526,7 +526,7 @@ static inline int tcg_temp_new_internal(TCGType type, int temp_local) ts->temp_local = temp_local; ts->name = NULL; ts++; - ts->base_type = TCG_TYPE_I32; + ts->base_type = type; ts->type = TCG_TYPE_I32; ts->temp_allocated = 1; ts->temp_local = temp_local; From 7a3a00979d9dfe2aaa66ce5fc68cd161b4f900ba Mon Sep 17 00:00:00 2001 From: Huw Davies Date: Thu, 13 Feb 2014 10:26:46 +0000 Subject: [PATCH 02/15] tcg-arm: The shift count of op_rotl_i32 is in args[2] not args[1]. It's this that should be subtracted from 0x20 when converting to a right rotate. Cc: qemu-stable@nongnu.org Signed-off-by: Huw Davies Signed-off-by: Richard Henderson --- tcg/arm/tcg-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c index 82658a170c..c8884b31f4 100644 --- a/tcg/arm/tcg-target.c +++ b/tcg/arm/tcg-target.c @@ -1866,7 +1866,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, SHIFT_IMM_ROR((0x20 - args[2]) & 0x1f) : SHIFT_IMM_LSL(0)); } else { - tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[1], 0x20); + tcg_out_dat_imm(s, COND_AL, ARITH_RSB, TCG_REG_TMP, args[2], 0x20); tcg_out_dat_reg(s, COND_AL, ARITH_MOV, args[0], 0, args[1], SHIFT_REG_ROR(TCG_REG_TMP)); } From e46b225a3137e62c975c49aaae7bb5f9583cc428 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 3 Sep 2013 08:27:38 +0200 Subject: [PATCH 03/15] tcg/optimize: fix known-zero bits for right shift ops 32-bit versions of sar and shr ops should not propagate known-zero bits from the unused 32 high bits. For sar it could even lead to wrong code being generated. Cc: qemu-stable@nongnu.org Reviewed-by: Paolo Bonzini Signed-off-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index 89e2d6a3b3..c5cdde2160 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -726,16 +726,25 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[1]].mask & mask; break; - CASE_OP_32_64(sar): + case INDEX_op_sar_i32: if (temps[args[2]].state == TCG_TEMP_CONST) { - mask = ((tcg_target_long)temps[args[1]].mask - >> temps[args[2]].val); + mask = (int32_t)temps[args[1]].mask >> temps[args[2]].val; + } + break; + case INDEX_op_sar_i64: + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = (int64_t)temps[args[1]].mask >> temps[args[2]].val; } break; - CASE_OP_32_64(shr): + case INDEX_op_shr_i32: if (temps[args[2]].state == TCG_TEMP_CONST) { - mask = temps[args[1]].mask >> temps[args[2]].val; + mask = (uint32_t)temps[args[1]].mask >> temps[args[2]].val; + } + break; + case INDEX_op_shr_i64: + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = (uint64_t)temps[args[1]].mask >> temps[args[2]].val; } break; From 3031244b01492528fd7b5e46b23eeb2124dc780a Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 3 Sep 2013 08:27:38 +0200 Subject: [PATCH 04/15] tcg/optimize: fix known-zero bits optimization Known-zero bits optimization is a great idea that helps to generate more optimized code. However the current implementation only works in very few cases as the computed mask is not saved. Fix this to make it really working. Reviewed-by: Paolo Bonzini Signed-off-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index c5cdde2160..7838be2c50 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -691,7 +691,8 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, break; } - /* Simplify using known-zero bits */ + /* Simplify using known-zero bits. Currently only ops with a single + output argument is supported. */ mask = -1; affected = -1; switch (op) { @@ -1149,6 +1150,11 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, } else { for (i = 0; i < def->nb_oargs; i++) { reset_temp(args[i]); + /* Save the corresponding known-zero bits mask for the + first output argument (only one supported so far). */ + if (i == 0) { + temps[args[i]].mask = mask; + } } } for (i = 0; i < def->nb_args; i++) { From f096dc96188378bc2bcd80683490ca386b0c1683 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 3 Sep 2013 08:27:38 +0200 Subject: [PATCH 05/15] tcg/optimize: improve known-zero bits for 32-bit ops The shl_i32 op might set some bits of the unused 32 high bits of the mask. Fix that by clearing the unused 32 high bits for all 32-bit ops except load/store which operate on tl values. Reviewed-by: Paolo Bonzini Signed-off-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tcg/optimize.c b/tcg/optimize.c index 7838be2c50..1cf017aab8 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -783,6 +783,12 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, break; } + /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit + results */ + if (!(tcg_op_defs[op].flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { + mask &= 0xffffffffu; + } + if (mask == 0) { assert(def->nb_oargs == 1); s->gen_opc_buf[op_index] = op_to_movi(op); From c8d70272535b84ccd3cd1a3dcad65aed34be6bb4 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Tue, 3 Sep 2013 08:27:39 +0200 Subject: [PATCH 06/15] tcg/optimize: add known-zero bits compute for load ops Reviewed-by: Paolo Bonzini Signed-off-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index 1cf017aab8..1b9fea5a3d 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -779,13 +779,37 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[3]].mask | temps[args[4]].mask; break; + CASE_OP_32_64(ld8u): + case INDEX_op_qemu_ld8u: + mask = 0xff; + break; + CASE_OP_32_64(ld16u): + case INDEX_op_qemu_ld16u: + mask = 0xffff; + break; + case INDEX_op_ld32u_i64: +#if TCG_TARGET_REG_BITS == 64 + case INDEX_op_qemu_ld32u: +#endif + mask = 0xffffffffu; + break; + + CASE_OP_32_64(qemu_ld): + { + TCGMemOp mop = args[def->nb_oargs + def->nb_iargs]; + if (!(mop & MO_SIGN)) { + mask = (2ULL << ((8 << (mop & MO_SIZE)) - 1)) - 1; + } + } + break; + default: break; } /* 32-bit ops (non 64-bit ops and non load/store ops) generate 32-bit results */ - if (!(tcg_op_defs[op].flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { + if (!(def->flags & (TCG_OPF_CALL_CLOBBER | TCG_OPF_64BIT))) { mask &= 0xffffffffu; } From 23ec69ed3759fe5d8374cb22795ade1305c331c4 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 28 Jan 2014 12:03:24 -0800 Subject: [PATCH 07/15] tcg/optimize: Handle known-zeros masks for ANDC Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tcg/optimize.c b/tcg/optimize.c index 1b9fea5a3d..4bea8a56ef 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -727,6 +727,17 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, mask = temps[args[1]].mask & mask; break; + CASE_OP_32_64(andc): + /* Known-zeros does not imply known-ones. Therefore unless + args[2] is constant, we can't infer anything from it. */ + if (temps[args[2]].state == TCG_TEMP_CONST) { + mask = ~temps[args[2]].mask; + goto and_const; + } + /* But we certainly know nothing outside args[1] may be set. */ + mask = temps[args[1]].mask; + break; + case INDEX_op_sar_i32: if (temps[args[2]].state == TCG_TEMP_CONST) { mask = (int32_t)temps[args[1]].mask >> temps[args[2]].val; From e201b56418a5bb6afadc42df16f94880c091fad4 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 28 Jan 2014 13:15:38 -0800 Subject: [PATCH 08/15] tcg/optimize: Simply some logical ops to NOT Given, of course, an appropriate constant. These could be generated from the "canonical" operation for inversion on the guest, or via other optimizations. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tcg/optimize.c b/tcg/optimize.c index 4bea8a56ef..0b1dd13a63 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -655,6 +655,63 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, } } break; + CASE_OP_32_64(xor): + CASE_OP_32_64(nand): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == -1) { + i = 1; + goto try_not; + } + break; + CASE_OP_32_64(nor): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == 0) { + i = 1; + goto try_not; + } + break; + CASE_OP_32_64(andc): + if (temps[args[2]].state != TCG_TEMP_CONST + && temps[args[1]].state == TCG_TEMP_CONST + && temps[args[1]].val == -1) { + i = 2; + goto try_not; + } + break; + CASE_OP_32_64(orc): + CASE_OP_32_64(eqv): + if (temps[args[2]].state != TCG_TEMP_CONST + && temps[args[1]].state == TCG_TEMP_CONST + && temps[args[1]].val == 0) { + i = 2; + goto try_not; + } + break; + try_not: + { + TCGOpcode not_op; + bool have_not; + + if (def->flags & TCG_OPF_64BIT) { + not_op = INDEX_op_not_i64; + have_not = TCG_TARGET_HAS_not_i64; + } else { + not_op = INDEX_op_not_i32; + have_not = TCG_TARGET_HAS_not_i32; + } + if (!have_not) { + break; + } + s->gen_opc_buf[op_index] = not_op; + reset_temp(args[0]); + gen_args[0] = args[0]; + gen_args[1] = args[i]; + args += 3; + gen_args += 2; + continue; + } default: break; } From e64e958e202c563730159c52f7c9116c80ceca52 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 28 Jan 2014 13:26:17 -0800 Subject: [PATCH 09/15] tcg/optimize: Optmize ANDC X,Y,Y to MOV X,0 Like we already do for SUB and XOR. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tcg/optimize.c b/tcg/optimize.c index 0b1dd13a63..c1b9284600 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -947,6 +947,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, /* Simplify expression for "op r, a, a => movi r, 0" cases */ switch (op) { + CASE_OP_32_64(andc): CASE_OP_32_64(sub): CASE_OP_32_64(xor): if (temps_are_copies(args[1], args[2])) { From 464a1441c138b4f29cff26d406298661e588235b Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Fri, 31 Jan 2014 07:42:11 -0600 Subject: [PATCH 10/15] tcg/optimize: Add more identity simplifications Recognize 0 operand to andc, and -1 operands to and, orc, eqv. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/optimize.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/tcg/optimize.c b/tcg/optimize.c index c1b9284600..7777743e88 100644 --- a/tcg/optimize.c +++ b/tcg/optimize.c @@ -716,7 +716,7 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, break; } - /* Simplify expression for "op r, a, 0 => mov r, a" cases */ + /* Simplify expression for "op r, a, const => mov r, a" cases */ switch (op) { CASE_OP_32_64(add): CASE_OP_32_64(sub): @@ -727,23 +727,32 @@ static TCGArg *tcg_constant_folding(TCGContext *s, uint16_t *tcg_opc_ptr, CASE_OP_32_64(rotr): CASE_OP_32_64(or): CASE_OP_32_64(xor): - if (temps[args[1]].state == TCG_TEMP_CONST) { - /* Proceed with possible constant folding. */ - break; - } - if (temps[args[2]].state == TCG_TEMP_CONST + CASE_OP_32_64(andc): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST && temps[args[2]].val == 0) { - if (temps_are_copies(args[0], args[1])) { - s->gen_opc_buf[op_index] = INDEX_op_nop; - } else { - s->gen_opc_buf[op_index] = op_to_mov(op); - tcg_opt_gen_mov(s, gen_args, args[0], args[1]); - gen_args += 2; - } - args += 3; - continue; + goto do_mov3; } break; + CASE_OP_32_64(and): + CASE_OP_32_64(orc): + CASE_OP_32_64(eqv): + if (temps[args[1]].state != TCG_TEMP_CONST + && temps[args[2]].state == TCG_TEMP_CONST + && temps[args[2]].val == -1) { + goto do_mov3; + } + break; + do_mov3: + if (temps_are_copies(args[0], args[1])) { + s->gen_opc_buf[op_index] = INDEX_op_nop; + } else { + s->gen_opc_buf[op_index] = op_to_mov(op); + tcg_opt_gen_mov(s, gen_args, args[0], args[1]); + gen_args += 2; + } + args += 3; + continue; default: break; } From 189f792dc5dd744c5f5d2333a7c52784e108974e Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 28 Jan 2014 16:39:36 -0800 Subject: [PATCH 11/15] disas/i386: Disassemble ANDN/SHLX/SHRX/SHAX Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- disas/i386.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 14 deletions(-) diff --git a/disas/i386.c b/disas/i386.c index 044e02c032..00ceca9c51 100644 --- a/disas/i386.c +++ b/disas/i386.c @@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis static void print_displacement (char *, bfd_vma); static void OP_E (int, int); static void OP_G (int, int); +static void OP_vvvv (int, int); static bfd_vma get64 (void); static bfd_signed_vma get32 (void); static bfd_signed_vma get32s (void); @@ -264,6 +265,9 @@ static int rex_used; current instruction. */ static int used_prefixes; +/* The VEX.vvvv register, unencoded. */ +static int vex_reg; + /* Flags stored in PREFIXES. */ #define PREFIX_REPZ 1 #define PREFIX_REPNZ 2 @@ -278,6 +282,10 @@ static int used_prefixes; #define PREFIX_ADDR 0x400 #define PREFIX_FWAIT 0x800 +#define PREFIX_VEX_0F 0x1000 +#define PREFIX_VEX_0F38 0x2000 +#define PREFIX_VEX_0F3A 0x4000 + /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive) to ADDR (exclusive) are valid. Returns 1 for success, longjmps on error. */ @@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define XX { NULL, 0 } +#define Bv { OP_vvvv, v_mode } #define Eb { OP_E, b_mode } #define Ev { OP_E, v_mode } #define Ed { OP_E, d_mode } @@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } } #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } } #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } } - +#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } } +#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } } #define X86_64_0 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } } #define X86_64_1 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } } @@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = { { "(bad)", { XX } }, }, + /* PREGRP105 */ + { + { "andnS", { Gv, Bv, Ev } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + }, + + /* PREGRP106 */ + { + { "bextrS", { Gv, Ev, Bv } }, + { "sarxS", { Gv, Ev, Bv } }, + { "shlxS", { Gv, Ev, Bv } }, + { "shrxS", { Gv, Ev, Bv } }, + }, + }; static const struct dis386 x86_64_table[][2] = { @@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = { /* f0 */ { PREGRP87 }, { PREGRP88 }, + { PREGRP105 }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { PREGRP106 }, /* f8 */ { "(bad)", { XX } }, { "(bad)", { XX } }, @@ -3477,6 +3503,74 @@ ckprefix (void) } } +static void +ckvexprefix (void) +{ + int op, vex2, vex3, newrex = 0, newpfx = prefixes; + + if (address_mode == mode_16bit) { + return; + } + + fetch_data(the_info, codep + 1); + op = *codep; + + if (op != 0xc4 && op != 0xc5) { + return; + } + + fetch_data(the_info, codep + 2); + vex2 = codep[1]; + + if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) { + return; + } + + if (op == 0xc4) { + /* Three byte VEX prefix. */ + fetch_data(the_info, codep + 3); + vex3 = codep[2]; + + newrex |= (vex2 & 0x80 ? 0 : REX_R); + newrex |= (vex2 & 0x40 ? 0 : REX_X); + newrex |= (vex2 & 0x20 ? 0 : REX_B); + newrex |= (vex3 & 0x80 ? REX_W : 0); + switch (vex2 & 0x1f) { /* VEX.m-mmmm */ + case 1: + newpfx |= PREFIX_VEX_0F; + break; + case 2: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38; + break; + case 3: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A; + break; + } + vex2 = vex3; + codep += 3; + } else { + /* Two byte VEX prefix. */ + newrex |= (vex2 & 0x80 ? 0 : REX_R); + codep += 2; + } + + vex_reg = (~vex2 >> 3) & 15; /* VEX.vvvv */ + switch (vex2 & 3) { /* VEX.pp */ + case 1: + newpfx |= PREFIX_DATA; /* 0x66 */ + break; + case 2: + newpfx |= PREFIX_REPZ; /* 0xf3 */ + break; + case 3: + newpfx |= PREFIX_REPNZ; /* 0xf2 */ + break; + } + + rex = newrex; + prefixes = newpfx; +} + /* Return the name of the prefix byte PREF, or NULL if PREF is not a prefix byte. */ @@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info) const char *p; struct dis_private priv; unsigned char op; + unsigned char threebyte; if (info->mach == bfd_mach_x86_64_intel_syntax || info->mach == bfd_mach_x86_64) @@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info) obufp = obuf; ckprefix (); + ckvexprefix (); insn_codep = codep; sizeflag = priv.orig_sizeflag; @@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info) } op = 0; + if (prefixes & PREFIX_VEX_0F) + { + used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A; + if (prefixes & PREFIX_VEX_0F38) + threebyte = 0x38; + else if (prefixes & PREFIX_VEX_0F3A) + threebyte = 0x3a; + else + threebyte = *codep++; + goto vex_opcode; + } if (*codep == 0x0f) { - unsigned char threebyte; fetch_data(info, codep + 2); - threebyte = *++codep; + threebyte = codep[1]; + codep += 2; + vex_opcode: dp = &dis386_twobyte[threebyte]; - need_modrm = twobyte_has_modrm[*codep]; - uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep]; - uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep]; - uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep]; - uses_LOCK_prefix = (*codep & ~0x02) == 0x20; - codep++; + need_modrm = twobyte_has_modrm[threebyte]; + uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte]; + uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte]; + uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte]; + uses_LOCK_prefix = (threebyte & ~0x02) == 0x20; if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE) { fetch_data(info, codep + 2); @@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag) } } +static void +OP_vvvv (int bytemode, int sizeflags) +{ + USED_REX (REX_W); + if (rex & REX_W) { + oappend(names64[vex_reg]); + } else { + oappend(names32[vex_reg]); + } +} + static bfd_vma get64 (void) { From a1b29c9ae06abe7ded354eb70767e34dc035db72 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 27 Jan 2014 13:02:31 -0800 Subject: [PATCH 12/15] tcg/i386: Move TCG_CT_CONST_* to tcg-target.c These are not needed by users of tcg-target.h. No need to recompile when we adjust them. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 4 ++++ tcg/i386/tcg-target.h | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 5d4cf9386e..7008b0eb7f 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -88,6 +88,10 @@ static const int tcg_target_call_oarg_regs[] = { #endif }; +/* Constants we accept. */ +#define TCG_CT_CONST_S32 0x100 +#define TCG_CT_CONST_U32 0x200 + /* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on i386. */ diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 92c0fcd36d..747b797315 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -64,9 +64,6 @@ typedef enum { TCG_REG_RDI = TCG_REG_EDI, } TCGReg; -#define TCG_CT_CONST_S32 0x100 -#define TCG_CT_CONST_U32 0x200 - /* used for function call generation */ #define TCG_REG_CALL_STACK TCG_REG_ESP #define TCG_TARGET_STACK_ALIGN 16 From ecc7e84327c1c8e9b006edfaa5d0e3baf35a3f99 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 27 Jan 2014 21:19:40 -0800 Subject: [PATCH 13/15] tcg/i386: Add tcg_out_vex_modrm Prepare for emitting BMI insns which require VEX encoding. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 7008b0eb7f..00dbc3b0c4 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -402,9 +402,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) rex = 0; rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */ - rex |= (r & 8) >> 1; /* REX.R */ - rex |= (x & 8) >> 2; /* REX.X */ - rex |= (rm & 8) >> 3; /* REX.B */ + rex |= (r & 8) >> 1; /* REX.R */ + rex |= (x & 8) >> 2; /* REX.X */ + rex |= (rm & 8) >> 3; /* REX.B */ /* P_REXB_{R,RM} indicates that the given register is the low byte. For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do, @@ -453,6 +453,41 @@ static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm) tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); } +static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) +{ + int tmp; + + if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) { + /* Three byte VEX prefix. */ + tcg_out8(s, 0xc4); + + /* VEX.m-mmmm */ + if (opc & P_EXT38) { + tmp = 2; + } else if (opc & P_EXT) { + tmp = 1; + } else { + tcg_abort(); + } + tmp |= 0x40; /* VEX.X */ + tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */ + tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */ + tcg_out8(s, tmp); + + tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */ + } else { + /* Two byte VEX prefix. */ + tcg_out8(s, 0xc5); + + tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ + } + tmp |= (opc & P_DATA16 ? 1 : 0); /* VEX.pp */ + tmp |= (~v & 15) << 3; /* VEX.vvvv */ + tcg_out8(s, tmp); + tcg_out8(s, opc); + tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm)); +} + /* Output an opcode with a full "rm + (index< Date: Mon, 27 Jan 2014 21:49:17 -0800 Subject: [PATCH 14/15] tcg/i386: Use ANDN instruction Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C so we must handle constants in the implementation of andc. Reviewed-by: Paolo Bonzini Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 54 +++++++++++++++++++++++++++++++++---------- tcg/i386/tcg-target.h | 6 +++-- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 00dbc3b0c4..4f6b9c123d 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -91,6 +91,7 @@ static const int tcg_target_call_oarg_regs[] = { /* Constants we accept. */ #define TCG_CT_CONST_S32 0x100 #define TCG_CT_CONST_U32 0x200 +#define TCG_CT_CONST_I32 0x400 /* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on @@ -128,6 +129,10 @@ static bool have_movbe; # define have_movbe 0 #endif +/* We need this symbol in tcg-target.h, and we can't properly conditionalize + it there. Therefore we always define the variable. */ +bool have_bmi1; + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -224,6 +229,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) case 'Z': ct->ct |= TCG_CT_CONST_U32; break; + case 'I': + ct->ct |= TCG_CT_CONST_I32; + break; default: return -1; @@ -247,6 +255,9 @@ static inline int tcg_target_const_match(tcg_target_long val, if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { return 1; } + if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { + return 1; + } return 0; } @@ -276,6 +287,7 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ +#define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) #define OPC_BSWAP (0xc8 | P_EXT) #define OPC_CALL_Jz (0xe8) @@ -1813,6 +1825,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_32_64(andc): + if (const_args[2]) { + tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, + args[0], args[1]); + tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0); + } else { + tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]); + } + break; + OP_32_64(mul): if (const_args[2]) { int32_t val; @@ -2041,6 +2063,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i32, { "r", "0", "ri" } }, { INDEX_op_or_i32, { "r", "0", "ri" } }, { INDEX_op_xor_i32, { "r", "0", "ri" } }, + { INDEX_op_andc_i32, { "r", "r", "ri" } }, { INDEX_op_shl_i32, { "r", "0", "ci" } }, { INDEX_op_shr_i32, { "r", "0", "ci" } }, @@ -2098,6 +2121,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i64, { "r", "0", "reZ" } }, { INDEX_op_or_i64, { "r", "0", "re" } }, { INDEX_op_xor_i64, { "r", "0", "re" } }, + { INDEX_op_andc_i64, { "r", "r", "rI" } }, { INDEX_op_shl_i64, { "r", "0", "ci" } }, { INDEX_op_shr_i64, { "r", "0", "ci" } }, @@ -2235,25 +2259,31 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { -#if !(defined(have_cmov) && defined(have_movbe)) - { - unsigned a, b, c, d; - int ret = __get_cpuid(1, &a, &b, &c, &d); + unsigned a, b, c, d; + int max = __get_cpuid_max(0, 0); -# ifndef have_cmov + if (max >= 1) { + __cpuid(1, a, b, c, d); +#ifndef have_cmov /* For 32-bit, 99% certainty that we're running on hardware that supports cmov, but we still need to check. In case cmov is not available, we'll use a small forward branch. */ - have_cmov = ret && (d & bit_CMOV); -# endif - -# ifndef have_movbe + have_cmov = (d & bit_CMOV) != 0; +#endif +#ifndef have_movbe /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ - have_movbe = ret && (c & bit_MOVBE); -# endif - } + have_movbe = (c & bit_MOVBE) != 0; #endif + } + + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b, c, d); +#ifdef bit_BMI + have_bmi1 = (b & bit_BMI) != 0; +#endif + } if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 747b797315..bdf2222452 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -73,6 +73,8 @@ typedef enum { #define TCG_TARGET_CALL_STACK_OFFSET 0 #endif +extern bool have_bmi1; + /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 #define TCG_TARGET_HAS_rot_i32 1 @@ -84,7 +86,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap32_i32 1 #define TCG_TARGET_HAS_neg_i32 1 #define TCG_TARGET_HAS_not_i32 1 -#define TCG_TARGET_HAS_andc_i32 0 +#define TCG_TARGET_HAS_andc_i32 have_bmi1 #define TCG_TARGET_HAS_orc_i32 0 #define TCG_TARGET_HAS_eqv_i32 0 #define TCG_TARGET_HAS_nand_i32 0 @@ -112,7 +114,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap64_i64 1 #define TCG_TARGET_HAS_neg_i64 1 #define TCG_TARGET_HAS_not_i64 1 -#define TCG_TARGET_HAS_andc_i64 0 +#define TCG_TARGET_HAS_andc_i64 have_bmi1 #define TCG_TARGET_HAS_orc_i64 0 #define TCG_TARGET_HAS_eqv_i64 0 #define TCG_TARGET_HAS_nand_i64 0 From 6399ab3325b7d4f77441c8a00fa9dae98bb0ac43 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 28 Jan 2014 11:39:49 -0800 Subject: [PATCH 15/15] tcg/i386: Use SHLX/SHRX/SARX instructions These three-operand shift instructions do not require the shift count to be placed into ECX. This reduces the number of mov insns required, with the mere addition of a new register constraint. Don't attempt to get rid of the matching constraint, as that's impossible to manipulate with just a new constraint. In addition, constant shifts still need the matching constraint. Reviewed-by: Paolo Bonzini Reviewed-by: Aurelien Jarno Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 61 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 4f6b9c123d..fef1717418 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -133,6 +133,12 @@ static bool have_movbe; it there. Therefore we always define the variable. */ bool have_bmi1; +#if defined(CONFIG_CPUID_H) && defined(bit_BMI2) +static bool have_bmi2; +#else +# define have_bmi2 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -175,6 +181,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX); break; case 'c': + case_c: ct->ct |= TCG_CT_REG; tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX); break; @@ -203,6 +210,7 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xf); break; case 'r': + case_r: ct->ct |= TCG_CT_REG; if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(ct->u.regs, 0, 0xffff); @@ -210,6 +218,13 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) tcg_regset_set32(ct->u.regs, 0, 0xff); } break; + case 'C': + /* With SHRX et al, we need not use ECX as shift count register. */ + if (have_bmi2) { + goto case_r; + } else { + goto case_c; + } /* qemu_ld/st address constraint */ case 'L': @@ -283,6 +298,8 @@ static inline int tcg_target_const_match(tcg_target_long val, # define P_REXB_RM 0 # define P_GS 0 #endif +#define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */ +#define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */ #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) @@ -325,6 +342,9 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_SHIFT_1 (0xd1) #define OPC_SHIFT_Ib (0xc1) #define OPC_SHIFT_cl (0xd3) +#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3) +#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16) +#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2) #define OPC_TESTL (0x85) #define OPC_XCHG_ax_r32 (0x90) @@ -493,7 +513,14 @@ static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm) tmp = (r & 8 ? 0 : 0x80); /* VEX.R */ } - tmp |= (opc & P_DATA16 ? 1 : 0); /* VEX.pp */ + /* VEX.pp */ + if (opc & P_DATA16) { + tmp |= 1; /* 0x66 */ + } else if (opc & P_SIMDF3) { + tmp |= 2; /* 0xf3 */ + } else if (opc & P_SIMDF2) { + tmp |= 3; /* 0xf2 */ + } tmp |= (~v & 15) << 3; /* VEX.vvvv */ tcg_out8(s, tmp); tcg_out8(s, opc); @@ -1689,7 +1716,7 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64) static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, const TCGArg *args, const int *const_args) { - int c, rexw = 0; + int c, vexop, rexw = 0; #if TCG_TARGET_REG_BITS == 64 # define OP_32_64(x) \ @@ -1860,19 +1887,28 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, OP_32_64(shl): c = SHIFT_SHL; - goto gen_shift; + vexop = OPC_SHLX; + goto gen_shift_maybe_vex; OP_32_64(shr): c = SHIFT_SHR; - goto gen_shift; + vexop = OPC_SHRX; + goto gen_shift_maybe_vex; OP_32_64(sar): c = SHIFT_SAR; - goto gen_shift; + vexop = OPC_SARX; + goto gen_shift_maybe_vex; OP_32_64(rotl): c = SHIFT_ROL; goto gen_shift; OP_32_64(rotr): c = SHIFT_ROR; goto gen_shift; + gen_shift_maybe_vex: + if (have_bmi2 && !const_args[2]) { + tcg_out_vex_modrm(s, vexop + rexw, args[0], args[2], args[1]); + break; + } + /* FALLTHRU */ gen_shift: if (const_args[2]) { tcg_out_shifti(s, c + rexw, args[0], args[2]); @@ -2065,9 +2101,9 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_xor_i32, { "r", "0", "ri" } }, { INDEX_op_andc_i32, { "r", "r", "ri" } }, - { INDEX_op_shl_i32, { "r", "0", "ci" } }, - { INDEX_op_shr_i32, { "r", "0", "ci" } }, - { INDEX_op_sar_i32, { "r", "0", "ci" } }, + { INDEX_op_shl_i32, { "r", "0", "Ci" } }, + { INDEX_op_shr_i32, { "r", "0", "Ci" } }, + { INDEX_op_sar_i32, { "r", "0", "Ci" } }, { INDEX_op_rotl_i32, { "r", "0", "ci" } }, { INDEX_op_rotr_i32, { "r", "0", "ci" } }, @@ -2123,9 +2159,9 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_xor_i64, { "r", "0", "re" } }, { INDEX_op_andc_i64, { "r", "r", "rI" } }, - { INDEX_op_shl_i64, { "r", "0", "ci" } }, - { INDEX_op_shr_i64, { "r", "0", "ci" } }, - { INDEX_op_sar_i64, { "r", "0", "ci" } }, + { INDEX_op_shl_i64, { "r", "0", "Ci" } }, + { INDEX_op_shr_i64, { "r", "0", "Ci" } }, + { INDEX_op_sar_i64, { "r", "0", "Ci" } }, { INDEX_op_rotl_i64, { "r", "0", "ci" } }, { INDEX_op_rotr_i64, { "r", "0", "ci" } }, @@ -2282,6 +2318,9 @@ static void tcg_target_init(TCGContext *s) __cpuid_count(7, 0, a, b, c, d); #ifdef bit_BMI have_bmi1 = (b & bit_BMI) != 0; +#endif +#ifndef have_bmi2 + have_bmi2 = (b & bit_BMI2) != 0; #endif }