From 9d2eec202fad72ce05ee8d54dc5a6fb6dcb87776 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Mon, 27 Jan 2014 21:49:17 -0800 Subject: [PATCH] tcg/i386: Use ANDN instruction Note that the optimizer cannot simplify ANDC X,Y,C to AND X,Y,~C so we must handle constants in the implementation of andc. Reviewed-by: Paolo Bonzini Signed-off-by: Richard Henderson --- tcg/i386/tcg-target.c | 54 +++++++++++++++++++++++++++++++++---------- tcg/i386/tcg-target.h | 6 +++-- 2 files changed, 46 insertions(+), 14 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index 00dbc3b0c4..4f6b9c123d 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -91,6 +91,7 @@ static const int tcg_target_call_oarg_regs[] = { /* Constants we accept. */ #define TCG_CT_CONST_S32 0x100 #define TCG_CT_CONST_U32 0x200 +#define TCG_CT_CONST_I32 0x400 /* Registers used with L constraint, which are the first argument registers on x86_64, and two random call clobbered registers on @@ -128,6 +129,10 @@ static bool have_movbe; # define have_movbe 0 #endif +/* We need this symbol in tcg-target.h, and we can't properly conditionalize + it there. Therefore we always define the variable. */ +bool have_bmi1; + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -224,6 +229,9 @@ static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str) case 'Z': ct->ct |= TCG_CT_CONST_U32; break; + case 'I': + ct->ct |= TCG_CT_CONST_I32; + break; default: return -1; @@ -247,6 +255,9 @@ static inline int tcg_target_const_match(tcg_target_long val, if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) { return 1; } + if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) { + return 1; + } return 0; } @@ -276,6 +287,7 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_ARITH_EvIz (0x81) #define OPC_ARITH_EvIb (0x83) #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */ +#define OPC_ANDN (0xf2 | P_EXT38) #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3)) #define OPC_BSWAP (0xc8 | P_EXT) #define OPC_CALL_Jz (0xe8) @@ -1813,6 +1825,16 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc, } break; + OP_32_64(andc): + if (const_args[2]) { + tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, + args[0], args[1]); + tgen_arithi(s, ARITH_AND + rexw, args[0], ~args[2], 0); + } else { + tcg_out_vex_modrm(s, OPC_ANDN + rexw, args[0], args[2], args[1]); + } + break; + OP_32_64(mul): if (const_args[2]) { int32_t val; @@ -2041,6 +2063,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i32, { "r", "0", "ri" } }, { INDEX_op_or_i32, { "r", "0", "ri" } }, { INDEX_op_xor_i32, { "r", "0", "ri" } }, + { INDEX_op_andc_i32, { "r", "r", "ri" } }, { INDEX_op_shl_i32, { "r", "0", "ci" } }, { INDEX_op_shr_i32, { "r", "0", "ci" } }, @@ -2098,6 +2121,7 @@ static const TCGTargetOpDef x86_op_defs[] = { { INDEX_op_and_i64, { "r", "0", "reZ" } }, { INDEX_op_or_i64, { "r", "0", "re" } }, { INDEX_op_xor_i64, { "r", "0", "re" } }, + { INDEX_op_andc_i64, { "r", "r", "rI" } }, { INDEX_op_shl_i64, { "r", "0", "ci" } }, { INDEX_op_shr_i64, { "r", "0", "ci" } }, @@ -2235,25 +2259,31 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { -#if !(defined(have_cmov) && defined(have_movbe)) - { - unsigned a, b, c, d; - int ret = __get_cpuid(1, &a, &b, &c, &d); + unsigned a, b, c, d; + int max = __get_cpuid_max(0, 0); -# ifndef have_cmov + if (max >= 1) { + __cpuid(1, a, b, c, d); +#ifndef have_cmov /* For 32-bit, 99% certainty that we're running on hardware that supports cmov, but we still need to check. In case cmov is not available, we'll use a small forward branch. */ - have_cmov = ret && (d & bit_CMOV); -# endif - -# ifndef have_movbe + have_cmov = (d & bit_CMOV) != 0; +#endif +#ifndef have_movbe /* MOVBE is only available on Intel Atom and Haswell CPUs, so we need to probe for it. */ - have_movbe = ret && (c & bit_MOVBE); -# endif - } + have_movbe = (c & bit_MOVBE) != 0; #endif + } + + if (max >= 7) { + /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */ + __cpuid_count(7, 0, a, b, c, d); +#ifdef bit_BMI + have_bmi1 = (b & bit_BMI) != 0; +#endif + } if (TCG_TARGET_REG_BITS == 64) { tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff); diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h index 747b797315..bdf2222452 100644 --- a/tcg/i386/tcg-target.h +++ b/tcg/i386/tcg-target.h @@ -73,6 +73,8 @@ typedef enum { #define TCG_TARGET_CALL_STACK_OFFSET 0 #endif +extern bool have_bmi1; + /* optional instructions */ #define TCG_TARGET_HAS_div2_i32 1 #define TCG_TARGET_HAS_rot_i32 1 @@ -84,7 +86,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap32_i32 1 #define TCG_TARGET_HAS_neg_i32 1 #define TCG_TARGET_HAS_not_i32 1 -#define TCG_TARGET_HAS_andc_i32 0 +#define TCG_TARGET_HAS_andc_i32 have_bmi1 #define TCG_TARGET_HAS_orc_i32 0 #define TCG_TARGET_HAS_eqv_i32 0 #define TCG_TARGET_HAS_nand_i32 0 @@ -112,7 +114,7 @@ typedef enum { #define TCG_TARGET_HAS_bswap64_i64 1 #define TCG_TARGET_HAS_neg_i64 1 #define TCG_TARGET_HAS_not_i64 1 -#define TCG_TARGET_HAS_andc_i64 0 +#define TCG_TARGET_HAS_andc_i64 have_bmi1 #define TCG_TARGET_HAS_orc_i64 0 #define TCG_TARGET_HAS_eqv_i64 0 #define TCG_TARGET_HAS_nand_i64 0