From 70c482852aed861d728654c7bad9404eff76d9e3 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 12 May 2010 11:04:27 -0700 Subject: [PATCH] target-sparc: Inline some generation of carry for ADDX/SUBX. Computing carry is trivial for some inputs. By avoiding an external function call, we generate near-optimal code for the common cases of add+addx (double-word arithmetic) and cmp+addx (a setcc pattern). Signed-off-by: Richard Henderson Acked-by: Artyom Tarasenko Signed-off-by: Blue Swirl --- target-sparc/helper.h | 2 +- target-sparc/op_helper.c | 2 +- target-sparc/translate.c | 272 ++++++++++++++++++++++++++++----------- 3 files changed, 200 insertions(+), 76 deletions(-) diff --git a/target-sparc/helper.h b/target-sparc/helper.h index 04c1306d69..6f103e7697 100644 --- a/target-sparc/helper.h +++ b/target-sparc/helper.h @@ -158,6 +158,6 @@ VIS_CMPHELPER(cmpne); #undef VIS_HELPER #undef VIS_CMPHELPER DEF_HELPER_0(compute_psr, void); -DEF_HELPER_0(compute_C_icc, tl); +DEF_HELPER_0(compute_C_icc, i32); #include "def-helper.h" diff --git a/target-sparc/op_helper.c b/target-sparc/op_helper.c index 6deebd35d2..d0bc27766e 100644 --- a/target-sparc/op_helper.c +++ b/target-sparc/op_helper.c @@ -1347,7 +1347,7 @@ void helper_compute_psr(void) CC_OP = CC_OP_FLAGS; } -target_ulong helper_compute_C_icc(void) +uint32_t helper_compute_C_icc(void) { uint32_t ret; diff --git a/target-sparc/translate.c b/target-sparc/translate.c index ea7c71b85a..8129b79d16 100644 --- a/target-sparc/translate.c +++ b/target-sparc/translate.c @@ -332,24 +332,132 @@ static inline void gen_op_add_cc(TCGv dst, TCGv src1, TCGv src2) tcg_gen_mov_tl(dst, cpu_cc_dst); } -static inline void gen_op_addxi_cc(TCGv dst, TCGv src1, target_long src2) +static TCGv_i32 gen_add32_carry32(void) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_movi_tl(cpu_cc_src2, src2); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_addi_tl(cpu_cc_dst, cpu_cc_dst, src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + TCGv_i32 carry_32, cc_src1_32, cc_src2_32; + + /* Carry is computed from a previous add: (dst < src) */ +#if TARGET_LONG_BITS == 64 + cc_src1_32 = tcg_temp_new_i32(); + cc_src2_32 = tcg_temp_new_i32(); + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_dst); + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src); +#else + cc_src1_32 = cpu_cc_dst; + cc_src2_32 = cpu_cc_src; +#endif + + carry_32 = tcg_temp_new_i32(); + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32); + +#if TARGET_LONG_BITS == 64 + tcg_temp_free_i32(cc_src1_32); + tcg_temp_free_i32(cc_src2_32); +#endif + + return carry_32; } -static inline void gen_op_addx_cc(TCGv dst, TCGv src1, TCGv src2) +static TCGv_i32 gen_sub32_carry32(void) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_mov_tl(cpu_cc_src2, src2); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_add_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + TCGv_i32 carry_32, cc_src1_32, cc_src2_32; + + /* Carry is computed from a previous borrow: (src1 < src2) */ +#if TARGET_LONG_BITS == 64 + cc_src1_32 = tcg_temp_new_i32(); + cc_src2_32 = tcg_temp_new_i32(); + tcg_gen_trunc_i64_i32(cc_src1_32, cpu_cc_src); + tcg_gen_trunc_i64_i32(cc_src2_32, cpu_cc_src2); +#else + cc_src1_32 = cpu_cc_src; + cc_src2_32 = cpu_cc_src2; +#endif + + carry_32 = tcg_temp_new_i32(); + tcg_gen_setcond_i32(TCG_COND_LTU, carry_32, cc_src1_32, cc_src2_32); + +#if TARGET_LONG_BITS == 64 + tcg_temp_free_i32(cc_src1_32); + tcg_temp_free_i32(cc_src2_32); +#endif + + return carry_32; +} + +static void gen_op_addx_int(DisasContext *dc, TCGv dst, TCGv src1, + TCGv src2, int update_cc) +{ + TCGv_i32 carry_32; + TCGv carry; + + switch (dc->cc_op) { + case CC_OP_DIV: + case CC_OP_LOGIC: + /* Carry is known to be zero. Fall back to plain ADD. */ + if (update_cc) { + gen_op_add_cc(dst, src1, src2); + } else { + tcg_gen_add_tl(dst, src1, src2); + } + return; + + case CC_OP_ADD: + case CC_OP_TADD: + case CC_OP_TADDTV: +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + { + /* For 32-bit hosts, we can re-use the host's hardware carry + generation by using an ADD2 opcode. We discard the low + part of the output. Ideally we'd combine this operation + with the add that generated the carry in the first place. */ + TCGv dst_low = tcg_temp_new(); + tcg_gen_op6_i32(INDEX_op_add2_i32, dst_low, dst, + cpu_cc_src, src1, cpu_cc_src2, src2); + tcg_temp_free(dst_low); + goto add_done; + } +#endif + carry_32 = gen_add32_carry32(); + break; + + case CC_OP_SUB: + case CC_OP_TSUB: + case CC_OP_TSUBTV: + carry_32 = gen_sub32_carry32(); + break; + + default: + /* We need external help to produce the carry. */ + carry_32 = tcg_temp_new_i32(); + gen_helper_compute_C_icc(carry_32); + break; + } + +#if TARGET_LONG_BITS == 64 + carry = tcg_temp_new(); + tcg_gen_extu_i32_i64(carry, carry_32); +#else + carry = carry_32; +#endif + + tcg_gen_add_tl(dst, src1, src2); + tcg_gen_add_tl(dst, dst, carry); + + tcg_temp_free_i32(carry_32); +#if TARGET_LONG_BITS == 64 + tcg_temp_free(carry); +#endif + +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + add_done: +#endif + if (update_cc) { + tcg_gen_mov_tl(cpu_cc_src, src1); + tcg_gen_mov_tl(cpu_cc_src2, src2); + tcg_gen_mov_tl(cpu_cc_dst, dst); + tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); + dc->cc_op = CC_OP_ADDX; + } } static inline void gen_op_tadd_cc(TCGv dst, TCGv src1, TCGv src2) @@ -415,24 +523,80 @@ static inline void gen_op_sub_cc(TCGv dst, TCGv src1, TCGv src2) tcg_gen_mov_tl(dst, cpu_cc_dst); } -static inline void gen_op_subxi_cc(TCGv dst, TCGv src1, target_long src2) +static void gen_op_subx_int(DisasContext *dc, TCGv dst, TCGv src1, + TCGv src2, int update_cc) { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_movi_tl(cpu_cc_src2, src2); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_subi_tl(cpu_cc_dst, cpu_cc_dst, src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); -} + TCGv_i32 carry_32; + TCGv carry; -static inline void gen_op_subx_cc(TCGv dst, TCGv src1, TCGv src2) -{ - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_mov_tl(cpu_cc_src, src1); - tcg_gen_mov_tl(cpu_cc_src2, src2); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_src, cpu_tmp0); - tcg_gen_sub_tl(cpu_cc_dst, cpu_cc_dst, cpu_cc_src2); - tcg_gen_mov_tl(dst, cpu_cc_dst); + switch (dc->cc_op) { + case CC_OP_DIV: + case CC_OP_LOGIC: + /* Carry is known to be zero. Fall back to plain SUB. */ + if (update_cc) { + gen_op_sub_cc(dst, src1, src2); + } else { + tcg_gen_sub_tl(dst, src1, src2); + } + return; + + case CC_OP_ADD: + case CC_OP_TADD: + case CC_OP_TADDTV: + carry_32 = gen_add32_carry32(); + break; + + case CC_OP_SUB: + case CC_OP_TSUB: + case CC_OP_TSUBTV: +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + { + /* For 32-bit hosts, we can re-use the host's hardware carry + generation by using a SUB2 opcode. We discard the low + part of the output. Ideally we'd combine this operation + with the add that generated the carry in the first place. */ + TCGv dst_low = tcg_temp_new(); + tcg_gen_op6_i32(INDEX_op_sub2_i32, dst_low, dst, + cpu_cc_src, src1, cpu_cc_src2, src2); + tcg_temp_free(dst_low); + goto sub_done; + } +#endif + carry_32 = gen_sub32_carry32(); + break; + + default: + /* We need external help to produce the carry. */ + carry_32 = tcg_temp_new_i32(); + gen_helper_compute_C_icc(carry_32); + break; + } + +#if TARGET_LONG_BITS == 64 + carry = tcg_temp_new(); + tcg_gen_extu_i32_i64(carry, carry_32); +#else + carry = carry_32; +#endif + + tcg_gen_sub_tl(dst, src1, src2); + tcg_gen_sub_tl(dst, dst, carry); + + tcg_temp_free_i32(carry_32); +#if TARGET_LONG_BITS == 64 + tcg_temp_free(carry); +#endif + +#if TCG_TARGET_REG_BITS == 32 && TARGET_LONG_BITS == 32 + sub_done: +#endif + if (update_cc) { + tcg_gen_mov_tl(cpu_cc_src, src1); + tcg_gen_mov_tl(cpu_cc_src2, src2); + tcg_gen_mov_tl(cpu_cc_dst, dst); + tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); + dc->cc_op = CC_OP_SUBX; + } } static inline void gen_op_tsub_cc(TCGv dst, TCGv src1, TCGv src2) @@ -2950,28 +3114,8 @@ static void disas_sparc_insn(DisasContext * dc) } break; case 0x8: /* addx, V9 addc */ - if (IS_IMM) { - simm = GET_FIELDs(insn, 19, 31); - if (xop & 0x10) { - gen_op_addxi_cc(cpu_dst, cpu_src1, simm); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); - dc->cc_op = CC_OP_ADDX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm); - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } else { - if (xop & 0x10) { - gen_op_addx_cc(cpu_dst, cpu_src1, cpu_src2); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_ADDX); - dc->cc_op = CC_OP_ADDX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0); - tcg_gen_add_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } + gen_op_addx_int(dc, cpu_dst, cpu_src1, cpu_src2, + (xop & 0x10)); break; #ifdef TARGET_SPARC64 case 0x9: /* V9 mulx */ @@ -3002,28 +3146,8 @@ static void disas_sparc_insn(DisasContext * dc) } break; case 0xc: /* subx, V9 subc */ - if (IS_IMM) { - simm = GET_FIELDs(insn, 19, 31); - if (xop & 0x10) { - gen_op_subxi_cc(cpu_dst, cpu_src1, simm); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); - dc->cc_op = CC_OP_SUBX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_addi_tl(cpu_tmp0, cpu_tmp0, simm); - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } else { - if (xop & 0x10) { - gen_op_subx_cc(cpu_dst, cpu_src1, cpu_src2); - tcg_gen_movi_i32(cpu_cc_op, CC_OP_SUBX); - dc->cc_op = CC_OP_SUBX; - } else { - gen_helper_compute_C_icc(cpu_tmp0); - tcg_gen_add_tl(cpu_tmp0, cpu_src2, cpu_tmp0); - tcg_gen_sub_tl(cpu_dst, cpu_src1, cpu_tmp0); - } - } + gen_op_subx_int(dc, cpu_dst, cpu_src1, cpu_src2, + (xop & 0x10)); break; #ifdef TARGET_SPARC64 case 0xd: /* V9 udivx */