From 80fd744fdae18292b5cb67cebceea8b750656c40 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 6 Mar 2007 07:59:38 -0800 Subject: [PATCH] i386.c (x86_use_leave, [...]): Merge into ... * config/i386/i386.c (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_movx, x86_double_with_add, x86_use_bit_test, x86_unroll_strlen, x86_deep_branch, x86_branch_hints, x86_use_sahf, x86_partial_reg_stall, x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_fast_prefix, x86_single_stringop, x86_qimode_math, x86_promote_qi_regs, x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_prologue_using_move, x86_epilogue_using_move, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec, x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit, x86_schedule, x86_use_bt, x86_pad_returns): Merge into ... (ix86_tune_features): ... here. New array. (x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_bswap): Merge into ... (ix86_arch_features): ... here. New array. (x86_3dnow_a): Remove. (x86_accumulate_outgoing_args): Make static. (x86_arch_always_fancy_math_387): Make static. (ix86_tune_mask, ix86_arch_mask): Move ... (override_options): ... to local variables here. Apply the appropriate mask to each element of ix86_arch_features and ix86_tune_features. Adjust TARGET_CMOVE and TARGET_USE_SAHF as were done in the old macros. (standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS. * config/i386/i386.h (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch, x86_branch_hints, x86_unroll_strlen, x86_double_with_add, x86_partial_reg_stall, x86_movx, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_single_stringop, x86_fast_prefix, x86_himode_math, x86_qimode_math, x86_promote_qi_regs, x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_accumulate_outgoing_args, x86_prologue_using_move, x86_epilogue_using_move, x86_decompose_lea, x86_arch_always_fancy_math_387, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves, x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_use_incdec, x86_pad_returns, x86_bswap, x86_partial_flag_reg_stall): Remove. (enum ix86_tune_indices): New. (ix86_tune_features): New. (TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND, TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN, TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS, TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX, TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL, TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0, TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES, TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode, TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH, TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS, TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4, TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY, TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS, TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR, TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE, TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP, TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE, TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS, TARGET_EXT_80387_CONSTANTS): Use it. (enum ix86_arch_indices): New. (ix86_arch_features): New. (TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD, TARGET_BSWAP): Use it. (ix86_tune_mask, ix86_arch_mask): Remove. From-SVN: r122621 --- gcc/ChangeLog | 80 ++++++++ gcc/config/i386/i386.c | 457 +++++++++++++++++++++++++---------------- gcc/config/i386/i386.h | 256 ++++++++++++++--------- 3 files changed, 510 insertions(+), 283 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4430d04c3c3..bc27cf5a55a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,83 @@ +2007-03-06 Richard Henderson + + * config/i386/i386.c (x86_use_leave, x86_push_memory, + x86_zero_extend_with_and, x86_movx, x86_double_with_add, + x86_use_bit_test, x86_unroll_strlen, x86_deep_branch, + x86_branch_hints, x86_use_sahf, x86_partial_reg_stall, + x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop, + x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify, + x86_split_long_moves, x86_promote_QImode, x86_fast_prefix, + x86_single_stringop, x86_qimode_math, x86_promote_qi_regs, + x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8, + x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_prologue_using_move, x86_epilogue_using_move, x86_shift1, + x86_sse_partial_reg_dependency, x86_sse_split_regs, + x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, + x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec, + x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit, + x86_schedule, x86_use_bt, x86_pad_returns): Merge into ... + (ix86_tune_features): ... here. New array. + (x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b, + x86_xadd, x86_bswap): Merge into ... + (ix86_arch_features): ... here. New array. + (x86_3dnow_a): Remove. + (x86_accumulate_outgoing_args): Make static. + (x86_arch_always_fancy_math_387): Make static. + (ix86_tune_mask, ix86_arch_mask): Move ... + (override_options): ... to local variables here. Apply the + appropriate mask to each element of ix86_arch_features and + ix86_tune_features. Adjust TARGET_CMOVE and TARGET_USE_SAHF + as were done in the old macros. + (standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS. + * config/i386/i386.h (x86_use_leave, x86_push_memory, + x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch, + x86_branch_hints, x86_unroll_strlen, x86_double_with_add, + x86_partial_reg_stall, x86_movx, x86_use_himode_fiop, + x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb, + x86_read_modify_write, x86_read_modify, x86_split_long_moves, + x86_promote_QImode, x86_single_stringop, x86_fast_prefix, + x86_himode_math, x86_qimode_math, x86_promote_qi_regs, + x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4, + x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8, + x86_partial_reg_dependency, x86_memory_mismatch_stall, + x86_accumulate_outgoing_args, x86_prologue_using_move, + x86_epilogue_using_move, x86_decompose_lea, + x86_arch_always_fancy_math_387, x86_shift1, + x86_sse_partial_reg_dependency, x86_sse_split_regs, + x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, + x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves, + x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, + x86_use_incdec, x86_pad_returns, x86_bswap, + x86_partial_flag_reg_stall): Remove. + (enum ix86_tune_indices): New. + (ix86_tune_features): New. + (TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND, + TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN, + TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS, + TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX, + TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL, + TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0, + TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES, + TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode, + TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH, + TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS, + TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4, + TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES, + TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY, + TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS, + TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR, + TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE, + TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP, + TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE, + TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS, + TARGET_EXT_80387_CONSTANTS): Use it. + (enum ix86_arch_indices): New. + (ix86_arch_features): New. + (TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD, + TARGET_BSWAP): Use it. + (ix86_tune_mask, ix86_arch_mask): Remove. + 2007-03-06 Joseph Myers PR bootstrap/31020 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index ac36887e94a..cf3b3ff319b 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1004,187 +1004,221 @@ const struct processor_costs *ix86_cost = &pentium_cost; (PPro/PENT4/NOCONA/CORE2/Athlon/K8). */ #define m_GENERIC (m_GENERIC32 | m_GENERIC64) -/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for - Generic64 seems like good code size tradeoff. We can't enable it for 32bit - generic because it is not working well with PPro base chips. */ -const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 - | m_GENERIC64; -const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 - | m_NOCONA | m_CORE2 | m_GENERIC; -const int x86_zero_extend_with_and = m_486 | m_PENT; -/* Enable to zero extend integer registers to avoid partial dependencies */ -const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA - | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */; -const int x86_double_with_add = ~m_386; -const int x86_use_bit_test = m_386; -const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 - | m_K6 | m_CORE2 | m_GENERIC; -const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 - | m_NOCONA; -const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10; -const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 - | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC; -/* Branch hints were put in P4 based on simulation result. But - after P4 was made, no performance benefit was observed with - branch hints. It also increases the code size. As the result, - icc never generates branch hints. */ -const int x86_branch_hints = 0; -const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32; - /*m_GENERIC | m_ATHLON_K8 ? */ -/* We probably ought to watch for partial register stalls on Generic32 - compilation setting as well. However in current implementation the - partial register stalls are not eliminated very well - they can - be introduced via subregs synthesized by combine and can happen - in caller/callee saving sequences. - Because this option pays back little on PPro based chips and is in conflict - with partial reg. dependencies used by Athlon/P4 based chips, it is better - to leave it off for generic32 for now. */ -const int x86_partial_reg_stall = m_PPRO; -const int x86_partial_flag_reg_stall = m_CORE2 | m_GENERIC; -const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE; -const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT - | m_CORE2 | m_GENERIC); -const int x86_use_mov0 = m_K6; -const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC); -/* Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ -const int x86_use_xchgb = m_PENT4; -const int x86_read_modify_write = ~m_PENT; -const int x86_read_modify = ~(m_PENT | m_PPRO); -const int x86_split_long_moves = m_PPRO; -const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486 - | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC; - /* m_PENT4 ? */ -const int x86_fast_prefix = ~(m_PENT | m_486 | m_386); -const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA; -const int x86_qimode_math = ~(0); -const int x86_promote_qi_regs = 0; -/* On PPro this flag is meant to avoid partial register stalls. Just like - the x86_partial_reg_stall this option might be considered for Generic32 - if our scheme for avoiding partial stalls was more effective. */ -const int x86_himode_math = ~(m_PPRO); -const int x86_promote_hi_regs = m_PPRO; -/* Enable if add/sub rsp is preferred over 1 or 2 push/pop */ -const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA - | m_CORE2 | m_GENERIC; -const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486 - | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC; -const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA - | m_CORE2 | m_GENERIC; -const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386 - | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC; -/* Enable if integer moves are preferred for DFmode copies */ -const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA - | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE); -const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA - | m_CORE2 | m_GENERIC; -const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA - | m_CORE2 | m_GENERIC; -/* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required - for outgoing arguments will be computed and placed into the variable - `current_function_outgoing_args_size'. No space will be pushed onto the stack - for each call; instead, the function prologue should increase the stack frame - size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is - not proper. */ -const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4 - | m_NOCONA | m_PPRO | m_CORE2 - | m_GENERIC; -const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC; -const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC; -const int x86_shift1 = ~m_486; -const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO - | m_ATHLON_K8_AMDFAM10 | m_PENT4 - | m_NOCONA | m_CORE2 | m_GENERIC; -/* In Generic model we have an conflict here in between PPro/Pentium4 based chips - that thread 128bit SSE registers as single units versus K8 based chips that - divide SSE registers to two 64bit halves. - x86_sse_partial_reg_dependency promote all store destinations to be 128bit - to allow register renaming on 128bit SSE units, but usually results in one - extra microop on 64bit SSE units. Experimental results shows that disabling - this option on P4 brings over 20% SPECfp regression, while enabling it on - K8 brings roughly 2.4% regression that can be partly masked by careful scheduling - of moves. */ -const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 - | m_GENERIC | m_AMDFAM10; -/* Set for machines where the type and dependencies are resolved on SSE - register parts instead of whole registers, so we may maintain just - lower part of scalar values in proper format leaving the upper part - undefined. */ -const int x86_sse_split_regs = m_ATHLON_K8; -/* Code generation for scalar reg-reg moves of single and double precision data: - if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) - movaps reg, reg - else - movss reg, reg - if (x86_sse_partial_reg_dependency == true) - movapd reg, reg - else - movsd reg, reg +/* Feature tests against the various tunings. */ +unsigned int ix86_tune_features[X86_TUNE_LAST] = { + /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results + negatively, so enabling for Generic64 seems like good code size + tradeoff. We can't enable it for 32bit generic because it does not + work well with PPro base chips. */ + m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64, - Code generation for scalar loads of double precision data: - if (x86_sse_split_regs == true) - movlpd mem, reg (gas syntax) - else - movsd mem, reg - - Code generation for unaligned packed loads of single precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): - if (x86_sse_unaligned_move_optimal) - movups mem, reg + /* X86_TUNE_PUSH_MEMORY */ + m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 + | m_NOCONA | m_CORE2 | m_GENERIC, - if (x86_sse_partial_reg_dependency == true) - { - xorps reg, reg - movlps mem, reg - movhps mem+8, reg - } - else - { - movlps mem, reg - movhps mem+8, reg - } + /* X86_TUNE_ZERO_EXTEND_WITH_AND */ + m_486 | m_PENT, - Code generation for unaligned packed loads of double precision data - (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): - if (x86_sse_unaligned_move_optimal) - movupd mem, reg + /* X86_TUNE_USE_BIT_TEST */ + m_386, - if (x86_sse_split_regs == true) - { - movlpd mem, reg - movhpd mem+8, reg - } - else - { - movsd mem, reg - movhpd mem+8, reg - } - */ -const int x86_sse_unaligned_move_optimal = m_AMDFAM10; -const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10; -const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA; -const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10; -const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC); + /* X86_TUNE_UNROLL_STRLEN */ + m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC, -const int x86_inter_unit_moves = ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC); + /* X86_TUNE_DEEP_BRANCH_PREDICTION */ + m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 + | m_NOCONA | m_CORE2 | m_GENERIC, -const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4 - | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC; -/* Some CPU cores are not able to predict more than 4 branch instructions in - the 16 byte window. */ -const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 - | m_NOCONA | m_CORE2 | m_GENERIC; -const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT - | m_CORE2 | m_GENERIC; -const int x86_use_bt = m_ATHLON_K8_AMDFAM10; -/* Compare and exchange was added for 80486. */ -const int x86_cmpxchg = ~m_386; -/* Compare and exchange 8 bytes was added for pentium. */ -const int x86_cmpxchg8b = ~(m_386 | m_486); -/* Exchange and add was added for 80486. */ -const int x86_xadd = ~m_386; -/* Byteswap was added for 80486. */ -const int x86_bswap = ~m_386; -const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC; + /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based + on simulation result. But after P4 was made, no performance benefit + was observed with branch hints. It also increases the code size. + As a result, icc never generates branch hints. */ + 0, + + /* X86_TUNE_DOUBLE_WITH_ADD */ + ~m_386, + + /* X86_TUNE_USE_SAHF */ + m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32, + /* | m_GENERIC | m_ATHLON_K8 ? */ + + /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid + partial dependencies */ + m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA + | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */, + + /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial + register stalls on Generic32 compilation setting as well. However + in current implementation the partial register stalls are not eliminated + very well - they can be introduced via subregs synthesized by combine + and can happen in caller/callee saving sequences. Because this option + pays back little on PPro based chips and is in conflict with partial reg + dependencies used by Athlon/P4 based chips, it is better to leave it off + for generic32 for now. */ + m_PPRO, + + /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ + m_CORE2 | m_GENERIC, + + /* X86_TUNE_USE_HIMODE_FIOP */ + m_386 | m_486 | m_K6_GEODE, + + /* X86_TUNE_USE_SIMODE_FIOP */ + ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC), + + /* X86_TUNE_USE_MOV0 */ + m_K6, + + /* X86_TUNE_USE_CLTD */ + ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC), + + /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ + m_PENT4, + + /* X86_TUNE_SPLIT_LONG_MOVES */ + m_PPRO, + + /* X86_TUNE_READ_MODIFY_WRITE */ + ~m_PENT, + + /* X86_TUNE_READ_MODIFY */ + ~(m_PENT | m_PPRO), + + /* X86_TUNE_PROMOTE_QIMODE */ + m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2 + | m_GENERIC /* | m_PENT4 ? */, + + /* X86_TUNE_FAST_PREFIX */ + ~(m_PENT | m_486 | m_386), + + /* X86_TUNE_SINGLE_STRINGOP */ + m_386 | m_PENT4 | m_NOCONA, + + /* X86_TUNE_QIMODE_MATH */ + ~0, + + /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial + register stalls. Just like X86_TUNE_PARTIAL_REG_STALL this option + might be considered for Generic32 if our scheme for avoiding partial + stalls was more effective. */ + ~m_PPRO, + + /* X86_TUNE_PROMOTE_QI_REGS */ + 0, + + /* X86_TUNE_PROMOTE_HI_REGS */ + m_PPRO, + + /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop. */ + m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_ADD_ESP_8 */ + m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386 + | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_SUB_ESP_4 */ + m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_SUB_ESP_8 */ + m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486 + | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred + for DFmode copies */ + ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 + | m_GENERIC | m_GEODE), + + /* X86_TUNE_PARTIAL_REG_DEPENDENCY */ + m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a + conflict here in between PPro/Pentium4 based chips that thread 128bit + SSE registers as single units versus K8 based chips that divide SSE + registers to two 64bit halves. This knob promotes all store destinations + to be 128bit to allow register renaming on 128bit SSE units, but usually + results in one extra microop on 64bit SSE units. Experimental results + shows that disabling this option on P4 brings over 20% SPECfp regression, + while enabling it on K8 brings roughly 2.4% regression that can be partly + masked by careful scheduling of moves. */ + m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10, + + /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */ + m_AMDFAM10, + + /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies + are resolved on SSE register parts instead of whole registers, so we may + maintain just lower part of scalar values in proper format leaving the + upper part undefined. */ + m_ATHLON_K8, + + /* X86_TUNE_SSE_TYPELESS_STORES */ + m_ATHLON_K8_AMDFAM10, + + /* X86_TUNE_SSE_LOAD0_BY_PXOR */ + m_PPRO | m_PENT4 | m_NOCONA, + + /* X86_TUNE_MEMORY_MISMATCH_STALL */ + m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_PROLOGUE_USING_MOVE */ + m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC, + + /* X86_TUNE_EPILOGUE_USING_MOVE */ + m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC, + + /* X86_TUNE_SHIFT1 */ + ~m_486, + + /* X86_TUNE_USE_FFREEP */ + m_ATHLON_K8_AMDFAM10, + + /* X86_TUNE_INTER_UNIT_MOVES */ + ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC), + + /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more + than 4 branch instructions in the 16 byte window. */ + m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC, + + /* X86_TUNE_SCHEDULE */ + m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC, + + /* X86_TUNE_USE_BT */ + m_ATHLON_K8_AMDFAM10, + + /* X86_TUNE_USE_INCDEC */ + ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC), + + /* X86_TUNE_PAD_RETURNS */ + m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC, + + /* X86_TUNE_EXT_80387_CONSTANTS */ + m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC +}; + +/* Feature tests against the various architecture variations. */ +unsigned int ix86_arch_features[X86_ARCH_LAST] = { + /* X86_ARCH_CMOVE */ + m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA, + + /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486. */ + ~m_386, + + /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */ + ~(m_386 | m_486), + + /* X86_ARCH_XADD: Exchange and add was added for 80486. */ + ~m_386, + + /* X86_ARCH_BSWAP: Byteswap was added for 80486. */ + ~m_386, +}; + +static const unsigned int x86_accumulate_outgoing_args + = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC; + +static const unsigned int x86_arch_always_fancy_math_387 + = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 + | m_NOCONA | m_CORE2 | m_GENERIC; static enum stringop_alg stringop_alg = no_stringop; @@ -1397,11 +1431,9 @@ enum fpmath_unit ix86_fpmath; /* Which cpu are we scheduling for. */ enum processor_type ix86_tune; -int ix86_tune_mask; /* Which instruction set architecture to use. */ enum processor_type ix86_arch; -int ix86_arch_mask; /* true if sse prefetch instruction is not NOOP. */ int x86_prefetch_sse; @@ -1811,6 +1843,7 @@ override_options (void) { int i; int ix86_tune_defaulted = 0; + unsigned int ix86_arch_mask, ix86_tune_mask; /* Comes from final.c -- no real reason to change it. */ #define MAX_CODE_ALIGN 16 @@ -2124,6 +2157,10 @@ override_options (void) if (i == pta_size) error ("bad value (%s) for -march= switch", ix86_arch_string); + ix86_arch_mask = 1u << ix86_arch; + for (i = 0; i < X86_ARCH_LAST; ++i) + ix86_arch_features[i] &= ix86_arch_mask; + for (i = 0; i < pta_size; i++) if (! strcmp (ix86_tune_string, processor_alias_table[i].name)) { @@ -2155,8 +2192,9 @@ override_options (void) if (i == pta_size) error ("bad value (%s) for -mtune= switch", ix86_tune_string); - ix86_arch_mask = 1 << ix86_arch; - ix86_tune_mask = 1 << ix86_tune; + ix86_tune_mask = 1u << ix86_tune; + for (i = 0; i < X86_TUNE_LAST; ++i) + ix86_tune_features[i] &= ix86_tune_mask; if (optimize_size) ix86_cost = &size_cost; @@ -2366,7 +2404,6 @@ override_options (void) error ("-msseregparm used without SSE enabled"); ix86_fpmath = TARGET_FPMATH_DEFAULT; - if (ix86_fpmath_string != 0) { if (! strcmp (ix86_fpmath_string, "387")) @@ -2425,6 +2462,15 @@ override_options (void) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; } + /* For sane SSE instruction set generation we need fcomi instruction. + It is safe to enable all CMOVE instructions. */ + if (TARGET_SSE) + TARGET_CMOVE = 1; + + /* ??? Any idea why this is unconditionally disabled for 64-bit? */ + if (TARGET_64BIT) + TARGET_USE_SAHF = 0; + /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix. */ { char *p; @@ -4999,7 +5045,7 @@ standard_80387_constant_p (rtx x) /* For XFmode constants, try to find a special 80387 instruction when optimizing for size or on those CPUs that benefit from them. */ if (GET_MODE (x) == XFmode - && (optimize_size || x86_ext_80387_constants & ix86_tune_mask)) + && (optimize_size || TARGET_EXT_80387_CONSTANTS)) { int i; @@ -9499,6 +9545,55 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[]) /* Implement the movmisalign patterns for SSE. Non-SSE modes go straight to ix86_expand_vector_move. */ +/* Code generation for scalar reg-reg moves of single and double precision data: + if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true) + movaps reg, reg + else + movss reg, reg + if (x86_sse_partial_reg_dependency == true) + movapd reg, reg + else + movsd reg, reg + + Code generation for scalar loads of double precision data: + if (x86_sse_split_regs == true) + movlpd mem, reg (gas syntax) + else + movsd mem, reg + + Code generation for unaligned packed loads of single precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency): + if (x86_sse_unaligned_move_optimal) + movups mem, reg + + if (x86_sse_partial_reg_dependency == true) + { + xorps reg, reg + movlps mem, reg + movhps mem+8, reg + } + else + { + movlps mem, reg + movhps mem+8, reg + } + + Code generation for unaligned packed loads of double precision data + (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs): + if (x86_sse_unaligned_move_optimal) + movupd mem, reg + + if (x86_sse_split_regs == true) + { + movlpd mem, reg + movhpd mem+8, reg + } + else + { + movsd mem, reg + movhpd mem+8, reg + } + */ void ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[]) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f77fc766462..8e3032cf515 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -179,110 +179,165 @@ extern const struct processor_costs *ix86_cost; #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10) -extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and; -extern const int x86_use_bit_test, x86_cmove, x86_deep_branch; -extern const int x86_branch_hints, x86_unroll_strlen; -extern const int x86_double_with_add, x86_partial_reg_stall, x86_movx; -extern const int x86_use_himode_fiop, x86_use_simode_fiop; -extern const int x86_use_mov0, x86_use_cltd, x86_use_xchgb; -extern const int x86_read_modify_write, x86_read_modify, x86_split_long_moves; -extern const int x86_promote_QImode, x86_single_stringop, x86_fast_prefix; -extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs; -extern const int x86_promote_hi_regs, x86_integer_DFmode_moves; -extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8; -extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall; -extern const int x86_accumulate_outgoing_args, x86_prologue_using_move; -extern const int x86_epilogue_using_move, x86_decompose_lea; -extern const int x86_arch_always_fancy_math_387, x86_shift1; -extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs; -extern const int x86_sse_unaligned_move_optimal; -extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor; -extern const int x86_use_ffreep; -extern const int x86_inter_unit_moves, x86_schedule; -extern const int x86_use_bt; -extern const int x86_cmpxchg, x86_cmpxchg8b, x86_xadd; -extern const int x86_use_incdec; -extern const int x86_pad_returns; -extern const int x86_bswap; -extern const int x86_partial_flag_reg_stall; -extern int x86_prefetch_sse, x86_cmpxchg16b; +/* Feature tests against the various tunings. */ +enum ix86_tune_indices { + X86_TUNE_USE_LEAVE, + X86_TUNE_PUSH_MEMORY, + X86_TUNE_ZERO_EXTEND_WITH_AND, + X86_TUNE_USE_BIT_TEST, + X86_TUNE_UNROLL_STRLEN, + X86_TUNE_DEEP_BRANCH_PREDICTION, + X86_TUNE_BRANCH_PREDICTION_HINTS, + X86_TUNE_DOUBLE_WITH_ADD, + X86_TUNE_USE_SAHF, /* && !TARGET_64BIT */ + X86_TUNE_MOVX, + X86_TUNE_PARTIAL_REG_STALL, + X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_USE_HIMODE_FIOP, + X86_TUNE_USE_SIMODE_FIOP, + X86_TUNE_USE_MOV0, + X86_TUNE_USE_CLTD, + X86_TUNE_USE_XCHGB, + X86_TUNE_SPLIT_LONG_MOVES, + X86_TUNE_READ_MODIFY_WRITE, + X86_TUNE_READ_MODIFY, + X86_TUNE_PROMOTE_QIMODE, + X86_TUNE_FAST_PREFIX, + X86_TUNE_SINGLE_STRINGOP, + X86_TUNE_QIMODE_MATH, + X86_TUNE_HIMODE_MATH, + X86_TUNE_PROMOTE_QI_REGS, + X86_TUNE_PROMOTE_HI_REGS, + X86_TUNE_ADD_ESP_4, + X86_TUNE_ADD_ESP_8, + X86_TUNE_SUB_ESP_4, + X86_TUNE_SUB_ESP_8, + X86_TUNE_INTEGER_DFMODE_MOVES, + X86_TUNE_PARTIAL_REG_DEPENDENCY, + X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, + X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL, + X86_TUNE_SSE_SPLIT_REGS, + X86_TUNE_SSE_TYPELESS_STORES, + X86_TUNE_SSE_LOAD0_BY_PXOR, + X86_TUNE_MEMORY_MISMATCH_STALL, + X86_TUNE_PROLOGUE_USING_MOVE, + X86_TUNE_EPILOGUE_USING_MOVE, + X86_TUNE_SHIFT1, + X86_TUNE_USE_FFREEP, + X86_TUNE_INTER_UNIT_MOVES, + X86_TUNE_FOUR_JUMP_LIMIT, + X86_TUNE_SCHEDULE, + X86_TUNE_USE_BT, + X86_TUNE_USE_INCDEC, + X86_TUNE_PAD_RETURNS, + X86_TUNE_EXT_80387_CONSTANTS, -#define TARGET_USE_LEAVE (x86_use_leave & ix86_tune_mask) -#define TARGET_PUSH_MEMORY (x86_push_memory & ix86_tune_mask) -#define TARGET_ZERO_EXTEND_WITH_AND (x86_zero_extend_with_and & ix86_tune_mask) -#define TARGET_USE_BIT_TEST (x86_use_bit_test & ix86_tune_mask) -#define TARGET_UNROLL_STRLEN (x86_unroll_strlen & ix86_tune_mask) -/* For sane SSE instruction set generation we need fcomi instruction. It is - safe to enable all CMOVE instructions. */ -#define TARGET_CMOVE ((x86_cmove & ix86_arch_mask) || TARGET_SSE) -#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387) -#define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & ix86_tune_mask) -#define TARGET_BRANCH_PREDICTION_HINTS (x86_branch_hints & ix86_tune_mask) -#define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & ix86_tune_mask) -#define TARGET_USE_SAHF ((x86_use_sahf & ix86_tune_mask) && !TARGET_64BIT) -#define TARGET_MOVX (x86_movx & ix86_tune_mask) -#define TARGET_PARTIAL_REG_STALL (x86_partial_reg_stall & ix86_tune_mask) -#define TARGET_PARTIAL_FLAG_REG_STALL \ - (x86_partial_flag_reg_stall & ix86_tune_mask) -#define TARGET_USE_HIMODE_FIOP (x86_use_himode_fiop & ix86_tune_mask) -#define TARGET_USE_SIMODE_FIOP (x86_use_simode_fiop & ix86_tune_mask) -#define TARGET_USE_MOV0 (x86_use_mov0 & ix86_tune_mask) -#define TARGET_USE_CLTD (x86_use_cltd & ix86_tune_mask) -#define TARGET_USE_XCHGB (x86_use_xchgb & ix86_tune_mask) -#define TARGET_SPLIT_LONG_MOVES (x86_split_long_moves & ix86_tune_mask) -#define TARGET_READ_MODIFY_WRITE (x86_read_modify_write & ix86_tune_mask) -#define TARGET_READ_MODIFY (x86_read_modify & ix86_tune_mask) -#define TARGET_PROMOTE_QImode (x86_promote_QImode & ix86_tune_mask) -#define TARGET_FAST_PREFIX (x86_fast_prefix & ix86_tune_mask) -#define TARGET_SINGLE_STRINGOP (x86_single_stringop & ix86_tune_mask) -#define TARGET_QIMODE_MATH (x86_qimode_math & ix86_tune_mask) -#define TARGET_HIMODE_MATH (x86_himode_math & ix86_tune_mask) -#define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & ix86_tune_mask) -#define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & ix86_tune_mask) -#define TARGET_ADD_ESP_4 (x86_add_esp_4 & ix86_tune_mask) -#define TARGET_ADD_ESP_8 (x86_add_esp_8 & ix86_tune_mask) -#define TARGET_SUB_ESP_4 (x86_sub_esp_4 & ix86_tune_mask) -#define TARGET_SUB_ESP_8 (x86_sub_esp_8 & ix86_tune_mask) -#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & ix86_tune_mask) -#define TARGET_PARTIAL_REG_DEPENDENCY \ - (x86_partial_reg_dependency & ix86_tune_mask) -#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ - (x86_sse_partial_reg_dependency & ix86_tune_mask) -#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \ - (x86_sse_unaligned_move_optimal & ix86_tune_mask) -#define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & ix86_tune_mask) -#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & ix86_tune_mask) -#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & ix86_tune_mask) -#define TARGET_MEMORY_MISMATCH_STALL \ - (x86_memory_mismatch_stall & ix86_tune_mask) -#define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & ix86_tune_mask) -#define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & ix86_tune_mask) -#define TARGET_PREFETCH_SSE (x86_prefetch_sse) -#define TARGET_SHIFT1 (x86_shift1 & ix86_tune_mask) -#define TARGET_USE_FFREEP (x86_use_ffreep & ix86_tune_mask) -#define TARGET_INTER_UNIT_MOVES (x86_inter_unit_moves & ix86_tune_mask) -#define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & ix86_tune_mask) -#define TARGET_SCHEDULE (x86_schedule & ix86_tune_mask) -#define TARGET_USE_BT (x86_use_bt & ix86_tune_mask) -#define TARGET_USE_INCDEC (x86_use_incdec & ix86_tune_mask) -#define TARGET_PAD_RETURNS (x86_pad_returns & ix86_tune_mask) + X86_TUNE_LAST +}; -#define ASSEMBLER_DIALECT (ix86_asm_dialect) +extern unsigned int ix86_tune_features[X86_TUNE_LAST]; -#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0) -#define TARGET_MIX_SSE_I387 ((ix86_fpmath & FPMATH_SSE) \ - && (ix86_fpmath & FPMATH_387)) +#define TARGET_USE_LEAVE ix86_tune_features[X86_TUNE_USE_LEAVE] +#define TARGET_PUSH_MEMORY ix86_tune_features[X86_TUNE_PUSH_MEMORY] +#define TARGET_ZERO_EXTEND_WITH_AND \ + ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND] +#define TARGET_USE_BIT_TEST ix86_tune_features[X86_TUNE_USE_BIT_TEST] +#define TARGET_UNROLL_STRLEN ix86_tune_features[X86_TUNE_UNROLL_STRLEN] +#define TARGET_DEEP_BRANCH_PREDICTION \ + ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION] +#define TARGET_BRANCH_PREDICTION_HINTS \ + ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS] +#define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD] +#define TARGET_USE_SAHF ix86_tune_features[X86_TUNE_USE_SAHF] +#define TARGET_MOVX ix86_tune_features[X86_TUNE_MOVX] +#define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] +#define TARGET_PARTIAL_FLAG_REG_STALL \ + ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] +#define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] +#define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] +#define TARGET_USE_CLTD ix86_tune_features[X86_TUNE_USE_CLTD] +#define TARGET_USE_XCHGB ix86_tune_features[X86_TUNE_USE_XCHGB] +#define TARGET_SPLIT_LONG_MOVES ix86_tune_features[X86_TUNE_SPLIT_LONG_MOVES] +#define TARGET_READ_MODIFY_WRITE ix86_tune_features[X86_TUNE_READ_MODIFY_WRITE] +#define TARGET_READ_MODIFY ix86_tune_features[X86_TUNE_READ_MODIFY] +#define TARGET_PROMOTE_QImode ix86_tune_features[X86_TUNE_PROMOTE_QIMODE] +#define TARGET_FAST_PREFIX ix86_tune_features[X86_TUNE_FAST_PREFIX] +#define TARGET_SINGLE_STRINGOP ix86_tune_features[X86_TUNE_SINGLE_STRINGOP] +#define TARGET_QIMODE_MATH ix86_tune_features[X86_TUNE_QIMODE_MATH] +#define TARGET_HIMODE_MATH ix86_tune_features[X86_TUNE_HIMODE_MATH] +#define TARGET_PROMOTE_QI_REGS ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS] +#define TARGET_PROMOTE_HI_REGS ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS] +#define TARGET_ADD_ESP_4 ix86_tune_features[X86_TUNE_ADD_ESP_4] +#define TARGET_ADD_ESP_8 ix86_tune_features[X86_TUNE_ADD_ESP_8] +#define TARGET_SUB_ESP_4 ix86_tune_features[X86_TUNE_SUB_ESP_4] +#define TARGET_SUB_ESP_8 ix86_tune_features[X86_TUNE_SUB_ESP_8] +#define TARGET_INTEGER_DFMODE_MOVES \ + ix86_tune_features[X86_TUNE_INTEGER_DFMODE_MOVES] +#define TARGET_PARTIAL_REG_DEPENDENCY \ + ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY] +#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \ + ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY] +#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \ + ix86_tune_features[X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL] +#define TARGET_SSE_SPLIT_REGS ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS] +#define TARGET_SSE_TYPELESS_STORES \ + ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES] +#define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR] +#define TARGET_MEMORY_MISMATCH_STALL \ + ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL] +#define TARGET_PROLOGUE_USING_MOVE \ + ix86_tune_features[X86_TUNE_PROLOGUE_USING_MOVE] +#define TARGET_EPILOGUE_USING_MOVE \ + ix86_tune_features[X86_TUNE_EPILOGUE_USING_MOVE] +#define TARGET_SHIFT1 ix86_tune_features[X86_TUNE_SHIFT1] +#define TARGET_USE_FFREEP ix86_tune_features[X86_TUNE_USE_FFREEP] +#define TARGET_INTER_UNIT_MOVES ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES] +#define TARGET_FOUR_JUMP_LIMIT ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT] +#define TARGET_SCHEDULE ix86_tune_features[X86_TUNE_SCHEDULE] +#define TARGET_USE_BT ix86_tune_features[X86_TUNE_USE_BT] +#define TARGET_USE_INCDEC ix86_tune_features[X86_TUNE_USE_INCDEC] +#define TARGET_PAD_RETURNS ix86_tune_features[X86_TUNE_PAD_RETURNS] +#define TARGET_EXT_80387_CONSTANTS \ + ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS] -#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU) -#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2) -#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS) -#define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN) +/* Feature tests against the various architecture variations. */ +enum ix86_arch_indices { + X86_ARCH_CMOVE, /* || TARGET_SSE */ + X86_ARCH_CMPXCHG, + X86_ARCH_CMPXCHG8B, + X86_ARCH_XADD, + X86_ARCH_BSWAP, -#define TARGET_CMPXCHG (x86_cmpxchg & ix86_arch_mask) -#define TARGET_CMPXCHG8B (x86_cmpxchg8b & ix86_arch_mask) -#define TARGET_CMPXCHG16B (x86_cmpxchg16b) -#define TARGET_XADD (x86_xadd & ix86_arch_mask) -#define TARGET_BSWAP (x86_bswap & ix86_arch_mask) + X86_ARCH_LAST +}; + +extern unsigned int ix86_arch_features[X86_ARCH_LAST]; + +#define TARGET_CMOVE ix86_arch_features[X86_ARCH_CMOVE] +#define TARGET_CMPXCHG ix86_arch_features[X86_ARCH_CMPXCHG] +#define TARGET_CMPXCHG8B ix86_arch_features[X86_ARCH_CMPXCHG8B] +#define TARGET_XADD ix86_arch_features[X86_ARCH_XADD] +#define TARGET_BSWAP ix86_arch_features[X86_ARCH_BSWAP] + +#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387) + +extern int x86_prefetch_sse; +#define TARGET_PREFETCH_SSE x86_prefetch_sse + +extern int x86_cmpxchg16b; +#define TARGET_CMPXCHG16B x86_cmpxchg16b + +#define ASSEMBLER_DIALECT (ix86_asm_dialect) + +#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0) +#define TARGET_MIX_SSE_I387 \ + ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387)) + +#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU) +#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2) +#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS) +#define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN) #ifndef TARGET_64BIT_DEFAULT #define TARGET_64BIT_DEFAULT 0 @@ -2132,10 +2187,7 @@ enum processor_type }; extern enum processor_type ix86_tune; -extern int ix86_tune_mask; - extern enum processor_type ix86_arch; -extern int ix86_arch_mask; enum fpmath_unit {