i386.c (x86_use_leave, [...]): Merge into ...

* config/i386/i386.c (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_movx, x86_double_with_add, x86_use_bit_test, x86_unroll_strlen, x86_deep_branch, x86_branch_hints, x86_use_sahf, x86_partial_reg_stall, x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_fast_prefix, x86_single_stringop, x86_qimode_math, x86_promote_qi_regs, x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8, x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_prologue_using_move, x86_epilogue_using_move, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec, x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit, x86_schedule, x86_use_bt, x86_pad_returns): Merge into ... (ix86_tune_features): ... here. New array. (x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_bswap): Merge into ... (ix86_arch_features): ... here. New array. (x86_3dnow_a): Remove. (x86_accumulate_outgoing_args): Make static. (x86_arch_always_fancy_math_387): Make static. (ix86_tune_mask, ix86_arch_mask): Move ... (override_options): ... to local variables here. Apply the appropriate mask to each element of ix86_arch_features and ix86_tune_features. Adjust TARGET_CMOVE and TARGET_USE_SAHF as were done in the old macros. (standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS. * config/i386/i386.h (x86_use_leave, x86_push_memory, x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch, x86_branch_hints, x86_unroll_strlen, x86_double_with_add, x86_partial_reg_stall, x86_movx, x86_use_himode_fiop, x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb, x86_read_modify_write, x86_read_modify, x86_split_long_moves, x86_promote_QImode, x86_single_stringop, x86_fast_prefix, x86_himode_math, x86_qimode_math, x86_promote_qi_regs, x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8, x86_partial_reg_dependency, x86_memory_mismatch_stall, x86_accumulate_outgoing_args, x86_prologue_using_move, x86_epilogue_using_move, x86_decompose_lea, x86_arch_always_fancy_math_387, x86_shift1, x86_sse_partial_reg_dependency, x86_sse_split_regs, x86_sse_unaligned_move_optimal, x86_sse_typeless_stores, x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves, x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd, x86_use_incdec, x86_pad_returns, x86_bswap, x86_partial_flag_reg_stall): Remove. (enum ix86_tune_indices): New. (ix86_tune_features): New. (TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND, TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN, TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS, TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX, TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL, TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0, TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES, TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode, TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH, TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS, TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4, TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY, TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS, TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR, TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE, TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP, TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE, TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS, TARGET_EXT_80387_CONSTANTS): Use it. (enum ix86_arch_indices): New. (ix86_arch_features): New. (TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD, TARGET_BSWAP): Use it. (ix86_tune_mask, ix86_arch_mask): Remove. From-SVN: r122621
2007-03-06 07:59:38 -08:00 · 2007-03-06 07:59:38 -08:00 · 80fd744fda
parent 14da607343
commit 80fd744fda
3 changed files with 510 additions and 283 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,83 @@
+2007-03-06  Richard Henderson  <rth@redhat.com>
+
+	* config/i386/i386.c (x86_use_leave, x86_push_memory,
+	x86_zero_extend_with_and, x86_movx, x86_double_with_add,
+	x86_use_bit_test, x86_unroll_strlen, x86_deep_branch,
+	x86_branch_hints, x86_use_sahf, x86_partial_reg_stall,
+	x86_partial_flag_reg_stall, x86_use_himode_fiop, x86_use_simode_fiop,
+	x86_use_mov0, x86_use_cltd, x86_read_modify_write, x86_read_modify,
+	x86_split_long_moves, x86_promote_QImode, x86_fast_prefix, 
+	x86_single_stringop, x86_qimode_math, x86_promote_qi_regs,
+	x86_himode_math, x86_promote_hi_regs, x86_sub_esp_4, x86_sub_esp_8,
+	x86_add_esp_4, x86_add_esp_8, x86_integer_DFmode_moves, 
+	x86_partial_reg_dependency, x86_memory_mismatch_stall, 
+	x86_prologue_using_move, x86_epilogue_using_move, x86_shift1,
+	x86_sse_partial_reg_dependency, x86_sse_split_regs, 
+	x86_sse_unaligned_move_optimal, x86_sse_typeless_stores,
+	x86_sse_load0_by_pxor, x86_use_ffreep, x86_use_incdec,
+	x86_inter_unit_moves, x86_ext_80387_constants, x86_four_jump_limit,
+	x86_schedule, x86_use_bt, x86_pad_returns): Merge into ...
+	(ix86_tune_features): ... here.  New array.
+	(x86_cmove, x86_use_xchgb, x86_cmpxchg, x86_cmpxchg8b,	
+	x86_xadd, x86_bswap): Merge into ...
+	(ix86_arch_features): ... here.  New array.
+	(x86_3dnow_a): Remove.
+	(x86_accumulate_outgoing_args): Make static.
+	(x86_arch_always_fancy_math_387): Make static.
+	(ix86_tune_mask, ix86_arch_mask): Move ...
+	(override_options): ... to local variables here.  Apply the
+	appropriate mask to each element of ix86_arch_features and
+	ix86_tune_features.  Adjust TARGET_CMOVE and TARGET_USE_SAHF
+	as were done in the old macros.
+	(standard_80387_constant_p): Use TARGET_EXT_80387_CONSTANTS.
+	* config/i386/i386.h (x86_use_leave, x86_push_memory,
+	x86_zero_extend_with_and, x86_use_bit_test, x86_cmove, x86_deep_branch,
+	x86_branch_hints, x86_unroll_strlen, x86_double_with_add,
+	x86_partial_reg_stall, x86_movx, x86_use_himode_fiop,
+	x86_use_simode_fiop, x86_use_mov0, x86_use_cltd, x86_use_xchgb,
+	x86_read_modify_write, x86_read_modify, x86_split_long_moves,
+	x86_promote_QImode, x86_single_stringop, x86_fast_prefix,
+	x86_himode_math, x86_qimode_math, x86_promote_qi_regs,
+	x86_promote_hi_regs, x86_integer_DFmode_moves, x86_add_esp_4,
+	x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8,
+	x86_partial_reg_dependency, x86_memory_mismatch_stall,
+	x86_accumulate_outgoing_args, x86_prologue_using_move,
+	x86_epilogue_using_move, x86_decompose_lea,
+	x86_arch_always_fancy_math_387, x86_shift1,
+	x86_sse_partial_reg_dependency, x86_sse_split_regs,
+	x86_sse_unaligned_move_optimal, x86_sse_typeless_stores,	
+	x86_sse_load0_by_pxor, x86_use_ffreep, x86_inter_unit_moves,
+	x86_schedule, x86_use_bt, x86_cmpxchg, x86_cmpxchg8b, x86_xadd,
+	x86_use_incdec, x86_pad_returns, x86_bswap,
+	x86_partial_flag_reg_stall): Remove.
+	(enum ix86_tune_indices): New.
+	(ix86_tune_features): New.
+	(TARGET_USE_LEAVE, TARGET_PUSH_MEMORY, TARGET_ZERO_EXTEND_WITH_AND,
+	TARGET_USE_BIT_TEST, TARGET_UNROLL_STRLEN,
+	TARGET_DEEP_BRANCH_PREDICTION, TARGET_BRANCH_PREDICTION_HINTS,
+	TARGET_DOUBLE_WITH_ADD, TARGET_USE_SAHF, TARGET_MOVX,
+	TARGET_PARTIAL_REG_STALL, TARGET_PARTIAL_FLAG_REG_STALL,
+	TARGET_USE_HIMODE_FIOP, TARGET_USE_SIMODE_FIOP, TARGET_USE_MOV0,
+	TARGET_USE_CLTD, TARGET_USE_XCHGB, TARGET_SPLIT_LONG_MOVES,
+	TARGET_READ_MODIFY_WRITE, TARGET_READ_MODIFY, TARGET_PROMOTE_QImode,
+	TARGET_FAST_PREFIX, TARGET_SINGLE_STRINGOP, TARGET_QIMODE_MATH,
+	TARGET_HIMODE_MATH, TARGET_PROMOTE_QI_REGS, TARGET_PROMOTE_HI_REGS,
+	TARGET_ADD_ESP_4, TARGET_ADD_ESP_8, TARGET_SUB_ESP_4,
+	TARGET_SUB_ESP_8, TARGET_INTEGER_DFMODE_MOVES,
+	TARGET_PARTIAL_REG_DEPENDENCY, TARGET_SSE_PARTIAL_REG_DEPENDENCY,
+	TARGET_SSE_UNALIGNED_MOVE_OPTIMAL, TARGET_SSE_SPLIT_REGS,
+	TARGET_SSE_TYPELESS_STORES, TARGET_SSE_LOAD0_BY_PXOR,
+	TARGET_MEMORY_MISMATCH_STALL, TARGET_PROLOGUE_USING_MOVE,
+	TARGET_EPILOGUE_USING_MOVE, TARGET_SHIFT1, TARGET_USE_FFREEP,
+	TARGET_INTER_UNIT_MOVES, TARGET_FOUR_JUMP_LIMIT, TARGET_SCHEDULE,
+	TARGET_USE_BT, TARGET_USE_INCDEC, TARGET_PAD_RETURNS,
+	TARGET_EXT_80387_CONSTANTS): Use it.
+	(enum ix86_arch_indices): New.
+	(ix86_arch_features): New.
+	(TARGET_CMOVE, TARGET_CMPXCHG, TARGET_CMPXCHG8B, TARGET_XADD,
+	TARGET_BSWAP): Use it.
+	(ix86_tune_mask, ix86_arch_mask): Remove.
+
 2007-03-06  Joseph Myers  <joseph@codesourcery.com>

 	PR bootstrap/31020
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -1004,187 +1004,221 @@ const struct processor_costs *ix86_cost = &pentium_cost;
   (PPro/PENT4/NOCONA/CORE2/Athlon/K8).  */
 #define m_GENERIC (m_GENERIC32 | m_GENERIC64)

-/* Leave is not affecting Nocona SPEC2000 results negatively, so enabling for
-   Generic64 seems like good code size tradeoff.  We can't enable it for 32bit
-   generic because it is not working well with PPro base chips.  */
-const int x86_use_leave = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2
-                          | m_GENERIC64;
-const int x86_push_memory = m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
-                            | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_zero_extend_with_and = m_486 | m_PENT;
-/* Enable to zero extend integer registers to avoid partial dependencies */
-const int x86_movx = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
-                     | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */;
-const int x86_double_with_add = ~m_386;
-const int x86_use_bit_test = m_386;
-const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10
-                              | m_K6 | m_CORE2 | m_GENERIC;
-const int x86_cmove = m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
-                      | m_NOCONA;
-const int x86_3dnow_a = m_ATHLON_K8_AMDFAM10;
-const int x86_deep_branch = m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10
-                            | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-/* Branch hints were put in P4 based on simulation result. But
-   after P4 was made, no performance benefit was observed with
-   branch hints. It also increases the code size. As the result,
-   icc never generates branch hints.  */
-const int x86_branch_hints = 0;
-const int x86_use_sahf = m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32;
-                         /*m_GENERIC | m_ATHLON_K8 ? */
-/* We probably ought to watch for partial register stalls on Generic32
-   compilation setting as well.  However in current implementation the
-   partial register stalls are not eliminated very well - they can
-   be introduced via subregs synthesized by combine and can happen
-   in caller/callee saving sequences.
-   Because this option pays back little on PPro based chips and is in conflict
-   with partial reg. dependencies used by Athlon/P4 based chips, it is better
-   to leave it off for generic32 for now.  */
-const int x86_partial_reg_stall = m_PPRO;
-const int x86_partial_flag_reg_stall =  m_CORE2 | m_GENERIC;
-const int x86_use_himode_fiop = m_386 | m_486 | m_K6_GEODE;
-const int x86_use_simode_fiop = ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT
-                                  | m_CORE2 | m_GENERIC);
-const int x86_use_mov0 = m_K6;
-const int x86_use_cltd = ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC);
-/* Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
-const int x86_use_xchgb = m_PENT4;
-const int x86_read_modify_write = ~m_PENT;
-const int x86_read_modify = ~(m_PENT | m_PPRO);
-const int x86_split_long_moves = m_PPRO;
-const int x86_promote_QImode = m_K6_GEODE | m_PENT | m_386 | m_486
-                               | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
-                               /* m_PENT4 ? */
-const int x86_fast_prefix = ~(m_PENT | m_486 | m_386);
-const int x86_single_stringop = m_386 | m_PENT4 | m_NOCONA;
-const int x86_qimode_math = ~(0);
-const int x86_promote_qi_regs = 0;
-/* On PPro this flag is meant to avoid partial register stalls.  Just like
-   the x86_partial_reg_stall this option might be considered for Generic32
-   if our scheme for avoiding partial stalls was more effective.  */
-const int x86_himode_math = ~(m_PPRO);
-const int x86_promote_hi_regs = m_PPRO;
-/* Enable if add/sub rsp is preferred over 1 or 2 push/pop */
-const int x86_sub_esp_4 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
-                          | m_CORE2 | m_GENERIC;
-const int x86_sub_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
-                          | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_add_esp_4 = m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA
-                          | m_CORE2 | m_GENERIC;
-const int x86_add_esp_8 = m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
-                          | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC;
-/* Enable if integer moves are preferred for DFmode copies */
-const int x86_integer_DFmode_moves = ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
-                                       | m_PPRO | m_CORE2 | m_GENERIC | m_GEODE);
-const int x86_partial_reg_dependency = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA
-                                       | m_CORE2 | m_GENERIC;
-const int x86_memory_mismatch_stall = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA 
-                                      | m_CORE2 | m_GENERIC;
-/* If ACCUMULATE_OUTGOING_ARGS is enabled, the maximum amount of space required
-   for outgoing arguments will be computed and placed into the variable
-   `current_function_outgoing_args_size'. No space will be pushed onto the stack
-   for each call; instead, the function prologue should increase the stack frame
-   size by this amount. Setting both PUSH_ARGS and ACCUMULATE_OUTGOING_ARGS is
-   not proper. */
-const int x86_accumulate_outgoing_args = m_ATHLON_K8_AMDFAM10 | m_PENT4
-                                         | m_NOCONA | m_PPRO | m_CORE2
-                                         | m_GENERIC;
-const int x86_prologue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
-const int x86_epilogue_using_move = m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC;
-const int x86_shift1 = ~m_486;
-const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO
-                                           | m_ATHLON_K8_AMDFAM10 | m_PENT4 
-                                           | m_NOCONA | m_CORE2 | m_GENERIC;
-/* In Generic model we have an conflict here in between PPro/Pentium4 based chips
-   that thread 128bit SSE registers as single units versus K8 based chips that
-   divide SSE registers to two 64bit halves.
-   x86_sse_partial_reg_dependency promote all store destinations to be 128bit
-   to allow register renaming on 128bit SSE units, but usually results in one
-   extra microop on 64bit SSE units.  Experimental results shows that disabling
-   this option on P4 brings over 20% SPECfp regression, while enabling it on
-   K8 brings roughly 2.4% regression that can be partly masked by careful scheduling
-   of moves.  */
-const int x86_sse_partial_reg_dependency = m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
-                                           | m_GENERIC | m_AMDFAM10;
-/* Set for machines where the type and dependencies are resolved on SSE
-   register parts instead of whole registers, so we may maintain just
-   lower part of scalar values in proper format leaving the upper part
-   undefined.  */
-const int x86_sse_split_regs = m_ATHLON_K8;
-/* Code generation for scalar reg-reg moves of single and double precision data:
-     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
-       movaps reg, reg
-     else
-       movss reg, reg
-     if (x86_sse_partial_reg_dependency == true)
-       movapd reg, reg
-     else
-       movsd reg, reg
+/* Feature tests against the various tunings.  */
+unsigned int ix86_tune_features[X86_TUNE_LAST] = {
+  /* X86_TUNE_USE_LEAVE: Leave does not affect Nocona SPEC2000 results
+     negatively, so enabling for Generic64 seems like good code size
+     tradeoff.  We can't enable it for 32bit generic because it does not
+     work well with PPro base chips.  */
+  m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC64,

-   Code generation for scalar loads of double precision data:
-     if (x86_sse_split_regs == true)
-       movlpd mem, reg      (gas syntax)
-     else
-       movsd mem, reg
- 
-   Code generation for unaligned packed loads of single precision data
-   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
-     if (x86_sse_unaligned_move_optimal)
-       movups mem, reg
+  /* X86_TUNE_PUSH_MEMORY */
+  m_386 | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
+  | m_NOCONA | m_CORE2 | m_GENERIC,

-     if (x86_sse_partial_reg_dependency == true)
-       {
-         xorps  reg, reg
-         movlps mem, reg
-         movhps mem+8, reg
-       }
-     else
-       {
-         movlps mem, reg
-         movhps mem+8, reg
-       }
+  /* X86_TUNE_ZERO_EXTEND_WITH_AND */
+  m_486 | m_PENT,

-   Code generation for unaligned packed loads of double precision data
-   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
-     if (x86_sse_unaligned_move_optimal)
-       movupd mem, reg
+  /* X86_TUNE_USE_BIT_TEST */
+  m_386,

-     if (x86_sse_split_regs == true)
-       {
-         movlpd mem, reg
-         movhpd mem+8, reg
-       }
-     else
-       {
-         movsd  mem, reg
-         movhpd mem+8, reg
-       }
- */
-const int x86_sse_unaligned_move_optimal = m_AMDFAM10;
-const int x86_sse_typeless_stores = m_ATHLON_K8_AMDFAM10;
-const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4 | m_NOCONA;
-const int x86_use_ffreep = m_ATHLON_K8_AMDFAM10;
-const int x86_use_incdec = ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC);
+  /* X86_TUNE_UNROLL_STRLEN */
+  m_486 | m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6 | m_CORE2 | m_GENERIC,

-const int x86_inter_unit_moves = ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC);
+  /* X86_TUNE_DEEP_BRANCH_PREDICTION */
+  m_PPRO | m_K6_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4
+  | m_NOCONA | m_CORE2 | m_GENERIC,

-const int x86_ext_80387_constants = m_K6_GEODE | m_ATHLON_K8 | m_PENT4
-                                    | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
-/* Some CPU cores are not able to predict more than 4 branch instructions in
-   the 16 byte window.  */
-const int x86_four_jump_limit = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
-                                | m_NOCONA | m_CORE2 | m_GENERIC;
-const int x86_schedule = m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT
-                         | m_CORE2 | m_GENERIC;
-const int x86_use_bt = m_ATHLON_K8_AMDFAM10;
-/* Compare and exchange was added for 80486.  */
-const int x86_cmpxchg = ~m_386;
-/* Compare and exchange 8 bytes was added for pentium.  */
-const int x86_cmpxchg8b = ~(m_386 | m_486);
-/* Exchange and add was added for 80486.  */
-const int x86_xadd = ~m_386;
-/* Byteswap was added for 80486.  */
-const int x86_bswap = ~m_386;
-const int x86_pad_returns = m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC;
+  /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based
+     on simulation result. But after P4 was made, no performance benefit
+     was observed with branch hints.  It also increases the code size.
+     As a result, icc never generates branch hints.  */
+  0,
+
+  /* X86_TUNE_DOUBLE_WITH_ADD */
+  ~m_386,
+  
+  /* X86_TUNE_USE_SAHF */
+  m_PPRO | m_K6_GEODE | m_PENT4 | m_NOCONA | m_GENERIC32,
+  /* | m_GENERIC | m_ATHLON_K8 ? */
+
+  /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid
+     partial dependencies */
+  m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA
+  | m_CORE2 | m_GENERIC | m_GEODE /* m_386 | m_K6 */,
+
+  /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial
+     register stalls on Generic32 compilation setting as well.  However
+     in current implementation the partial register stalls are not eliminated
+     very well - they can be introduced via subregs synthesized by combine
+     and can happen in caller/callee saving sequences.  Because this option
+     pays back little on PPro based chips and is in conflict with partial reg
+     dependencies used by Athlon/P4 based chips, it is better to leave it off
+     for generic32 for now.  */
+  m_PPRO,
+
+  /* X86_TUNE_PARTIAL_FLAG_REG_STALL */
+  m_CORE2 | m_GENERIC,
+  
+  /* X86_TUNE_USE_HIMODE_FIOP */
+  m_386 | m_486 | m_K6_GEODE,
+
+  /* X86_TUNE_USE_SIMODE_FIOP */
+  ~(m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT | m_CORE2 | m_GENERIC),
+
+  /* X86_TUNE_USE_MOV0 */
+  m_K6,
+  
+  /* X86_TUNE_USE_CLTD */
+  ~(m_PENT | m_K6 | m_CORE2 | m_GENERIC),
+
+  /* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx.  */
+  m_PENT4,
+
+  /* X86_TUNE_SPLIT_LONG_MOVES */
+  m_PPRO,
+
+  /* X86_TUNE_READ_MODIFY_WRITE */
+  ~m_PENT,
+
+  /* X86_TUNE_READ_MODIFY */
+  ~(m_PENT | m_PPRO),
+
+  /* X86_TUNE_PROMOTE_QIMODE */
+  m_K6_GEODE | m_PENT | m_386 | m_486 | m_ATHLON_K8_AMDFAM10 | m_CORE2
+  | m_GENERIC /* | m_PENT4 ? */,
+
+  /* X86_TUNE_FAST_PREFIX */
+  ~(m_PENT | m_486 | m_386),
+
+  /* X86_TUNE_SINGLE_STRINGOP */
+  m_386 | m_PENT4 | m_NOCONA,
+  
+  /* X86_TUNE_QIMODE_MATH */
+  ~0,
+  
+  /* X86_TUNE_HIMODE_MATH: On PPro this flag is meant to avoid partial
+     register stalls.  Just like X86_TUNE_PARTIAL_REG_STALL this option
+     might be considered for Generic32 if our scheme for avoiding partial
+     stalls was more effective.  */
+  ~m_PPRO,
+
+  /* X86_TUNE_PROMOTE_QI_REGS */
+  0,
+
+  /* X86_TUNE_PROMOTE_HI_REGS */
+  m_PPRO,
+
+  /* X86_TUNE_ADD_ESP_4: Enable if add/sub is preferred over 1/2 push/pop.  */
+  m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_ADD_ESP_8 */
+  m_ATHLON_K8_AMDFAM10 | m_PPRO | m_K6_GEODE | m_386
+  | m_486 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_SUB_ESP_4 */
+  m_ATHLON_K8_AMDFAM10 | m_PPRO | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_SUB_ESP_8 */
+  m_ATHLON_K8_AMDFAM10 | m_PPRO | m_386 | m_486
+  | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred
+     for DFmode copies */
+  ~(m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2
+    | m_GENERIC | m_GEODE),
+
+  /* X86_TUNE_PARTIAL_REG_DEPENDENCY */
+  m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a
+     conflict here in between PPro/Pentium4 based chips that thread 128bit
+     SSE registers as single units versus K8 based chips that divide SSE
+     registers to two 64bit halves.  This knob promotes all store destinations
+     to be 128bit to allow register renaming on 128bit SSE units, but usually
+     results in one extra microop on 64bit SSE units.  Experimental results
+     shows that disabling this option on P4 brings over 20% SPECfp regression,
+     while enabling it on K8 brings roughly 2.4% regression that can be partly
+     masked by careful scheduling of moves.  */
+  m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC | m_AMDFAM10,
+
+  /* X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL */
+  m_AMDFAM10,
+
+  /* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies
+     are resolved on SSE register parts instead of whole registers, so we may
+     maintain just lower part of scalar values in proper format leaving the
+     upper part undefined.  */
+  m_ATHLON_K8,
+
+  /* X86_TUNE_SSE_TYPELESS_STORES */
+  m_ATHLON_K8_AMDFAM10,
+
+  /* X86_TUNE_SSE_LOAD0_BY_PXOR */
+  m_PPRO | m_PENT4 | m_NOCONA,
+
+  /* X86_TUNE_MEMORY_MISMATCH_STALL */
+  m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_PROLOGUE_USING_MOVE */
+  m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_EPILOGUE_USING_MOVE */
+  m_ATHLON_K8 | m_PPRO | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_SHIFT1 */
+  ~m_486,
+
+  /* X86_TUNE_USE_FFREEP */
+  m_ATHLON_K8_AMDFAM10,
+
+  /* X86_TUNE_INTER_UNIT_MOVES */
+  ~(m_ATHLON_K8_AMDFAM10 | m_GENERIC),
+
+  /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more
+     than 4 branch instructions in the 16 byte window.  */
+  m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_SCHEDULE */
+  m_PPRO | m_ATHLON_K8_AMDFAM10 | m_K6_GEODE | m_PENT | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_USE_BT */
+  m_ATHLON_K8_AMDFAM10,
+
+  /* X86_TUNE_USE_INCDEC */
+  ~(m_PENT4 | m_NOCONA | m_CORE2 | m_GENERIC),
+
+  /* X86_TUNE_PAD_RETURNS */
+  m_ATHLON_K8_AMDFAM10 | m_CORE2 | m_GENERIC,
+
+  /* X86_TUNE_EXT_80387_CONSTANTS */
+  m_K6_GEODE | m_ATHLON_K8 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC
+};
+
+/* Feature tests against the various architecture variations.  */
+unsigned int ix86_arch_features[X86_ARCH_LAST] = {
+  /* X86_ARCH_CMOVE */
+  m_PPRO | m_GEODE | m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA,
+
+  /* X86_ARCH_CMPXCHG: Compare and exchange was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_CMPXCHG8B: Compare and exchange 8 bytes was added for pentium. */
+  ~(m_386 | m_486),
+
+  /* X86_ARCH_XADD: Exchange and add was added for 80486.  */
+  ~m_386,
+
+  /* X86_ARCH_BSWAP: Byteswap was added for 80486.  */
+  ~m_386,
+};
+
+static const unsigned int x86_accumulate_outgoing_args
+  = m_ATHLON_K8_AMDFAM10 | m_PENT4 | m_NOCONA | m_PPRO | m_CORE2 | m_GENERIC;
+
+static const unsigned int x86_arch_always_fancy_math_387
+  = m_PENT | m_PPRO | m_ATHLON_K8_AMDFAM10 | m_PENT4
+    | m_NOCONA | m_CORE2 | m_GENERIC;

 static enum stringop_alg stringop_alg = no_stringop;

@ -1397,11 +1431,9 @@ enum fpmath_unit ix86_fpmath;

 /* Which cpu are we scheduling for.  */
 enum processor_type ix86_tune;
-int ix86_tune_mask;

 /* Which instruction set architecture to use.  */
 enum processor_type ix86_arch;
-int ix86_arch_mask;

 /* true if sse prefetch instruction is not NOOP.  */
 int x86_prefetch_sse;
@ -1811,6 +1843,7 @@ override_options (void)
 {
  int i;
  int ix86_tune_defaulted = 0;
+  unsigned int ix86_arch_mask, ix86_tune_mask;

  /* Comes from final.c -- no real reason to change it.  */
 #define MAX_CODE_ALIGN 16
@ -2124,6 +2157,10 @@ override_options (void)
  if (i == pta_size)
    error ("bad value (%s) for -march= switch", ix86_arch_string);

+  ix86_arch_mask = 1u << ix86_arch;
+  for (i = 0; i < X86_ARCH_LAST; ++i)
+    ix86_arch_features[i] &= ix86_arch_mask;
+
  for (i = 0; i < pta_size; i++)
    if (! strcmp (ix86_tune_string, processor_alias_table[i].name))
      {
@ -2155,8 +2192,9 @@ override_options (void)
  if (i == pta_size)
    error ("bad value (%s) for -mtune= switch", ix86_tune_string);

-  ix86_arch_mask = 1 << ix86_arch;
-  ix86_tune_mask = 1 << ix86_tune;
+  ix86_tune_mask = 1u << ix86_tune;
+  for (i = 0; i < X86_TUNE_LAST; ++i)
+    ix86_tune_features[i] &= ix86_tune_mask;

  if (optimize_size)
    ix86_cost = &size_cost;
@ -2366,7 +2404,6 @@ override_options (void)
    error ("-msseregparm used without SSE enabled");

  ix86_fpmath = TARGET_FPMATH_DEFAULT;
-
  if (ix86_fpmath_string != 0)
    {
      if (! strcmp (ix86_fpmath_string, "387"))
@ -2425,6 +2462,15 @@ override_options (void)
      target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS;
    }

+  /* For sane SSE instruction set generation we need fcomi instruction.
+     It is safe to enable all CMOVE instructions.  */
+  if (TARGET_SSE)
+    TARGET_CMOVE = 1;
+
+  /* ??? Any idea why this is unconditionally disabled for 64-bit?  */
+  if (TARGET_64BIT)
+    TARGET_USE_SAHF = 0;
+  
  /* Figure out what ASM_GENERATE_INTERNAL_LABEL builds as a prefix.  */
  {
    char *p;
@ -4999,7 +5045,7 @@ standard_80387_constant_p (rtx x)
  /* For XFmode constants, try to find a special 80387 instruction when
     optimizing for size or on those CPUs that benefit from them.  */
  if (GET_MODE (x) == XFmode
-      && (optimize_size || x86_ext_80387_constants & ix86_tune_mask))
+      && (optimize_size || TARGET_EXT_80387_CONSTANTS))
    {
      int i;

@ -9499,6 +9545,55 @@ ix86_expand_vector_move (enum machine_mode mode, rtx operands[])

 /* Implement the movmisalign patterns for SSE.  Non-SSE modes go
   straight to ix86_expand_vector_move.  */
+/* Code generation for scalar reg-reg moves of single and double precision data:
+     if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
+       movaps reg, reg
+     else
+       movss reg, reg
+     if (x86_sse_partial_reg_dependency == true)
+       movapd reg, reg
+     else
+       movsd reg, reg
+
+   Code generation for scalar loads of double precision data:
+     if (x86_sse_split_regs == true)
+       movlpd mem, reg      (gas syntax)
+     else
+       movsd mem, reg
+ 
+   Code generation for unaligned packed loads of single precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
+     if (x86_sse_unaligned_move_optimal)
+       movups mem, reg
+
+     if (x86_sse_partial_reg_dependency == true)
+       {
+         xorps  reg, reg
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+     else
+       {
+         movlps mem, reg
+         movhps mem+8, reg
+       }
+
+   Code generation for unaligned packed loads of double precision data
+   (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
+     if (x86_sse_unaligned_move_optimal)
+       movupd mem, reg
+
+     if (x86_sse_split_regs == true)
+       {
+         movlpd mem, reg
+         movhpd mem+8, reg
+       }
+     else
+       {
+         movsd  mem, reg
+         movhpd mem+8, reg
+       }
+ */

 void
 ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@ -179,110 +179,165 @@ extern const struct processor_costs *ix86_cost;
 #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64)
 #define TARGET_AMDFAM10 (ix86_tune == PROCESSOR_AMDFAM10)

-extern const int x86_use_leave, x86_push_memory, x86_zero_extend_with_and;
-extern const int x86_use_bit_test, x86_cmove, x86_deep_branch;
-extern const int x86_branch_hints, x86_unroll_strlen;
-extern const int x86_double_with_add, x86_partial_reg_stall, x86_movx;
-extern const int x86_use_himode_fiop, x86_use_simode_fiop;
-extern const int x86_use_mov0, x86_use_cltd, x86_use_xchgb;
-extern const int x86_read_modify_write, x86_read_modify, x86_split_long_moves;
-extern const int x86_promote_QImode, x86_single_stringop, x86_fast_prefix;
-extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
-extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
-extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
-extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
-extern const int x86_accumulate_outgoing_args, x86_prologue_using_move;
-extern const int x86_epilogue_using_move, x86_decompose_lea;
-extern const int x86_arch_always_fancy_math_387, x86_shift1;
-extern const int x86_sse_partial_reg_dependency, x86_sse_split_regs;
-extern const int x86_sse_unaligned_move_optimal;
-extern const int x86_sse_typeless_stores, x86_sse_load0_by_pxor;
-extern const int x86_use_ffreep;
-extern const int x86_inter_unit_moves, x86_schedule;
-extern const int x86_use_bt;
-extern const int x86_cmpxchg, x86_cmpxchg8b, x86_xadd;
-extern const int x86_use_incdec;
-extern const int x86_pad_returns;
-extern const int x86_bswap;
-extern const int x86_partial_flag_reg_stall;
-extern int x86_prefetch_sse, x86_cmpxchg16b;
+/* Feature tests against the various tunings.  */
+enum ix86_tune_indices {
+  X86_TUNE_USE_LEAVE,
+  X86_TUNE_PUSH_MEMORY,
+  X86_TUNE_ZERO_EXTEND_WITH_AND,
+  X86_TUNE_USE_BIT_TEST,
+  X86_TUNE_UNROLL_STRLEN,
+  X86_TUNE_DEEP_BRANCH_PREDICTION,
+  X86_TUNE_BRANCH_PREDICTION_HINTS,
+  X86_TUNE_DOUBLE_WITH_ADD,
+  X86_TUNE_USE_SAHF,			/* && !TARGET_64BIT */
+  X86_TUNE_MOVX,
+  X86_TUNE_PARTIAL_REG_STALL,
+  X86_TUNE_PARTIAL_FLAG_REG_STALL,
+  X86_TUNE_USE_HIMODE_FIOP,
+  X86_TUNE_USE_SIMODE_FIOP,
+  X86_TUNE_USE_MOV0,
+  X86_TUNE_USE_CLTD,
+  X86_TUNE_USE_XCHGB,
+  X86_TUNE_SPLIT_LONG_MOVES,
+  X86_TUNE_READ_MODIFY_WRITE,
+  X86_TUNE_READ_MODIFY,
+  X86_TUNE_PROMOTE_QIMODE,
+  X86_TUNE_FAST_PREFIX,
+  X86_TUNE_SINGLE_STRINGOP,
+  X86_TUNE_QIMODE_MATH,
+  X86_TUNE_HIMODE_MATH,
+  X86_TUNE_PROMOTE_QI_REGS,
+  X86_TUNE_PROMOTE_HI_REGS,
+  X86_TUNE_ADD_ESP_4,
+  X86_TUNE_ADD_ESP_8,
+  X86_TUNE_SUB_ESP_4,
+  X86_TUNE_SUB_ESP_8,
+  X86_TUNE_INTEGER_DFMODE_MOVES,
+  X86_TUNE_PARTIAL_REG_DEPENDENCY,
+  X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY,
+  X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL,
+  X86_TUNE_SSE_SPLIT_REGS,
+  X86_TUNE_SSE_TYPELESS_STORES,
+  X86_TUNE_SSE_LOAD0_BY_PXOR,
+  X86_TUNE_MEMORY_MISMATCH_STALL,
+  X86_TUNE_PROLOGUE_USING_MOVE,
+  X86_TUNE_EPILOGUE_USING_MOVE,
+  X86_TUNE_SHIFT1,
+  X86_TUNE_USE_FFREEP,
+  X86_TUNE_INTER_UNIT_MOVES,
+  X86_TUNE_FOUR_JUMP_LIMIT,
+  X86_TUNE_SCHEDULE,
+  X86_TUNE_USE_BT,
+  X86_TUNE_USE_INCDEC,
+  X86_TUNE_PAD_RETURNS,
+  X86_TUNE_EXT_80387_CONSTANTS,

-#define TARGET_USE_LEAVE (x86_use_leave & ix86_tune_mask)
-#define TARGET_PUSH_MEMORY (x86_push_memory & ix86_tune_mask)
-#define TARGET_ZERO_EXTEND_WITH_AND (x86_zero_extend_with_and & ix86_tune_mask)
-#define TARGET_USE_BIT_TEST (x86_use_bit_test & ix86_tune_mask)
-#define TARGET_UNROLL_STRLEN (x86_unroll_strlen & ix86_tune_mask)
-/* For sane SSE instruction set generation we need fcomi instruction.  It is
-   safe to enable all CMOVE instructions.  */
-#define TARGET_CMOVE ((x86_cmove & ix86_arch_mask) || TARGET_SSE)
-#define TARGET_FISTTP (TARGET_SSE3 && TARGET_80387)
-#define TARGET_DEEP_BRANCH_PREDICTION (x86_deep_branch & ix86_tune_mask)
-#define TARGET_BRANCH_PREDICTION_HINTS (x86_branch_hints & ix86_tune_mask)
-#define TARGET_DOUBLE_WITH_ADD (x86_double_with_add & ix86_tune_mask)
-#define TARGET_USE_SAHF ((x86_use_sahf & ix86_tune_mask) && !TARGET_64BIT)
-#define TARGET_MOVX (x86_movx & ix86_tune_mask)
-#define TARGET_PARTIAL_REG_STALL (x86_partial_reg_stall & ix86_tune_mask)
-#define TARGET_PARTIAL_FLAG_REG_STALL		\
-  (x86_partial_flag_reg_stall & ix86_tune_mask)
-#define TARGET_USE_HIMODE_FIOP (x86_use_himode_fiop & ix86_tune_mask)
-#define TARGET_USE_SIMODE_FIOP (x86_use_simode_fiop & ix86_tune_mask)
-#define TARGET_USE_MOV0 (x86_use_mov0 & ix86_tune_mask)
-#define TARGET_USE_CLTD (x86_use_cltd & ix86_tune_mask)
-#define TARGET_USE_XCHGB (x86_use_xchgb & ix86_tune_mask)
-#define TARGET_SPLIT_LONG_MOVES (x86_split_long_moves & ix86_tune_mask)
-#define TARGET_READ_MODIFY_WRITE (x86_read_modify_write & ix86_tune_mask)
-#define TARGET_READ_MODIFY (x86_read_modify & ix86_tune_mask)
-#define TARGET_PROMOTE_QImode (x86_promote_QImode & ix86_tune_mask)
-#define TARGET_FAST_PREFIX (x86_fast_prefix & ix86_tune_mask)
-#define TARGET_SINGLE_STRINGOP (x86_single_stringop & ix86_tune_mask)
-#define TARGET_QIMODE_MATH (x86_qimode_math & ix86_tune_mask)
-#define TARGET_HIMODE_MATH (x86_himode_math & ix86_tune_mask)
-#define TARGET_PROMOTE_QI_REGS (x86_promote_qi_regs & ix86_tune_mask)
-#define TARGET_PROMOTE_HI_REGS (x86_promote_hi_regs & ix86_tune_mask)
-#define TARGET_ADD_ESP_4 (x86_add_esp_4 & ix86_tune_mask)
-#define TARGET_ADD_ESP_8 (x86_add_esp_8 & ix86_tune_mask)
-#define TARGET_SUB_ESP_4 (x86_sub_esp_4 & ix86_tune_mask)
-#define TARGET_SUB_ESP_8 (x86_sub_esp_8 & ix86_tune_mask)
-#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & ix86_tune_mask)
-#define TARGET_PARTIAL_REG_DEPENDENCY		\
-  (x86_partial_reg_dependency & ix86_tune_mask)
-#define TARGET_SSE_PARTIAL_REG_DEPENDENCY		\
-  (x86_sse_partial_reg_dependency & ix86_tune_mask)
-#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL		\
-  (x86_sse_unaligned_move_optimal & ix86_tune_mask)
-#define TARGET_SSE_SPLIT_REGS (x86_sse_split_regs & ix86_tune_mask)
-#define TARGET_SSE_TYPELESS_STORES (x86_sse_typeless_stores & ix86_tune_mask)
-#define TARGET_SSE_LOAD0_BY_PXOR (x86_sse_load0_by_pxor & ix86_tune_mask)
-#define TARGET_MEMORY_MISMATCH_STALL		\
-  (x86_memory_mismatch_stall & ix86_tune_mask)
-#define TARGET_PROLOGUE_USING_MOVE (x86_prologue_using_move & ix86_tune_mask)
-#define TARGET_EPILOGUE_USING_MOVE (x86_epilogue_using_move & ix86_tune_mask)
-#define TARGET_PREFETCH_SSE (x86_prefetch_sse)
-#define TARGET_SHIFT1 (x86_shift1 & ix86_tune_mask)
-#define TARGET_USE_FFREEP (x86_use_ffreep & ix86_tune_mask)
-#define TARGET_INTER_UNIT_MOVES (x86_inter_unit_moves & ix86_tune_mask)
-#define TARGET_FOUR_JUMP_LIMIT (x86_four_jump_limit & ix86_tune_mask)
-#define TARGET_SCHEDULE (x86_schedule & ix86_tune_mask)
-#define TARGET_USE_BT (x86_use_bt & ix86_tune_mask)
-#define TARGET_USE_INCDEC (x86_use_incdec & ix86_tune_mask)
-#define TARGET_PAD_RETURNS (x86_pad_returns & ix86_tune_mask)
+  X86_TUNE_LAST
+};

-#define ASSEMBLER_DIALECT (ix86_asm_dialect)
+extern unsigned int ix86_tune_features[X86_TUNE_LAST];

-#define TARGET_SSE_MATH ((ix86_fpmath & FPMATH_SSE) != 0)
-#define TARGET_MIX_SSE_I387 ((ix86_fpmath & FPMATH_SSE) \
-			     && (ix86_fpmath & FPMATH_387))
+#define TARGET_USE_LEAVE	ix86_tune_features[X86_TUNE_USE_LEAVE]
+#define TARGET_PUSH_MEMORY	ix86_tune_features[X86_TUNE_PUSH_MEMORY]
+#define TARGET_ZERO_EXTEND_WITH_AND \
+	ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
+#define TARGET_USE_BIT_TEST	ix86_tune_features[X86_TUNE_USE_BIT_TEST]
+#define TARGET_UNROLL_STRLEN	ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
+#define TARGET_DEEP_BRANCH_PREDICTION \
+	ix86_tune_features[X86_TUNE_DEEP_BRANCH_PREDICTION]
+#define TARGET_BRANCH_PREDICTION_HINTS \
+	ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_DOUBLE_WITH_ADD	ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
+#define TARGET_USE_SAHF		ix86_tune_features[X86_TUNE_USE_SAHF]
+#define TARGET_MOVX		ix86_tune_features[X86_TUNE_MOVX]
+#define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL]
+#define TARGET_PARTIAL_FLAG_REG_STALL \
+	ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL]
+#define TARGET_USE_HIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP]
+#define TARGET_USE_SIMODE_FIOP	ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP]
+#define TARGET_USE_MOV0		ix86_tune_features[X86_TUNE_USE_MOV0]
+#define TARGET_USE_CLTD		ix86_tune_features[X86_TUNE_USE_CLTD]
+#define TARGET_USE_XCHGB	ix86_tune_features[X86_TUNE_USE_XCHGB]
+#define TARGET_SPLIT_LONG_MOVES	ix86_tune_features[X86_TUNE_SPLIT_LONG_MOVES]
+#define TARGET_READ_MODIFY_WRITE ix86_tune_features[X86_TUNE_READ_MODIFY_WRITE]
+#define TARGET_READ_MODIFY	ix86_tune_features[X86_TUNE_READ_MODIFY]
+#define TARGET_PROMOTE_QImode	ix86_tune_features[X86_TUNE_PROMOTE_QIMODE]
+#define TARGET_FAST_PREFIX	ix86_tune_features[X86_TUNE_FAST_PREFIX]
+#define TARGET_SINGLE_STRINGOP	ix86_tune_features[X86_TUNE_SINGLE_STRINGOP]
+#define TARGET_QIMODE_MATH	ix86_tune_features[X86_TUNE_QIMODE_MATH]
+#define TARGET_HIMODE_MATH	ix86_tune_features[X86_TUNE_HIMODE_MATH]
+#define TARGET_PROMOTE_QI_REGS	ix86_tune_features[X86_TUNE_PROMOTE_QI_REGS]
+#define TARGET_PROMOTE_HI_REGS	ix86_tune_features[X86_TUNE_PROMOTE_HI_REGS]
+#define TARGET_ADD_ESP_4	ix86_tune_features[X86_TUNE_ADD_ESP_4]
+#define TARGET_ADD_ESP_8	ix86_tune_features[X86_TUNE_ADD_ESP_8]
+#define TARGET_SUB_ESP_4	ix86_tune_features[X86_TUNE_SUB_ESP_4]
+#define TARGET_SUB_ESP_8	ix86_tune_features[X86_TUNE_SUB_ESP_8]
+#define TARGET_INTEGER_DFMODE_MOVES \
+	ix86_tune_features[X86_TUNE_INTEGER_DFMODE_MOVES]
+#define TARGET_PARTIAL_REG_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
+	ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_UNALIGNED_MOVE_OPTIMAL \
+	ix86_tune_features[X86_TUNE_SSE_UNALIGNED_MOVE_OPTIMAL]
+#define TARGET_SSE_SPLIT_REGS	ix86_tune_features[X86_TUNE_SSE_SPLIT_REGS]
+#define TARGET_SSE_TYPELESS_STORES \
+	ix86_tune_features[X86_TUNE_SSE_TYPELESS_STORES]
+#define TARGET_SSE_LOAD0_BY_PXOR ix86_tune_features[X86_TUNE_SSE_LOAD0_BY_PXOR]
+#define TARGET_MEMORY_MISMATCH_STALL \
+	ix86_tune_features[X86_TUNE_MEMORY_MISMATCH_STALL]
+#define TARGET_PROLOGUE_USING_MOVE \
+	ix86_tune_features[X86_TUNE_PROLOGUE_USING_MOVE]
+#define TARGET_EPILOGUE_USING_MOVE \
+	ix86_tune_features[X86_TUNE_EPILOGUE_USING_MOVE]
+#define TARGET_SHIFT1		ix86_tune_features[X86_TUNE_SHIFT1]
+#define TARGET_USE_FFREEP	ix86_tune_features[X86_TUNE_USE_FFREEP]
+#define TARGET_INTER_UNIT_MOVES	ix86_tune_features[X86_TUNE_INTER_UNIT_MOVES]
+#define TARGET_FOUR_JUMP_LIMIT	ix86_tune_features[X86_TUNE_FOUR_JUMP_LIMIT]
+#define TARGET_SCHEDULE		ix86_tune_features[X86_TUNE_SCHEDULE]
+#define TARGET_USE_BT		ix86_tune_features[X86_TUNE_USE_BT]
+#define TARGET_USE_INCDEC	ix86_tune_features[X86_TUNE_USE_INCDEC]
+#define TARGET_PAD_RETURNS	ix86_tune_features[X86_TUNE_PAD_RETURNS]
+#define TARGET_EXT_80387_CONSTANTS \
+	ix86_tune_features[X86_TUNE_EXT_80387_CONSTANTS]

-#define TARGET_GNU_TLS (ix86_tls_dialect == TLS_DIALECT_GNU)
-#define TARGET_GNU2_TLS (ix86_tls_dialect == TLS_DIALECT_GNU2)
-#define TARGET_ANY_GNU_TLS (TARGET_GNU_TLS || TARGET_GNU2_TLS)
-#define TARGET_SUN_TLS (ix86_tls_dialect == TLS_DIALECT_SUN)
+/* Feature tests against the various architecture variations.  */
+enum ix86_arch_indices {
+  X86_ARCH_CMOVE,		/* || TARGET_SSE */
+  X86_ARCH_CMPXCHG,
+  X86_ARCH_CMPXCHG8B,
+  X86_ARCH_XADD,
+  X86_ARCH_BSWAP,

-#define TARGET_CMPXCHG (x86_cmpxchg & ix86_arch_mask)
-#define TARGET_CMPXCHG8B (x86_cmpxchg8b & ix86_arch_mask)
-#define TARGET_CMPXCHG16B (x86_cmpxchg16b)
-#define TARGET_XADD (x86_xadd & ix86_arch_mask)
-#define TARGET_BSWAP (x86_bswap & ix86_arch_mask)
+  X86_ARCH_LAST
+};
+  
+extern unsigned int ix86_arch_features[X86_ARCH_LAST];
+
+#define TARGET_CMOVE		ix86_arch_features[X86_ARCH_CMOVE]
+#define TARGET_CMPXCHG		ix86_arch_features[X86_ARCH_CMPXCHG]
+#define TARGET_CMPXCHG8B	ix86_arch_features[X86_ARCH_CMPXCHG8B]
+#define TARGET_XADD		ix86_arch_features[X86_ARCH_XADD]
+#define TARGET_BSWAP		ix86_arch_features[X86_ARCH_BSWAP]
+
+#define TARGET_FISTTP		(TARGET_SSE3 && TARGET_80387)
+
+extern int x86_prefetch_sse;
+#define TARGET_PREFETCH_SSE	x86_prefetch_sse
+
+extern int x86_cmpxchg16b;
+#define TARGET_CMPXCHG16B	x86_cmpxchg16b
+
+#define ASSEMBLER_DIALECT	(ix86_asm_dialect)
+
+#define TARGET_SSE_MATH		((ix86_fpmath & FPMATH_SSE) != 0)
+#define TARGET_MIX_SSE_I387 \
+ ((ix86_fpmath & (FPMATH_SSE | FPMATH_387)) == (FPMATH_SSE | FPMATH_387))
+
+#define TARGET_GNU_TLS		(ix86_tls_dialect == TLS_DIALECT_GNU)
+#define TARGET_GNU2_TLS		(ix86_tls_dialect == TLS_DIALECT_GNU2)
+#define TARGET_ANY_GNU_TLS	(TARGET_GNU_TLS || TARGET_GNU2_TLS)
+#define TARGET_SUN_TLS		(ix86_tls_dialect == TLS_DIALECT_SUN)

 #ifndef TARGET_64BIT_DEFAULT
 #define TARGET_64BIT_DEFAULT 0
@ -2132,10 +2187,7 @@ enum processor_type
 };

 extern enum processor_type ix86_tune;
-extern int ix86_tune_mask;
-
 extern enum processor_type ix86_arch;
-extern int ix86_arch_mask;

 enum fpmath_unit
 {