i386.c (athlon_cost): Fix lea, divide and XFmode move costs.

* i386.c (athlon_cost): Fix lea, divide and XFmode move costs. (x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall): New global variables. (ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only alloved; fix load penalties for Athlon. * i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency, x86_memory_mismatch_stall): Declare. (TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY, TARGET_MEMORY_MISMATCH_STALL): New. * i386.md (athlon scheduling parameters): Fix latencies according to Athlon Optimization Manual. (sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to vector. (fsqrt instruction patterns): Set athlon_decode to direct. (movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines. (movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY and TARGET_PARTIAL_REGISTER_STALL machines. (pushdf_nointeger): New pattern. (pushdf_integer): Rename from pushdf. (movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines. (movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines. From-SVN: r33215
2000-04-17 21:39:30 +00:00 · 2000-04-17 21:39:30 +00:00 · 0b5107cf3a
commit 0b5107cf3a
parent 10f1359422
4 changed files with 239 additions and 69 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,28 @@
+Mon Apr 17 23:35:29 MET DST 2000  Jan Hubicka  <jh@suse.cz>
+
+	* i386.c (athlon_cost): Fix lea, divide and XFmode move costs.
+	(x86_integer_DFmode_moves, x86_partial_reg_dependency,
+	 x86_memory_mismatch_stall): New global variables.
+	(ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only
+	alloved; fix load penalties for Athlon.
+	* i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency,
+	x86_memory_mismatch_stall): Declare.
+	(TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY,
+	 TARGET_MEMORY_MISMATCH_STALL): New.
+	* i386.md (athlon scheduling parameters): Fix latencies according to
+	Athlon Optimization Manual.
+	(sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to
+	vector.
+	(fsqrt instruction patterns): Set athlon_decode to direct.
+	(movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for
+	PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines.
+	(movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY
+	and TARGET_PARTIAL_REGISTER_STALL machines.
+	(pushdf_nointeger): New pattern.
+	(pushdf_integer): Rename from pushdf.
+	(movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines.
+	(movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines.
+
 2000-04-17  Richard Henderson  <rth@cygnus.com>

 	* loop.c (canonicalize_condition): Add WANT_REG argument.
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@ -163,12 +163,12 @@ struct processor_costs k6_cost = {

 struct processor_costs athlon_cost = {
  1,					/* cost of an add instruction */
-  1,					/* cost of a lea instruction */
+  2,					/* cost of a lea instruction */
  1,					/* variable shift costs */
  1,					/* constant shift costs */
  5,					/* cost of starting a multiply */
  0,					/* cost of multiply per each bit set */
-  19,					/* cost of a divide/mod */
+  42,					/* cost of a divide/mod */
  8,					/* "large" insn */
  9,					/* MOVE_RATIO */
  4,					/* cost for loading QImode using movzbl */
@ -177,9 +177,9 @@ struct processor_costs athlon_cost = {
 					   Relative to reg-reg move (2). */
  {2, 3, 2},				/* cost of storing integer registers */
  4,					/* cost of reg,reg fld/fst */
-  {6, 6, 6},				/* cost of loading fp registers
+  {6, 6, 20},				/* cost of loading fp registers
 					   in SFmode, DFmode and XFmode */
-  {4, 4, 4}				/* cost of loading integer registers */
+  {4, 4, 16}				/* cost of loading integer registers */
 };

 struct processor_costs *ix86_cost = &pentium_cost;
@ -222,6 +222,9 @@ const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
 const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
 const int x86_add_esp_4 = m_ATHLON | m_K6;
 const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
+const int x86_integer_DFmode_moves = ~m_ATHLON;
+const int x86_partial_reg_dependency = m_ATHLON;
+const int x86_memory_mismatch_stall = m_ATHLON;

 #define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))

@ -6287,6 +6290,7 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
     int cost;
 {
  enum attr_type insn_type, dep_insn_type;
+  enum attr_memory memory;
  rtx set, set2;
  int dep_insn_code_number;

@ -6334,7 +6338,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
 	 increase the cost here for non-imov insns.  */
      if (dep_insn_type != TYPE_IMOV
 	  && dep_insn_type != TYPE_FMOV
-	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
+	  && ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
+              || memory == MEMORY_BOTH))
 	cost += 1;

      /* INT->FP conversion is expensive.  */
@ -6359,7 +6364,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)

      /* Since we can't represent delayed latencies of load+operation, 
 	 increase the cost here for non-imov insns.  */
-      if (get_attr_memory (dep_insn) == MEMORY_LOAD)
+      if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
+          || memory == MEMORY_BOTH)
 	cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;

      /* INT->FP conversion is expensive.  */
@ -6368,19 +6374,15 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
      break;

    case PROCESSOR_ATHLON:
-      /* Address Generation Interlock cause problems on the Athlon CPU because
-         the loads and stores are done in order so once one load or store has
-	 to wait, others must too, so penalize the AGIs slightly by one cycle.
-	 We might experiment with this value later.  */
-      if (ix86_agi_dependant (insn, dep_insn, insn_type))
-	cost += 1;
+      if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
+           || memory == MEMORY_BOTH)
+	{
+	  if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
+	    cost += 2;
+	  else
+	    cost += 3;
+        }

-      /* Since we can't represent delayed latencies of load+operation, 
-	 increase the cost here for non-imov insns.  */
-      if (dep_insn_type != TYPE_IMOV
-	  && dep_insn_type != TYPE_FMOV
-	  && get_attr_memory (dep_insn) == MEMORY_LOAD)
-	cost += 2;
    default:
      break;
    }
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@ -173,8 +173,9 @@ extern const int x86_use_cltd, x86_read_modify_write;
 extern const int x86_read_modify, x86_split_long_moves;
 extern const int x86_promote_QImode, x86_single_stringop;
 extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
-extern const int x86_promote_hi_regs;
+extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
 extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
+extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;

 #define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
 #define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
@ -206,6 +207,9 @@ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
 #define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
 #define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
 #define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
+#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
+#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
+#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)

 #define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)

--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@ -738,7 +738,7 @@
 ;; communicates with all the execution units seperately instead.

 (define_attr "athlon_decode" "direct,vector"
-  (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str")
+  (cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov")
 	   (const_string "vector")
         (and (eq_attr "type" "push")
              (match_operand 1 "memory_operand" ""))
@ -766,7 +766,7 @@

 (define_function_unit "athlon_ieu" 3 0
  (and (eq_attr "cpu" "athlon")
-       (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld"))
+       (eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
  1 1)

 (define_function_unit "athlon_ieu" 3 0
@ -777,12 +777,12 @@
 (define_function_unit "athlon_ieu" 3 0
  (and (eq_attr "cpu" "athlon")
       (eq_attr "type" "imul"))
-  4 0)
+  5 0)

 (define_function_unit "athlon_ieu" 3 0
  (and (eq_attr "cpu" "athlon")
       (eq_attr "type" "idiv"))
-  27 0)
+  42 0)

 (define_function_unit "athlon_muldiv" 1 0
  (and (eq_attr "cpu" "athlon")
@ -792,56 +792,118 @@
 (define_function_unit "athlon_muldiv" 1 0
  (and (eq_attr "cpu" "athlon")
       (eq_attr "type" "idiv"))
-  27 27)
+  42 42)

-(define_attr "athlon_fpunits" "none,store,mul,add,muladd,all"
+(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
  (cond [(eq_attr "type" "fop,fop1,fcmp")
 	   (const_string "add")
-         (eq_attr "type" "fmul,fdiv,fpspc,fsgn")
+         (eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
 	   (const_string "mul")
-	 (and (eq_attr "type" "fmov") (eq_attr "memory" "!none"))
+	 (and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
 	   (const_string "store")
+	 (and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
+	   (const_string "any")
         (and (eq_attr "type" "fmov")
              (ior (match_operand:SI 1 "register_operand" "")
                   (match_operand 1 "immediate_operand" "")))
 	   (const_string "store")
         (eq_attr "type" "fmov")
-	   (const_string "muladd")
-         (eq_attr "type" "fcmov")
-	   (const_string "all")]
+	   (const_string "muladd")]
 	(const_string "none")))

+;; We use latencies 1 for definitions.  This is OK to model colisions
+;; in execution units.  The real latencies are modeled in the "fp" pipeline.
+
+;; fsin, fcos: 96-192
+;; fsincos: 107-211
+;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fpspc"))
+  100 1)
+
+;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fdiv"))
+  24 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fop,fop1,fmul"))
+  4 1)
+
+;; XFmode loads are slow.
+;; XFmode store is slow too (8 cycles), but we don't need to model it, because
+;; there are no dependent instructions.
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (and (eq_attr "type" "fmov")
+	    (match_operand:XF 1 "memory_operand" "")))
+  10 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fmov,fsgn"))
+  2 1)
+
+;; fcmp and ftst instructions
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (and (eq_attr "type" "fcmp")
+	    (eq_attr "athlon_decode" "direct")))
+  3 1)
+
+;; fcmpi instructions.
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (and (eq_attr "type" "fcmp")
+	    (eq_attr "athlon_decode" "vector")))
+  3 1)
+
+(define_function_unit "athlon_fp" 3 0
+  (and (eq_attr "cpu" "athlon")
+       (eq_attr "type" "fcmov"))
+  7 1)
+
 (define_function_unit "athlon_fp_mul" 1 0
  (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "mul,all"))
-  4 1)
+       (eq_attr "athlon_fpunits" "mul"))
+  1 1)

 (define_function_unit "athlon_fp_add" 1 0
  (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "add,all"))
-  4 1)
+       (eq_attr "athlon_fpunits" "add"))
+  1 1)

 (define_function_unit "athlon_fp_muladd" 2 0
  (and (eq_attr "cpu" "athlon")
-       (and (eq_attr "type" "fmov")
-            (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
-  2 1)
-
-(define_function_unit "athlon_fp_muladd" 2 0
-  (and (eq_attr "cpu" "athlon")
-       (and (eq_attr "type" "!fmov")
-            (eq_attr "athlon_fpunits" "muladd,mul,add,all")))
-  4 1)
+       (eq_attr "athlon_fpunits" "muladd,mul,add"))
+  1 1)

 (define_function_unit "athlon_fp_store" 1 0
  (and (eq_attr "cpu" "athlon")
-       (eq_attr "athlon_fpunits" "store,all"))
+       (eq_attr "athlon_fpunits" "store"))
  1 1)

-(define_function_unit "athlon_agu" 3 0
+;; We don't need to model the Adress Generation Unit, since we don't model
+;; the re-order buffer yet and thus we never schedule more than three operations
+;; at time.  Later we may want to experiment with MD_SCHED macros modeling the
+;; decoders independently on the functional units.
+
+;(define_function_unit "athlon_agu" 3 0
+;  (and (eq_attr "cpu" "athlon")
+;       (and (eq_attr "memory" "!none")
+;            (eq_attr "athlon_fpunits" "none")))
+;  1 1)
+
+;; Model load unit to avoid too long sequences of loads.  We don't need to
+;; model store queue, since it is hardly going to be bottleneck.
+
+(define_function_unit "athlon_load" 2 0
  (and (eq_attr "cpu" "athlon")
-       (and (eq_attr "memory" "!none")
-            (eq_attr "athlon_fpunits" "none")))
+       (eq_attr "memory" "load,both"))
  1 1)


@ -1255,6 +1317,7 @@
  ""
  "sahf"
  [(set_attr "length" "1")
+   (set_attr "athlon_decode" "vector")
   (set_attr "ppro_uops" "one")])

 ;; Pentium Pro can do steps 1 through 3 in one go.
@ -1390,6 +1453,7 @@
  "xchg{l}\\t%1, %0"
  [(set_attr "type" "imov")
   (set_attr "pent_pair" "np")
+   (set_attr "athlon_decode" "vector")
   (set_attr "ppro_uops" "few")])

 (define_expand "movhi"
@ -1437,8 +1501,10 @@
 }"
  [(set (attr "type")
     (cond [(and (eq_attr "alternative" "0")
-		 (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
-		     (const_int 0)))
+		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			  (const_int 0))
+		      (eq (symbol_ref "TARGET_HIMODE_MATH")
+			  (const_int 0))))
 	      (const_string "imov")
 	    (and (eq_attr "alternative" "1,2")
 		 (match_operand:HI 1 "aligned_operand" ""))
@ -1456,8 +1522,10 @@
 		  (match_operand:HI 1 "aligned_operand" ""))
 	       (const_string "0")
 	     (and (eq_attr "alternative" "0")
-		  (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
-		      (const_int 0)))
+		  (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			   (const_int 0))
+		       (eq (symbol_ref "TARGET_HIMODE_MATH")
+			   (const_int 0))))
 	       (const_string "0")
 	    ]
 	    (const_string "1")))
@ -1547,9 +1615,19 @@
  [(set_attr "type" "pop")
   (set_attr "length_prefix" "1")])

+;; Situation is quite tricky about when to choose full sized (SImode) move
+;; over QImode moves.  For Q_REG -> Q_REG move we use full size only for
+;; partial register dependency machines (such as AMD Athlon), where QImode
+;; moves issue extra dependency and for partial register stalls machines
+;; that don't use QImode patterns (and QImode move cause stall on the next
+;; instruction).
+;;
+;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial
+;; register stall machines with, where we use QImode instructions, since
+;; partial register stall can be caused there.  Then we use movzx.
 (define_insn "*movqi_1"
-  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q,r,?r,m")
-	(match_operand:QI 1 "general_operand" "qn,qm,rn,qm,qn"))]
+  [(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
+	(match_operand:QI 1 "general_operand"      " q,qn,qm,q,rn,qm,qn"))]
  "GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
  "*
 {
@ -1560,26 +1638,50 @@
 	abort ();
      return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\";
    default:
-      if (which_alternative == 2)
+      if (which_alternative == 4 || which_alternative == 3
+	  || (which_alternative == 1 && get_attr_length (insn) == 5)
+	  || (which_alternative == 0
+	      && ((TARGET_PARTIAL_REG_STALL && !TARGET_QIMODE_MATH)
+		  || TARGET_PARTIAL_REG_DEPENDENCY)))
        return \"mov{l}\\t{%k1, %k0|%k0, %k1}\";
      else
        return \"mov{b}\\t{%1, %0|%0, %1}\";
    }
 }"
  [(set (attr "type")
-     (cond [(eq_attr "alternative" "3")
+     (cond [(and (eq_attr "alternative" "3")
+		 (ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
+			  (const_int 0))
+		      (eq (symbol_ref "TARGET_QIMODE_MATH")
+			  (const_int 0))))
+	      (const_string "imov")
+	    (eq_attr "alternative" "3,5")
 	      (const_string "imovx")
 	    (and (ne (symbol_ref "TARGET_MOVX")
 		     (const_int 0))
-		 (eq_attr "alternative" "1"))
+		 (eq_attr "alternative" "2"))
 	      (const_string "imovx")
 	   ]
 	   (const_string "imov")))
    ; There's no place to override just the immediate length
    (set (attr "length")
      (cond [(and (eq_attr "type" "imov")
-		  (and (eq_attr "alternative" "2")
-		       (match_operand:HI 1 "immediate_operand" "")))
+		  (and (match_operand:HI 1 "immediate_operand" "")
+		       (eq_attr "alternative" "4")))
+	       (const_string "5")
+	     ;; Avoid extra dependency on partial register.
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "1")
+		       (ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
+			   (const_int 0))))
+	       (const_string "5")
+	     ;; Avoid partial register stalls when not using QImode arithmetic
+	     (and (eq_attr "type" "imov")
+		  (and (eq_attr "alternative" "1")
+		       (and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL")
+				(const_int 0))
+			    (eq (symbol_ref "TARGET_QIMODE_MATH")
+				(const_int 0)))))
 	       (const_string "5")
 	    ]
 	    (const_string "*")))])
@ -1904,10 +2006,38 @@
 ;; On the average, pushdf using integers can be still shorter.  Allow this
 ;; pattern for optimize_size too.

-(define_insn "*pushdf"
+(define_insn "*pushdf_nointeger"
+  [(set (match_operand:DF 0 "push_operand" "=<,<,<")
+	(match_operand:DF 1 "general_no_elim_operand" "f,Fo#f,*r#f"))]
+  "!TARGET_INTEGER_DFMODE_MOVES"
+  "*
+{
+  switch (which_alternative)
+    {
+    case 0:
+      /* %%% We loose REG_DEAD notes for controling pops if we split late.  */
+      operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx);
+      operands[2] = stack_pointer_rtx;
+      operands[3] = GEN_INT (8);
+      if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
+	return \"sub{l}\\t{%3, %2|%2, %3}\;fstp%z0\\t%y0\";
+      else
+	return \"sub{l}\\t{%3, %2|%2, %3}\;fst%z0\\t%y0\";
+
+    case 1:
+    case 2:
+      return \"#\";
+
+    default:
+      abort ();
+    }
+}"
+  [(set_attr "type" "multi")])
+
+(define_insn "*pushdf_integer"
  [(set (match_operand:DF 0 "push_operand" "=<,<")
 	(match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))]
-  ""
+  "TARGET_INTEGER_DFMODE_MOVES"
  "*
 {
  switch (which_alternative)
@ -1955,7 +2085,7 @@
  [(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o")
 	(match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))]
  "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
-   && optimize_size
+   && (optimize_size || !TARGET_INTEGER_DFMODE_MOVES)
   && (reload_in_progress || reload_completed
       || GET_CODE (operands[1]) != CONST_DOUBLE
       || memory_operand (operands[0], DFmode))" 
@ -2002,7 +2132,7 @@
  [(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o")
 	(match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))]
  "(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
-   && !optimize_size
+   && !optimize_size && TARGET_INTEGER_DFMODE_MOVES
   && (reload_in_progress || reload_completed
       || GET_CODE (operands[1]) != CONST_DOUBLE
       || memory_operand (operands[0], DFmode))" 
@ -2304,7 +2434,8 @@
  else
    return \"fxch\\t%0\";
 }"
-  [(set_attr "type" "fxch")])
+  [(set_attr "type" "fxch")
+   (set_attr "athlon_decode" "vector")])

 ;; Zero extension instructions

@ -3202,6 +3333,7 @@
  "TARGET_80387"
  "fldcw\\t%0"
  [(set_attr "length_opcode" "2")
+   (set_attr "athlon_decode" "vector")
   (set_attr "ppro_uops" "few")])

 ;; Conversion between fixed point and floating point.
@ -7691,6 +7823,7 @@
  ""
  "leave"
  [(set_attr "length" "1")
+   (set_attr "athlon_decode" "vector")
   (set_attr "ppro_uops" "few")])

 (define_expand "ffssi2"
@ -8123,7 +8256,8 @@
 	(sqrt:SF (match_operand:SF 1 "register_operand" "0")))]
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "sqrtdf2"
  [(set (match_operand:DF 0 "register_operand" "=f")
@ -8131,7 +8265,8 @@
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387
   && (TARGET_IEEE_FP || flag_fast_math) "
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "*sqrtextendsfdf2"
  [(set (match_operand:DF 0 "register_operand" "=f")
@ -8139,7 +8274,8 @@
 		  (match_operand:SF 1 "register_operand" "0"))))]
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "sqrtxf2"
  [(set (match_operand:XF 0 "register_operand" "=f")
@ -8147,7 +8283,8 @@
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387 
   && (TARGET_IEEE_FP || flag_fast_math) "
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "*sqrtextenddfxf2"
  [(set (match_operand:XF 0 "register_operand" "=f")
@ -8155,7 +8292,8 @@
 		  (match_operand:DF 1 "register_operand" "0"))))]
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "*sqrtextendsfxf2"
  [(set (match_operand:XF 0 "register_operand" "=f")
@ -8163,7 +8301,8 @@
 		  (match_operand:SF 1 "register_operand" "0"))))]
  "! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
  "fsqrt"
-  [(set_attr "type" "fpspc")])
+  [(set_attr "type" "fpspc")
+   (set_attr "athlon_decode" "direct")])

 (define_insn "sindf2"
  [(set (match_operand:DF 0 "register_operand" "=f")