i386.c (athlon_cost): Fix lea, divide and XFmode move costs.

* i386.c (athlon_cost): Fix lea, divide and XFmode move costs.
	(x86_integer_DFmode_moves, x86_partial_reg_dependency,
	 x86_memory_mismatch_stall): New global variables.
	(ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only
	alloved; fix load penalties for Athlon.
	* i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency,
	x86_memory_mismatch_stall): Declare.
	(TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY,
	 TARGET_MEMORY_MISMATCH_STALL): New.
	* i386.md (athlon scheduling parameters): Fix latencies according to
	Athlon Optimization Manual.
	(sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to
	vector.
	(fsqrt instruction patterns): Set athlon_decode to direct.
	(movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for
	PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines.
	(movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY
	and TARGET_PARTIAL_REGISTER_STALL machines.
	(pushdf_nointeger): New pattern.
	(pushdf_integer): Rename from pushdf.
	(movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines.
	(movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines.

From-SVN: r33215
This commit is contained in:
Jan Hubicka 2000-04-17 21:39:30 +00:00 committed by Jan Hubicka
parent 10f1359422
commit 0b5107cf3a
4 changed files with 239 additions and 69 deletions

View File

@ -1,3 +1,28 @@
Mon Apr 17 23:35:29 MET DST 2000 Jan Hubicka <jh@suse.cz>
* i386.c (athlon_cost): Fix lea, divide and XFmode move costs.
(x86_integer_DFmode_moves, x86_partial_reg_dependency,
x86_memory_mismatch_stall): New global variables.
(ix86_adjust_cost): Handle MEMORY_BOTH on places MEMORY_STORE was only
alloved; fix load penalties for Athlon.
* i386.h (x86_integer_DFmode_moves, x86_partial_reg_dependency,
x86_memory_mismatch_stall): Declare.
(TARGET_INTEGER_DFMODE_MOVES, TARGET_PARTIAL_REG_DEPENDENCY,
TARGET_MEMORY_MISMATCH_STALL): New.
* i386.md (athlon scheduling parameters): Fix latencies according to
Athlon Optimization Manual.
(sahf, xchg, fldcw, leave instruction patterns): Set athlon_decode to
vector.
(fsqrt instruction patterns): Set athlon_decode to direct.
(movhi_1): Promote for TARGET_PARTIAL_REG_DEPENDENCY and for
PARTIAL_REGISTER_STALL with !TARGET_HIMODE_MATH machines.
(movqi_1): Handle promoting correctly for TARGET_PARTIAL_REG_DEPENDENCY
and TARGET_PARTIAL_REGISTER_STALL machines.
(pushdf_nointeger): New pattern.
(pushdf_integer): Rename from pushdf.
(movdf_nointger): Enable for !TARGET_INTEGER_DFMODE_MOVES machines.
(movdf_intger): Disable for !TARGET_INTEGER_DFMODE_MOVES machines.
2000-04-17 Richard Henderson <rth@cygnus.com>
* loop.c (canonicalize_condition): Add WANT_REG argument.

View File

@ -163,12 +163,12 @@ struct processor_costs k6_cost = {
struct processor_costs athlon_cost = {
1, /* cost of an add instruction */
1, /* cost of a lea instruction */
2, /* cost of a lea instruction */
1, /* variable shift costs */
1, /* constant shift costs */
5, /* cost of starting a multiply */
0, /* cost of multiply per each bit set */
19, /* cost of a divide/mod */
42, /* cost of a divide/mod */
8, /* "large" insn */
9, /* MOVE_RATIO */
4, /* cost for loading QImode using movzbl */
@ -177,9 +177,9 @@ struct processor_costs athlon_cost = {
Relative to reg-reg move (2). */
{2, 3, 2}, /* cost of storing integer registers */
4, /* cost of reg,reg fld/fst */
{6, 6, 6}, /* cost of loading fp registers
{6, 6, 20}, /* cost of loading fp registers
in SFmode, DFmode and XFmode */
{4, 4, 4} /* cost of loading integer registers */
{4, 4, 16} /* cost of loading integer registers */
};
struct processor_costs *ix86_cost = &pentium_cost;
@ -222,6 +222,9 @@ const int x86_sub_esp_4 = m_ATHLON | m_PPRO;
const int x86_sub_esp_8 = m_ATHLON | m_PPRO | m_386 | m_486;
const int x86_add_esp_4 = m_ATHLON | m_K6;
const int x86_add_esp_8 = m_ATHLON | m_PPRO | m_K6 | m_386 | m_486;
const int x86_integer_DFmode_moves = ~m_ATHLON;
const int x86_partial_reg_dependency = m_ATHLON;
const int x86_memory_mismatch_stall = m_ATHLON;
#define AT_BP(mode) (gen_rtx_MEM ((mode), hard_frame_pointer_rtx))
@ -6287,6 +6290,7 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
int cost;
{
enum attr_type insn_type, dep_insn_type;
enum attr_memory memory;
rtx set, set2;
int dep_insn_code_number;
@ -6334,7 +6338,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
increase the cost here for non-imov insns. */
if (dep_insn_type != TYPE_IMOV
&& dep_insn_type != TYPE_FMOV
&& get_attr_memory (dep_insn) == MEMORY_LOAD)
&& ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
|| memory == MEMORY_BOTH))
cost += 1;
/* INT->FP conversion is expensive. */
@ -6359,7 +6364,8 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
if (get_attr_memory (dep_insn) == MEMORY_LOAD)
if ((memory = get_attr_memory (dep_insn) == MEMORY_LOAD)
|| memory == MEMORY_BOTH)
cost += (dep_insn_type != TYPE_IMOV) ? 2 : 1;
/* INT->FP conversion is expensive. */
@ -6368,19 +6374,15 @@ ix86_adjust_cost (insn, link, dep_insn, cost)
break;
case PROCESSOR_ATHLON:
/* Address Generation Interlock cause problems on the Athlon CPU because
the loads and stores are done in order so once one load or store has
to wait, others must too, so penalize the AGIs slightly by one cycle.
We might experiment with this value later. */
if (ix86_agi_dependant (insn, dep_insn, insn_type))
cost += 1;
if ((memory = get_attr_memory (dep_insn)) == MEMORY_LOAD
|| memory == MEMORY_BOTH)
{
if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV)
cost += 2;
else
cost += 3;
}
/* Since we can't represent delayed latencies of load+operation,
increase the cost here for non-imov insns. */
if (dep_insn_type != TYPE_IMOV
&& dep_insn_type != TYPE_FMOV
&& get_attr_memory (dep_insn) == MEMORY_LOAD)
cost += 2;
default:
break;
}

View File

@ -173,8 +173,9 @@ extern const int x86_use_cltd, x86_read_modify_write;
extern const int x86_read_modify, x86_split_long_moves;
extern const int x86_promote_QImode, x86_single_stringop;
extern const int x86_himode_math, x86_qimode_math, x86_promote_qi_regs;
extern const int x86_promote_hi_regs;
extern const int x86_promote_hi_regs, x86_integer_DFmode_moves;
extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
extern const int x86_partial_reg_dependency, x86_memory_mismatch_stall;
#define TARGET_USE_LEAVE (x86_use_leave & CPUMASK)
#define TARGET_PUSH_MEMORY (x86_push_memory & CPUMASK)
@ -206,6 +207,9 @@ extern const int x86_add_esp_4, x86_add_esp_8, x86_sub_esp_4, x86_sub_esp_8;
#define TARGET_ADD_ESP_8 (x86_add_esp_8 & CPUMASK)
#define TARGET_SUB_ESP_4 (x86_sub_esp_4 & CPUMASK)
#define TARGET_SUB_ESP_8 (x86_sub_esp_8 & CPUMASK)
#define TARGET_INTEGER_DFMODE_MOVES (x86_integer_DFmode_moves & CPUMASK)
#define TARGET_PARTIAL_REG_DEPENDENCY (x86_partial_reg_dependency & CPUMASK)
#define TARGET_MEMORY_MISMATCH_STALL (x86_memory_mismatch_stall & CPUMASK)
#define TARGET_STACK_PROBE (target_flags & MASK_STACK_PROBE)

View File

@ -738,7 +738,7 @@
;; communicates with all the execution units seperately instead.
(define_attr "athlon_decode" "direct,vector"
(cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str")
(cond [(eq_attr "type" "call,imul,idiv,other,multi,fcmov,fpspc,str,pop,cld,fcmov")
(const_string "vector")
(and (eq_attr "type" "push")
(match_operand 1 "memory_operand" ""))
@ -766,7 +766,7 @@
(define_function_unit "athlon_ieu" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,imul,idiv,ibr,setcc,push,pop,call,callv,icmov,str,cld"))
(eq_attr "type" "alu1,negnot,alu,icmp,imov,imovx,lea,incdec,ishift,ibr,call,callv,icmov,cld,pop,setcc,push,pop"))
1 1)
(define_function_unit "athlon_ieu" 3 0
@ -777,12 +777,12 @@
(define_function_unit "athlon_ieu" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "imul"))
4 0)
5 0)
(define_function_unit "athlon_ieu" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "idiv"))
27 0)
42 0)
(define_function_unit "athlon_muldiv" 1 0
(and (eq_attr "cpu" "athlon")
@ -792,56 +792,118 @@
(define_function_unit "athlon_muldiv" 1 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "idiv"))
27 27)
42 42)
(define_attr "athlon_fpunits" "none,store,mul,add,muladd,all"
(define_attr "athlon_fpunits" "none,store,mul,add,muladd,any"
(cond [(eq_attr "type" "fop,fop1,fcmp")
(const_string "add")
(eq_attr "type" "fmul,fdiv,fpspc,fsgn")
(eq_attr "type" "fmul,fdiv,fpspc,fsgn,fcmov")
(const_string "mul")
(and (eq_attr "type" "fmov") (eq_attr "memory" "!none"))
(and (eq_attr "type" "fmov") (eq_attr "memory" "store,both"))
(const_string "store")
(and (eq_attr "type" "fmov") (eq_attr "memory" "load"))
(const_string "any")
(and (eq_attr "type" "fmov")
(ior (match_operand:SI 1 "register_operand" "")
(match_operand 1 "immediate_operand" "")))
(const_string "store")
(eq_attr "type" "fmov")
(const_string "muladd")
(eq_attr "type" "fcmov")
(const_string "all")]
(const_string "muladd")]
(const_string "none")))
;; We use latencies 1 for definitions. This is OK to model colisions
;; in execution units. The real latencies are modeled in the "fp" pipeline.
;; fsin, fcos: 96-192
;; fsincos: 107-211
;; fsqrt: 19 for SFmode, 27 for DFmode, 35 for XFmode.
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "fpspc"))
100 1)
;; 16 cycles for SFmode, 20 for DFmode and 24 for XFmode.
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "fdiv"))
24 1)
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "fop,fop1,fmul"))
4 1)
;; XFmode loads are slow.
;; XFmode store is slow too (8 cycles), but we don't need to model it, because
;; there are no dependent instructions.
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "fmov")
(match_operand:XF 1 "memory_operand" "")))
10 1)
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "fmov,fsgn"))
2 1)
;; fcmp and ftst instructions
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "fcmp")
(eq_attr "athlon_decode" "direct")))
3 1)
;; fcmpi instructions.
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "fcmp")
(eq_attr "athlon_decode" "vector")))
3 1)
(define_function_unit "athlon_fp" 3 0
(and (eq_attr "cpu" "athlon")
(eq_attr "type" "fcmov"))
7 1)
(define_function_unit "athlon_fp_mul" 1 0
(and (eq_attr "cpu" "athlon")
(eq_attr "athlon_fpunits" "mul,all"))
4 1)
(eq_attr "athlon_fpunits" "mul"))
1 1)
(define_function_unit "athlon_fp_add" 1 0
(and (eq_attr "cpu" "athlon")
(eq_attr "athlon_fpunits" "add,all"))
4 1)
(eq_attr "athlon_fpunits" "add"))
1 1)
(define_function_unit "athlon_fp_muladd" 2 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "fmov")
(eq_attr "athlon_fpunits" "muladd,mul,add,all")))
2 1)
(define_function_unit "athlon_fp_muladd" 2 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "type" "!fmov")
(eq_attr "athlon_fpunits" "muladd,mul,add,all")))
4 1)
(eq_attr "athlon_fpunits" "muladd,mul,add"))
1 1)
(define_function_unit "athlon_fp_store" 1 0
(and (eq_attr "cpu" "athlon")
(eq_attr "athlon_fpunits" "store,all"))
(eq_attr "athlon_fpunits" "store"))
1 1)
(define_function_unit "athlon_agu" 3 0
;; We don't need to model the Adress Generation Unit, since we don't model
;; the re-order buffer yet and thus we never schedule more than three operations
;; at time. Later we may want to experiment with MD_SCHED macros modeling the
;; decoders independently on the functional units.
;(define_function_unit "athlon_agu" 3 0
; (and (eq_attr "cpu" "athlon")
; (and (eq_attr "memory" "!none")
; (eq_attr "athlon_fpunits" "none")))
; 1 1)
;; Model load unit to avoid too long sequences of loads. We don't need to
;; model store queue, since it is hardly going to be bottleneck.
(define_function_unit "athlon_load" 2 0
(and (eq_attr "cpu" "athlon")
(and (eq_attr "memory" "!none")
(eq_attr "athlon_fpunits" "none")))
(eq_attr "memory" "load,both"))
1 1)
@ -1255,6 +1317,7 @@
""
"sahf"
[(set_attr "length" "1")
(set_attr "athlon_decode" "vector")
(set_attr "ppro_uops" "one")])
;; Pentium Pro can do steps 1 through 3 in one go.
@ -1390,6 +1453,7 @@
"xchg{l}\\t%1, %0"
[(set_attr "type" "imov")
(set_attr "pent_pair" "np")
(set_attr "athlon_decode" "vector")
(set_attr "ppro_uops" "few")])
(define_expand "movhi"
@ -1437,8 +1501,10 @@
}"
[(set (attr "type")
(cond [(and (eq_attr "alternative" "0")
(eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0)))
(ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0))
(eq (symbol_ref "TARGET_HIMODE_MATH")
(const_int 0))))
(const_string "imov")
(and (eq_attr "alternative" "1,2")
(match_operand:HI 1 "aligned_operand" ""))
@ -1456,8 +1522,10 @@
(match_operand:HI 1 "aligned_operand" ""))
(const_string "0")
(and (eq_attr "alternative" "0")
(eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0)))
(ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0))
(eq (symbol_ref "TARGET_HIMODE_MATH")
(const_int 0))))
(const_string "0")
]
(const_string "1")))
@ -1547,9 +1615,19 @@
[(set_attr "type" "pop")
(set_attr "length_prefix" "1")])
;; Situation is quite tricky about when to choose full sized (SImode) move
;; over QImode moves. For Q_REG -> Q_REG move we use full size only for
;; partial register dependency machines (such as AMD Athlon), where QImode
;; moves issue extra dependency and for partial register stalls machines
;; that don't use QImode patterns (and QImode move cause stall on the next
;; instruction).
;;
;; For loads of Q_REG to NONQ_REG we use full sized moves except for partial
;; register stall machines with, where we use QImode instructions, since
;; partial register stall can be caused there. Then we use movzx.
(define_insn "*movqi_1"
[(set (match_operand:QI 0 "nonimmediate_operand" "=q,q,r,?r,m")
(match_operand:QI 1 "general_operand" "qn,qm,rn,qm,qn"))]
[(set (match_operand:QI 0 "nonimmediate_operand" "=q,q ,q ,r,r ,?r,m")
(match_operand:QI 1 "general_operand" " q,qn,qm,q,rn,qm,qn"))]
"GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM"
"*
{
@ -1560,26 +1638,50 @@
abort ();
return \"movz{bl|x}\\t{%1, %k0|%k0, %1}\";
default:
if (which_alternative == 2)
if (which_alternative == 4 || which_alternative == 3
|| (which_alternative == 1 && get_attr_length (insn) == 5)
|| (which_alternative == 0
&& ((TARGET_PARTIAL_REG_STALL && !TARGET_QIMODE_MATH)
|| TARGET_PARTIAL_REG_DEPENDENCY)))
return \"mov{l}\\t{%k1, %k0|%k0, %k1}\";
else
return \"mov{b}\\t{%1, %0|%0, %1}\";
}
}"
[(set (attr "type")
(cond [(eq_attr "alternative" "3")
(cond [(and (eq_attr "alternative" "3")
(ior (eq (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0))
(eq (symbol_ref "TARGET_QIMODE_MATH")
(const_int 0))))
(const_string "imov")
(eq_attr "alternative" "3,5")
(const_string "imovx")
(and (ne (symbol_ref "TARGET_MOVX")
(const_int 0))
(eq_attr "alternative" "1"))
(eq_attr "alternative" "2"))
(const_string "imovx")
]
(const_string "imov")))
; There's no place to override just the immediate length
(set (attr "length")
(cond [(and (eq_attr "type" "imov")
(and (eq_attr "alternative" "2")
(match_operand:HI 1 "immediate_operand" "")))
(and (match_operand:HI 1 "immediate_operand" "")
(eq_attr "alternative" "4")))
(const_string "5")
;; Avoid extra dependency on partial register.
(and (eq_attr "type" "imov")
(and (eq_attr "alternative" "1")
(ne (symbol_ref "TARGET_PARTIAL_REG_DEPENDENCY")
(const_int 0))))
(const_string "5")
;; Avoid partial register stalls when not using QImode arithmetic
(and (eq_attr "type" "imov")
(and (eq_attr "alternative" "1")
(and (ne (symbol_ref "TARGET_PARTIAL_REG_STALL")
(const_int 0))
(eq (symbol_ref "TARGET_QIMODE_MATH")
(const_int 0)))))
(const_string "5")
]
(const_string "*")))])
@ -1904,10 +2006,38 @@
;; On the average, pushdf using integers can be still shorter. Allow this
;; pattern for optimize_size too.
(define_insn "*pushdf"
(define_insn "*pushdf_nointeger"
[(set (match_operand:DF 0 "push_operand" "=<,<,<")
(match_operand:DF 1 "general_no_elim_operand" "f,Fo#f,*r#f"))]
"!TARGET_INTEGER_DFMODE_MOVES"
"*
{
switch (which_alternative)
{
case 0:
/* %%% We loose REG_DEAD notes for controling pops if we split late. */
operands[0] = gen_rtx_MEM (DFmode, stack_pointer_rtx);
operands[2] = stack_pointer_rtx;
operands[3] = GEN_INT (8);
if (find_regno_note (insn, REG_DEAD, REGNO (operands[1])))
return \"sub{l}\\t{%3, %2|%2, %3}\;fstp%z0\\t%y0\";
else
return \"sub{l}\\t{%3, %2|%2, %3}\;fst%z0\\t%y0\";
case 1:
case 2:
return \"#\";
default:
abort ();
}
}"
[(set_attr "type" "multi")])
(define_insn "*pushdf_integer"
[(set (match_operand:DF 0 "push_operand" "=<,<")
(match_operand:DF 1 "general_no_elim_operand" "f#r,rFo#f"))]
""
"TARGET_INTEGER_DFMODE_MOVES"
"*
{
switch (which_alternative)
@ -1955,7 +2085,7 @@
[(set (match_operand:DF 0 "nonimmediate_operand" "=f,m,f,*r,o")
(match_operand:DF 1 "general_operand" "fm,f,G,*roF,F*r"))]
"(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
&& optimize_size
&& (optimize_size || !TARGET_INTEGER_DFMODE_MOVES)
&& (reload_in_progress || reload_completed
|| GET_CODE (operands[1]) != CONST_DOUBLE
|| memory_operand (operands[0], DFmode))"
@ -2002,7 +2132,7 @@
[(set (match_operand:DF 0 "nonimmediate_operand" "=f#r,m,f#r,r#f,o")
(match_operand:DF 1 "general_operand" "fm#r,f#r,G,roF#f,Fr#f"))]
"(GET_CODE (operands[0]) != MEM || GET_CODE (operands[1]) != MEM)
&& !optimize_size
&& !optimize_size && TARGET_INTEGER_DFMODE_MOVES
&& (reload_in_progress || reload_completed
|| GET_CODE (operands[1]) != CONST_DOUBLE
|| memory_operand (operands[0], DFmode))"
@ -2304,7 +2434,8 @@
else
return \"fxch\\t%0\";
}"
[(set_attr "type" "fxch")])
[(set_attr "type" "fxch")
(set_attr "athlon_decode" "vector")])
;; Zero extension instructions
@ -3202,6 +3333,7 @@
"TARGET_80387"
"fldcw\\t%0"
[(set_attr "length_opcode" "2")
(set_attr "athlon_decode" "vector")
(set_attr "ppro_uops" "few")])
;; Conversion between fixed point and floating point.
@ -7691,6 +7823,7 @@
""
"leave"
[(set_attr "length" "1")
(set_attr "athlon_decode" "vector")
(set_attr "ppro_uops" "few")])
(define_expand "ffssi2"
@ -8123,7 +8256,8 @@
(sqrt:SF (match_operand:SF 1 "register_operand" "0")))]
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "sqrtdf2"
[(set (match_operand:DF 0 "register_operand" "=f")
@ -8131,7 +8265,8 @@
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387
&& (TARGET_IEEE_FP || flag_fast_math) "
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "*sqrtextendsfdf2"
[(set (match_operand:DF 0 "register_operand" "=f")
@ -8139,7 +8274,8 @@
(match_operand:SF 1 "register_operand" "0"))))]
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "sqrtxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
@ -8147,7 +8283,8 @@
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387
&& (TARGET_IEEE_FP || flag_fast_math) "
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "*sqrtextenddfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
@ -8155,7 +8292,8 @@
(match_operand:DF 1 "register_operand" "0"))))]
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "*sqrtextendsfxf2"
[(set (match_operand:XF 0 "register_operand" "=f")
@ -8163,7 +8301,8 @@
(match_operand:SF 1 "register_operand" "0"))))]
"! TARGET_NO_FANCY_MATH_387 && TARGET_80387"
"fsqrt"
[(set_attr "type" "fpspc")])
[(set_attr "type" "fpspc")
(set_attr "athlon_decode" "direct")])
(define_insn "sindf2"
[(set (match_operand:DF 0 "register_operand" "=f")