ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.

* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
	(*sqrt_approx): New instruction pattern for approximate square roots.
	(*setf_exp_xf): New instruction pattern for exponentiation.
	(*maddxf4_alts_truncsf): New instruction pattern for truncation.
	(sqrtsf2_internal_thr): New define_and_split implementing
	throughput-optimized inline calculation of SFmode square root.
	(sqrtdf2_internal_thr): Likewise for DFmode.
	(sqrtxf2_internal_thr): Likewise for XFmode.
	(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
	latency- and throughput-optimized square root algorithms.
	* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
	TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
	New macros.
	(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
	-minline-sqrt-max-throughput.
	* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
	and -minline-sqrt-max-throughput are given, notify the user
	that both options cannot be used simultaneously.
	If -minline-sqrt-min-latency is given, notify the user that
	this mode is not yet implemented.
	(rtx_needs_barrier): Reformat initial comment to obey
	72-character width limit.  Support UNSPEC_SETF_EXP and
	UNSPEC_FR_SQRT_RECIP_APPROX.

From-SVN: r73027
This commit is contained in:
Zack Weinberg 2003-10-29 00:55:43 +00:00
parent 1e8fee4a42
commit b38ba46301
4 changed files with 534 additions and 4 deletions

View File

@ -1,3 +1,29 @@
2003-10-28 Zack Weinberg <zack@codesourcery.com>
* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
(*sqrt_approx): New instruction pattern for approximate square roots.
(*setf_exp_xf): New instruction pattern for exponentiation.
(*maddxf4_alts_truncsf): New instruction pattern for truncation.
(sqrtsf2_internal_thr): New define_and_split implementing
throughput-optimized inline calculation of SFmode square root.
(sqrtdf2_internal_thr): Likewise for DFmode.
(sqrtxf2_internal_thr): Likewise for XFmode.
(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
latency- and throughput-optimized square root algorithms.
* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
New macros.
(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
-minline-sqrt-max-throughput.
* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
and -minline-sqrt-max-throughput are given, notify the user
that both options cannot be used simultaneously.
If -minline-sqrt-min-latency is given, notify the user that
this mode is not yet implemented.
(rtx_needs_barrier): Reformat initial comment to obey
72-character width limit. Support UNSPEC_SETF_EXP and
UNSPEC_FR_SQRT_RECIP_APPROX.
2003-10-29 Alan Modra <amodra@bigpond.net.au>
* config/rs6000/rs6000.md (movdf_softfloat64): Allow dummy ctr,ctr

View File

@ -4487,6 +4487,18 @@ ia64_override_options (void)
target_flags &= ~MASK_INLINE_INT_DIV_THR;
}
if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
{
warning ("cannot optimize square root for both latency and throughput");
target_flags &= ~MASK_INLINE_SQRT_THR;
}
if (TARGET_INLINE_SQRT_LAT)
{
warning ("not yet implemented: latency-optimized inline square root");
target_flags &= ~MASK_INLINE_SQRT_LAT;
}
if (ia64_fixed_range_string)
fix_range (ia64_fixed_range_string);
@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
return need_barrier;
}
/* Handle an access to rtx X of type FLAGS using predicate register PRED.
Return 1 is this access creates a dependency with an earlier instruction
in the same group. */
/* Handle an access to rtx X of type FLAGS using predicate register
PRED. Return 1 if this access creates a dependency with an earlier
instruction in the same group. */
static int
rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
case UNSPEC_FR_SPILL:
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
case UNSPEC_SETF_EXP:
case UNSPEC_ADDP4:
case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;

View File

@ -87,6 +87,10 @@ extern int target_flags;
#define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */
#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */
#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
@ -127,6 +131,13 @@ extern int target_flags;
#define TARGET_INLINE_INT_DIV \
(target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))
#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
#define TARGET_INLINE_SQRT \
(target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
extern int ia64_tls_size;
@ -186,6 +197,10 @@ extern int ia64_tls_size;
N_("Generate inline integer division, optimize for latency") }, \
{ "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \
N_("Generate inline integer division, optimize for throughput") },\
{ "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \
N_("Generate inline square root, optimize for latency") }, \
{ "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
N_("Generate inline square root, optimize for throughput") }, \
{ "dwarf2-asm", MASK_DWARF2_ASM, \
N_("Enable Dwarf 2 line debug info via GNU as")}, \
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \

View File

@ -74,6 +74,8 @@
(UNSPEC_ADDP4 24)
(UNSPEC_PROLOGUE_USE 25)
(UNSPEC_RET_ADDR 26)
(UNSPEC_SETF_EXP 27)
(UNSPEC_FR_SQRT_RECIP_APPROX 28)
])
(define_constants
@ -2757,6 +2759,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; Inline square root.
(define_insn "*sqrt_approx"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(div:XF (const_int 1)
(sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
(set (match_operand:BI 1 "register_operand" "=c")
(unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
(use (match_operand:SI 3 "const_int_operand" "")) ]
""
"frsqrta.s%3 %0, %1 = %2"
[(set_attr "itanium_class" "fmisc")
(set_attr "predicable" "no")])
(define_insn "*setf_exp_xf"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(unspec:XF [(match_operand:DI 1 "register_operand" "r")]
UNSPEC_SETF_EXP))]
""
"setf.exp %0 = %1"
[(set_attr "itanium_class" "frfr")])
(define_expand "sqrtsf2"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtsf2_internal_thr"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(sqrt:XF (match_dup 8))))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; d = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 5
;; d' = d + 1/2 * d in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 5))
(match_dup 5)))
(use (const_int 1))]))
;; Step 6
;; e = d + d * d' in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 3))
(match_dup 5)))
(use (const_int 1))]))
;; Step 7
;; S1 = S0 + e * S0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 7))))
(use (const_int 1))]))
;; Step 8
;; H1 = H0 + e * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; d1 = a - S1 * S1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 10
;; S = S1 + d1 * H1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@ -3102,6 +3253,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; Inline square root.
(define_expand "sqrtdf2"
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtdf2_internal_thr"
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f10
(set (match_dup 5)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(sqrt:XF (match_dup 8))))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; G0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; r0 = 1/2 - G0 * H0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
(match_dup 5)))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + r0 * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 6
;; G1 = G0 + r0 * G0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 7
;; r1 = 1/2 - G1 * H1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
(match_dup 5)))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + r1 * H1 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 9
;; G2 = G1 + r1 * G1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 10
;; d2 = a - G2 * G2 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 11
;; G3 = G2 + d2 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7)))
(use (const_int 1))]))
;; Step 12
;; d3 = a - G3 * G3 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 13
;; S = G3 + d3 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:DF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@ -3292,6 +3592,17 @@
"fma.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
(define_insn "*maddxf4_alts_truncsf"
[(set (match_operand:SF 0 "fr_register_operand" "=f")
(float_truncate:SF
(plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
(match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
(match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
(use (match_operand:SI 4 "const_int_operand" ""))]
""
"fma.s.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
(define_insn "*maddxf4_alts_truncdf"
[(set (match_operand:DF 0 "fr_register_operand" "=f")
(float_truncate:DF
@ -3591,6 +3902,170 @@
"operands[6] = CONST1_RTX (XFmode);"
[(set_attr "predicable" "no")])
;; Inline square root.
(define_expand "sqrtxf2"
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtxf2_internal_thr"
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register f11 in optimization guide
(clobber (match_scratch:XF 6 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 7 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8. The Intel manual mistakenly specifies f10.
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 8)
(div:XF (const_int 1)
(sqrt:XF (match_dup 9))))
(set (match_dup 7)
(unspec:BI [(match_dup 9)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 9) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 4
;; d0 = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + d0 * H0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 6
;; S1 = S0 + d0 * S0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 7
;; d1 = 1/2 - S1 * H1 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + d1 * H1 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; S2 = S1 + d1 * S1 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 10
;; d2 = 1/2 - S2 * H2 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 11
;; e2 = a - S2 * S2 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
(match_dup 9)))
(use (const_int 1))]))
;; Step 12
;; S3 = S2 + e2 * H2 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 1))]))
;; Step 13
;; H3 = H2 + d2 * H2 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 14
;; e3 = a - S3 * S3 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
(match_dup 9)))
(use (const_int 1))]))
;; Step 15
;; S = S3 + e3 * H3 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 0)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[10] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"