ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.

* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
	(*sqrt_approx): New instruction pattern for approximate square roots.
	(*setf_exp_xf): New instruction pattern for exponentiation.
	(*maddxf4_alts_truncsf): New instruction pattern for truncation.
	(sqrtsf2_internal_thr): New define_and_split implementing
	throughput-optimized inline calculation of SFmode square root.
	(sqrtdf2_internal_thr): Likewise for DFmode.
	(sqrtxf2_internal_thr): Likewise for XFmode.
	(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
	latency- and throughput-optimized square root algorithms.
	* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
	TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
	New macros.
	(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
	-minline-sqrt-max-throughput.
	* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
	and -minline-sqrt-max-throughput are given, notify the user
	that both options cannot be used simultaneously.
	If -minline-sqrt-min-latency is given, notify the user that
	this mode is not yet implemented.
	(rtx_needs_barrier): Reformat initial comment to obey
	72-character width limit.  Support UNSPEC_SETF_EXP and
	UNSPEC_FR_SQRT_RECIP_APPROX.

From-SVN: r73027
This commit is contained in:
Zack Weinberg 2003-10-29 00:55:43 +00:00
parent 1e8fee4a42
commit b38ba46301
4 changed files with 534 additions and 4 deletions

View File

@ -1,3 +1,29 @@
2003-10-28 Zack Weinberg <zack@codesourcery.com>
* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
(*sqrt_approx): New instruction pattern for approximate square roots.
(*setf_exp_xf): New instruction pattern for exponentiation.
(*maddxf4_alts_truncsf): New instruction pattern for truncation.
(sqrtsf2_internal_thr): New define_and_split implementing
throughput-optimized inline calculation of SFmode square root.
(sqrtdf2_internal_thr): Likewise for DFmode.
(sqrtxf2_internal_thr): Likewise for XFmode.
(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
latency- and throughput-optimized square root algorithms.
* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
New macros.
(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
-minline-sqrt-max-throughput.
* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
and -minline-sqrt-max-throughput are given, notify the user
that both options cannot be used simultaneously.
If -minline-sqrt-min-latency is given, notify the user that
this mode is not yet implemented.
(rtx_needs_barrier): Reformat initial comment to obey
72-character width limit. Support UNSPEC_SETF_EXP and
UNSPEC_FR_SQRT_RECIP_APPROX.
2003-10-29 Alan Modra <amodra@bigpond.net.au>
* config/rs6000/rs6000.md (movdf_softfloat64): Allow dummy ctr,ctr
@ -12,7 +38,7 @@
2003-10-28 Richard Earnshaw <rearnsha@arm.com>
* arm.c (arm_output_epilogue): When using a frame pointer, don't emit
an extra stack adjustment insn if the stack pointer is already
an extra stack adjustment insn if the stack pointer is already
pointing at the right place.
(use_return_insn): Allow a return insn to be used when we have a
frame pointer if the stack pointer is in the right place.

View File

@ -4487,6 +4487,18 @@ ia64_override_options (void)
target_flags &= ~MASK_INLINE_INT_DIV_THR;
}
if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
{
warning ("cannot optimize square root for both latency and throughput");
target_flags &= ~MASK_INLINE_SQRT_THR;
}
if (TARGET_INLINE_SQRT_LAT)
{
warning ("not yet implemented: latency-optimized inline square root");
target_flags &= ~MASK_INLINE_SQRT_LAT;
}
if (ia64_fixed_range_string)
fix_range (ia64_fixed_range_string);
@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
return need_barrier;
}
/* Handle an access to rtx X of type FLAGS using predicate register PRED.
Return 1 is this access creates a dependency with an earlier instruction
in the same group. */
/* Handle an access to rtx X of type FLAGS using predicate register
PRED. Return 1 if this access creates a dependency with an earlier
instruction in the same group. */
static int
rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
case UNSPEC_FR_SPILL:
case UNSPEC_FR_RESTORE:
case UNSPEC_GETF_EXP:
case UNSPEC_SETF_EXP:
case UNSPEC_ADDP4:
case UNSPEC_FR_SQRT_RECIP_APPROX:
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
break;

View File

@ -87,6 +87,10 @@ extern int target_flags;
#define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */
#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */
#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
@ -127,6 +131,13 @@ extern int target_flags;
#define TARGET_INLINE_INT_DIV \
(target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))
#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
#define TARGET_INLINE_SQRT \
(target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
extern int ia64_tls_size;
@ -186,6 +197,10 @@ extern int ia64_tls_size;
N_("Generate inline integer division, optimize for latency") }, \
{ "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \
N_("Generate inline integer division, optimize for throughput") },\
{ "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \
N_("Generate inline square root, optimize for latency") }, \
{ "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
N_("Generate inline square root, optimize for throughput") }, \
{ "dwarf2-asm", MASK_DWARF2_ASM, \
N_("Enable Dwarf 2 line debug info via GNU as")}, \
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \

View File

@ -74,6 +74,8 @@
(UNSPEC_ADDP4 24)
(UNSPEC_PROLOGUE_USE 25)
(UNSPEC_RET_ADDR 26)
(UNSPEC_SETF_EXP 27)
(UNSPEC_FR_SQRT_RECIP_APPROX 28)
])
(define_constants
@ -2757,6 +2759,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; Inline square root.
(define_insn "*sqrt_approx"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(div:XF (const_int 1)
(sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
(set (match_operand:BI 1 "register_operand" "=c")
(unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
(use (match_operand:SI 3 "const_int_operand" "")) ]
""
"frsqrta.s%3 %0, %1 = %2"
[(set_attr "itanium_class" "fmisc")
(set_attr "predicable" "no")])
(define_insn "*setf_exp_xf"
[(set (match_operand:XF 0 "fr_register_operand" "=f")
(unspec:XF [(match_operand:DI 1 "register_operand" "r")]
UNSPEC_SETF_EXP))]
""
"setf.exp %0 = %1"
[(set_attr "itanium_class" "frfr")])
(define_expand "sqrtsf2"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtsf2_internal_thr"
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(sqrt:XF (match_dup 8))))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; d = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 5
;; d' = d + 1/2 * d in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 5))
(match_dup 5)))
(use (const_int 1))]))
;; Step 6
;; e = d + d * d' in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 3))
(match_dup 5)))
(use (const_int 1))]))
;; Step 7
;; S1 = S0 + e * S0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
(match_dup 7))))
(use (const_int 1))]))
;; Step 8
;; H1 = H0 + e * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; d1 = a - S1 * S1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 10
;; S = S1 + d1 * H1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:SF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@ -3102,6 +3253,155 @@
operands[10] = CONST1_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; Inline square root.
(define_expand "sqrtdf2"
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtdf2_internal_thr"
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 6 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f10
(set (match_dup 5)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 7)
(div:XF (const_int 1)
(sqrt:XF (match_dup 8))))
(set (match_dup 6)
(unspec:BI [(match_dup 8)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 5) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 3
;; G0 = a * y0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
(match_dup 9)))
(use (const_int 1))]))
;; Step 4
;; r0 = 1/2 - G0 * H0 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
(match_dup 5)))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + r0 * H0 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 6
;; G1 = G0 + r0 * G0 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 7
;; r1 = 1/2 - G1 * H1 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
(match_dup 5)))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + r1 * H1 in f8
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 3)))
(use (const_int 1))]))
;; Step 9
;; G2 = G1 + r1 * G1 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
(match_dup 7)))
(use (const_int 1))]))
;; Step 10
;; d2 = a - G2 * G2 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 11
;; G3 = G2 + d2 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 7)
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7)))
(use (const_int 1))]))
;; Step 12
;; d3 = a - G3 * G3 in f9
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
(match_dup 8)))
(use (const_int 1))]))
;; Step 13
;; S = G3 + d3 * H2 in f7
(cond_exec (ne (match_dup 6) (const_int 0))
(parallel [(set (match_dup 0)
(float_truncate:DF
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
(match_dup 7))))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[9] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ::::::::::::::::::::
;; ::
@ -3292,6 +3592,17 @@
"fma.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
(define_insn "*maddxf4_alts_truncsf"
[(set (match_operand:SF 0 "fr_register_operand" "=f")
(float_truncate:SF
(plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
(match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
(match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
(use (match_operand:SI 4 "const_int_operand" ""))]
""
"fma.s.s%4 %0 = %F1, %F2, %F3"
[(set_attr "itanium_class" "fmac")])
(define_insn "*maddxf4_alts_truncdf"
[(set (match_operand:DF 0 "fr_register_operand" "=f")
(float_truncate:DF
@ -3591,6 +3902,170 @@
"operands[6] = CONST1_RTX (XFmode);"
[(set_attr "predicable" "no")])
;; Inline square root.
(define_expand "sqrtxf2"
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
"TARGET_INLINE_SQRT"
{
rtx insn;
if (TARGET_INLINE_SQRT_LAT)
#if 0
insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
#else
abort ();
#endif
else
insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
emit_insn (insn);
DONE;
})
;; Latency-optimized square root.
;; FIXME: Implement.
;; Throughput-optimized square root.
(define_insn_and_split "sqrtxf2_internal_thr"
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
;; Register r2 in optimization guide.
(clobber (match_scratch:DI 2 "=r"))
;; Register f8 in optimization guide
(clobber (match_scratch:XF 3 "=&f"))
;; Register f9 in optimization guide
(clobber (match_scratch:XF 4 "=&f"))
;; Register f10 in optimization guide
(clobber (match_scratch:XF 5 "=&f"))
;; Register f11 in optimization guide
(clobber (match_scratch:XF 6 "=&f"))
;; Register p6 in optimization guide.
(clobber (match_scratch:BI 7 "=c"))]
"TARGET_INLINE_SQRT_THR"
"#"
"&& reload_completed"
[ ;; exponent of +1/2 in r2
(set (match_dup 2) (const_int 65534))
;; +1/2 in f8. The Intel manual mistakenly specifies f10.
(set (match_dup 3)
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
;; Step 1
;; y0 = 1/sqrt(a) in f7
(parallel [(set (match_dup 8)
(div:XF (const_int 1)
(sqrt:XF (match_dup 9))))
(set (match_dup 7)
(unspec:BI [(match_dup 9)]
UNSPEC_FR_SQRT_RECIP_APPROX))
(use (const_int 0))])
;; Step 2
;; H0 = 1/2 * y0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 3) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 3
;; S0 = a * y0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 9) (match_dup 8))
(match_dup 10)))
(use (const_int 1))]))
;; Step 4
;; d0 = 1/2 - S0 * H0 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 5
;; H1 = H0 + d0 * H0 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 6
;; S1 = S0 + d0 * S0 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 7
;; d1 = 1/2 - S1 * H1 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 8
;; H2 = H1 + d1 * H1 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 9
;; S2 = S1 + d1 * S1 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
(match_dup 8)))
(use (const_int 1))]))
;; Step 10
;; d2 = 1/2 - S2 * H2 in f10
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 5)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
(match_dup 3)))
(use (const_int 1))]))
;; Step 11
;; e2 = a - S2 * S2 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
(match_dup 9)))
(use (const_int 1))]))
;; Step 12
;; S3 = S2 + e2 * H2 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 8)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 1))]))
;; Step 13
;; H3 = H2 + d2 * H2 in f9
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 4)
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
(match_dup 4)))
(use (const_int 1))]))
;; Step 14
;; e3 = a - S3 * S3 in f8
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 3)
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
(match_dup 9)))
(use (const_int 1))]))
;; Step 15
;; S = S3 + e3 * H3 in f7
(cond_exec (ne (match_dup 7) (const_int 0))
(parallel [(set (match_dup 0)
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
(match_dup 8)))
(use (const_int 0))]))]
{
/* Generate 82-bit versions of the input and output operands. */
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
/* Generate required floating-point constants. */
operands[10] = CONST0_RTX (XFmode);
}
[(set_attr "predicable" "no")])
;; ??? frcpa works like cmp.foo.unc.
(define_insn "*recip_approx"