diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7519bc34950..585e25b6aef 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,29 @@ +2003-10-28 Zack Weinberg + + * ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants. + (*sqrt_approx): New instruction pattern for approximate square roots. + (*setf_exp_xf): New instruction pattern for exponentiation. + (*maddxf4_alts_truncsf): New instruction pattern for truncation. + (sqrtsf2_internal_thr): New define_and_split implementing + throughput-optimized inline calculation of SFmode square root. + (sqrtdf2_internal_thr): Likewise for DFmode. + (sqrtxf2_internal_thr): Likewise for XFmode. + (sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between + latency- and throughput-optimized square root algorithms. + * ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR, + TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT): + New macros. + (TARGET_SWITCHES): Add -minline-sqrt-min-latency and + -minline-sqrt-max-throughput. + * ia64.c (ia64_override_options): If both -minline-sqrt-min-latency + and -minline-sqrt-max-throughput are given, notify the user + that both options cannot be used simultaneously. + If -minline-sqrt-min-latency is given, notify the user that + this mode is not yet implemented. + (rtx_needs_barrier): Reformat initial comment to obey + 72-character width limit. Support UNSPEC_SETF_EXP and + UNSPEC_FR_SQRT_RECIP_APPROX. + 2003-10-29 Alan Modra * config/rs6000/rs6000.md (movdf_softfloat64): Allow dummy ctr,ctr @@ -12,7 +38,7 @@ 2003-10-28 Richard Earnshaw * arm.c (arm_output_epilogue): When using a frame pointer, don't emit - an extra stack adjustment insn if the stack pointer is already + an extra stack adjustment insn if the stack pointer is already pointing at the right place. (use_return_insn): Allow a return insn to be used when we have a frame pointer if the stack pointer is in the right place. diff --git a/gcc/config/ia64/ia64.c b/gcc/config/ia64/ia64.c index 7b0069d7fb0..a25c4c5a83d 100644 --- a/gcc/config/ia64/ia64.c +++ b/gcc/config/ia64/ia64.c @@ -4487,6 +4487,18 @@ ia64_override_options (void) target_flags &= ~MASK_INLINE_INT_DIV_THR; } + if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR) + { + warning ("cannot optimize square root for both latency and throughput"); + target_flags &= ~MASK_INLINE_SQRT_THR; + } + + if (TARGET_INLINE_SQRT_LAT) + { + warning ("not yet implemented: latency-optimized inline square root"); + target_flags &= ~MASK_INLINE_SQRT_LAT; + } + if (ia64_fixed_range_string) fix_range (ia64_fixed_range_string); @@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond) return need_barrier; } -/* Handle an access to rtx X of type FLAGS using predicate register PRED. - Return 1 is this access creates a dependency with an earlier instruction - in the same group. */ +/* Handle an access to rtx X of type FLAGS using predicate register + PRED. Return 1 if this access creates a dependency with an earlier + instruction in the same group. */ static int rtx_needs_barrier (rtx x, struct reg_flags flags, int pred) @@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred) case UNSPEC_FR_SPILL: case UNSPEC_FR_RESTORE: case UNSPEC_GETF_EXP: + case UNSPEC_SETF_EXP: case UNSPEC_ADDP4: + case UNSPEC_FR_SQRT_RECIP_APPROX: need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred); break; diff --git a/gcc/config/ia64/ia64.h b/gcc/config/ia64/ia64.h index 8ccda53bd9b..44ef6c6e1e9 100644 --- a/gcc/config/ia64/ia64.h +++ b/gcc/config/ia64/ia64.h @@ -87,6 +87,10 @@ extern int target_flags; #define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */ +#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */ + +#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */ + #define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */ #define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */ @@ -127,6 +131,13 @@ extern int target_flags; #define TARGET_INLINE_INT_DIV \ (target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR)) +#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT) + +#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR) + +#define TARGET_INLINE_SQRT \ + (target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR)) + #define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM) extern int ia64_tls_size; @@ -186,6 +197,10 @@ extern int ia64_tls_size; N_("Generate inline integer division, optimize for latency") }, \ { "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \ N_("Generate inline integer division, optimize for throughput") },\ + { "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \ + N_("Generate inline square root, optimize for latency") }, \ + { "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \ + N_("Generate inline square root, optimize for throughput") }, \ { "dwarf2-asm", MASK_DWARF2_ASM, \ N_("Enable Dwarf 2 line debug info via GNU as")}, \ { "no-dwarf2-asm", -MASK_DWARF2_ASM, \ diff --git a/gcc/config/ia64/ia64.md b/gcc/config/ia64/ia64.md index 547b7979862..ec66fd5a77d 100644 --- a/gcc/config/ia64/ia64.md +++ b/gcc/config/ia64/ia64.md @@ -74,6 +74,8 @@ (UNSPEC_ADDP4 24) (UNSPEC_PROLOGUE_USE 25) (UNSPEC_RET_ADDR 26) + (UNSPEC_SETF_EXP 27) + (UNSPEC_FR_SQRT_RECIP_APPROX 28) ]) (define_constants @@ -2757,6 +2759,155 @@ operands[10] = CONST1_RTX (XFmode); } [(set_attr "predicable" "no")]) + +;; Inline square root. + +(define_insn "*sqrt_approx" + [(set (match_operand:XF 0 "fr_register_operand" "=f") + (div:XF (const_int 1) + (sqrt:XF (match_operand:XF 2 "fr_register_operand" "f")))) + (set (match_operand:BI 1 "register_operand" "=c") + (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX)) + (use (match_operand:SI 3 "const_int_operand" "")) ] + "" + "frsqrta.s%3 %0, %1 = %2" + [(set_attr "itanium_class" "fmisc") + (set_attr "predicable" "no")]) + +(define_insn "*setf_exp_xf" + [(set (match_operand:XF 0 "fr_register_operand" "=f") + (unspec:XF [(match_operand:DI 1 "register_operand" "r")] + UNSPEC_SETF_EXP))] + "" + "setf.exp %0 = %1" + [(set_attr "itanium_class" "frfr")]) + +(define_expand "sqrtsf2" + [(set (match_operand:SF 0 "fr_register_operand" "=&f") + (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))] + "TARGET_INLINE_SQRT" +{ + rtx insn; + if (TARGET_INLINE_SQRT_LAT) +#if 0 + insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]); +#else + abort (); +#endif + else + insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]); + emit_insn (insn); + DONE; +}) + +;; Latency-optimized square root. +;; FIXME: Implement. + +;; Throughput-optimized square root. + +(define_insn_and_split "sqrtsf2_internal_thr" + [(set (match_operand:SF 0 "fr_register_operand" "=&f") + (sqrt:SF (match_operand:SF 1 "fr_register_operand" "f"))) + ;; Register r2 in optimization guide. + (clobber (match_scratch:DI 2 "=r")) + ;; Register f8 in optimization guide + (clobber (match_scratch:XF 3 "=&f")) + ;; Register f9 in optimization guide + (clobber (match_scratch:XF 4 "=&f")) + ;; Register f10 in optimization guide + (clobber (match_scratch:XF 5 "=&f")) + ;; Register p6 in optimization guide. + (clobber (match_scratch:BI 6 "=c"))] + "TARGET_INLINE_SQRT_THR" + "#" + "&& reload_completed" + [ ;; exponent of +1/2 in r2 + (set (match_dup 2) (const_int 65534)) + ;; +1/2 in f8 + (set (match_dup 3) + (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP)) + ;; Step 1 + ;; y0 = 1/sqrt(a) in f7 + (parallel [(set (match_dup 7) + (div:XF (const_int 1) + (sqrt:XF (match_dup 8)))) + (set (match_dup 6) + (unspec:BI [(match_dup 8)] + UNSPEC_FR_SQRT_RECIP_APPROX)) + (use (const_int 0))]) + ;; Step 2 + ;; H0 = 1/2 * y0 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (mult:XF (match_dup 3) (match_dup 7)) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 3 + ;; S0 = a * y0 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 7) + (plus:XF (mult:XF (match_dup 8) (match_dup 7)) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 4 + ;; d = 1/2 - S0 * H0 in f10 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 5) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4))) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 5 + ;; d' = d + 1/2 * d in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 3) (match_dup 5)) + (match_dup 5))) + (use (const_int 1))])) + ;; Step 6 + ;; e = d + d * d' in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 5) (match_dup 3)) + (match_dup 5))) + (use (const_int 1))])) + ;; Step 7 + ;; S1 = S0 + e * S0 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 0) + (float_truncate:SF + (plus:XF (mult:XF (match_dup 3) (match_dup 7)) + (match_dup 7)))) + (use (const_int 1))])) + ;; Step 8 + ;; H1 = H0 + e * H0 in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 3) (match_dup 4)) + (match_dup 4))) + (use (const_int 1))])) + ;; Step 9 + ;; d1 = a - S1 * S1 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7))) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 10 + ;; S = S1 + d1 * H1 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 0) + (float_truncate:SF + (plus:XF (mult:XF (match_dup 4) (match_dup 3)) + (match_dup 7)))) + (use (const_int 0))]))] +{ + /* Generate 82-bit versions of the input and output operands. */ + operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0])); + operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1])); + /* Generate required floating-point constants. */ + operands[9] = CONST0_RTX (XFmode); +} + [(set_attr "predicable" "no")]) ;; :::::::::::::::::::: ;; :: @@ -3102,6 +3253,155 @@ operands[10] = CONST1_RTX (XFmode); } [(set_attr "predicable" "no")]) + +;; Inline square root. + +(define_expand "sqrtdf2" + [(set (match_operand:DF 0 "fr_register_operand" "=&f") + (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))] + "TARGET_INLINE_SQRT" +{ + rtx insn; + if (TARGET_INLINE_SQRT_LAT) +#if 0 + insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]); +#else + abort (); +#endif + else + insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]); + emit_insn (insn); + DONE; +}) + +;; Latency-optimized square root. +;; FIXME: Implement. + +;; Throughput-optimized square root. + +(define_insn_and_split "sqrtdf2_internal_thr" + [(set (match_operand:DF 0 "fr_register_operand" "=&f") + (sqrt:DF (match_operand:DF 1 "fr_register_operand" "f"))) + ;; Register r2 in optimization guide. + (clobber (match_scratch:DI 2 "=r")) + ;; Register f8 in optimization guide + (clobber (match_scratch:XF 3 "=&f")) + ;; Register f9 in optimization guide + (clobber (match_scratch:XF 4 "=&f")) + ;; Register f10 in optimization guide + (clobber (match_scratch:XF 5 "=&f")) + ;; Register p6 in optimization guide. + (clobber (match_scratch:BI 6 "=c"))] + "TARGET_INLINE_SQRT_THR" + "#" + "&& reload_completed" + [ ;; exponent of +1/2 in r2 + (set (match_dup 2) (const_int 65534)) + ;; +1/2 in f10 + (set (match_dup 5) + (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP)) + ;; Step 1 + ;; y0 = 1/sqrt(a) in f7 + (parallel [(set (match_dup 7) + (div:XF (const_int 1) + (sqrt:XF (match_dup 8)))) + (set (match_dup 6) + (unspec:BI [(match_dup 8)] + UNSPEC_FR_SQRT_RECIP_APPROX)) + (use (const_int 0))]) + ;; Step 2 + ;; H0 = 1/2 * y0 in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 5) (match_dup 7)) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 3 + ;; G0 = a * y0 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 7) + (plus:XF (mult:XF (match_dup 8) (match_dup 7)) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 4 + ;; r0 = 1/2 - G0 * H0 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3))) + (match_dup 5))) + (use (const_int 1))])) + ;; Step 5 + ;; H1 = H0 + r0 * H0 in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 4) (match_dup 3)) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 6 + ;; G1 = G0 + r0 * G0 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 7) + (plus:XF (mult:XF (match_dup 4) (match_dup 7)) + (match_dup 7))) + (use (const_int 1))])) + ;; Step 7 + ;; r1 = 1/2 - G1 * H1 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3))) + (match_dup 5))) + (use (const_int 1))])) + ;; Step 8 + ;; H2 = H1 + r1 * H1 in f8 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (mult:XF (match_dup 4) (match_dup 3)) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 9 + ;; G2 = G1 + r1 * G1 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 7) + (plus:XF (mult:XF (match_dup 4) (match_dup 7)) + (match_dup 7))) + (use (const_int 1))])) + ;; Step 10 + ;; d2 = a - G2 * G2 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7))) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 11 + ;; G3 = G2 + d2 * H2 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 7) + (plus:XF (mult:XF (match_dup 4) (match_dup 3)) + (match_dup 7))) + (use (const_int 1))])) + ;; Step 12 + ;; d3 = a - G3 * G3 in f9 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7))) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 13 + ;; S = G3 + d3 * H2 in f7 + (cond_exec (ne (match_dup 6) (const_int 0)) + (parallel [(set (match_dup 0) + (float_truncate:DF + (plus:XF (mult:XF (match_dup 4) (match_dup 3)) + (match_dup 7)))) + (use (const_int 0))]))] +{ + /* Generate 82-bit versions of the input and output operands. */ + operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0])); + operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1])); + /* Generate required floating-point constants. */ + operands[9] = CONST0_RTX (XFmode); +} + [(set_attr "predicable" "no")]) ;; :::::::::::::::::::: ;; :: @@ -3292,6 +3592,17 @@ "fma.s%4 %0 = %F1, %F2, %F3" [(set_attr "itanium_class" "fmac")]) +(define_insn "*maddxf4_alts_truncsf" + [(set (match_operand:SF 0 "fr_register_operand" "=f") + (float_truncate:SF + (plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG") + (match_operand:XF 2 "xfreg_or_fp01_operand" "fG")) + (match_operand:XF 3 "xfreg_or_fp01_operand" "fG")))) + (use (match_operand:SI 4 "const_int_operand" ""))] + "" + "fma.s.s%4 %0 = %F1, %F2, %F3" + [(set_attr "itanium_class" "fmac")]) + (define_insn "*maddxf4_alts_truncdf" [(set (match_operand:DF 0 "fr_register_operand" "=f") (float_truncate:DF @@ -3591,6 +3902,170 @@ "operands[6] = CONST1_RTX (XFmode);" [(set_attr "predicable" "no")]) +;; Inline square root. + +(define_expand "sqrtxf2" + [(set (match_operand:XF 0 "fr_register_operand" "=&f") + (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))] + "TARGET_INLINE_SQRT" +{ + rtx insn; + if (TARGET_INLINE_SQRT_LAT) +#if 0 + insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]); +#else + abort (); +#endif + else + insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]); + emit_insn (insn); + DONE; +}) + +;; Latency-optimized square root. +;; FIXME: Implement. + +;; Throughput-optimized square root. + +(define_insn_and_split "sqrtxf2_internal_thr" + [(set (match_operand:XF 0 "fr_register_operand" "=&f") + (sqrt:XF (match_operand:XF 1 "fr_register_operand" "f"))) + ;; Register r2 in optimization guide. + (clobber (match_scratch:DI 2 "=r")) + ;; Register f8 in optimization guide + (clobber (match_scratch:XF 3 "=&f")) + ;; Register f9 in optimization guide + (clobber (match_scratch:XF 4 "=&f")) + ;; Register f10 in optimization guide + (clobber (match_scratch:XF 5 "=&f")) + ;; Register f11 in optimization guide + (clobber (match_scratch:XF 6 "=&f")) + ;; Register p6 in optimization guide. + (clobber (match_scratch:BI 7 "=c"))] + "TARGET_INLINE_SQRT_THR" + "#" + "&& reload_completed" + [ ;; exponent of +1/2 in r2 + (set (match_dup 2) (const_int 65534)) + ;; +1/2 in f8. The Intel manual mistakenly specifies f10. + (set (match_dup 3) + (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP)) + ;; Step 1 + ;; y0 = 1/sqrt(a) in f7 + (parallel [(set (match_dup 8) + (div:XF (const_int 1) + (sqrt:XF (match_dup 9)))) + (set (match_dup 7) + (unspec:BI [(match_dup 9)] + UNSPEC_FR_SQRT_RECIP_APPROX)) + (use (const_int 0))]) + ;; Step 2 + ;; H0 = 1/2 * y0 in f9 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (mult:XF (match_dup 3) (match_dup 8)) + (match_dup 10))) + (use (const_int 1))])) + ;; Step 3 + ;; S0 = a * y0 in f7 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 8) + (plus:XF (mult:XF (match_dup 9) (match_dup 8)) + (match_dup 10))) + (use (const_int 1))])) + ;; Step 4 + ;; d0 = 1/2 - S0 * H0 in f10 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 5) + (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4))) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 5 + ;; H1 = H0 + d0 * H0 in f9 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (mult:XF (match_dup 5) (match_dup 4)) + (match_dup 4))) + (use (const_int 1))])) + ;; Step 6 + ;; S1 = S0 + d0 * S0 in f7 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 8) + (plus:XF (mult:XF (match_dup 5) (match_dup 8)) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 7 + ;; d1 = 1/2 - S1 * H1 in f10 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 5) + (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4))) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 8 + ;; H2 = H1 + d1 * H1 in f9 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (mult:XF (match_dup 5) (match_dup 4)) + (match_dup 4))) + (use (const_int 1))])) + ;; Step 9 + ;; S2 = S1 + d1 * S1 in f7 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 8) + (plus:XF (mult:XF (match_dup 5) (match_dup 8)) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 10 + ;; d2 = 1/2 - S2 * H2 in f10 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 5) + (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4))) + (match_dup 3))) + (use (const_int 1))])) + ;; Step 11 + ;; e2 = a - S2 * S2 in f8 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8))) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 12 + ;; S3 = S2 + e2 * H2 in f7 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 8) + (plus:XF (mult:XF (match_dup 3) (match_dup 4)) + (match_dup 8))) + (use (const_int 1))])) + ;; Step 13 + ;; H3 = H2 + d2 * H2 in f9 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 4) + (plus:XF (mult:XF (match_dup 5) (match_dup 4)) + (match_dup 4))) + (use (const_int 1))])) + ;; Step 14 + ;; e3 = a - S3 * S3 in f8 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 3) + (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8))) + (match_dup 9))) + (use (const_int 1))])) + ;; Step 15 + ;; S = S3 + e3 * H3 in f7 + (cond_exec (ne (match_dup 7) (const_int 0)) + (parallel [(set (match_dup 0) + (plus:XF (mult:XF (match_dup 3) (match_dup 4)) + (match_dup 8))) + (use (const_int 0))]))] +{ + /* Generate 82-bit versions of the input and output operands. */ + operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0])); + operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1])); + /* Generate required floating-point constants. */ + operands[10] = CONST0_RTX (XFmode); +} + [(set_attr "predicable" "no")]) + ;; ??? frcpa works like cmp.foo.unc. (define_insn "*recip_approx"