ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.

* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants. (*sqrt_approx): New instruction pattern for approximate square roots. (*setf_exp_xf): New instruction pattern for exponentiation. (*maddxf4_alts_truncsf): New instruction pattern for truncation. (sqrtsf2_internal_thr): New define_and_split implementing throughput-optimized inline calculation of SFmode square root. (sqrtdf2_internal_thr): Likewise for DFmode. (sqrtxf2_internal_thr): Likewise for XFmode. (sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between latency- and throughput-optimized square root algorithms. * ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR, TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT): New macros. (TARGET_SWITCHES): Add -minline-sqrt-min-latency and -minline-sqrt-max-throughput. * ia64.c (ia64_override_options): If both -minline-sqrt-min-latency and -minline-sqrt-max-throughput are given, notify the user that both options cannot be used simultaneously. If -minline-sqrt-min-latency is given, notify the user that this mode is not yet implemented. (rtx_needs_barrier): Reformat initial comment to obey 72-character width limit. Support UNSPEC_SETF_EXP and UNSPEC_FR_SQRT_RECIP_APPROX. From-SVN: r73027
2003-10-29 00:55:43 +00:00 · 2003-10-29 00:55:43 +00:00 · b38ba46301
parent 1e8fee4a42
commit b38ba46301
4 changed files with 534 additions and 4 deletions
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@ -1,3 +1,29 @@
+2003-10-28  Zack Weinberg  <zack@codesourcery.com>
+
+	* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
+	(*sqrt_approx): New instruction pattern for approximate square roots.
+	(*setf_exp_xf): New instruction pattern for exponentiation.
+	(*maddxf4_alts_truncsf): New instruction pattern for truncation.
+	(sqrtsf2_internal_thr): New define_and_split implementing
+	throughput-optimized inline calculation of SFmode square root.
+	(sqrtdf2_internal_thr): Likewise for DFmode.
+	(sqrtxf2_internal_thr): Likewise for XFmode.
+	(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
+	latency- and throughput-optimized square root algorithms.
+	* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
+	TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
+	New macros.
+	(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
+	-minline-sqrt-max-throughput.
+	* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
+	and -minline-sqrt-max-throughput are given, notify the user
+	that both options cannot be used simultaneously.
+	If -minline-sqrt-min-latency is given, notify the user that
+	this mode is not yet implemented.
+	(rtx_needs_barrier): Reformat initial comment to obey
+	72-character width limit.  Support UNSPEC_SETF_EXP and
+	UNSPEC_FR_SQRT_RECIP_APPROX.
+
 2003-10-29  Alan Modra  <amodra@bigpond.net.au>

 	* config/rs6000/rs6000.md (movdf_softfloat64): Allow dummy ctr,ctr
@ -12,7 +38,7 @@
 2003-10-28  Richard Earnshaw  <rearnsha@arm.com>

 	* arm.c (arm_output_epilogue): When using a frame pointer, don't emit
-	an extra stack adjustment insn if the stack pointer is already 
+	an extra stack adjustment insn if the stack pointer is already
 	pointing at the right place.
 	(use_return_insn): Allow a return insn to be used when we have a
 	frame pointer if the stack pointer is in the right place.
--- a/gcc/config/ia64/ia64.c
+++ b/gcc/config/ia64/ia64.c
@ -4487,6 +4487,18 @@ ia64_override_options (void)
      target_flags &= ~MASK_INLINE_INT_DIV_THR;
    }

+  if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
+    {
+      warning ("cannot optimize square root for both latency and throughput");
+      target_flags &= ~MASK_INLINE_SQRT_THR;
+    }
+
+  if (TARGET_INLINE_SQRT_LAT)
+    {
+      warning ("not yet implemented: latency-optimized inline square root");
+      target_flags &= ~MASK_INLINE_SQRT_LAT;
+    }
+
  if (ia64_fixed_range_string)
    fix_range (ia64_fixed_range_string);

@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
  return need_barrier;
 }

-/* Handle an access to rtx X of type FLAGS using predicate register PRED.
-   Return 1 is this access creates a dependency with an earlier instruction
-   in the same group.  */
+/* Handle an access to rtx X of type FLAGS using predicate register
+   PRED.  Return 1 if this access creates a dependency with an earlier
+   instruction in the same group.  */

 static int
 rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
 	case UNSPEC_FR_SPILL:
 	case UNSPEC_FR_RESTORE:
 	case UNSPEC_GETF_EXP:
+	case UNSPEC_SETF_EXP:
        case UNSPEC_ADDP4:
+	case UNSPEC_FR_SQRT_RECIP_APPROX:
 	  need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
 	  break;

--- a/gcc/config/ia64/ia64.h
+++ b/gcc/config/ia64/ia64.h
@ -87,6 +87,10 @@ extern int target_flags;

 #define MASK_INLINE_INT_DIV_THR   0x00001000 /* inline div, max throughput.  */

+#define MASK_INLINE_SQRT_LAT      0x00002000 /* inline sqrt, min latency.  */
+
+#define MASK_INLINE_SQRT_THR      0x00004000 /* inline sqrt, max throughput. */
+
 #define MASK_DWARF2_ASM 0x40000000	/* test dwarf2 line info via gas.  */

 #define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model.  */
@ -127,6 +131,13 @@ extern int target_flags;
 #define TARGET_INLINE_INT_DIV \
  (target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))

+#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
+
+#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
+
+#define TARGET_INLINE_SQRT \
+  (target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
+
 #define TARGET_DWARF2_ASM	(target_flags & MASK_DWARF2_ASM)

 extern int ia64_tls_size;
@ -186,6 +197,10 @@ extern int ia64_tls_size;
      N_("Generate inline integer division, optimize for latency") },	\
  { "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR,	\
      N_("Generate inline integer division, optimize for throughput") },\
+  { "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT,			\
+      N_("Generate inline square root, optimize for latency") },	\
+  { "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR,			\
+      N_("Generate inline square root, optimize for throughput") },     \
  { "dwarf2-asm", 	MASK_DWARF2_ASM,				\
      N_("Enable Dwarf 2 line debug info via GNU as")},			\
  { "no-dwarf2-asm", 	-MASK_DWARF2_ASM,				\
--- a/gcc/config/ia64/ia64.md
+++ b/gcc/config/ia64/ia64.md
@ -74,6 +74,8 @@
   (UNSPEC_ADDP4		24)
   (UNSPEC_PROLOGUE_USE		25)
   (UNSPEC_RET_ADDR		26)
+   (UNSPEC_SETF_EXP             27)
+   (UNSPEC_FR_SQRT_RECIP_APPROX 28)
  ])

 (define_constants
@ -2757,6 +2759,155 @@
  operands[10] = CONST1_RTX (XFmode);
 }
  [(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_insn "*sqrt_approx"
+  [(set (match_operand:XF 0 "fr_register_operand" "=f")
+        (div:XF (const_int 1)
+                (sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
+   (set (match_operand:BI 1 "register_operand" "=c")
+        (unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
+   (use (match_operand:SI 3 "const_int_operand" "")) ]
+  ""
+  "frsqrta.s%3 %0, %1 = %2"
+  [(set_attr "itanium_class" "fmisc")
+   (set_attr "predicable" "no")])
+
+(define_insn "*setf_exp_xf"
+  [(set (match_operand:XF 0 "fr_register_operand" "=f")
+        (unspec:XF [(match_operand:DI 1 "register_operand" "r")]
+                  UNSPEC_SETF_EXP))]
+  ""
+  "setf.exp %0 = %1"
+  [(set_attr "itanium_class" "frfr")])
+
+(define_expand "sqrtsf2"
+  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+	(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
+  "TARGET_INLINE_SQRT"
+{
+  rtx insn;
+  if (TARGET_INLINE_SQRT_LAT)
+#if 0
+    insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
+#else
+    abort ();
+#endif
+  else
+    insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
+  emit_insn (insn);
+  DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtsf2_internal_thr"
+  [(set (match_operand:SF 0 "fr_register_operand" "=&f")
+	(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
+   ;; Register r2 in optimization guide.
+   (clobber (match_scratch:DI 2 "=r"))
+   ;; Register f8 in optimization guide
+   (clobber (match_scratch:XF 3 "=&f"))
+   ;; Register f9 in optimization guide
+   (clobber (match_scratch:XF 4 "=&f"))
+   ;; Register f10 in optimization guide
+   (clobber (match_scratch:XF 5 "=&f"))
+   ;; Register p6 in optimization guide.
+   (clobber (match_scratch:BI 6 "=c"))]
+  "TARGET_INLINE_SQRT_THR"
+  "#"
+  "&& reload_completed"
+  [ ;; exponent of +1/2 in r2
+    (set (match_dup 2) (const_int 65534))
+    ;; +1/2 in f8
+    (set (match_dup 3) 
+         (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+    ;; Step 1
+    ;; y0 = 1/sqrt(a) in f7
+    (parallel [(set (match_dup 7)
+                    (div:XF (const_int 1)
+                            (sqrt:XF (match_dup 8))))
+               (set (match_dup 6)
+                    (unspec:BI [(match_dup 8)]
+                                 UNSPEC_FR_SQRT_RECIP_APPROX))
+               (use (const_int 0))])
+    ;; Step 2
+    ;; H0 = 1/2 * y0 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 4)
+                      (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+                               (match_dup 9)))
+                 (use (const_int 1))]))
+    ;; Step 3
+    ;; S0 = a * y0 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 7)
+                      (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+                               (match_dup 9)))
+                 (use (const_int 1))]))
+    ;; Step 4
+    ;; d = 1/2 - S0 * H0 in f10
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 5)
+                      (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
+                               (match_dup 3)))
+                 (use (const_int 1))]))
+    ;; Step 5
+    ;; d' = d + 1/2 * d in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (mult:XF (match_dup 3) (match_dup 5))
+                                (match_dup 5)))
+                  (use (const_int 1))]))
+    ;; Step 6
+    ;; e = d + d * d' in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 3))
+                                (match_dup 5)))
+                  (use (const_int 1))]))
+    ;; Step 7
+    ;; S1 = S0 + e * S0 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 0)
+		      (float_truncate:SF
+                        (plus:XF (mult:XF (match_dup 3) (match_dup 7))
+                                 (match_dup 7))))
+                 (use (const_int 1))]))
+    ;; Step 8
+    ;; H1 = H0 + e * H0 in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+                                (match_dup 4)))
+                  (use (const_int 1))]))
+    ;; Step 9 
+    ;; d1 = a - S1 * S1 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 10
+    ;; S = S1 + d1 * H1 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 0)
+                       (float_truncate:SF
+                         (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+                                  (match_dup 7))))
+                  (use (const_int 0))]))]
+{
+  /* Generate 82-bit versions of the input and output operands.  */
+  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+  /* Generate required floating-point constants.  */
+  operands[9] = CONST0_RTX (XFmode);
+}
+  [(set_attr "predicable" "no")])

 ;; ::::::::::::::::::::
 ;; ::
@ -3102,6 +3253,155 @@
  operands[10] = CONST1_RTX (XFmode);
 }
  [(set_attr "predicable" "no")])
+
+;; Inline square root.
+
+(define_expand "sqrtdf2"
+  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+	(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
+  "TARGET_INLINE_SQRT"
+{
+  rtx insn;
+  if (TARGET_INLINE_SQRT_LAT)
+#if 0
+    insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
+#else
+    abort ();
+#endif
+  else
+    insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
+  emit_insn (insn);
+  DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtdf2_internal_thr"
+  [(set (match_operand:DF 0 "fr_register_operand" "=&f")
+	(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
+   ;; Register r2 in optimization guide.
+   (clobber (match_scratch:DI 2 "=r"))
+   ;; Register f8 in optimization guide
+   (clobber (match_scratch:XF 3 "=&f"))
+   ;; Register f9 in optimization guide
+   (clobber (match_scratch:XF 4 "=&f"))
+   ;; Register f10 in optimization guide
+   (clobber (match_scratch:XF 5 "=&f"))
+   ;; Register p6 in optimization guide.
+   (clobber (match_scratch:BI 6 "=c"))]
+  "TARGET_INLINE_SQRT_THR"
+  "#"
+  "&& reload_completed"
+  [ ;; exponent of +1/2 in r2
+    (set (match_dup 2) (const_int 65534))
+    ;; +1/2 in f10
+    (set (match_dup 5) 
+         (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+    ;; Step 1
+    ;; y0 = 1/sqrt(a) in f7
+    (parallel [(set (match_dup 7)
+                    (div:XF (const_int 1)
+                            (sqrt:XF (match_dup 8))))
+               (set (match_dup 6)
+                    (unspec:BI [(match_dup 8)]
+                                 UNSPEC_FR_SQRT_RECIP_APPROX))
+               (use (const_int 0))])
+    ;; Step 2
+    ;; H0 = 1/2 * y0 in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 3)
+                      (plus:XF (mult:XF (match_dup 5) (match_dup 7))
+                               (match_dup 9)))
+                 (use (const_int 1))]))
+    ;; Step 3
+    ;; G0 = a * y0 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 7)
+                      (plus:XF (mult:XF (match_dup 8) (match_dup 7))
+                               (match_dup 9)))
+                 (use (const_int 1))]))
+    ;; Step 4
+    ;; r0 = 1/2 - G0 * H0 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 4)
+                      (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+                               (match_dup 5)))
+                 (use (const_int 1))]))
+    ;; Step 5
+    ;; H1 = H0 + r0 * H0 in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+                                (match_dup 3)))
+                  (use (const_int 1))]))
+    ;; Step 6
+    ;; G1 = G0 + r0 * G0 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 7)
+                       (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+                                (match_dup 7)))
+                  (use (const_int 1))]))
+    ;; Step 7
+    ;; r1 = 1/2 - G1 * H1 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+      (parallel [(set (match_dup 4)
+                      (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
+                               (match_dup 5)))
+                 (use (const_int 1))]))
+    ;; Step 8
+    ;; H2 = H1 + r1 * H1 in f8
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+                                (match_dup 3)))
+                  (use (const_int 1))]))
+    ;; Step 9 
+    ;; G2 = G1 + r1 * G1 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 7)
+                       (plus:XF (mult:XF (match_dup 4) (match_dup 7))
+                                (match_dup 7)))
+                  (use (const_int 1))]))
+    ;; Step 10
+    ;; d2 = a - G2 * G2 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 11
+    ;; G3 = G2 + d2 * H2 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 7)
+                       (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+                                (match_dup 7)))
+                  (use (const_int 1))]))
+    ;; Step 12
+    ;; d3 = a - G3 * G3 in f9
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 13
+    ;; S = G3 + d3 * H2 in f7
+    (cond_exec (ne (match_dup 6) (const_int 0))
+       (parallel [(set (match_dup 0)
+                       (float_truncate:DF
+                         (plus:XF (mult:XF (match_dup 4) (match_dup 3))
+                                  (match_dup 7))))
+                  (use (const_int 0))]))]
+{
+  /* Generate 82-bit versions of the input and output operands.  */
+  operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+  /* Generate required floating-point constants.  */
+  operands[9] = CONST0_RTX (XFmode);
+}
+  [(set_attr "predicable" "no")])

 ;; ::::::::::::::::::::
 ;; ::
@ -3292,6 +3592,17 @@
  "fma.s%4 %0 = %F1, %F2, %F3"
  [(set_attr "itanium_class" "fmac")])

+(define_insn "*maddxf4_alts_truncsf"
+  [(set (match_operand:SF 0 "fr_register_operand" "=f")
+	(float_truncate:SF
+	  (plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
+			    (match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
+		   (match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
+   (use (match_operand:SI 4 "const_int_operand" ""))]
+  ""
+  "fma.s.s%4 %0 = %F1, %F2, %F3"
+  [(set_attr "itanium_class" "fmac")])
+
 (define_insn "*maddxf4_alts_truncdf"
  [(set (match_operand:DF 0 "fr_register_operand" "=f")
 	(float_truncate:DF
@ -3591,6 +3902,170 @@
  "operands[6] = CONST1_RTX (XFmode);"
  [(set_attr "predicable" "no")])

+;; Inline square root.
+
+(define_expand "sqrtxf2"
+  [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+	(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
+  "TARGET_INLINE_SQRT"
+{
+  rtx insn;
+  if (TARGET_INLINE_SQRT_LAT)
+#if 0
+    insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
+#else
+    abort ();
+#endif
+  else
+    insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
+  emit_insn (insn);
+  DONE;
+})
+
+;; Latency-optimized square root.
+;; FIXME: Implement.
+
+;; Throughput-optimized square root.
+
+(define_insn_and_split "sqrtxf2_internal_thr"
+  [(set (match_operand:XF 0 "fr_register_operand" "=&f")
+	(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
+   ;; Register r2 in optimization guide.
+   (clobber (match_scratch:DI 2 "=r"))
+   ;; Register f8 in optimization guide
+   (clobber (match_scratch:XF 3 "=&f"))
+   ;; Register f9 in optimization guide
+   (clobber (match_scratch:XF 4 "=&f"))
+   ;; Register f10 in optimization guide
+   (clobber (match_scratch:XF 5 "=&f"))
+   ;; Register f11 in optimization guide
+   (clobber (match_scratch:XF 6 "=&f"))
+   ;; Register p6 in optimization guide.
+   (clobber (match_scratch:BI 7 "=c"))]
+  "TARGET_INLINE_SQRT_THR"
+  "#"
+  "&& reload_completed"
+  [ ;; exponent of +1/2 in r2
+    (set (match_dup 2) (const_int 65534))
+    ;; +1/2 in f8.  The Intel manual mistakenly specifies f10.
+    (set (match_dup 3) 
+         (unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
+    ;; Step 1
+    ;; y0 = 1/sqrt(a) in f7
+    (parallel [(set (match_dup 8)
+                    (div:XF (const_int 1)
+                            (sqrt:XF (match_dup 9))))
+               (set (match_dup 7)
+                    (unspec:BI [(match_dup 9)]
+                                 UNSPEC_FR_SQRT_RECIP_APPROX))
+               (use (const_int 0))])
+    ;; Step 2
+    ;; H0 = 1/2 * y0 in f9
+    (cond_exec (ne (match_dup 7) (const_int 0))
+      (parallel [(set (match_dup 4)
+                      (plus:XF (mult:XF (match_dup 3) (match_dup 8))
+                               (match_dup 10)))
+                 (use (const_int 1))]))
+    ;; Step 3
+    ;; S0 = a * y0 in f7
+    (cond_exec (ne (match_dup 7) (const_int 0))
+      (parallel [(set (match_dup 8)
+                      (plus:XF (mult:XF (match_dup 9) (match_dup 8))
+                               (match_dup 10)))
+                 (use (const_int 1))]))
+    ;; Step 4
+    ;; d0 = 1/2 - S0 * H0 in f10
+    (cond_exec (ne (match_dup 7) (const_int 0))
+      (parallel [(set (match_dup 5)
+                      (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+                               (match_dup 3)))
+                 (use (const_int 1))]))
+    ;; Step 5
+    ;; H1 = H0 + d0 * H0 in f9
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+                                (match_dup 4)))
+                  (use (const_int 1))]))
+    ;; Step 6
+    ;; S1 = S0 + d0 * S0 in f7
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 8)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 7
+    ;; d1 = 1/2 - S1 * H1 in f10
+    (cond_exec (ne (match_dup 7) (const_int 0))
+      (parallel [(set (match_dup 5)
+                      (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+                               (match_dup 3)))
+                 (use (const_int 1))]))
+    ;; Step 8
+    ;; H2 = H1 + d1 * H1 in f9
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+                                (match_dup 4)))
+                  (use (const_int 1))]))
+    ;; Step 9 
+    ;; S2 = S1 + d1 * S1 in f7
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 8)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 8))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 10
+    ;; d2 = 1/2 - S2 * H2 in f10
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 5)
+                       (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
+                                (match_dup 3)))
+                  (use (const_int 1))]))
+    ;; Step 11
+    ;; e2 = a - S2 * S2 in f8
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+                                (match_dup 9)))
+                  (use (const_int 1))]))
+    ;; Step 12
+    ;; S3 = S2 + e2 * H2 in f7
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 8)
+                       (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+                                (match_dup 8)))
+                  (use (const_int 1))]))
+    ;; Step 13
+    ;; H3 = H2 + d2 * H2 in f9
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 4)
+                       (plus:XF (mult:XF (match_dup 5) (match_dup 4))
+                                (match_dup 4)))
+                  (use (const_int 1))]))
+    ;; Step 14
+    ;; e3 = a - S3 * S3 in f8
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 3)
+                       (plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
+                                (match_dup 9)))
+                  (use (const_int 1))]))
+    ;; Step 15
+    ;; S = S3 + e3 * H3 in f7
+    (cond_exec (ne (match_dup 7) (const_int 0))
+       (parallel [(set (match_dup 0)
+                       (plus:XF (mult:XF (match_dup 3) (match_dup 4))
+                                (match_dup 8)))
+                  (use (const_int 0))]))]
+{
+  /* Generate 82-bit versions of the input and output operands.  */
+  operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
+  operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
+  /* Generate required floating-point constants.  */
+  operands[10] = CONST0_RTX (XFmode);
+}
+  [(set_attr "predicable" "no")])
+
 ;; ??? frcpa works like cmp.foo.unc.

 (define_insn "*recip_approx"