ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants. (*sqrt_approx): New instruction pattern for approximate square roots. (*setf_exp_xf): New instruction pattern for exponentiation. (*maddxf4_alts_truncsf): New instruction pattern for truncation. (sqrtsf2_internal_thr): New define_and_split implementing throughput-optimized inline calculation of SFmode square root. (sqrtdf2_internal_thr): Likewise for DFmode. (sqrtxf2_internal_thr): Likewise for XFmode. (sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between latency- and throughput-optimized square root algorithms. * ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR, TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT): New macros. (TARGET_SWITCHES): Add -minline-sqrt-min-latency and -minline-sqrt-max-throughput. * ia64.c (ia64_override_options): If both -minline-sqrt-min-latency and -minline-sqrt-max-throughput are given, notify the user that both options cannot be used simultaneously. If -minline-sqrt-min-latency is given, notify the user that this mode is not yet implemented. (rtx_needs_barrier): Reformat initial comment to obey 72-character width limit. Support UNSPEC_SETF_EXP and UNSPEC_FR_SQRT_RECIP_APPROX. From-SVN: r73027
This commit is contained in:
parent
1e8fee4a42
commit
b38ba46301
|
@ -1,3 +1,29 @@
|
|||
2003-10-28 Zack Weinberg <zack@codesourcery.com>
|
||||
|
||||
* ia64.md (UNSPEC_SETF_EXP,UNSPEC_FR_SQRT_RECIP_APPROX): New constants.
|
||||
(*sqrt_approx): New instruction pattern for approximate square roots.
|
||||
(*setf_exp_xf): New instruction pattern for exponentiation.
|
||||
(*maddxf4_alts_truncsf): New instruction pattern for truncation.
|
||||
(sqrtsf2_internal_thr): New define_and_split implementing
|
||||
throughput-optimized inline calculation of SFmode square root.
|
||||
(sqrtdf2_internal_thr): Likewise for DFmode.
|
||||
(sqrtxf2_internal_thr): Likewise for XFmode.
|
||||
(sqrtsf2, sqrtdf2, sqrtxf2): New expanders to choose between
|
||||
latency- and throughput-optimized square root algorithms.
|
||||
* ia64.h (MASK_INLINE_SQRT_LAT, MASK_INLINE_SQRT_THR,
|
||||
TARGET_INLINE_SQRT_LAT, TARGET_INLINE_SQRT_THR, TARGET_INLINE_SQRT):
|
||||
New macros.
|
||||
(TARGET_SWITCHES): Add -minline-sqrt-min-latency and
|
||||
-minline-sqrt-max-throughput.
|
||||
* ia64.c (ia64_override_options): If both -minline-sqrt-min-latency
|
||||
and -minline-sqrt-max-throughput are given, notify the user
|
||||
that both options cannot be used simultaneously.
|
||||
If -minline-sqrt-min-latency is given, notify the user that
|
||||
this mode is not yet implemented.
|
||||
(rtx_needs_barrier): Reformat initial comment to obey
|
||||
72-character width limit. Support UNSPEC_SETF_EXP and
|
||||
UNSPEC_FR_SQRT_RECIP_APPROX.
|
||||
|
||||
2003-10-29 Alan Modra <amodra@bigpond.net.au>
|
||||
|
||||
* config/rs6000/rs6000.md (movdf_softfloat64): Allow dummy ctr,ctr
|
||||
|
@ -12,7 +38,7 @@
|
|||
2003-10-28 Richard Earnshaw <rearnsha@arm.com>
|
||||
|
||||
* arm.c (arm_output_epilogue): When using a frame pointer, don't emit
|
||||
an extra stack adjustment insn if the stack pointer is already
|
||||
an extra stack adjustment insn if the stack pointer is already
|
||||
pointing at the right place.
|
||||
(use_return_insn): Allow a return insn to be used when we have a
|
||||
frame pointer if the stack pointer is in the right place.
|
||||
|
|
|
@ -4487,6 +4487,18 @@ ia64_override_options (void)
|
|||
target_flags &= ~MASK_INLINE_INT_DIV_THR;
|
||||
}
|
||||
|
||||
if (TARGET_INLINE_SQRT_LAT && TARGET_INLINE_SQRT_THR)
|
||||
{
|
||||
warning ("cannot optimize square root for both latency and throughput");
|
||||
target_flags &= ~MASK_INLINE_SQRT_THR;
|
||||
}
|
||||
|
||||
if (TARGET_INLINE_SQRT_LAT)
|
||||
{
|
||||
warning ("not yet implemented: latency-optimized inline square root");
|
||||
target_flags &= ~MASK_INLINE_SQRT_LAT;
|
||||
}
|
||||
|
||||
if (ia64_fixed_range_string)
|
||||
fix_range (ia64_fixed_range_string);
|
||||
|
||||
|
@ -4896,9 +4908,9 @@ set_src_needs_barrier (rtx x, struct reg_flags flags, int pred, rtx cond)
|
|||
return need_barrier;
|
||||
}
|
||||
|
||||
/* Handle an access to rtx X of type FLAGS using predicate register PRED.
|
||||
Return 1 is this access creates a dependency with an earlier instruction
|
||||
in the same group. */
|
||||
/* Handle an access to rtx X of type FLAGS using predicate register
|
||||
PRED. Return 1 if this access creates a dependency with an earlier
|
||||
instruction in the same group. */
|
||||
|
||||
static int
|
||||
rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
|
||||
|
@ -5124,7 +5136,9 @@ rtx_needs_barrier (rtx x, struct reg_flags flags, int pred)
|
|||
case UNSPEC_FR_SPILL:
|
||||
case UNSPEC_FR_RESTORE:
|
||||
case UNSPEC_GETF_EXP:
|
||||
case UNSPEC_SETF_EXP:
|
||||
case UNSPEC_ADDP4:
|
||||
case UNSPEC_FR_SQRT_RECIP_APPROX:
|
||||
need_barrier = rtx_needs_barrier (XVECEXP (x, 0, 0), flags, pred);
|
||||
break;
|
||||
|
||||
|
|
|
@ -87,6 +87,10 @@ extern int target_flags;
|
|||
|
||||
#define MASK_INLINE_INT_DIV_THR 0x00001000 /* inline div, max throughput. */
|
||||
|
||||
#define MASK_INLINE_SQRT_LAT 0x00002000 /* inline sqrt, min latency. */
|
||||
|
||||
#define MASK_INLINE_SQRT_THR 0x00004000 /* inline sqrt, max throughput. */
|
||||
|
||||
#define MASK_DWARF2_ASM 0x40000000 /* test dwarf2 line info via gas. */
|
||||
|
||||
#define MASK_EARLY_STOP_BITS 0x00002000 /* tune stop bits for the model. */
|
||||
|
@ -127,6 +131,13 @@ extern int target_flags;
|
|||
#define TARGET_INLINE_INT_DIV \
|
||||
(target_flags & (MASK_INLINE_INT_DIV_LAT | MASK_INLINE_INT_DIV_THR))
|
||||
|
||||
#define TARGET_INLINE_SQRT_LAT (target_flags & MASK_INLINE_SQRT_LAT)
|
||||
|
||||
#define TARGET_INLINE_SQRT_THR (target_flags & MASK_INLINE_SQRT_THR)
|
||||
|
||||
#define TARGET_INLINE_SQRT \
|
||||
(target_flags & (MASK_INLINE_SQRT_LAT | MASK_INLINE_SQRT_THR))
|
||||
|
||||
#define TARGET_DWARF2_ASM (target_flags & MASK_DWARF2_ASM)
|
||||
|
||||
extern int ia64_tls_size;
|
||||
|
@ -186,6 +197,10 @@ extern int ia64_tls_size;
|
|||
N_("Generate inline integer division, optimize for latency") }, \
|
||||
{ "inline-int-divide-max-throughput", MASK_INLINE_INT_DIV_THR, \
|
||||
N_("Generate inline integer division, optimize for throughput") },\
|
||||
{ "inline-sqrt-min-latency", MASK_INLINE_SQRT_LAT, \
|
||||
N_("Generate inline square root, optimize for latency") }, \
|
||||
{ "inline-sqrt-max-throughput", MASK_INLINE_SQRT_THR, \
|
||||
N_("Generate inline square root, optimize for throughput") }, \
|
||||
{ "dwarf2-asm", MASK_DWARF2_ASM, \
|
||||
N_("Enable Dwarf 2 line debug info via GNU as")}, \
|
||||
{ "no-dwarf2-asm", -MASK_DWARF2_ASM, \
|
||||
|
|
|
@ -74,6 +74,8 @@
|
|||
(UNSPEC_ADDP4 24)
|
||||
(UNSPEC_PROLOGUE_USE 25)
|
||||
(UNSPEC_RET_ADDR 26)
|
||||
(UNSPEC_SETF_EXP 27)
|
||||
(UNSPEC_FR_SQRT_RECIP_APPROX 28)
|
||||
])
|
||||
|
||||
(define_constants
|
||||
|
@ -2757,6 +2759,155 @@
|
|||
operands[10] = CONST1_RTX (XFmode);
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; Inline square root.
|
||||
|
||||
(define_insn "*sqrt_approx"
|
||||
[(set (match_operand:XF 0 "fr_register_operand" "=f")
|
||||
(div:XF (const_int 1)
|
||||
(sqrt:XF (match_operand:XF 2 "fr_register_operand" "f"))))
|
||||
(set (match_operand:BI 1 "register_operand" "=c")
|
||||
(unspec:BI [(match_dup 2)] UNSPEC_FR_SQRT_RECIP_APPROX))
|
||||
(use (match_operand:SI 3 "const_int_operand" "")) ]
|
||||
""
|
||||
"frsqrta.s%3 %0, %1 = %2"
|
||||
[(set_attr "itanium_class" "fmisc")
|
||||
(set_attr "predicable" "no")])
|
||||
|
||||
(define_insn "*setf_exp_xf"
|
||||
[(set (match_operand:XF 0 "fr_register_operand" "=f")
|
||||
(unspec:XF [(match_operand:DI 1 "register_operand" "r")]
|
||||
UNSPEC_SETF_EXP))]
|
||||
""
|
||||
"setf.exp %0 = %1"
|
||||
[(set_attr "itanium_class" "frfr")])
|
||||
|
||||
(define_expand "sqrtsf2"
|
||||
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))]
|
||||
"TARGET_INLINE_SQRT"
|
||||
{
|
||||
rtx insn;
|
||||
if (TARGET_INLINE_SQRT_LAT)
|
||||
#if 0
|
||||
insn = gen_sqrtsf2_internal_lat (operands[0], operands[1]);
|
||||
#else
|
||||
abort ();
|
||||
#endif
|
||||
else
|
||||
insn = gen_sqrtsf2_internal_thr (operands[0], operands[1]);
|
||||
emit_insn (insn);
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; Latency-optimized square root.
|
||||
;; FIXME: Implement.
|
||||
|
||||
;; Throughput-optimized square root.
|
||||
|
||||
(define_insn_and_split "sqrtsf2_internal_thr"
|
||||
[(set (match_operand:SF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:SF (match_operand:SF 1 "fr_register_operand" "f")))
|
||||
;; Register r2 in optimization guide.
|
||||
(clobber (match_scratch:DI 2 "=r"))
|
||||
;; Register f8 in optimization guide
|
||||
(clobber (match_scratch:XF 3 "=&f"))
|
||||
;; Register f9 in optimization guide
|
||||
(clobber (match_scratch:XF 4 "=&f"))
|
||||
;; Register f10 in optimization guide
|
||||
(clobber (match_scratch:XF 5 "=&f"))
|
||||
;; Register p6 in optimization guide.
|
||||
(clobber (match_scratch:BI 6 "=c"))]
|
||||
"TARGET_INLINE_SQRT_THR"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[ ;; exponent of +1/2 in r2
|
||||
(set (match_dup 2) (const_int 65534))
|
||||
;; +1/2 in f8
|
||||
(set (match_dup 3)
|
||||
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
|
||||
;; Step 1
|
||||
;; y0 = 1/sqrt(a) in f7
|
||||
(parallel [(set (match_dup 7)
|
||||
(div:XF (const_int 1)
|
||||
(sqrt:XF (match_dup 8))))
|
||||
(set (match_dup 6)
|
||||
(unspec:BI [(match_dup 8)]
|
||||
UNSPEC_FR_SQRT_RECIP_APPROX))
|
||||
(use (const_int 0))])
|
||||
;; Step 2
|
||||
;; H0 = 1/2 * y0 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 3
|
||||
;; S0 = a * y0 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 7)
|
||||
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 4
|
||||
;; d = 1/2 - S0 * H0 in f10
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 5)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 4)))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 5
|
||||
;; d' = d + 1/2 * d in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 5))
|
||||
(match_dup 5)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 6
|
||||
;; e = d + d * d' in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 3))
|
||||
(match_dup 5)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 7
|
||||
;; S1 = S0 + e * S0 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 0)
|
||||
(float_truncate:SF
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 7))
|
||||
(match_dup 7))))
|
||||
(use (const_int 1))]))
|
||||
;; Step 8
|
||||
;; H1 = H0 + e * H0 in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
|
||||
(match_dup 4)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 9
|
||||
;; d1 = a - S1 * S1 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 10
|
||||
;; S = S1 + d1 * H1 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 0)
|
||||
(float_truncate:SF
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
|
||||
(match_dup 7))))
|
||||
(use (const_int 0))]))]
|
||||
{
|
||||
/* Generate 82-bit versions of the input and output operands. */
|
||||
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
|
||||
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
|
||||
/* Generate required floating-point constants. */
|
||||
operands[9] = CONST0_RTX (XFmode);
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; ::::::::::::::::::::
|
||||
;; ::
|
||||
|
@ -3102,6 +3253,155 @@
|
|||
operands[10] = CONST1_RTX (XFmode);
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; Inline square root.
|
||||
|
||||
(define_expand "sqrtdf2"
|
||||
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))]
|
||||
"TARGET_INLINE_SQRT"
|
||||
{
|
||||
rtx insn;
|
||||
if (TARGET_INLINE_SQRT_LAT)
|
||||
#if 0
|
||||
insn = gen_sqrtdf2_internal_lat (operands[0], operands[1]);
|
||||
#else
|
||||
abort ();
|
||||
#endif
|
||||
else
|
||||
insn = gen_sqrtdf2_internal_thr (operands[0], operands[1]);
|
||||
emit_insn (insn);
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; Latency-optimized square root.
|
||||
;; FIXME: Implement.
|
||||
|
||||
;; Throughput-optimized square root.
|
||||
|
||||
(define_insn_and_split "sqrtdf2_internal_thr"
|
||||
[(set (match_operand:DF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:DF (match_operand:DF 1 "fr_register_operand" "f")))
|
||||
;; Register r2 in optimization guide.
|
||||
(clobber (match_scratch:DI 2 "=r"))
|
||||
;; Register f8 in optimization guide
|
||||
(clobber (match_scratch:XF 3 "=&f"))
|
||||
;; Register f9 in optimization guide
|
||||
(clobber (match_scratch:XF 4 "=&f"))
|
||||
;; Register f10 in optimization guide
|
||||
(clobber (match_scratch:XF 5 "=&f"))
|
||||
;; Register p6 in optimization guide.
|
||||
(clobber (match_scratch:BI 6 "=c"))]
|
||||
"TARGET_INLINE_SQRT_THR"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[ ;; exponent of +1/2 in r2
|
||||
(set (match_dup 2) (const_int 65534))
|
||||
;; +1/2 in f10
|
||||
(set (match_dup 5)
|
||||
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
|
||||
;; Step 1
|
||||
;; y0 = 1/sqrt(a) in f7
|
||||
(parallel [(set (match_dup 7)
|
||||
(div:XF (const_int 1)
|
||||
(sqrt:XF (match_dup 8))))
|
||||
(set (match_dup 6)
|
||||
(unspec:BI [(match_dup 8)]
|
||||
UNSPEC_FR_SQRT_RECIP_APPROX))
|
||||
(use (const_int 0))])
|
||||
;; Step 2
|
||||
;; H0 = 1/2 * y0 in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 7))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 3
|
||||
;; G0 = a * y0 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 7)
|
||||
(plus:XF (mult:XF (match_dup 8) (match_dup 7))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 4
|
||||
;; r0 = 1/2 - G0 * H0 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
|
||||
(match_dup 5)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 5
|
||||
;; H1 = H0 + r0 * H0 in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 6
|
||||
;; G1 = G0 + r0 * G0 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 7)
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
|
||||
(match_dup 7)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 7
|
||||
;; r1 = 1/2 - G1 * H1 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 3)))
|
||||
(match_dup 5)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 8
|
||||
;; H2 = H1 + r1 * H1 in f8
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 9
|
||||
;; G2 = G1 + r1 * G1 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 7)
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 7))
|
||||
(match_dup 7)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 10
|
||||
;; d2 = a - G2 * G2 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 11
|
||||
;; G3 = G2 + d2 * H2 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 7)
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
|
||||
(match_dup 7)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 12
|
||||
;; d3 = a - G3 * G3 in f9
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 7) (match_dup 7)))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 13
|
||||
;; S = G3 + d3 * H2 in f7
|
||||
(cond_exec (ne (match_dup 6) (const_int 0))
|
||||
(parallel [(set (match_dup 0)
|
||||
(float_truncate:DF
|
||||
(plus:XF (mult:XF (match_dup 4) (match_dup 3))
|
||||
(match_dup 7))))
|
||||
(use (const_int 0))]))]
|
||||
{
|
||||
/* Generate 82-bit versions of the input and output operands. */
|
||||
operands[7] = gen_rtx_REG (XFmode, REGNO (operands[0]));
|
||||
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[1]));
|
||||
/* Generate required floating-point constants. */
|
||||
operands[9] = CONST0_RTX (XFmode);
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; ::::::::::::::::::::
|
||||
;; ::
|
||||
|
@ -3292,6 +3592,17 @@
|
|||
"fma.s%4 %0 = %F1, %F2, %F3"
|
||||
[(set_attr "itanium_class" "fmac")])
|
||||
|
||||
(define_insn "*maddxf4_alts_truncsf"
|
||||
[(set (match_operand:SF 0 "fr_register_operand" "=f")
|
||||
(float_truncate:SF
|
||||
(plus:XF (mult:XF (match_operand:XF 1 "xfreg_or_fp01_operand" "fG")
|
||||
(match_operand:XF 2 "xfreg_or_fp01_operand" "fG"))
|
||||
(match_operand:XF 3 "xfreg_or_fp01_operand" "fG"))))
|
||||
(use (match_operand:SI 4 "const_int_operand" ""))]
|
||||
""
|
||||
"fma.s.s%4 %0 = %F1, %F2, %F3"
|
||||
[(set_attr "itanium_class" "fmac")])
|
||||
|
||||
(define_insn "*maddxf4_alts_truncdf"
|
||||
[(set (match_operand:DF 0 "fr_register_operand" "=f")
|
||||
(float_truncate:DF
|
||||
|
@ -3591,6 +3902,170 @@
|
|||
"operands[6] = CONST1_RTX (XFmode);"
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; Inline square root.
|
||||
|
||||
(define_expand "sqrtxf2"
|
||||
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))]
|
||||
"TARGET_INLINE_SQRT"
|
||||
{
|
||||
rtx insn;
|
||||
if (TARGET_INLINE_SQRT_LAT)
|
||||
#if 0
|
||||
insn = gen_sqrtxf2_internal_lat (operands[0], operands[1]);
|
||||
#else
|
||||
abort ();
|
||||
#endif
|
||||
else
|
||||
insn = gen_sqrtxf2_internal_thr (operands[0], operands[1]);
|
||||
emit_insn (insn);
|
||||
DONE;
|
||||
})
|
||||
|
||||
;; Latency-optimized square root.
|
||||
;; FIXME: Implement.
|
||||
|
||||
;; Throughput-optimized square root.
|
||||
|
||||
(define_insn_and_split "sqrtxf2_internal_thr"
|
||||
[(set (match_operand:XF 0 "fr_register_operand" "=&f")
|
||||
(sqrt:XF (match_operand:XF 1 "fr_register_operand" "f")))
|
||||
;; Register r2 in optimization guide.
|
||||
(clobber (match_scratch:DI 2 "=r"))
|
||||
;; Register f8 in optimization guide
|
||||
(clobber (match_scratch:XF 3 "=&f"))
|
||||
;; Register f9 in optimization guide
|
||||
(clobber (match_scratch:XF 4 "=&f"))
|
||||
;; Register f10 in optimization guide
|
||||
(clobber (match_scratch:XF 5 "=&f"))
|
||||
;; Register f11 in optimization guide
|
||||
(clobber (match_scratch:XF 6 "=&f"))
|
||||
;; Register p6 in optimization guide.
|
||||
(clobber (match_scratch:BI 7 "=c"))]
|
||||
"TARGET_INLINE_SQRT_THR"
|
||||
"#"
|
||||
"&& reload_completed"
|
||||
[ ;; exponent of +1/2 in r2
|
||||
(set (match_dup 2) (const_int 65534))
|
||||
;; +1/2 in f8. The Intel manual mistakenly specifies f10.
|
||||
(set (match_dup 3)
|
||||
(unspec:XF [(match_dup 2)] UNSPEC_SETF_EXP))
|
||||
;; Step 1
|
||||
;; y0 = 1/sqrt(a) in f7
|
||||
(parallel [(set (match_dup 8)
|
||||
(div:XF (const_int 1)
|
||||
(sqrt:XF (match_dup 9))))
|
||||
(set (match_dup 7)
|
||||
(unspec:BI [(match_dup 9)]
|
||||
UNSPEC_FR_SQRT_RECIP_APPROX))
|
||||
(use (const_int 0))])
|
||||
;; Step 2
|
||||
;; H0 = 1/2 * y0 in f9
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 8))
|
||||
(match_dup 10)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 3
|
||||
;; S0 = a * y0 in f7
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 8)
|
||||
(plus:XF (mult:XF (match_dup 9) (match_dup 8))
|
||||
(match_dup 10)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 4
|
||||
;; d0 = 1/2 - S0 * H0 in f10
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 5)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 5
|
||||
;; H1 = H0 + d0 * H0 in f9
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
|
||||
(match_dup 4)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 6
|
||||
;; S1 = S0 + d0 * S0 in f7
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 8)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 7
|
||||
;; d1 = 1/2 - S1 * H1 in f10
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 5)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 8
|
||||
;; H2 = H1 + d1 * H1 in f9
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
|
||||
(match_dup 4)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 9
|
||||
;; S2 = S1 + d1 * S1 in f7
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 8)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 8))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 10
|
||||
;; d2 = 1/2 - S2 * H2 in f10
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 5)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 4)))
|
||||
(match_dup 3)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 11
|
||||
;; e2 = a - S2 * S2 in f8
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 12
|
||||
;; S3 = S2 + e2 * H2 in f7
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 8)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
|
||||
(match_dup 8)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 13
|
||||
;; H3 = H2 + d2 * H2 in f9
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 4)
|
||||
(plus:XF (mult:XF (match_dup 5) (match_dup 4))
|
||||
(match_dup 4)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 14
|
||||
;; e3 = a - S3 * S3 in f8
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 3)
|
||||
(plus:XF (neg:XF (mult:XF (match_dup 8) (match_dup 8)))
|
||||
(match_dup 9)))
|
||||
(use (const_int 1))]))
|
||||
;; Step 15
|
||||
;; S = S3 + e3 * H3 in f7
|
||||
(cond_exec (ne (match_dup 7) (const_int 0))
|
||||
(parallel [(set (match_dup 0)
|
||||
(plus:XF (mult:XF (match_dup 3) (match_dup 4))
|
||||
(match_dup 8)))
|
||||
(use (const_int 0))]))]
|
||||
{
|
||||
/* Generate 82-bit versions of the input and output operands. */
|
||||
operands[8] = gen_rtx_REG (XFmode, REGNO (operands[0]));
|
||||
operands[9] = gen_rtx_REG (XFmode, REGNO (operands[1]));
|
||||
/* Generate required floating-point constants. */
|
||||
operands[10] = CONST0_RTX (XFmode);
|
||||
}
|
||||
[(set_attr "predicable" "no")])
|
||||
|
||||
;; ??? frcpa works like cmp.foo.unc.
|
||||
|
||||
(define_insn "*recip_approx"
|
||||
|
|
Loading…
Reference in New Issue