re PR middle-end/31723 (Use reciprocal and reciprocal square root with -ffast-math)

PR middle-end/31723
    * hooks.c (hook_tree_tree_bool_null): New hook.
    * hooks.h (hook_tree_tree_bool_null): Add prototype.
    * tree-pass.h (pass_convert_to_rsqrt): Declare.
    * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt.
    * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b)
    and convert it to reciprocal a*rfunc(b).
    (execute_convert_to_rsqrt): New function.
    (gate_convert_to_rsqrt): New function.
    (pass_convert_to_rsqrt): New pass definition.
    * target.h (struct gcc_target): Add builtin_reciprocal.
    * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define.
    (TARGET_INITIALIZER): Initialize builtin_reciprocal with
    TARGET_BUILTIN_RECIPROCAL.
    * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document.

    * config/i386/i386.h (TARGET_RECIP): New define.
    * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (*rcpsf2_sse): New insn pattern.
    (*rsqrtsf2_sse): Ditto.
    (rsqrtsf2): New expander.  Expand by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (*sse_rsqrtv4sf2): Do not export.
    (sqrtv4sf2): Ditto.
    (sse_rsqrtv4sf2): New expander.  Expand by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (sqrtv4sf2): Ditto.
    * config/i386/i386.opt (mrecip): New option.
    * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare.
    (ix86_emit_swsqrtsf): Ditto.
    * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant.
    (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New
    builtin definition.
    (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using
    ix86_expand_unop1_builtin.
    (ix86_emit_swdivsf): New function.
    (ix86_emit_swsqrtsf): Ditto.
    (ix86_builtin_reciprocal): New function.
    (TARGET_BUILTIN_RECIPROCAL): Use it.
    (ix86_vectorize_builtin_conversion): Rename from
    ix86_builtin_conversion.
    (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function.
    * doc/invoke.texi (Machine Dependent Options): Add -mrecip to
    "i386 and x86_64 Options" section.
    (Intel 386 and AMD x86_64 Options): Document -mrecip.

testsuite/ChangeLog:

    PR middle-end/31723
    * gcc.target/i386/recip-divf.c: New test.
    * gcc.target/i386/recip-sqrtf.c: Ditto.
    * gcc.target/i386/recip-vec-divf.c: Ditto.
    * gcc.target/i386/recip-vec-sqrtf.c: Ditto.
    * gcc.target/i386/sse-recip.c: Ditto.

From-SVN: r125756
This commit is contained in:
Uros Bizjak 2007-06-16 11:52:48 +02:00 committed by Uros Bizjak
parent 5be014d5b7
commit 6b889d891d
22 changed files with 604 additions and 11 deletions

View File

@ -1,3 +1,66 @@
2007-06-16 Uros Bizjak <ubizjak@gmail.com>
PR middle-end/31723
* hooks.c (hook_tree_tree_bool_null): New hook.
* hooks.h (hook_tree_tree_bool_null): Add prototype.
* tree-pass.h (pass_convert_to_rsqrt): Declare.
* passes.c (init_optimization_passes): Add pass_convert_to_rsqrt.
* tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b)
and convert it to reciprocal a*rfunc(b).
(execute_convert_to_rsqrt): New function.
(gate_convert_to_rsqrt): New function.
(pass_convert_to_rsqrt): New pass definition.
* target.h (struct gcc_target): Add builtin_reciprocal.
* target-def.h (TARGET_BUILTIN_RECIPROCAL): New define.
(TARGET_INITIALIZER): Initialize builtin_reciprocal with
TARGET_BUILTIN_RECIPROCAL.
* doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document.
* config/i386/i386.h (TARGET_RECIP): New define.
* config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf
for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
flag_unsafe_math_optimizations are set, flag_trapping_math is unset
and not optimizing for size.
(*rcpsf2_sse): New insn pattern.
(*rsqrtsf2_sse): Ditto.
(rsqrtsf2): New expander. Expand by calling ix86_emit_swsqrtsf
for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
flag_unsafe_math_optimizations are set, flag_trapping_math is unset
and not optimizing for size.
(sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf
for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
flag_unsafe_math_optimizations are set, flag_trapping_math is unset
and not optimizing for size.
* config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf
for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
flag_unsafe_math_optimizations are set, flag_trapping_math is unset
and not optimizing for size.
(*sse_rsqrtv4sf2): Do not export.
(sqrtv4sf2): Ditto.
(sse_rsqrtv4sf2): New expander. Expand by calling ix86_emit_swsqrtsf
for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
flag_unsafe_math_optimizations are set, flag_trapping_math is unset
and not optimizing for size.
(sqrtv4sf2): Ditto.
* config/i386/i386.opt (mrecip): New option.
* config/i386/i386-protos.h (ix86_emit_swdivsf): Declare.
(ix86_emit_swsqrtsf): Ditto.
* config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant.
(ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New
builtin definition.
(ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using
ix86_expand_unop1_builtin.
(ix86_emit_swdivsf): New function.
(ix86_emit_swsqrtsf): Ditto.
(ix86_builtin_reciprocal): New function.
(TARGET_BUILTIN_RECIPROCAL): Use it.
(ix86_vectorize_builtin_conversion): Rename from
ix86_builtin_conversion.
(TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function.
* doc/invoke.texi (Machine Dependent Options): Add -mrecip to
"i386 and x86_64 Options" section.
(Intel 386 and AMD x86_64 Options): Document -mrecip.
2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com>
Zdenek Dvorak <dvorakz@suse.cz>
Richard Guenther <rguenther@suse.de>

View File

@ -163,6 +163,8 @@ extern void x86_emit_floatuns (rtx [2]);
extern void ix86_emit_fp_unordered_jump (rtx);
extern void ix86_emit_i387_log1p (rtx, rtx);
extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode);
extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool);
extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode);

View File

@ -16450,6 +16450,7 @@ enum ix86_builtins
IX86_BUILTIN_RCPSS,
IX86_BUILTIN_RSQRTPS,
IX86_BUILTIN_RSQRTSS,
IX86_BUILTIN_RSQRTF,
IX86_BUILTIN_SQRTPS,
IX86_BUILTIN_SQRTSS,
@ -18039,6 +18040,10 @@ ix86_init_mmx_sse_builtins (void)
def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
ftype = build_function_type_list (float_type_node,
float_type_node,
NULL_TREE);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
@ -19133,6 +19138,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
emit_insn (pat);
return 0;
case IX86_BUILTIN_RSQRTF:
return ix86_expand_unop1_builtin (CODE_FOR_rsqrtsf2, exp, target);
case IX86_BUILTIN_SQRTSS:
return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target);
case IX86_BUILTIN_RSQRTSS:
@ -19869,7 +19877,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
input vector of type TYPE, or NULL_TREE if it is not available. */
static tree
ix86_builtin_conversion (unsigned int code, tree type)
ix86_vectorize_builtin_conversion (unsigned int code, tree type)
{
if (TREE_CODE (type) != VECTOR_TYPE)
return NULL_TREE;
@ -19899,6 +19907,32 @@ ix86_builtin_conversion (unsigned int code, tree type)
}
}
/* Returns a code for a target-specific builtin that implements
reciprocal of the function, or NULL_TREE if not available. */
static tree
ix86_builtin_reciprocal (unsigned int code, bool sqrt ATTRIBUTE_UNUSED)
{
if (! (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations))
return NULL_TREE;
switch (code)
{
/* Sqrt to rsqrt conversion. */
case BUILT_IN_SQRTF:
return ix86_builtins[IX86_BUILTIN_RSQRTF];
/* Vectorized version of sqrt to rsqrt conversion. */
case IX86_BUILTIN_SQRTPS:
return ix86_builtins[IX86_BUILTIN_RSQRTPS];
default:
return NULL_TREE;
}
}
/* Store OPERAND to the memory after reload is completed. This means
that we can't easily use assign_stack_local. */
rtx
@ -22501,6 +22535,100 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
emit_label (label2);
}
/* Output code to perform a Newton-Rhapson approximation of a single precision
floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
{
rtx x0, x1, e0, e1, two;
x0 = gen_reg_rtx (mode);
e0 = gen_reg_rtx (mode);
e1 = gen_reg_rtx (mode);
x1 = gen_reg_rtx (mode);
two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode);
if (VECTOR_MODE_P (mode))
two = ix86_build_const_vector (SFmode, true, two);
two = force_reg (mode, two);
/* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */
/* x0 = 1./b estimate */
emit_insn (gen_rtx_SET (VOIDmode, x0,
gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
UNSPEC_RCP)));
/* e0 = x0 * b */
emit_insn (gen_rtx_SET (VOIDmode, e0,
gen_rtx_MULT (mode, x0, b)));
/* e1 = 2. - e0 */
emit_insn (gen_rtx_SET (VOIDmode, e1,
gen_rtx_MINUS (mode, two, e0)));
/* x1 = x0 * e1 */
emit_insn (gen_rtx_SET (VOIDmode, x1,
gen_rtx_MULT (mode, x0, e1)));
/* res = a * x1 */
emit_insn (gen_rtx_SET (VOIDmode, res,
gen_rtx_MULT (mode, a, x1)));
}
/* Output code to perform a Newton-Rhapson approximation of a
single precision floating point [reciprocal] square root. */
void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
bool recip)
{
rtx x0, e0, e1, e2, e3, three, half;
x0 = gen_reg_rtx (mode);
e0 = gen_reg_rtx (mode);
e1 = gen_reg_rtx (mode);
e2 = gen_reg_rtx (mode);
e3 = gen_reg_rtx (mode);
three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode);
half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode);
if (VECTOR_MODE_P (mode))
{
three = ix86_build_const_vector (SFmode, true, three);
half = ix86_build_const_vector (SFmode, true, half);
}
three = force_reg (mode, three);
half = force_reg (mode, half);
/* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a))
1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */
/* x0 = 1./sqrt(a) estimate */
emit_insn (gen_rtx_SET (VOIDmode, x0,
gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
UNSPEC_RSQRT)));
/* e0 = x0 * a */
emit_insn (gen_rtx_SET (VOIDmode, e0,
gen_rtx_MULT (mode, x0, a)));
/* e1 = e0 * x0 */
emit_insn (gen_rtx_SET (VOIDmode, e1,
gen_rtx_MULT (mode, e0, x0)));
/* e2 = 3. - e1 */
emit_insn (gen_rtx_SET (VOIDmode, e2,
gen_rtx_MINUS (mode, three, e1)));
if (recip)
/* e3 = .5 * x0 */
emit_insn (gen_rtx_SET (VOIDmode, e3,
gen_rtx_MULT (mode, half, x0)));
else
/* e3 = .5 * e0 */
emit_insn (gen_rtx_SET (VOIDmode, e3,
gen_rtx_MULT (mode, half, e0)));
/* ret = e2 * e3 */
emit_insn (gen_rtx_SET (VOIDmode, res,
gen_rtx_MULT (mode, e2, e3)));
}
/* Solaris implementation of TARGET_ASM_NAMED_SECTION. */
static void ATTRIBUTE_UNUSED
@ -23205,9 +23333,14 @@ static const struct attribute_spec ix86_attribute_table[] =
#define TARGET_EXPAND_BUILTIN ix86_expand_builtin
#undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
ix86_builtin_vectorized_function
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion
#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
#undef TARGET_BUILTIN_RECIPROCAL
#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
#undef TARGET_ASM_FUNCTION_EPILOGUE
#define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue

View File

@ -365,6 +365,7 @@ extern int x86_prefetch_sse;
#define TARGET_POPCNT x86_popcnt
#define TARGET_PREFETCH_SSE x86_prefetch_sse
#define TARGET_SAHF x86_sahf
#define TARGET_RECIP x86_recip
#define ASSEMBLER_DIALECT (ix86_asm_dialect)

View File

@ -7470,7 +7470,16 @@
(div:SF (match_operand:SF 1 "register_operand" "")
(match_operand:SF 2 "nonimmediate_operand" "")))]
"TARGET_80387 || TARGET_SSE_MATH"
"")
{
if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations)
{
ix86_emit_swdivsf (operands[0], operands[1],
operands[2], SFmode);
DONE;
}
})
;; Remainder instructions.
@ -15516,6 +15525,15 @@
(const_string "fop")))
(set_attr "mode" "SF")])
(define_insn "*rcpsf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x")
(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
UNSPEC_RCP))]
"TARGET_SSE_MATH"
"rcpss\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
(define_insn "*fop_sf_1_sse"
[(set (match_operand:SF 0 "register_operand" "=x")
(match_operator:SF 3 "binary_fp_operator"
@ -15980,6 +15998,27 @@
(set_attr "athlon_decode" "direct")
(set_attr "amdfam10_decode" "direct")])
(define_insn "*rsqrtsf2_sse"
[(set (match_operand:SF 0 "register_operand" "=x")
(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
UNSPEC_RSQRT))]
"TARGET_SSE_MATH"
"rsqrtss\t{%1, %0|%0, %1}"
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
(define_expand "rsqrtsf2"
[(set (match_operand:SF 0 "register_operand" "=x")
(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
UNSPEC_RSQRT))]
"TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations"
{
ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1);
DONE;
})
(define_insn "*sqrt<mode>2_sse"
[(set (match_operand:SSEMODEF 0 "register_operand" "=x")
(sqrt:SSEMODEF
@ -15998,6 +16037,15 @@
"TARGET_USE_FANCY_MATH_387
|| (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
{
if (<MODE>mode == SFmode
&& TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations)
{
ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0);
DONE;
}
if (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
{
rtx op0 = gen_reg_rtx (XFmode);

View File

@ -258,3 +258,7 @@ Support code generation of popcnt instruction.
msahf
Target Report RejectNegative Var(x86_sahf)
Support code generation of sahf instruction in 64bit x86-64 code.
mrecip
Target Report RejectNegative Var(x86_recip)
Generate reciprocals instead of divss and sqrtss.

View File

@ -450,7 +450,18 @@
(div:V4SF (match_operand:V4SF 1 "register_operand" "")
(match_operand:V4SF 2 "nonimmediate_operand" "")))]
"TARGET_SSE"
"ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);")
{
ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);
if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations)
{
ix86_emit_swdivsf (operands[0], operands[1],
operands[2], V4SFmode);
DONE;
}
})
(define_insn "*divv4sf3"
[(set (match_operand:V4SF 0 "register_operand" "=x")
@ -494,7 +505,7 @@
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
(define_insn "sse_rsqrtv4sf2"
(define_insn "*sse_rsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(unspec:V4SF
[(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
@ -503,6 +514,21 @@
[(set_attr "type" "sse")
(set_attr "mode" "V4SF")])
(define_expand "sse_rsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "")
(unspec:V4SF
[(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))]
"TARGET_SSE"
{
if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations)
{
ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 1);
DONE;
}
})
(define_insn "sse_vmrsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(vec_merge:V4SF
@ -515,7 +541,7 @@
[(set_attr "type" "sse")
(set_attr "mode" "SF")])
(define_insn "sqrtv4sf2"
(define_insn "*sqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
"TARGET_SSE"
@ -523,6 +549,20 @@
[(set_attr "type" "sse")
(set_attr "mode" "V4SF")])
(define_expand "sqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=")
(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))]
"TARGET_SSE"
{
if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
&& flag_finite_math_only && !flag_trapping_math
&& flag_unsafe_math_optimizations)
{
ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 0);
DONE;
}
})
(define_insn "sse_vmsqrtv4sf2"
[(set (match_operand:V4SF 0 "register_operand" "=x")
(vec_merge:V4SF

View File

@ -548,7 +548,7 @@ Objective-C and Objective-C++ Dialects}.
-masm=@var{dialect} -mno-fancy-math-387 @gol
-mno-fp-ret-in-387 -msoft-float @gol
-mno-wide-multiply -mrtd -malign-double @gol
-mpreferred-stack-boundary=@var{num} -mcx16 -msahf @gol
-mpreferred-stack-boundary=@var{num} -mcx16 -msahf -mrecip @gol
-mmmx -msse -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 @gol
-msse4a -m3dnow -mpopcnt -mabm @gol
-mthreads -mno-align-stringops -minline-all-stringops @gol
@ -10346,6 +10346,13 @@ SAHF are load and store instructions, respectively, for certain status flags.
In 64-bit mode, SAHF instruction is used to optimize @code{fmod}, @code{drem}
or @code{remainder} built-in functions: see @ref{Other Builtins} for details.
@item -mrecip
@opindex mrecip
This option will enable GCC to use RCPSS and RSQRTSS instructions (and their
vectorized variants RCPPS and RSQRTPS) instead of DIVSS and SQRTSS (and their
vectorized variants). These instructions will be generated only when
@option{-funsafe-math-optimizatons} is enabled.
@item -mpush-args
@itemx -mno-push-args
@opindex mpush-args

View File

@ -5345,6 +5345,15 @@ of @var{x}.
The default version returns false for all constants.
@end deftypefn
@deftypefn {Target Hook} tree TARGET_BUILTIN_RECIPROCAL (enum tree_code @var{code}, bool @var{sqrt})
This hook should return the DECL of a function that implements reciprocal of
the builtin function with builtin function code @var{code}, or
@code{NULL_TREE} if such a function is not available. When @var{sqrt} is
true, additional optimizations that apply only to the reciprocal of a square
root function are performed, and only reciprocals of @code{sqrt} function
are valid.
@end deftypefn
@deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD (void)
This hook should return the DECL of a function @var{f} that given an
address @var{addr} as an argument returns a mask @var{m} that can be

View File

@ -266,7 +266,15 @@ hook_constcharptr_tree_null (tree t ATTRIBUTE_UNUSED)
}
tree
hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, tree t1 ATTRIBUTE_UNUSED,
hook_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
bool ignore ATTRIBUTE_UNUSED)
{
return NULL;
}
tree
hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
tree t1 ATTRIBUTE_UNUSED,
bool ignore ATTRIBUTE_UNUSED)
{
return NULL;

View File

@ -58,6 +58,7 @@ extern int hook_int_void_no_regs (void);
extern tree hook_tree_tree_tree_null (tree, tree);
extern tree hook_tree_tree_tree_tree_3rd_identity (tree, tree, tree);
extern tree hook_tree_tree_bool_null (tree, bool);
extern tree hook_tree_tree_tree_bool_null (tree, tree, bool);
extern unsigned hook_uint_uint_constcharptrptr_0 (unsigned, const char **);

View File

@ -647,6 +647,7 @@ init_optimization_passes (void)
NEXT_PASS (pass_tree_loop_done);
}
NEXT_PASS (pass_cse_reciprocals);
NEXT_PASS (pass_convert_to_rsqrt);
NEXT_PASS (pass_reassoc);
NEXT_PASS (pass_vrp);
NEXT_PASS (pass_dominator);

View File

@ -350,8 +350,10 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
TARGET_SCHED_SET_SCHED_FLAGS}
#define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION default_builtin_vectorized_function
#define TARGET_VECTORIZE_BUILTIN_CONVERSION default_builtin_vectorized_conversion
#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
default_builtin_vectorized_function
#define TARGET_VECTORIZE_BUILTIN_CONVERSION \
default_builtin_vectorized_conversion
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN 0
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD 0
@ -385,6 +387,9 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define TARGET_RESOLVE_OVERLOADED_BUILTIN NULL
#define TARGET_FOLD_BUILTIN hook_tree_tree_tree_bool_null
/* In tree-ssa-math-opts.c */
#define TARGET_BUILTIN_RECIPROCAL hook_tree_tree_bool_null
/* In varasm.c. */
#ifndef TARGET_SECTION_TYPE_FLAGS
#define TARGET_SECTION_TYPE_FLAGS default_section_type_flags
@ -668,6 +673,7 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
TARGET_EXPAND_BUILTIN, \
TARGET_RESOLVE_OVERLOADED_BUILTIN, \
TARGET_FOLD_BUILTIN, \
TARGET_BUILTIN_RECIPROCAL, \
TARGET_MANGLE_FUNDAMENTAL_TYPE, \
TARGET_INIT_LIBFUNCS, \
TARGET_SECTION_TYPE_FLAGS, \

View File

@ -483,6 +483,10 @@ struct gcc_target
/* Fold a target-specific builtin. */
tree (* fold_builtin) (tree fndecl, tree arglist, bool ignore);
/* Returns a code for a target-specific builtin that implements
reciprocal of the function, or NULL_TREE if not available. */
tree (* builtin_reciprocal) (unsigned, bool);
/* For a vendor-specific fundamental TYPE, return a pointer to
a statically-allocated string containing the C++ mangling for
TYPE. In all other cases, return NULL. */

View File

@ -1,3 +1,12 @@
2007-06-16 Uros Bizjak <ubizjak@gmail.com>
PR middle-end/31723
* gcc.target/i386/recip-divf.c: New test.
* gcc.target/i386/recip-sqrtf.c: Ditto.
* gcc.target/i386/recip-vec-divf.c: Ditto.
* gcc.target/i386/recip-vec-sqrtf.c: Ditto.
* gcc.target/i386/sse-recip.c: Ditto.
2007-06-15 Andrew Pinski <andrew_pinski@playstation.sony.com>
PR tree-opt/32225

View File

@ -0,0 +1,9 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
float t1(float a, float b)
{
return a / b;
}
/* { dg-final { scan-assembler "rcpss" } } */

View File

@ -0,0 +1,21 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
extern float sqrtf (float);
float t1(float a, float b)
{
return a/sqrtf(b);
}
float t2(float x, float a, float b)
{
return sqrtf(a/b);
}
float t3(float a)
{
return sqrtf(a);
}
/* { dg-final { scan-assembler-times "rsqrtss" 3 } } */

View File

@ -0,0 +1,16 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
float a[16];
float b[16];
float r[16];
void t1(void)
{
int i;
for (i = 0; i < 16; i++)
r[i] = a[i] / b[i];
}
/* { dg-final { scan-assembler "rcpps" } } */

View File

@ -0,0 +1,34 @@
/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
float a[16];
float b[16];
float r[16];
extern float sqrtf (float);
void t1(void)
{
int i;
for (i = 0; i < 16; i++)
r[i] = a[i] / sqrtf (b[i]);
}
void t2(void)
{
int i;
for (i = 0; i < 16; i++)
r[i] = sqrtf (a[i] / b[i]);
}
void t3(void)
{
int i;
for (i = 0; i < 16; i++)
r[i] = sqrtf (a[i]);
}
/* { dg-final { scan-assembler-times "rsqrtps" 3 } } */

View File

@ -0,0 +1,51 @@
/* { dg-do run } */
/* { dg-options "-O2 -ffast-math -msse -mfpmath=sse -mrecip" } */
#include "../../gcc.dg/i386-cpuid.h"
extern float sqrtf (float);
extern void abort (void);
#define N 8
int __attribute__((noinline))
main1 ()
{
float a[N] = { 0.f, 18.f, 108.f, 324.f, 720.f, 1944.f, 3087.f, 5832.f };
float b[N] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
float r[N];
float rc[N] = { 0.f, 3.f, 6.f, 9.f, 12.f, 18.f, 21.f, 27.f };
int i;
for (i = 0; i < N; i++)
{
r[i] = sqrtf (a[i] / b[i]);
}
/* check results: */
for (i = 0; i < N; i++)
{
if (r[i] != rc[i])
abort();
}
return 0;
}
int
main ()
{
unsigned long cpu_facilities;
cpu_facilities = i386_cpuid ();
if ((cpu_facilities & (bit_MMX | bit_SSE | bit_CMOV))
!= (bit_MMX | bit_SSE | bit_CMOV))
/* If host has no vector support, pass. */
return 0;
main1 ();
return 0;
}

View File

@ -293,6 +293,7 @@ extern struct tree_opt_pass pass_early_warn_uninitialized;
extern struct tree_opt_pass pass_late_warn_uninitialized;
extern struct tree_opt_pass pass_cse_reciprocals;
extern struct tree_opt_pass pass_cse_sincos;
extern struct tree_opt_pass pass_convert_to_rsqrt;
extern struct tree_opt_pass pass_warn_function_return;
extern struct tree_opt_pass pass_warn_function_noreturn;
extern struct tree_opt_pass pass_phiopt;

View File

@ -496,6 +496,46 @@ execute_cse_reciprocals (void)
&& TREE_CODE (def) == SSA_NAME)
execute_cse_reciprocals_1 (&bsi, def);
}
/* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */
for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
{
tree stmt = bsi_stmt (bsi);
tree fndecl;
if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
&& TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == RDIV_EXPR)
{
tree arg1 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt, 1), 1);
tree stmt1 = SSA_NAME_DEF_STMT (arg1);
if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
&& TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == CALL_EXPR
&& (fndecl
= get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt1, 1)))
&& (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
|| DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
{
enum built_in_function code;
tree arg10;
tree tmp;
code = DECL_FUNCTION_CODE (fndecl);
fndecl = targetm.builtin_reciprocal (code, false);
if (!fndecl)
continue;
arg10 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
tmp = build_call_expr (fndecl, 1, arg10);
GIMPLE_STMT_OPERAND (stmt1, 1) = tmp;
update_stmt (stmt1);
TREE_SET_CODE (GIMPLE_STMT_OPERAND (stmt, 1), MULT_EXPR);
fold_stmt_inplace (stmt);
update_stmt (stmt);
}
}
}
}
free_dominance_info (CDI_DOMINATORS);
@ -726,3 +766,88 @@ struct tree_opt_pass pass_cse_sincos =
| TODO_verify_stmts, /* todo_flags_finish */
0 /* letter */
};
/* Find all expressions in the form of sqrt(a/b) and
convert them to rsqrt(b/a). */
static unsigned int
execute_convert_to_rsqrt (void)
{
basic_block bb;
FOR_EACH_BB (bb)
{
block_stmt_iterator bsi;
for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
{
tree stmt = bsi_stmt (bsi);
tree fndecl;
if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
&& TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == CALL_EXPR
&& (fndecl = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt, 1)))
&& (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
|| DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
{
enum built_in_function code;
tree arg1;
tree stmt1;
code = DECL_FUNCTION_CODE (fndecl);
fndecl = targetm.builtin_reciprocal (code, true);
if (!fndecl)
continue;
arg1 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt, 1), 0);
stmt1 = SSA_NAME_DEF_STMT (arg1);
if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
&& TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == RDIV_EXPR)
{
tree arg10, arg11;
tree tmp;
arg10 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
arg11 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1);
/* Swap operands of RDIV_EXPR. */
TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0) = arg11;
TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1) = arg10;
fold_stmt_inplace (stmt1);
update_stmt (stmt1);
tmp = build_call_expr (fndecl, 1, arg1);
GIMPLE_STMT_OPERAND (stmt, 1) = tmp;
update_stmt (stmt);
}
}
}
}
return 0;
}
static bool
gate_convert_to_rsqrt (void)
{
return flag_unsafe_math_optimizations && optimize;
}
struct tree_opt_pass pass_convert_to_rsqrt =
{
"rsqrt", /* name */
gate_convert_to_rsqrt, /* gate */
execute_convert_to_rsqrt, /* execute */
NULL, /* sub */
NULL, /* next */
0, /* static_pass_number */
0, /* tv_id */
PROP_ssa, /* properties_required */
0, /* properties_provided */
0, /* properties_destroyed */
0, /* todo_flags_start */
TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
| TODO_verify_stmts, /* todo_flags_finish */
0 /* letter */
};