re PR tree-optimization/23109 (compiler generates wrong code leading to spurious division by zero with -funsafe-math-optimizations (instead of -ftrapping-math))

gcc:
2006-01-11  Paolo Bonzini  <bonzini@gnu.org>

	PR tree-optimization/23109
	PR tree-optimization/23948
	PR tree-optimization/24123

	* Makefile.in (tree-ssa-math-opts.o): Adjust dependencies.
        * tree-cfg.c (single_noncomplex_succ): New.
        * tree-flow.h (single_noncomplex_succ): Declare it.
        * tree-ssa-math-opts.c (enum place_reciprocal): Remove.
        * tree-ssa-math-opts.c (enum place_reciprocal): Remove.
        (struct occurrence, occ_head, occ_pool, is_divide_by, compute_merit,
	insert_bb, register_division_in, insert_reciprocals,
	replace_reciprocal, free_bb): New.
        (execute_cse_reciprocals_1): Rewritten.
        (execute_cse_reciprocals): Adjust calls to execute_cse_reciprocals_1.
        Do not commit any edge insertion.  Always compute dominators and
        create the allocation pool.
        * target-def.h (TARGET_MIN_DIVISIONS_FOR_RECIP_MUL): New.
	* target.h (struct gcc_target): Add min_divistions_for_recip_mul.
	* targhooks.c (default_min_divistions_for_recip_mul): New.
	* targhooks.h (default_min_divistions_for_recip_mul): New prototype.
        * passes.c (init_optimization_passes): Run recip after tree loop
        optimizations.
        * doc/tm.texi (Misc): Document TARGET_MIN_DIVISIONS_FOR_RECIP_MUL.

gcc/testsuite:
2006-01-11  Paolo Bonzini  <bonzini@gnu.org>
        
        PR tree-optimization/23109
        PR tree-optimization/23948
        PR tree-optimization/24123

        * gcc.dg/tree-ssa/recip-3.c, gcc.dg/tree-ssa/recip-4.c,
        gcc.dg/tree-ssa/recip-5.c, gcc.dg/tree-ssa/recip-6.c,
        gcc.dg/tree-ssa/recip-7.c, gcc.dg/tree-ssa/pr23109.c,
        g++.dg/tree-ssa/pr23948.C: New testcases.
        * gcc.dg/tree-ssa/recip-2.c, gcc.dg/tree-ssa/pr23234.c: Provide
	three divisions in order to do the optimization.

From-SVN: r109578
This commit is contained in:
Paolo Bonzini 2006-01-11 13:02:18 +00:00
parent 4d779342f0
commit bc23502b7f
21 changed files with 745 additions and 119 deletions

View File

@ -1,3 +1,29 @@
2006-01-11 Paolo Bonzini <bonzini@gnu.org>
PR tree-optimization/23109
PR tree-optimization/23948
PR tree-optimization/24123
* Makefile.in (tree-ssa-math-opts.o): Adjust dependencies.
* tree-cfg.c (single_noncomplex_succ): New.
* tree-flow.h (single_noncomplex_succ): Declare it.
* tree-ssa-math-opts.c (enum place_reciprocal): Remove.
* tree-ssa-math-opts.c (enum place_reciprocal): Remove.
(struct occurrence, occ_head, occ_pool, is_divide_by, compute_merit,
insert_bb, register_division_in, insert_reciprocals,
replace_reciprocal, free_bb): New.
(execute_cse_reciprocals_1): Rewritten.
(execute_cse_reciprocals): Adjust calls to execute_cse_reciprocals_1.
Do not commit any edge insertion. Always compute dominators and
create the allocation pool.
* target-def.h (TARGET_MIN_DIVISIONS_FOR_RECIP_MUL): New.
* target.h (struct gcc_target): Add min_divistions_for_recip_mul.
* targhooks.c (default_min_divistions_for_recip_mul): New.
* targhooks.h (default_min_divistions_for_recip_mul): New prototype.
* passes.c (init_optimization_passes): Run recip after tree loop
optimizations.
* doc/tm.texi (Misc): Document TARGET_MIN_DIVISIONS_FOR_RECIP_MUL.
2005-01-11 Danny Berlin <dberlin@dberlin.org>
Kenneth Zadeck <zadeck@naturalbridge.com>
@ -151,31 +177,31 @@
2006-01-10 John David Anglin <dave.anglin@nrc-cnrc.gc.ca>
PR target/20754
* pa.md: Create separate 32 and 64-bit move patterns for SI, DI, SF
and DF modes. Add alternatives to copy between general and floating
point registers to the 32-bit patterns.
* pa-64.h (SECONDARY_MEMORY_NEEDED_RTX): Delete undefine.
* pa.h (SECONDARY_MEMORY_NEEDED_RTX): Delete define.
* config/pa/pa.md: Create separate 32 and 64-bit move patterns
for SI, DI, SF and DF modes. Add alternatives to copy between
general and floating point registers to the 32-bit patterns.
* config/pa/pa-64.h (SECONDARY_MEMORY_NEEDED_RTX): Delete undefine.
* config/pa/pa.h (SECONDARY_MEMORY_NEEDED_RTX): Delete define.
(SECONDARY_MEMORY_NEEDED): Secondary memory is only needed when
generating 64-bit code.
* pa.c (output_move_double): Handle copies between general and
floating registers.
* config/pa/pa.c (output_move_double): Handle copies between general
and floating registers.
2006-01-10 Stuart Hastings <stuart@apple.com>
* gcc/config/i386/i386.md (set_got): Update.
* config/i386/i386.md (set_got): Update.
(set_got_labelled): New. (UNSPEC_LD_MPIC): New.
(builtin_setjmp_receiver): Mach-O support.
* gcc/config/i386/darwin.h (TARGET_ASM_FILE_END) Define.
* config/i386/darwin.h (TARGET_ASM_FILE_END) Define.
(GOT_SYMBOL_NAME): Define.
(FORCE_PREFERRED_STACK_BOUNDARY_IN_MAIN): New.
(TARGET_DEEP_BRANCH_PREDICTION): Remove.
* gcc/config/i386/i386.c (override_options): Revise for Darwin.
* config/i386/i386.c (override_options): Revise for Darwin.
(USE_HIDDEN_LINKONCE): Enable for Mach-O. (ix86_file_end): Mach-O
support. (darwin_x86_file_end): New. (output_set_got): Add label
parameter, revise for Mach-O. (x86_output_mi_thunk): Likewise.
* gcc/config/i386/i386-protos.h (output_set_got): Likewise.
* gcc/config/darwin.c (machopic_legitimize_pic_address): Update
* config/i386/i386-protos.h (output_set_got): Likewise.
* config/darwin.c (machopic_legitimize_pic_address): Update
regs_ever_live[].
2006-01-10 Kaz Kojima <kkojima@gcc.gnu.org>
@ -604,7 +630,7 @@
2006-01-03 Adrian Straetling <straetling@de.ibm.com>
* gcc/builtins.c (get_builtin_sync_mem): New function.
* builtins.c (get_builtin_sync_mem): New function.
(expand_builtin_sync_operation, expand_builtin_compare_and_swap,
expand_builtin_lock_test_and_set, expand_builtin_lock_release):
Call get_builtin_sync_mem to generate mem rtx.

View File

@ -1970,7 +1970,8 @@ tree-ssa-loop-im.o : tree-ssa-loop-im.c $(TREE_FLOW_H) $(CONFIG_H) \
$(TREE_DUMP_H) tree-pass.h $(FLAGS_H) real.h $(BASIC_BLOCK_H) \
hard-reg-set.h
tree-ssa-math-opts.o : tree-ssa-math-opts.c $(TREE_FLOW_H) $(CONFIG_H) \
$(SYSTEM_H) $(TREE_H) $(TIMEVAR_H) tree-pass.h $(TM_H) $(FLAGS_H)
$(SYSTEM_H) $(TREE_H) $(TIMEVAR_H) tree-pass.h $(TM_H) $(FLAGS_H) \
alloc-pool.h $(BASIC_BLOCK_H) $(TARGET_H)
tree-ssa-alias.o : tree-ssa-alias.c $(TREE_FLOW_H) $(CONFIG_H) $(SYSTEM_H) \
$(RTL_H) $(TREE_H) $(TM_P_H) $(EXPR_H) $(GGC_H) tree-inline.h $(FLAGS_H) \
function.h $(TIMEVAR_H) convert.h $(TM_H) coretypes.h langhooks.h \

View File

@ -8893,6 +8893,15 @@ point number to a signed fixed point number also convert validly to an
unsigned one.
@end defmac
@deftypefn {Target Hook} int TARGET_MIN_DIVISIONS_FOR_RECIP_MUL (enum machine_mode @var{mode})
When @option{-ffast-math} is in effect, GCC tries to optimize
divisions by the same divisor, by turning them into multiplications by
the reciprocal. This target hook specifies the minimum number of divisions
that should be there for GCC to perform the optimization for a variable
of mode @var{mode}. The default implementation returns 3 if the machine
has an instruction for the division, and 2 if it does not.
@end deftypefn
@defmac MOVE_MAX
The maximum number of bytes that a single instruction can move quickly
between memory and registers or between two memory locations.

View File

@ -551,12 +551,12 @@ init_optimization_passes (void)
we add may_alias right after fold builtins
which can create arbitrary GIMPLE. */
NEXT_PASS (pass_may_alias);
NEXT_PASS (pass_cse_reciprocals);
NEXT_PASS (pass_split_crit_edges);
NEXT_PASS (pass_pre);
NEXT_PASS (pass_may_alias);
NEXT_PASS (pass_sink_code);
NEXT_PASS (pass_tree_loop);
NEXT_PASS (pass_cse_reciprocals);
NEXT_PASS (pass_reassoc);
NEXT_PASS (pass_dominator);

View File

@ -336,6 +336,10 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define TARGET_SHIFT_TRUNCATION_MASK default_shift_truncation_mask
#endif
#ifndef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
#define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL default_min_divisions_for_recip_mul
#endif
#ifndef TARGET_VALID_POINTER_MODE
#define TARGET_VALID_POINTER_MODE default_valid_pointer_mode
#endif
@ -588,6 +592,7 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
TARGET_ENCODE_SECTION_INFO, \
TARGET_STRIP_NAME_ENCODING, \
TARGET_SHIFT_TRUNCATION_MASK, \
TARGET_MIN_DIVISIONS_FOR_RECIP_MUL, \
TARGET_VALID_POINTER_MODE, \
TARGET_SCALAR_MODE_SUPPORTED_P, \
TARGET_VECTOR_MODE_SUPPORTED_P, \

View File

@ -440,6 +440,11 @@ struct gcc_target
return the mask that they apply. Return 0 otherwise. */
unsigned HOST_WIDE_INT (* shift_truncation_mask) (enum machine_mode mode);
/* Return the number of divisions in the given MODE that should be present,
so that it is profitable to turn the division into a multiplication by
the reciprocal. */
unsigned int (* min_divisions_for_recip_mul) (enum machine_mode mode);
/* True if MODE is valid for a pointer in __attribute__((mode("MODE"))). */
bool (* valid_pointer_mode) (enum machine_mode mode);

View File

@ -148,6 +148,14 @@ default_shift_truncation_mask (enum machine_mode mode)
return SHIFT_COUNT_TRUNCATED ? GET_MODE_BITSIZE (mode) - 1 : 0;
}
/* The default implementation of TARGET_MIN_DIVISIONS_FOR_RECIP_MUL. */
unsigned int
default_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
{
return have_insn_for (DIV, mode) ? 3 : 2;
}
/* Generic hook that takes a CUMULATIVE_ARGS pointer and returns true. */
bool

View File

@ -33,6 +33,7 @@ extern bool default_pretend_outgoing_varargs_named (CUMULATIVE_ARGS *);
extern enum machine_mode default_eh_return_filter_mode (void);
extern unsigned HOST_WIDE_INT default_shift_truncation_mask
(enum machine_mode);
extern unsigned int default_min_divisions_for_recip_mul (enum machine_mode);
extern tree default_stack_protect_guard (void);
extern tree default_external_stack_protect_fail (void);

View File

@ -1,3 +1,16 @@
2006-01-11 Paolo Bonzini <bonzini@gnu.org>
PR tree-optimization/23109
PR tree-optimization/23948
PR tree-optimization/24123
* gcc.dg/tree-ssa/recip-3.c, gcc.dg/tree-ssa/recip-4.c,
gcc.dg/tree-ssa/recip-5.c, gcc.dg/tree-ssa/recip-6.c,
gcc.dg/tree-ssa/recip-7.c, gcc.dg/tree-ssa/pr23109.c,
g++.dg/tree-ssa/pr23948.C: New testcases.
* gcc.dg/tree-ssa/recip-2.c, gcc.dg/tree-ssa/pr23234.c: Provide
three divisions in order to do the optimization.
2005-01-11 Zdenek Dvorak <dvorakz@suse.cz>
PR c++/25632

View File

@ -0,0 +1,19 @@
/* { dg-options "-O1 -ffast-math -fdump-tree-recip" } */
/* { dg-do compile } */
struct MIOFILE {
~MIOFILE();
};
double potentially_runnable_resource_share();
void f1(double);
int make_scheduler_request(double a, double b)
{
MIOFILE mf;
double prrs = potentially_runnable_resource_share();
f1(a/prrs);
f1(1/prrs);
f1(b/prrs);
}
/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -0,0 +1,34 @@
/* { dg-do compile } */
/* { dg-options "-O2 -funsafe-math-optimizations -fdump-tree-recip -fdump-tree-lim" } */
double F[2] = { 0., 0. }, e = 0.;
int main()
{
int i;
double E, W, P, d;
/* make sure the program crashes on FP exception */
unsigned short int Mask;
W = 1.;
d = 2.*e;
E = 1. - d;
for( i=0; i < 2; i++ )
if( d > 0.01 )
{
P = ( W < E ) ? (W - E)/d : (E - W)/d;
F[i] += P;
}
return 0;
}
/* LIM only performs the transformation in the no-trapping-math case. In
the future we will do it for trapping-math as well in recip, check that
this is not wrongly optimized. */
/* { dg-final { scan-tree-dump-not "reciptmp" "lim" } } */
/* { dg-final { scan-tree-dump-not "reciptmp" "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -9,6 +9,7 @@ double
f1 (double a, double b, double c)
{
double y0;
double y1;
if (a == 0.0)
{
@ -16,7 +17,8 @@ f1 (double a, double b, double c)
return y0;
}
y0 = c / b;
return y0;
y1 = a / b;
return y0 * y1;
}
/* Labels may end up in the middle of a block. Also bad. */
@ -24,6 +26,7 @@ double
f2 (double a, double b, double c)
{
double y0;
double y1;
a_label:
another_label:
@ -33,7 +36,8 @@ another_label:
return y0;
}
y0 = c / b;
return y0;
y1 = a / b;
return y0 * y1;
}
/* Uses must still be dominated by their defs. */
@ -41,6 +45,7 @@ double
f3 (double a, double b, double c)
{
double y0;
double y1;
y0 = -c / b;
if (a == 0.0)
@ -48,5 +53,6 @@ f3 (double a, double b, double c)
return y0;
}
y0 = c / b;
return y0;
y1 = a / b;
return y0 * y1;
}

View File

@ -10,14 +10,19 @@ float e(float a, float b, float c, float d, float e, float f)
}
/* The PHI nodes for these divisions should be combined. */
d = d / a;
e = e / a;
f = f / a;
a = a / c;
b = b / c;
return a + b + e + f;
/* This should not be left as a multiplication. */
c = 1 / c;
return a + b + c + d + e + f;
}
/* { dg-final { scan-tree-dump-times " / " 2 "recip" } } */
/* { dg-final { scan-tree-dump-times " \\* " 5 "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -0,0 +1,27 @@
/* { dg-do compile } */
/* { dg-options "-O1 -fno-trapping-math -funsafe-math-optimizations -fdump-tree-recip" } */
double F[2] = { 0.0, 0.0 }, e;
/* In this case the optimization is interesting. */
float h ()
{
int i;
double E, W, P, d;
W = 1.;
d = 2.*e;
E = 1. - d;
for( i=0; i < 2; i++ )
if( d > 0.01 )
{
P = ( W < E ) ? (W - E)/d : (E - W)/d;
F[i] += P;
}
F[0] += E / d;
}
/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -0,0 +1,45 @@
/* { dg-do compile } */
/* { dg-options "-O1 -fno-trapping-math -funsafe-math-optimizations -fdump-tree-recip" } */
/* based on the test case in pr23109 */
double F[2] = { 0., 0. }, e = 0.;
/* Nope, we cannot prove the optimization is worthwhile in this case. */
void f ()
{
int i;
double E, W, P, d;
W = 1.;
d = 2.*e;
E = 1. - d;
if( d > 0.01 )
{
P = ( W < E ) ? (W - E)/d : (E - W)/d;
F[i] += P;
}
}
/* We also cannot prove the optimization is worthwhile in this case. */
float g ()
{
int i;
double E, W, P, d;
W = 1.;
d = 2.*e;
E = 1. - d;
if( d > 0.01 )
{
P = ( W < E ) ? (W - E)/d : (E - W)/d;
F[i] += P;
}
return 1.0 / d;
}
/* { dg-final { scan-tree-dump-not "reciptmp" "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -0,0 +1,31 @@
/* { dg-options "-O1 -funsafe-math-optimizations -ftrapping-math -fdump-tree-recip -fdump-tree-optimized" } */
/* { dg-do compile } */
/* Test the reciprocal optimizations together with trapping math. */
extern int f2();
double f1(double y, double z, double w, double j, double k)
{
double b, c, d, e, f, g;
if (f2 ())
/* inserts one division here */
b = 1 / y, c = z / y, d = j / y;
else
/* one division here */
b = 3 / y, c = w / y, d = k / y;
/* and one here, that should be removed afterwards but is not right now */
e = b / y;
f = c / y;
g = d / y;
return e + f + g;
}
/* { dg-final { scan-tree-dump-times " / " 3 "recip" } } */
/* { dg-final { scan-tree-dump-times " / " 2 "optimized" { xfail *-*-* } } } */
/* { dg-final { cleanup-tree-dump "recip" } } */
/* { dg-final { cleanup-tree-dump "optimized" } } */

View File

@ -0,0 +1,26 @@
/* { dg-options "-O1 -funsafe-math-optimizations -fno-trapping-math -fdump-tree-recip" } */
/* { dg-do compile } */
/* Test inserting in a block that does not contain a division. */
extern int f2();
double f1(double y, double z, double w)
{
double b, c, d, e, f;
if (g ())
b = 1 / y, c = z / y;
else
b = 3 / y, c = w / y;
d = b / y;
e = c / y;
f = 1 / y;
return d + e + f;
}
/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -0,0 +1,27 @@
/* { dg-options "-O1 -funsafe-math-optimizations -fno-trapping-math -fdump-tree-recip" } */
/* { dg-do compile } */
/* Test inserting in a block that does not contain a division. */
extern double h();
double f(int x, double z, double w)
{
double b, c, d, e, f;
double y = h ();
if (x)
b = 1 / y, c = z / y;
else
b = 3 / y, c = w / y;
d = b / y;
e = c / y;
f = 1 / y;
return d + e + f;
}
/* { dg-final { scan-tree-dump-times " / " 1 "recip" } } */
/* { dg-final { cleanup-tree-dump "recip" } } */

View File

@ -1389,6 +1389,30 @@ tree_merge_blocks (basic_block a, basic_block b)
}
/* Return the one of two successors of BB that is not reachable by a
reached by a complex edge, if there is one. Else, return BB. We use
this in optimizations that use post-dominators for their heuristics,
to catch the cases in C++ where function calls are involved. */
basic_block
single_noncomplex_succ (basic_block bb)
{
edge e0, e1;
if (EDGE_COUNT (bb->succs) != 2)
return bb;
e0 = EDGE_SUCC (bb, 0);
e1 = EDGE_SUCC (bb, 1);
if (e0->flags & EDGE_COMPLEX)
return e1->dest;
if (e1->flags & EDGE_COMPLEX)
return e0->dest;
return bb;
}
/* Walk the function tree removing unnecessary statements.
* Empty statement nodes are removed

View File

@ -487,6 +487,7 @@ extern bool is_ctrl_stmt (tree);
extern bool is_ctrl_altering_stmt (tree);
extern bool computed_goto_p (tree);
extern bool simple_goto_p (tree);
extern basic_block single_noncomplex_succ (basic_block bb);
extern void tree_dump_bb (basic_block, FILE *, int);
extern void debug_tree_bb (basic_block);
extern basic_block debug_tree_bb_n (int);

View File

@ -35,7 +35,55 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
z = z * rmodulus;
We do this for loop invariant divisors, and with this pass whenever
we notice that a division has the same divisor multiple times. */
we notice that a division has the same divisor multiple times.
Of course, like in PRE, we don't insert a division if a dominator
already has one. However, this cannot be done as an extension of
PRE for several reasons.
First of all, with some experiments it was found out that the
transformation is not always useful if there are only two divisions
hy the same divisor. This is probably because modern processors
can pipeline the divisions; on older, in-order processors it should
still be effective to optimize two divisions by the same number.
We make this a param, and it shall be called N in the remainder of
this comment.
Second, if trapping math is active, we have less freedom on where
to insert divisions: we can only do so in basic blocks that already
contain one. (If divisions don't trap, instead, we can insert
divisions elsewhere, which will be in blocks that are common dominators
of those that have the division).
We really don't want to compute the reciprocal unless a division will
be found. To do this, we won't insert the division in a basic block
that has less than N divisions *post-dominating* it.
The algorithm constructs a subset of the dominator tree, holding the
blocks containing the divisions and the common dominators to them,
and walk it twice. The first walk is in post-order, and it annotates
each block with the number of divisions that post-dominate it: this
gives information on where divisions can be inserted profitably.
The second walk is in pre-order, and it inserts divisions as explained
above, and replaces divisions by multiplications.
In the best case, the cost of the pass is O(n_statements). In the
worst-case, the cost is due to creating the dominator tree subset,
with a cost of O(n_basic_blocks ^ 2); however this can only happen
for n_statements / n_basic_blocks statements. So, the amortized cost
of creating the dominator tree subset is O(n_basic_blocks) and the
worst-case cost of the pass is O(n_statements * n_basic_blocks).
More practically, the cost will be small because there are few
divisions, and they tend to be in the same basic block, so insert_bb
is called very few times.
If we did this using domwalk.c, an efficient implementation would have
to work on all the variables in a single pass, because we could not
work on just a subset of the dominator tree, as we do now, and the
cost would also be something like O(n_statements * n_basic_blocks).
The data structures would be more complex in order to work on all the
variables in a single pass. */
#include "config.h"
#include "system.h"
@ -47,6 +95,348 @@ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
#include "real.h"
#include "timevar.h"
#include "tree-pass.h"
#include "alloc-pool.h"
#include "basic-block.h"
#include "target.h"
/* This structure represents one basic block that either computes a
division, or is a common dominator for basic block that compute a
division. */
struct occurrence {
/* The basic block represented by this structure. */
basic_block bb;
/* If non-NULL, the SSA_NAME holding the definition for a reciprocal
inserted in BB. */
tree recip_def;
/* If non-NULL, the MODIFY_EXPR for a reciprocal computation that
was inserted in BB. */
tree recip_def_stmt;
/* Pointer to a list of "struct occurrence"s for blocks dominated
by BB. */
struct occurrence *children;
/* Pointer to the next "struct occurrence"s in the list of blocks
sharing a common dominator. */
struct occurrence *next;
/* The number of divisions that are in BB before compute_merit. The
number of divisions that are in BB or post-dominate it after
compute_merit. */
int num_divisions;
/* True if the basic block has a division, false if it is a common
dominator for basic blocks that do. If it is false and trapping
math is active, BB is not a candidate for inserting a reciprocal. */
bool bb_has_division;
};
/* The instance of "struct occurrence" representing the highest
interesting block in the dominator tree. */
static struct occurrence *occ_head;
/* Allocation pool for getting instances of "struct occurrence". */
static alloc_pool occ_pool;
/* Allocate and return a new struct occurrence for basic block BB, and
whose children list is headed by CHILDREN. */
static struct occurrence *
occ_new (basic_block bb, struct occurrence *children)
{
struct occurrence *occ;
occ = bb->aux = pool_alloc (occ_pool);
memset (occ, 0, sizeof (struct occurrence));
occ->bb = bb;
occ->children = children;
return occ;
}
/* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a
list of "struct occurrence"s, one per basic block, having IDOM as
their common dominator.
We try to insert NEW_OCC as deep as possible in the tree, and we also
insert any other block that is a common dominator for BB and one
block already in the tree. */
static void
insert_bb (struct occurrence *new_occ, basic_block idom,
struct occurrence **p_head)
{
struct occurrence *occ, **p_occ;
for (p_occ = p_head; (occ = *p_occ) != NULL; )
{
basic_block bb = new_occ->bb, occ_bb = occ->bb;
basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
if (dom == bb)
{
/* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC
from its list. */
*p_occ = occ->next;
occ->next = new_occ->children;
new_occ->children = occ;
/* Try the next block (it may as well be dominated by BB). */
}
else if (dom == occ_bb)
{
/* OCC_BB dominates BB. Tail recurse to look deeper. */
insert_bb (new_occ, dom, &occ->children);
return;
}
else if (dom != idom)
{
gcc_assert (!dom->aux);
/* There is a dominator between IDOM and BB, add it and make
two children out of NEW_OCC and OCC. First, remove OCC from
its list. */
*p_occ = occ->next;
new_occ->next = occ;
occ->next = NULL;
/* None of the previous blocks has DOM as a dominator: if we tail
recursed, we would reexamine them uselessly. Just switch BB with
DOM, and go on looking for blocks dominated by DOM. */
new_occ = occ_new (dom, new_occ);
}
else
{
/* Nothing special, go on with the next element. */
p_occ = &occ->next;
}
}
/* No place was found as a child of IDOM. Make BB a sibling of IDOM. */
new_occ->next = *p_head;
*p_head = new_occ;
}
/* Register that we found a division in BB. */
static inline void
register_division_in (basic_block bb)
{
struct occurrence *occ;
occ = (struct occurrence *) bb->aux;
if (!occ)
{
occ = occ_new (bb, NULL);
insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
}
occ->bb_has_division = true;
occ->num_divisions++;
}
/* Compute the number of divisions that postdominate each block in OCC and
its children. */
static void
compute_merit (struct occurrence *occ)
{
struct occurrence *occ_child;
basic_block dom = occ->bb;
for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
{
basic_block bb;
if (occ_child->children)
compute_merit (occ_child);
if (flag_exceptions)
bb = single_noncomplex_succ (dom);
else
bb = dom;
if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
occ->num_divisions += occ_child->num_divisions;
}
}
/* Return whether USE_STMT is a floating-point division by DEF. */
static inline bool
is_division_by (tree use_stmt, tree def)
{
return TREE_CODE (use_stmt) == MODIFY_EXPR
&& TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
&& TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def;
}
/* Walk the subset of the dominator tree rooted at OCC, setting the
RECIP_DEF field to a definition of 1.0 / DEF that can be used in
the given basic block. The field may be left NULL, of course,
if it is not possible or profitable to do the optimization.
DEF_BSI is an iterator pointing at the statement defining DEF.
If RECIP_DEF is set, a dominator already has a computation that can
be used. */
static void
insert_reciprocals (block_stmt_iterator *def_bsi, struct occurrence *occ,
tree def, tree recip_def, int threshold)
{
tree type, new_stmt;
block_stmt_iterator bsi;
struct occurrence *occ_child;
if (!recip_def
&& (occ->bb_has_division || !flag_trapping_math)
&& occ->num_divisions >= threshold)
{
/* Make a variable with the replacement and substitute it. */
type = TREE_TYPE (def);
recip_def = make_rename_temp (type, "reciptmp");
new_stmt = build2 (MODIFY_EXPR, void_type_node, recip_def,
fold_build2 (RDIV_EXPR, type,
build_real (type, dconst1), def));
if (occ->bb_has_division)
{
/* Case 1: insert before an existing division. */
bsi = bsi_after_labels (occ->bb);
while (!bsi_end_p (bsi) && !is_division_by (bsi_stmt (bsi), def))
bsi_next (&bsi);
bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT);
}
else if (def_bsi && occ->bb == def_bsi->bb)
{
/* Case 2: insert right after the definition. Note that this will
never happen if the definition statement can throw, because in
that case the sole successor of the statement's basic block will
dominate all the uses as well. */
bsi_insert_after (def_bsi, new_stmt, BSI_NEW_STMT);
}
else
{
/* Case 3: insert in a basic block not containing defs/uses. */
bsi = bsi_after_labels (occ->bb);
bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT);
}
occ->recip_def_stmt = new_stmt;
}
occ->recip_def = recip_def;
for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
insert_reciprocals (def_bsi, occ_child, def, recip_def, threshold);
}
/* Replace the division at USE_P with a multiplication by the reciprocal, if
possible. */
static inline void
replace_reciprocal (use_operand_p use_p)
{
tree use_stmt = USE_STMT (use_p);
basic_block bb = bb_for_stmt (use_stmt);
struct occurrence *occ = (struct occurrence *) bb->aux;
if (occ->recip_def && use_stmt != occ->recip_def_stmt)
{
TREE_SET_CODE (TREE_OPERAND (use_stmt, 1), MULT_EXPR);
SET_USE (use_p, occ->recip_def);
fold_stmt_inplace (use_stmt);
update_stmt (use_stmt);
}
}
/* Free OCC and return one more "struct occurrence" to be freed. */
static struct occurrence *
free_bb (struct occurrence *occ)
{
struct occurrence *child, *next;
/* First get the two pointers hanging off OCC. */
next = occ->next;
child = occ->children;
occ->bb->aux = NULL;
pool_free (occ_pool, occ);
/* Now ensure that we don't recurse unless it is necessary. */
if (!child)
return next;
else
{
while (next)
next = free_bb (next);
return child;
}
}
/* Look for floating-point divisions among DEF's uses, and try to
replace them by multiplications with the reciprocal. Add
as many statements computing the reciprocal as needed.
DEF must be a GIMPLE register of a floating-point type. */
static void
execute_cse_reciprocals_1 (block_stmt_iterator *def_bsi, tree def)
{
use_operand_p use_p;
imm_use_iterator use_iter;
struct occurrence *occ;
int count = 0, threshold;
gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
{
tree use_stmt = USE_STMT (use_p);
if (is_division_by (use_stmt, def))
{
register_division_in (bb_for_stmt (use_stmt));
count++;
}
}
/* Do the expensive part only if we can hope to optimize something. */
threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
if (count >= threshold)
{
for (occ = occ_head; occ; occ = occ->next)
{
compute_merit (occ);
insert_reciprocals (def_bsi, occ, def, NULL, threshold);
}
FOR_EACH_IMM_USE_SAFE (use_p, use_iter, def)
{
tree use_stmt = USE_STMT (use_p);
if (is_division_by (use_stmt, def))
replace_reciprocal (use_p);
}
}
for (occ = occ_head; occ; )
occ = free_bb (occ);
occ_head = NULL;
}
static bool
gate_cse_reciprocals (void)
@ -54,135 +444,58 @@ gate_cse_reciprocals (void)
return optimize && !optimize_size && flag_unsafe_math_optimizations;
}
/* Where to put the statement computing a reciprocal. */
enum place_reciprocal
{
PR_BEFORE_BSI, /* Put it using bsi_insert_before. */
PR_AFTER_BSI, /* Put it using bsi_insert_after. */
PR_ON_ENTRY_EDGE /* Put it on the edge between the entry
and the first basic block. */
};
/* Check if DEF's uses include more than one floating-point division,
and if so replace them by multiplications with the reciprocal. Add
the statement computing the reciprocal according to WHERE.
Does not check the type of DEF, nor that DEF is a GIMPLE register.
This is done in the caller for speed, because otherwise this routine
would be called for every definition and phi node. */
static void
execute_cse_reciprocals_1 (block_stmt_iterator *bsi, tree def,
enum place_reciprocal where)
{
use_operand_p use_p;
imm_use_iterator use_iter;
tree t, new_stmt, type;
int count = 0;
bool ok = !flag_trapping_math;
/* Find uses. */
FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
{
tree use_stmt = USE_STMT (use_p);
if (TREE_CODE (use_stmt) == MODIFY_EXPR
&& TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
&& TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def)
{
++count;
/* Check if this use post-dominates the insertion point. */
if (ok || dominated_by_p (CDI_POST_DOMINATORS, bsi->bb,
bb_for_stmt (use_stmt)))
ok = true;
}
if (count >= 2 && ok)
break;
}
if (count < 2 || !ok)
return;
/* Make a variable with the replacement and substitute it. */
type = TREE_TYPE (def);
t = make_rename_temp (type, "reciptmp");
new_stmt = build2 (MODIFY_EXPR, void_type_node, t,
fold_build2 (RDIV_EXPR, type, build_real (type, dconst1),
def));
if (where == PR_BEFORE_BSI)
bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT);
else if (where == PR_AFTER_BSI)
bsi_insert_after (bsi, new_stmt, BSI_NEW_STMT);
else if (where == PR_ON_ENTRY_EDGE)
bsi_insert_on_edge (single_succ_edge (ENTRY_BLOCK_PTR), new_stmt);
else
gcc_unreachable ();
FOR_EACH_IMM_USE_SAFE (use_p, use_iter, def)
{
tree use_stmt = USE_STMT (use_p);
if (use_stmt != new_stmt
&& TREE_CODE (use_stmt) == MODIFY_EXPR
&& TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
&& TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def)
{
TREE_SET_CODE (TREE_OPERAND (use_stmt, 1), MULT_EXPR);
SET_USE (use_p, t);
}
}
}
/* Go through all the floating-point SSA_NAMEs, and call
execute_cse_reciprocals_1 on each of them. */
static void
execute_cse_reciprocals (void)
{
basic_block bb;
tree arg;
if (flag_trapping_math)
calculate_dominance_info (CDI_POST_DOMINATORS);
occ_pool = create_alloc_pool ("dominators for recip",
sizeof (struct occurrence),
n_basic_blocks / 3 + 1);
if (single_succ_p (ENTRY_BLOCK_PTR))
for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = TREE_CHAIN (arg))
if (default_def (arg))
{
block_stmt_iterator bsi;
bsi = bsi_start (single_succ (ENTRY_BLOCK_PTR));
execute_cse_reciprocals_1 (&bsi, default_def (arg),
PR_ON_ENTRY_EDGE);
}
calculate_dominance_info (CDI_DOMINATORS | CDI_POST_DOMINATORS);
#ifdef ENABLE_CHECKING
FOR_EACH_BB (bb)
gcc_assert (!bb->aux);
#endif
for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = TREE_CHAIN (arg))
if (default_def (arg)
&& FLOAT_TYPE_P (TREE_TYPE (arg))
&& is_gimple_reg (arg))
execute_cse_reciprocals_1 (NULL, default_def (arg));
FOR_EACH_BB (bb)
{
block_stmt_iterator bsi;
tree phi, def;
for (bsi = bsi_start (bb);
!bsi_end_p (bsi) && TREE_CODE (bsi_stmt (bsi)) == LABEL_EXPR;
bsi_next (&bsi))
;
for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
{
def = PHI_RESULT (phi);
if (FLOAT_TYPE_P (TREE_TYPE (def))
&& is_gimple_reg (def))
execute_cse_reciprocals_1 (&bsi, def, PR_BEFORE_BSI);
execute_cse_reciprocals_1 (NULL, def);
}
for (; !bsi_end_p (bsi); bsi_next (&bsi))
for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
{
tree stmt = bsi_stmt (bsi);
if (TREE_CODE (stmt) == MODIFY_EXPR
&& (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
&& FLOAT_TYPE_P (TREE_TYPE (def))
&& TREE_CODE (def) == SSA_NAME)
execute_cse_reciprocals_1 (&bsi, def, PR_AFTER_BSI);
execute_cse_reciprocals_1 (&bsi, def);
}
}
if (flag_trapping_math)
free_dominance_info (CDI_POST_DOMINATORS);
if (single_succ_p (ENTRY_BLOCK_PTR))
bsi_commit_one_edge_insert (single_succ_edge (ENTRY_BLOCK_PTR), NULL);
free_dominance_info (CDI_DOMINATORS | CDI_POST_DOMINATORS);
free_alloc_pool (occ_pool);
}
struct tree_opt_pass pass_cse_reciprocals =