re PR tree-optimization/50596 (Problems in vectorization of condition expression)

PR tree-optimization/50596
	* tree-vectorizer.h (NUM_PATTERNS): Increase to 7.
	* tree-vect-patterns.c (vect_vect_recog_func_ptrs): Add
	vect_recog_bool_pattern.
	(check_bool_pattern, adjust_bool_pattern_cast,
	adjust_bool_pattern, vect_recog_bool_pattern): New functions.

	* gcc.dg/vect/vect-cond-9.c: New test.

From-SVN: r180057
This commit is contained in:
Jakub Jelinek 2011-10-16 15:10:20 +02:00
parent ea10ca9c75
commit 71c92d1778
5 changed files with 604 additions and 7 deletions

View File

@ -1,4 +1,15 @@
2011-10-16 Ira Rosen <ira.rosen@linaro.org>
2011-10-16 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/50596
* tree-vectorizer.h (NUM_PATTERNS): Increase to 7.
* tree-vect-patterns.c (vect_vect_recog_func_ptrs): Add
vect_recog_bool_pattern.
(check_bool_pattern, adjust_bool_pattern_cast,
adjust_bool_pattern, vect_recog_bool_pattern): New functions.
* gcc.dg/vect/vect-cond-9.c: New test.
2011-10-16 Ira Rosen <ira.rosen@linaro.org>
* tree-vect-stmts.c (vectorizable_load): For SLP without permutation
treat the first load of the node as the first element in its

View File

@ -1,4 +1,9 @@
2011-10-16 Ira Rosen <ira.rosen@linaro.org>
2011-10-16 Jakub Jelinek <jakub@redhat.com>
PR tree-optimization/50596
* gcc.dg/vect/vect-cond-9.c: New test.
2011-10-16 Ira Rosen <ira.rosen@linaro.org>
* gcc.dg/vect/bb-slp-29.c: New test.

View File

@ -0,0 +1,200 @@
/* { dg-require-effective-target vect_cond_mixed } */
#include "tree-vect.h"
#define N 1024
float a[N], b[N], c[N], d[N];
int j[N];
unsigned char k[N];
__attribute__((noinline, noclone)) void
f1 (void)
{
int i;
for (i = 0; i < N; ++i)
{
unsigned int x = a[i] < b[i] ? -1 : 0;
unsigned int y = c[i] < d[i] ? -1 : 0;
j[i] = (x & y) >> 31;
}
}
__attribute__((noinline, noclone)) void
f2 (void)
{
int i;
for (i = 0; i < N; ++i)
{
int x = a[i] < b[i];
int y = c[i] < d[i];
j[i] = x & y;
}
}
__attribute__((noinline, noclone)) void
f3 (void)
{
int i;
for (i = 0; i < N; ++i)
j[i] = (a[i] < b[i]) & (c[i] < d[i]);
}
__attribute__((noinline, noclone)) void
f4 (void)
{
int i;
for (i = 0; i < N; ++i)
{
int x = a[i] < b[i];
int y = c[i] < d[i];
k[i] = x & y;
}
}
__attribute__((noinline, noclone)) void
f5 (void)
{
int i;
for (i = 0; i < N; ++i)
k[i] = (a[i] < b[i]) & (c[i] < d[i]);
}
__attribute__((noinline, noclone)) void
f6 (void)
{
int i;
for (i = 0; i < N; ++i)
{
unsigned int x = a[i] < b[i] ? -1 : 0;
unsigned int y = c[i] < d[i] ? -1 : 0;
j[i] = (x | y) >> 31;
}
}
__attribute__((noinline, noclone)) void
f7 (void)
{
int i;
for (i = 0; i < N; ++i)
{
int x = a[i] < b[i];
int y = c[i] < d[i];
j[i] = x | y;
}
}
__attribute__((noinline, noclone)) void
f8 (void)
{
int i;
for (i = 0; i < N; ++i)
j[i] = (a[i] < b[i]) | (c[i] < d[i]);
}
__attribute__((noinline, noclone)) void
f9 (void)
{
int i;
for (i = 0; i < N; ++i)
{
int x = a[i] < b[i];
int y = c[i] < d[i];
k[i] = x | y;
}
}
__attribute__((noinline, noclone)) void
f10 (void)
{
int i;
for (i = 0; i < N; ++i)
k[i] = (a[i] < b[i]) | (c[i] < d[i]);
}
int
main ()
{
int i;
check_vect ();
for (i = 0; i < N; i++)
{
switch (i % 9)
{
case 0: asm (""); a[i] = - i - 1; b[i] = i + 1; break;
case 1: a[i] = 0; b[i] = 0; break;
case 2: a[i] = i + 1; b[i] = - i - 1; break;
case 3: a[i] = i; b[i] = i + 7; break;
case 4: a[i] = i; b[i] = i; break;
case 5: a[i] = i + 16; b[i] = i + 3; break;
case 6: a[i] = - i - 5; b[i] = - i; break;
case 7: a[i] = - i; b[i] = - i; break;
case 8: a[i] = - i; b[i] = - i - 7; break;
}
}
for (i = 0; i < N; i++)
{
switch ((i / 9) % 3)
{
case 0: c[i] = a[i / 9]; d[i] = b[i / 9]; break;
case 1: c[i] = a[i / 9 + 3]; d[i] = b[i / 9 + 3]; break;
case 2: c[i] = a[i / 9 + 6]; d[i] = b[i / 9 + 6]; break;
}
}
f1 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 && ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f2 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 && ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f3 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 && ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f4 ();
for (i = 0; i < N; i++)
if (k[i] != ((i % 3) == 0 && ((i / 9) % 3) == 0))
abort ();
__builtin_memset (k, -6, sizeof (k));
f5 ();
for (i = 0; i < N; i++)
if (k[i] != ((i % 3) == 0 && ((i / 9) % 3) == 0))
abort ();
__builtin_memset (k, -6, sizeof (k));
f6 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 || ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f7 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 || ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f8 ();
for (i = 0; i < N; i++)
if (j[i] != ((i % 3) == 0 || ((i / 9) % 3) == 0))
abort ();
__builtin_memset (j, -6, sizeof (j));
f9 ();
for (i = 0; i < N; i++)
if (k[i] != ((i % 3) == 0 || ((i / 9) % 3) == 0))
abort ();
__builtin_memset (k, -6, sizeof (k));
f10 ();
for (i = 0; i < N; i++)
if (k[i] != ((i % 3) == 0 || ((i / 9) % 3) == 0))
abort ();
__builtin_memset (k, -6, sizeof (k));
return 0;
}
/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops" 10 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -51,13 +51,15 @@ static gimple vect_recog_over_widening_pattern (VEC (gimple, heap) **, tree *,
tree *);
static gimple vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **,
tree *, tree *);
static gimple vect_recog_bool_pattern (VEC (gimple, heap) **, tree *, tree *);
static vect_recog_func_ptr vect_vect_recog_func_ptrs[NUM_PATTERNS] = {
vect_recog_widen_mult_pattern,
vect_recog_widen_sum_pattern,
vect_recog_dot_prod_pattern,
vect_recog_pow_pattern,
vect_recog_over_widening_pattern,
vect_recog_mixed_size_cond_pattern};
vect_recog_mixed_size_cond_pattern,
vect_recog_bool_pattern};
/* Function widened_name_p
@ -1068,10 +1070,8 @@ vect_operation_fits_smaller_type (gimple stmt, tree def, tree *new_type,
constants.
Check if S3 and S4 can be done on a smaller type than 'TYPE', it can either
be 'type' or some intermediate type. For now, we expect S5 to be a type
demotion operation. We also check that S3 and S4 have only one use.
.
demotion operation. We also check that S3 and S4 have only one use. */
*/
static gimple
vect_recog_over_widening_pattern (VEC (gimple, heap) **stmts,
tree *type_in, tree *type_out)
@ -1333,6 +1333,387 @@ vect_recog_mixed_size_cond_pattern (VEC (gimple, heap) **stmts, tree *type_in,
}
/* Helper function of vect_recog_bool_pattern. Called recursively, return
true if bool VAR can be optimized that way. */
static bool
check_bool_pattern (tree var, loop_vec_info loop_vinfo)
{
gimple def_stmt;
enum vect_def_type dt;
tree def, rhs1;
enum tree_code rhs_code;
if (!vect_is_simple_use (var, loop_vinfo, NULL, &def_stmt, &def, &dt))
return false;
if (dt != vect_internal_def)
return false;
if (!is_gimple_assign (def_stmt))
return false;
if (!has_single_use (def))
return false;
rhs1 = gimple_assign_rhs1 (def_stmt);
rhs_code = gimple_assign_rhs_code (def_stmt);
switch (rhs_code)
{
case SSA_NAME:
return check_bool_pattern (rhs1, loop_vinfo);
CASE_CONVERT:
if ((TYPE_PRECISION (TREE_TYPE (rhs1)) != 1
|| !TYPE_UNSIGNED (TREE_TYPE (rhs1)))
&& TREE_CODE (TREE_TYPE (rhs1)) != BOOLEAN_TYPE)
return false;
return check_bool_pattern (rhs1, loop_vinfo);
case BIT_NOT_EXPR:
return check_bool_pattern (rhs1, loop_vinfo);
case BIT_AND_EXPR:
case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
if (!check_bool_pattern (rhs1, loop_vinfo))
return false;
return check_bool_pattern (gimple_assign_rhs2 (def_stmt), loop_vinfo);
default:
if (TREE_CODE_CLASS (rhs_code) == tcc_comparison)
{
tree vecitype, comp_vectype;
comp_vectype = get_vectype_for_scalar_type (TREE_TYPE (rhs1));
if (comp_vectype == NULL_TREE)
return false;
if (TREE_CODE (TREE_TYPE (rhs1)) != INTEGER_TYPE)
{
enum machine_mode mode = TYPE_MODE (TREE_TYPE (rhs1));
tree itype
= build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), 0);
vecitype = get_vectype_for_scalar_type (itype);
if (vecitype == NULL_TREE)
return false;
}
else
vecitype = comp_vectype;
return expand_vec_cond_expr_p (vecitype, comp_vectype);
}
return false;
}
}
/* Helper function of adjust_bool_pattern. Add a cast to TYPE to a previous
stmt (SSA_NAME_DEF_STMT of VAR) by moving the COND_EXPR from RELATED_STMT
to PATTERN_DEF_STMT and adding a cast as RELATED_STMT. */
static tree
adjust_bool_pattern_cast (tree type, tree var)
{
stmt_vec_info stmt_vinfo = vinfo_for_stmt (SSA_NAME_DEF_STMT (var));
gimple cast_stmt, pattern_stmt;
gcc_assert (!STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo));
pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
STMT_VINFO_PATTERN_DEF_STMT (stmt_vinfo) = pattern_stmt;
cast_stmt
= gimple_build_assign_with_ops (NOP_EXPR,
vect_recog_temp_ssa_var (type, NULL),
gimple_assign_lhs (pattern_stmt),
NULL_TREE);
STMT_VINFO_RELATED_STMT (stmt_vinfo) = cast_stmt;
return gimple_assign_lhs (cast_stmt);
}
/* Helper function of vect_recog_bool_pattern. Do the actual transformations,
recursively. VAR is an SSA_NAME that should be transformed from bool
to a wider integer type, OUT_TYPE is the desired final integer type of
the whole pattern, TRUEVAL should be NULL unless optimizing
BIT_AND_EXPR into a COND_EXPR with one integer from one of the operands
in the then_clause, STMTS is where statements with added pattern stmts
should be pushed to. */
static tree
adjust_bool_pattern (tree var, tree out_type, tree trueval,
VEC (gimple, heap) **stmts)
{
gimple stmt = SSA_NAME_DEF_STMT (var);
enum tree_code rhs_code, def_rhs_code;
tree itype, cond_expr, rhs1, rhs2, irhs1, irhs2;
location_t loc;
gimple pattern_stmt, def_stmt;
rhs1 = gimple_assign_rhs1 (stmt);
rhs2 = gimple_assign_rhs2 (stmt);
rhs_code = gimple_assign_rhs_code (stmt);
loc = gimple_location (stmt);
switch (rhs_code)
{
case SSA_NAME:
CASE_CONVERT:
irhs1 = adjust_bool_pattern (rhs1, out_type, NULL_TREE, stmts);
itype = TREE_TYPE (irhs1);
pattern_stmt
= gimple_build_assign_with_ops (SSA_NAME,
vect_recog_temp_ssa_var (itype, NULL),
irhs1, NULL_TREE);
break;
case BIT_NOT_EXPR:
irhs1 = adjust_bool_pattern (rhs1, out_type, NULL_TREE, stmts);
itype = TREE_TYPE (irhs1);
pattern_stmt
= gimple_build_assign_with_ops (BIT_XOR_EXPR,
vect_recog_temp_ssa_var (itype, NULL),
irhs1, build_int_cst (itype, 1));
break;
case BIT_AND_EXPR:
/* Try to optimize x = y & (a < b ? 1 : 0); into
x = (a < b ? y : 0);
E.g. for:
bool a_b, b_b, c_b;
TYPE d_T;
S1 a_b = x1 CMP1 y1;
S2 b_b = x2 CMP2 y2;
S3 c_b = a_b & b_b;
S4 d_T = (TYPE) c_b;
we would normally emit:
S1' a_T = x1 CMP1 y1 ? 1 : 0;
S2' b_T = x2 CMP2 y2 ? 1 : 0;
S3' c_T = a_T & b_T;
S4' d_T = c_T;
but we can save one stmt by using the
result of one of the COND_EXPRs in the other COND_EXPR and leave
BIT_AND_EXPR stmt out:
S1' a_T = x1 CMP1 y1 ? 1 : 0;
S3' c_T = x2 CMP2 y2 ? a_T : 0;
S4' f_T = c_T;
At least when VEC_COND_EXPR is implemented using masks
cond ? 1 : 0 is as expensive as cond ? var : 0, in both cases it
computes the comparison masks and ands it, in one case with
all ones vector, in the other case with a vector register.
Don't do this for BIT_IOR_EXPR, because cond ? 1 : var; is
often more expensive. */
def_stmt = SSA_NAME_DEF_STMT (rhs2);
def_rhs_code = gimple_assign_rhs_code (def_stmt);
if (TREE_CODE_CLASS (def_rhs_code) == tcc_comparison)
{
tree def_rhs1 = gimple_assign_rhs1 (def_stmt);
irhs1 = adjust_bool_pattern (rhs1, out_type, NULL_TREE, stmts);
if (TYPE_PRECISION (TREE_TYPE (irhs1))
== GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (def_rhs1))))
{
gimple tstmt;
stmt_vec_info stmt_def_vinfo = vinfo_for_stmt (def_stmt);
irhs2 = adjust_bool_pattern (rhs2, out_type, irhs1, stmts);
tstmt = VEC_pop (gimple, *stmts);
gcc_assert (tstmt == def_stmt);
VEC_quick_push (gimple, *stmts, stmt);
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt))
= STMT_VINFO_RELATED_STMT (stmt_def_vinfo);
gcc_assert (!STMT_VINFO_PATTERN_DEF_STMT (stmt_def_vinfo));
STMT_VINFO_RELATED_STMT (stmt_def_vinfo) = NULL;
return irhs2;
}
else
irhs2 = adjust_bool_pattern (rhs2, out_type, NULL_TREE, stmts);
goto and_ior_xor;
}
def_stmt = SSA_NAME_DEF_STMT (rhs1);
def_rhs_code = gimple_assign_rhs_code (def_stmt);
if (TREE_CODE_CLASS (def_rhs_code) == tcc_comparison)
{
tree def_rhs1 = gimple_assign_rhs1 (def_stmt);
irhs2 = adjust_bool_pattern (rhs2, out_type, NULL_TREE, stmts);
if (TYPE_PRECISION (TREE_TYPE (irhs2))
== GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (def_rhs1))))
{
gimple tstmt;
stmt_vec_info stmt_def_vinfo = vinfo_for_stmt (def_stmt);
irhs1 = adjust_bool_pattern (rhs1, out_type, irhs2, stmts);
tstmt = VEC_pop (gimple, *stmts);
gcc_assert (tstmt == def_stmt);
VEC_quick_push (gimple, *stmts, stmt);
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt))
= STMT_VINFO_RELATED_STMT (stmt_def_vinfo);
gcc_assert (!STMT_VINFO_PATTERN_DEF_STMT (stmt_def_vinfo));
STMT_VINFO_RELATED_STMT (stmt_def_vinfo) = NULL;
return irhs1;
}
else
irhs1 = adjust_bool_pattern (rhs1, out_type, NULL_TREE, stmts);
goto and_ior_xor;
}
/* FALLTHRU */
case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
irhs1 = adjust_bool_pattern (rhs1, out_type, NULL_TREE, stmts);
irhs2 = adjust_bool_pattern (rhs2, out_type, NULL_TREE, stmts);
and_ior_xor:
if (TYPE_PRECISION (TREE_TYPE (irhs1))
!= TYPE_PRECISION (TREE_TYPE (irhs2)))
{
int prec1 = TYPE_PRECISION (TREE_TYPE (irhs1));
int prec2 = TYPE_PRECISION (TREE_TYPE (irhs2));
int out_prec = TYPE_PRECISION (out_type);
if (absu_hwi (out_prec - prec1) < absu_hwi (out_prec - prec2))
irhs2 = adjust_bool_pattern_cast (TREE_TYPE (irhs1), rhs2);
else if (absu_hwi (out_prec - prec1) > absu_hwi (out_prec - prec2))
irhs1 = adjust_bool_pattern_cast (TREE_TYPE (irhs2), rhs1);
else
{
irhs1 = adjust_bool_pattern_cast (out_type, rhs1);
irhs2 = adjust_bool_pattern_cast (out_type, rhs2);
}
}
itype = TREE_TYPE (irhs1);
pattern_stmt
= gimple_build_assign_with_ops (rhs_code,
vect_recog_temp_ssa_var (itype, NULL),
irhs1, irhs2);
break;
default:
gcc_assert (TREE_CODE_CLASS (rhs_code) == tcc_comparison);
if (TREE_CODE (TREE_TYPE (rhs1)) != INTEGER_TYPE
|| TYPE_UNSIGNED (TREE_TYPE (rhs1)))
{
enum machine_mode mode = TYPE_MODE (TREE_TYPE (rhs1));
itype
= build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), 0);
}
else
itype = TREE_TYPE (rhs1);
cond_expr = build2_loc (loc, rhs_code, itype, rhs1, rhs2);
if (trueval == NULL_TREE)
trueval = build_int_cst (itype, 1);
else
gcc_checking_assert (useless_type_conversion_p (itype,
TREE_TYPE (trueval)));
pattern_stmt
= gimple_build_assign_with_ops3 (COND_EXPR,
vect_recog_temp_ssa_var (itype, NULL),
cond_expr, trueval,
build_int_cst (itype, 0));
break;
}
VEC_safe_push (gimple, heap, *stmts, stmt);
gimple_set_location (pattern_stmt, loc);
STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt)) = pattern_stmt;
return gimple_assign_lhs (pattern_stmt);
}
/* Function vect_recog_bool_pattern
Try to find pattern like following:
bool a_b, b_b, c_b, d_b, e_b;
TYPE f_T;
loop:
S1 a_b = x1 CMP1 y1;
S2 b_b = x2 CMP2 y2;
S3 c_b = a_b & b_b;
S4 d_b = x3 CMP3 y3;
S5 e_b = c_b | d_b;
S6 f_T = (TYPE) e_b;
where type 'TYPE' is an integral type.
Input:
* LAST_STMT: A stmt at the end from which the pattern
search begins, i.e. cast of a bool to
an integer type.
Output:
* TYPE_IN: The type of the input arguments to the pattern.
* TYPE_OUT: The type of the output of this pattern.
* Return value: A new stmt that will be used to replace the pattern.
Assuming size of TYPE is the same as size of all comparisons
(otherwise some casts would be added where needed), the above
sequence we create related pattern stmts:
S1' a_T = x1 CMP1 y1 ? 1 : 0;
S3' c_T = x2 CMP2 y2 ? a_T : 0;
S4' d_T = x3 CMP3 y3 ? 1 : 0;
S5' e_T = c_T | d_T;
S6' f_T = e_T;
Instead of the above S3' we could emit:
S2' b_T = x2 CMP2 y2 ? 1 : 0;
S3' c_T = a_T | b_T;
but the above is more efficient. */
static gimple
vect_recog_bool_pattern (VEC (gimple, heap) **stmts, tree *type_in,
tree *type_out)
{
gimple last_stmt = VEC_pop (gimple, *stmts);
enum tree_code rhs_code;
tree var, lhs, rhs, vectype;
stmt_vec_info stmt_vinfo = vinfo_for_stmt (last_stmt);
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
gimple pattern_stmt;
if (!is_gimple_assign (last_stmt))
return NULL;
var = gimple_assign_rhs1 (last_stmt);
lhs = gimple_assign_lhs (last_stmt);
if ((TYPE_PRECISION (TREE_TYPE (var)) != 1
|| !TYPE_UNSIGNED (TREE_TYPE (var)))
&& TREE_CODE (TREE_TYPE (var)) != BOOLEAN_TYPE)
return NULL;
rhs_code = gimple_assign_rhs_code (last_stmt);
if (CONVERT_EXPR_CODE_P (rhs_code))
{
if (TREE_CODE (TREE_TYPE (lhs)) != INTEGER_TYPE)
return NULL;
vectype = get_vectype_for_scalar_type (TREE_TYPE (lhs));
if (vectype == NULL_TREE)
return NULL;
if (!check_bool_pattern (var, loop_vinfo))
return NULL;
rhs = adjust_bool_pattern (var, TREE_TYPE (lhs), NULL_TREE, stmts);
lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
if (useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
pattern_stmt
= gimple_build_assign_with_ops (SSA_NAME, lhs, rhs, NULL_TREE);
else
pattern_stmt
= gimple_build_assign_with_ops (NOP_EXPR, lhs, rhs, NULL_TREE);
*type_out = vectype;
*type_in = vectype;
VEC_safe_push (gimple, heap, *stmts, last_stmt);
return pattern_stmt;
}
else
return NULL;
}
/* Mark statements that are involved in a pattern. */
static inline void

View File

@ -902,7 +902,7 @@ extern void vect_slp_transform_bb (basic_block);
Additional pattern recognition functions can (and will) be added
in the future. */
typedef gimple (* vect_recog_func_ptr) (VEC (gimple, heap) **, tree *, tree *);
#define NUM_PATTERNS 6
#define NUM_PATTERNS 7
void vect_pattern_recog (loop_vec_info);
/* In tree-vectorizer.c. */