re PR tree-optimization/37027 (SLP loop vectorization missing support for reductions)

PR tree-optimization/37027
	* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions 
	and macro to access it.
	(vectorizable_reduction): Add argument.
	(vect_get_slp_defs): Likewise.
	* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
	statements for possible use in SLP.
	(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
	(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
	(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
	add new argument.
	(vectorizable_reduction): Likewise.
	* tree-vect-stmts.c (vect_get_vec_defs): Update call to 
	vect_get_slp_defs.
	(vectorizable_type_demotion, vectorizable_type_promotion,
	vectorizable_store): Likewise.
	(vect_analyze_stmt): Update call to vectorizable_reduction.
	(vect_transform_stmt): Likewise.
	* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
	(vect_build_slp_tree): Fix indentation. Check that there are no loads
	from different interleaving chains in same node.
	(vect_slp_rearrange_stmts): New function.
	(vect_supported_load_permutation_p): Allow load permutations for 
	reductions. Call vect_slp_rearrange_stmts() to rearrange statements
	inside SLP nodes if necessary.
	(vect_analyze_slp_instance): Handle reductions.
	(vect_analyze_slp): Try to build SLP instances originating from groups
	of reductions.
	(vect_detect_hybrid_slp_stmts): Skip reduction statements.
	(vect_get_constant_vectors): Create initial vectors for reductions
	according to reduction code. Add new argument.
	(vect_get_slp_defs): Add new argument, pass it to 
	vect_get_constant_vectors.
	(vect_schedule_slp_instance): Remove SLP tree root statements.

From-SVN: r158506
This commit is contained in:
Ira Rosen 2010-04-19 09:10:45 +00:00 committed by Ira Rosen
parent 5a2fa9e8bf
commit b5aeb3bb3e
15 changed files with 1368 additions and 469 deletions

View File

@ -1,3 +1,40 @@
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* tree-vectorizer.h (struct _loop_vec_info): Add new field reductions
and macro to access it.
(vectorizable_reduction): Add argument.
(vect_get_slp_defs): Likewise.
* tree-vect-loop.c (vect_analyze_scalar_cycles_1): Collect reduction
statements for possible use in SLP.
(new_loop_vec_info): Initialize LOOP_VINFO_REDUCTIONS.
(destroy_loop_vec_info): Free LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Handle SLP. Modify documentation,
add new argument.
(vectorizable_reduction): Likewise.
* tree-vect-stmts.c (vect_get_vec_defs): Update call to
vect_get_slp_defs.
(vectorizable_type_demotion, vectorizable_type_promotion,
vectorizable_store): Likewise.
(vect_analyze_stmt): Update call to vectorizable_reduction.
(vect_transform_stmt): Likewise.
* tree-vect-slp.c (vect_get_and_check_slp_defs): Handle reduction.
(vect_build_slp_tree): Fix indentation. Check that there are no loads
from different interleaving chains in same node.
(vect_slp_rearrange_stmts): New function.
(vect_supported_load_permutation_p): Allow load permutations for
reductions. Call vect_slp_rearrange_stmts() to rearrange statements
inside SLP nodes if necessary.
(vect_analyze_slp_instance): Handle reductions.
(vect_analyze_slp): Try to build SLP instances originating from groups
of reductions.
(vect_detect_hybrid_slp_stmts): Skip reduction statements.
(vect_get_constant_vectors): Create initial vectors for reductions
according to reduction code. Add new argument.
(vect_get_slp_defs): Add new argument, pass it to
vect_get_constant_vectors.
(vect_schedule_slp_instance): Remove SLP tree root statements.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* tree.h (ENUM_IS_SCOPED): Define.

View File

@ -1,3 +1,14 @@
2010-04-19 Ira Rosen <irar@il.ibm.com>
PR tree-optimization/37027
* lib/target-supports.exp
(check_effective_target_vect_widen_sum_hi_to_si_pattern): New.
* gcc.dg/vect/pr37027.c: New test.
* gcc.dg/vect/slp-reduc-1.c, gcc.dg/vect/slp-reduc-2.c,
gcc.dg/vect/slp-reduc-3.c, gcc.dg/vect/slp-reduc-4.c,
gcc.dg/vect/slp-reduc-5.c, gcc.dg/vect/slp-reduc-6.c,
gcc.dg/vect/vect-complex-6.c: Likewise.
2010-04-19 Jakub Jelinek <jakub@redhat.com>
* g++.dg/debug/dwarf2/enum1.C: New test.

View File

@ -0,0 +1,37 @@
/* { dg-do compile } */
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
struct mystr
{
int f1;
int f2;
};
struct mystr a[16];
struct mystr b[16];
int res1, res2;
void
foo (void)
{
int i;
int sum1;
int sum2;
for (i = 0; i < 16; i++)
{
sum1 += a[i].f1 + b[i].f1;
sum2 += a[i].f2 + b[i].f2;
}
res1 = sum1;
res2 = sum2;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,49 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10, udiff2 = 20, udiff3 = 30;
for (i = 0; i < n; i++) {
udiff3 += (ub[4*i + 3] - uc[4*i + 3]);
udiff2 += (ub[4*i + 2] - uc[4*i + 2]);
udiff1 += (ub[4*i + 1] - uc[4*i + 1]);
udiff0 += (ub[4*i] - uc[4*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1
|| udiff2 != res2
|| udiff3 != res3)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/4, 53, 66, 84, 102);
main1 (N/4 - 1, 29, 40, 56, 72);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,44 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 16
unsigned int ub[N] = {0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45};
unsigned int uc[N] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
/* Vectorization of reduction using loop-aware SLP (with unrolling). */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3)
{
int i;
unsigned int udiff0 = 5, udiff1 = 10;
for (i = 0; i < n; i++) {
udiff1 += (ub[2*i + 1] - uc[2*i + 1]);
udiff0 += (ub[2*i] - uc[2*i]);
}
/* Check results: */
if (udiff0 != res0
|| udiff1 != res1)
abort ();
return 0;
}
int main (void)
{
check_vect ();
main1 (N/2, 117, 138, 84, 102);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,62 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include "tree-vect.h"
#define N 64
#define DOT1 21834
#define DOT2 21876
unsigned short X[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
unsigned short Y[N] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
/* short->short->int dot product.
Not detected as a dot-product pattern.
Requires support for non-widneing multiplication and widening-summation.
Vectorized with loop-aware SLP. */
__attribute__ ((noinline)) unsigned int
foo1(int len, int *result1, int *result2)
{
int i;
unsigned int res1 = 10, res2 = 20;
unsigned short prod;
for (i=0; i<len; i++) {
prod = X[2*i] * Y[2*i];
res1 += prod;
prod = X[2*i+1] * Y[2*i+1];
res2 += prod;
}
*result1 = res1;
*result2 = res2;
return 0;
}
int main (void)
{
unsigned int dot1, dot2;
unsigned short i;
check_vect ();
for (i=0; i<N; i++) {
X[i] = i;
Y[i] = 64-i;
}
foo1 (N/2, &dot1, &dot2);
if (dot1 != DOT1 || dot2 != DOT2)
abort ();
return 0;
}
/* The initialization loop in main also gets vectorized. */
/* { dg-final { scan-tree-dump-times "vect_recog_dot_prod_pattern: detected" 1 "vect" { xfail *-*-* } } } */
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target { vect_short_mult && vect_widen_sum_hi_to_si } } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_widen_sum_hi_to_si_pattern } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,60 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
unsigned int uc[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1, int res2, int res3, int res4, int res5, int res6, int res7)
{
int i;
unsigned int max0 = 5, max1 = 10, max2 = 20, max3 = 30, max4 = 2, max5 = 13, max6 = 7, max7 = 313;
for (i = 0; i < n; i++) {
max2 = max2 < uc[8*i+2] ? uc[8*i+2] : max2;
max3 = max3 < uc[8*i+3] ? uc[8*i+3] : max3;
max1 = max1 < uc[8*i+1] ? uc[8*i+1] : max1;
max7 = max7 < uc[8*i+7] ? uc[8*i+7] : max7;
max6 = max6 < uc[8*i+6] ? uc[8*i+6] : max6;
max0 = max0 < uc[8*i] ? uc[8*i] : max0;
max4 = max4 < uc[8*i+4] ? uc[8*i+4] : max4;
max5 = max5 < uc[8*i+5] ? uc[8*i+5] : max5;
}
/* Check results: */
if (max0 != res0
|| max1 != res1
|| max2 != res2
|| max3 != res3
|| max4 != res4
|| max5 != res5
|| max6 != res6
|| max7 != res7)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
uc[i] = i+3;
main1 (N/8, 123, 124, 125, 126, 127, 128, 129, 313);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,49 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int c[N];
/* Vectorization of reduction using loop-aware SLP. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int max0 = -100, max1 = -313;
for (i = 0; i < n; i++) {
max1 = max1 < c[2*i+1] ? c[2*i+1] : max1;
max0 = max0 < c[2*i] ? c[2*i] : max0;
}
/* Check results: */
if (max0 != res0
|| max1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
c[i] = (i+3) * -1;
c[0] = c[1] = -100;
main1 (N/2, -5, -6);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_no_int_max } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,50 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 128
int a[N], b[N];
/* Vectorization of reduction. Loop-aware SLP is not possible, because of
different arrays. */
__attribute__ ((noinline))
int main1 (int n, int res0, int res1)
{
int i;
int sum0 = 0, sum1 = 0;
for (i = 0; i < n; i++) {
sum1 += a[2*i];
sum0 += b[2*i];
}
/* Check results: */
if (sum0 != res0
|| sum1 != res1)
abort ();
return 0;
}
int main (void)
{
int i;
check_vect ();
for (i = 0; i < N; i++)
a[i] = b[i] = i;
main1 (N/2, 4032, 4032);
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { xfail vect_no_int_add } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "different interleaving chains in one node" 1 "vect" { target { ! vect_no_int_add } } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -2105,6 +2105,25 @@ proc check_effective_target_vect_perm { } {
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_widen_sum_hi_to_si_pattern { } {
global et_vect_widen_sum_hi_to_si_pattern
if [info exists et_vect_widen_sum_hi_to_si_pattern_saved] {
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: using cached result" 2
} else {
set et_vect_widen_sum_hi_to_si_pattern_saved 0
if { [istarget powerpc*-*-*] } {
set et_vect_widen_sum_hi_to_si_pattern_saved 1
}
}
verbose "check_effective_target_vect_widen_sum_hi_to_si_pattern: returning $et_vect_widen_sum_hi_to_si_pattern_saved" 2
return $et_vect_widen_sum_hi_to_si_pattern_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.

File diff suppressed because it is too large Load Diff

View File

@ -670,6 +670,8 @@ vect_pattern_recog_1 (
tree pattern_vectype;
tree type_in, type_out;
enum tree_code code;
int i;
gimple next;
pattern_stmt = (* vect_recog_func) (stmt, &type_in, &type_out);
if (!pattern_stmt)
@ -735,7 +737,13 @@ vect_pattern_recog_1 (
STMT_VINFO_IN_PATTERN_P (stmt_info) = true;
STMT_VINFO_RELATED_STMT (stmt_info) = pattern_stmt;
return;
/* Patterns cannot be vectorized using SLP, because they change the order of
computation. */
for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
next);
i++)
if (next == stmt)
VEC_ordered_remove (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i);
}

View File

@ -273,6 +273,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
break;
case vect_internal_def:
case vect_reduction_def:
if (i == 0)
VEC_safe_push (gimple, heap, *def_stmts0, def_stmt);
else
@ -332,7 +333,7 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
HOST_WIDE_INT dummy;
bool permutation = false;
unsigned int load_place;
gimple first_load;
gimple first_load, prev_first_load = NULL;
/* For every stmt in NODE find its def stmt/s. */
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
@ -485,42 +486,62 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
&pattern0, &pattern1))
return false;
}
else
{
/* Load. */
/* FORNOW: Check that there is no gap between the loads. */
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
"loads have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
else
{
/* Load. */
/* FORNOW: Check that there is no gap between the loads. */
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
"loads have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
return false;
}
/* Check that the size of interleaved loads group is not
greater than the SLP group size. */
if (DR_GROUP_SIZE (vinfo_for_stmt (stmt))
> ncopies * group_size)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: the number of "
"interleaved loads is greater than"
" the SLP group size ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
/* Check that the size of interleaved loads group is not
greater than the SLP group size. */
if (DR_GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: the number of "
"interleaved loads is greater than"
" the SLP group size ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
return false;
}
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
if (prev_first_load)
{
/* Check that there are no loads from different interleaving
chains in the same node. The only exception is complex
numbers. */
if (prev_first_load != first_load
&& rhs_code != REALPART_EXPR
&& rhs_code != IMAGPART_EXPR)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: different "
"interleaving chains in one node ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
}
else
prev_first_load = first_load;
if (first_load == stmt)
{
@ -787,6 +808,39 @@ vect_supported_slp_permutation_p (slp_instance instance)
}
/* Rearrange the statements of NODE according to PERMUTATION. */
static void
vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
VEC (int, heap) *permutation)
{
gimple stmt;
VEC (gimple, heap) *tmp_stmts;
unsigned int index, i;
if (!node)
return;
vect_slp_rearrange_stmts (SLP_TREE_LEFT (node), group_size, permutation);
vect_slp_rearrange_stmts (SLP_TREE_RIGHT (node), group_size, permutation);
gcc_assert (group_size == VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)));
tmp_stmts = VEC_alloc (gimple, heap, group_size);
for (i = 0; i < group_size; i++)
VEC_safe_push (gimple, heap, tmp_stmts, NULL);
for (i = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (node), i, stmt); i++)
{
index = VEC_index (int, permutation, i);
VEC_replace (gimple, tmp_stmts, index, stmt);
}
VEC_free (gimple, heap, SLP_TREE_SCALAR_STMTS (node));
SLP_TREE_SCALAR_STMTS (node) = tmp_stmts;
}
/* Check if the required load permutation is supported.
LOAD_PERMUTATION contains a list of indices of the loads.
In SLP this permutation is relative to the order of strided stores that are
@ -796,9 +850,11 @@ static bool
vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
VEC (int, heap) *load_permutation)
{
int i = 0, j, prev = -1, next, k;
bool supported;
int i = 0, j, prev = -1, next, k, number_of_groups;
bool supported, bad_permutation = false;
sbitmap load_index;
slp_tree node;
gimple stmt;
/* FORNOW: permutations are only supported in SLP. */
if (!slp_instn)
@ -811,9 +867,72 @@ vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
fprintf (vect_dump, "%d ", next);
}
/* In case of reduction every load permutation is allowed, since the order
of the reduction statements is not important (as opposed to the case of
strided stores). The only condition we need to check is that all the
load nodes are of the same size and have the same permutation (and then
rearrange all the nodes of the SLP instance according to this
permutation). */
/* Check that all the load nodes are of the same size. */
for (i = 0;
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_instn), i, node);
i++)
if (VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node))
!= (unsigned) group_size)
return false;
node = SLP_INSTANCE_TREE (slp_instn);
stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
/* LOAD_PERMUTATION is a list of indices of all the loads of the SLP
instance, not all the loads belong to the same node or interleaving
group. Hence, we need to divide them into groups according to
GROUP_SIZE. */
number_of_groups = VEC_length (int, load_permutation) / group_size;
/* Reduction (there are no data-refs in the root). */
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
{
int first_group_load_index;
/* Compare all the permutation sequences to the first one. */
for (i = 1; i < number_of_groups; i++)
{
k = 0;
for (j = i * group_size; j < i * group_size + group_size; j++)
{
next = VEC_index (int, load_permutation, j);
first_group_load_index = VEC_index (int, load_permutation, k);
if (next != first_group_load_index)
{
bad_permutation = true;
break;
}
k++;
}
if (bad_permutation)
break;
}
if (!bad_permutation)
{
/* This permutaion is valid for reduction. Since the order of the
statements in the nodes is not important unless they are memory
accesses, we can rearrange the statements in all the nodes
according to the order of the loads. */
vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
load_permutation);
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (slp_instn));
return true;
}
}
/* FORNOW: the only supported permutation is 0..01..1.. of length equal to
GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
well. */
well (unless it's reduction). */
if (VEC_length (int, load_permutation)
!= (unsigned int) (group_size * group_size))
return false;
@ -896,17 +1015,28 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
slp_tree node = XNEW (struct _slp_tree);
unsigned int group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
unsigned int unrolling_factor = 1, nunits;
tree vectype, scalar_type;
tree vectype, scalar_type = NULL_TREE;
gimple next;
unsigned int vectorization_factor = 0;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost, i;
unsigned int max_nunits = 0;
VEC (int, heap) *load_permutation;
VEC (slp_tree, heap) *loads;
struct data_reference *dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
if (dr)
{
scalar_type = TREE_TYPE (DR_REF (dr));
vectype = get_vectype_for_scalar_type (scalar_type);
group_size = DR_GROUP_SIZE (vinfo_for_stmt (stmt));
}
else
{
gcc_assert (loop_vinfo);
vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
group_size = VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo));
}
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
vinfo_for_stmt (stmt))));
vectype = get_vectype_for_scalar_type (scalar_type);
if (!vectype)
{
if (vect_print_dump_info (REPORT_SLP))
@ -914,6 +1044,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
fprintf (vect_dump, "Build SLP failed: unsupported data-type ");
print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
}
return false;
}
@ -938,11 +1069,29 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
/* Create a node (a root of the SLP tree) for the packed strided stores. */
SLP_TREE_SCALAR_STMTS (node) = VEC_alloc (gimple, heap, group_size);
next = stmt;
/* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
while (next)
if (dr)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
/* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */
while (next)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
next = DR_GROUP_NEXT_DR (vinfo_for_stmt (next));
}
}
else
{
/* Collect reduction statements. */
for (i = 0; VEC_iterate (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo), i,
next);
i++)
{
VEC_safe_push (gimple, heap, SLP_TREE_SCALAR_STMTS (node), next);
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "pushing reduction into node: ");
print_gimple_stmt (vect_dump, next, 0, TDF_SLIM);
}
}
}
SLP_TREE_VEC_STMTS (node) = NULL;
@ -1035,7 +1184,7 @@ bool
vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
{
unsigned int i;
VEC (gimple, heap) *strided_stores;
VEC (gimple, heap) *strided_stores, *reductions = NULL;
gimple store;
bool ok = false;
@ -1043,10 +1192,14 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "=== vect_analyze_slp ===");
if (loop_vinfo)
strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
{
strided_stores = LOOP_VINFO_STRIDED_STORES (loop_vinfo);
reductions = LOOP_VINFO_REDUCTIONS (loop_vinfo);
}
else
strided_stores = BB_VINFO_STRIDED_STORES (bb_vinfo);
/* Find SLP sequences starting from groups of strided stores. */
for (i = 0; VEC_iterate (gimple, strided_stores, i, store); i++)
if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, store))
ok = true;
@ -1059,6 +1212,12 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
return false;
}
/* Find SLP sequences starting from groups of reductions. */
if (loop_vinfo && VEC_length (gimple, LOOP_VINFO_REDUCTIONS (loop_vinfo))
&& vect_analyze_slp_instance (loop_vinfo, bb_vinfo,
VEC_index (gimple, reductions, 0)))
ok = true;
return true;
}
@ -1120,7 +1279,10 @@ vect_detect_hybrid_slp_stmts (slp_tree node)
if ((stmt_vinfo = vinfo_for_stmt (use_stmt))
&& !STMT_SLP_TYPE (stmt_vinfo)
&& (STMT_VINFO_RELEVANT (stmt_vinfo)
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo))))
|| VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
&& !(gimple_code (use_stmt) == GIMPLE_PHI
&& STMT_VINFO_DEF_TYPE (vinfo_for_stmt (use_stmt))
== vect_reduction_def))
vect_mark_slp_stmts (node, hybrid, i);
vect_detect_hybrid_slp_stmts (SLP_TREE_LEFT (node));
@ -1429,11 +1591,14 @@ vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
/* For constant and loop invariant defs of SLP_NODE this function returns
(vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar
stmts. NUMBER_OF_VECTORS is the number of vector defs to create. */
stmts. NUMBER_OF_VECTORS is the number of vector defs to create.
REDUC_INDEX is the index of the reduction operand in the statements, unless
it is -1. */
static void
vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
unsigned int op_num, unsigned int number_of_vectors)
unsigned int op_num, unsigned int number_of_vectors,
int reduc_index)
{
VEC (gimple, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node);
gimple stmt = VEC_index (gimple, stmts, 0);
@ -1449,6 +1614,50 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
int number_of_copies = 1;
VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors);
bool constant_p, is_store;
tree neutral_op = NULL;
if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
{
enum tree_code code = gimple_assign_rhs_code (stmt);
if (reduc_index == -1)
{
VEC_free (tree, heap, *vec_oprnds);
return;
}
op_num = reduc_index - 1;
op = gimple_op (stmt, op_num + 1);
/* For additional copies (see the explanation of NUMBER_OF_COPIES below)
we need either neutral operands or the original operands. See
get_initial_def_for_reduction() for details. */
switch (code)
{
case WIDEN_SUM_EXPR:
case DOT_PROD_EXPR:
case PLUS_EXPR:
case MINUS_EXPR:
case BIT_IOR_EXPR:
case BIT_XOR_EXPR:
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
neutral_op = build_real (TREE_TYPE (op), dconst0);
else
neutral_op = build_int_cst (TREE_TYPE (op), 0);
break;
case MULT_EXPR:
case BIT_AND_EXPR:
if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (op)))
neutral_op = build_real (TREE_TYPE (op), dconst1);
else
neutral_op = build_int_cst (TREE_TYPE (op), 1);
break;
default:
neutral_op = NULL;
}
}
if (STMT_VINFO_DATA_REF (stmt_vinfo))
{
@ -1499,6 +1708,19 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
else
op = gimple_op (stmt, op_num + 1);
if (reduc_index != -1)
{
struct loop *loop = (gimple_bb (stmt))->loop_father;
gimple def_stmt = SSA_NAME_DEF_STMT (op);
gcc_assert (loop);
/* Get the def before the loop. */
op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
loop_preheader_edge (loop));
if (j != (number_of_copies - 1) && neutral_op)
op = neutral_op;
}
/* Create 'vect_ = {op0,op1,...,opn}'. */
t = tree_cons (NULL_TREE, op, t);
@ -1536,8 +1758,25 @@ vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds,
to replicate the vectors. */
while (number_of_vectors > VEC_length (tree, *vec_oprnds))
{
for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
VEC_quick_push (tree, *vec_oprnds, vop);
tree neutral_vec = NULL;
if (neutral_op)
{
if (!neutral_vec)
{
t = NULL;
for (i = 0; i < (unsigned) nunits; i++)
t = tree_cons (NULL_TREE, neutral_op, t);
neutral_vec = build_vector (vector_type, t);
}
VEC_quick_push (tree, *vec_oprnds, neutral_vec);
}
else
{
for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++)
VEC_quick_push (tree, *vec_oprnds, vop);
}
}
}
@ -1576,7 +1815,7 @@ vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds)
void
vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
VEC (tree,heap) **vec_oprnds1)
VEC (tree,heap) **vec_oprnds1, int reduc_index)
{
gimple first_stmt;
enum tree_code code;
@ -1607,19 +1846,26 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
*vec_oprnds0 = VEC_alloc (tree, heap, number_of_vects);
/* SLP_NODE corresponds either to a group of stores or to a group of
unary/binary operations. We don't call this function for loads. */
if (SLP_TREE_LEFT (slp_node))
unary/binary operations. We don't call this function for loads.
For reduction defs we call vect_get_constant_vectors(), since we are
looking for initial loop invariant values. */
if (SLP_TREE_LEFT (slp_node) && reduc_index == -1)
/* The defs are already vectorized. */
vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0);
else
/* Build vectors from scalar defs. */
vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects);
vect_get_constant_vectors (slp_node, vec_oprnds0, 0, number_of_vects,
reduc_index);
if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)))
/* Since we don't call this function with loads, this is a group of
stores. */
return;
/* For reductions, we only need initial values. */
if (reduc_index != -1)
return;
code = gimple_assign_rhs_code (first_stmt);
if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS || !vec_oprnds1)
return;
@ -1638,7 +1884,7 @@ vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0,
vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1);
else
/* Build vectors from scalar defs. */
vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects);
vect_get_constant_vectors (slp_node, vec_oprnds1, 1, number_of_vects, -1);
}
@ -2027,22 +2273,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
si = gsi_for_stmt (stmt);
is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
if (is_store)
{
if (DR_GROUP_FIRST_DR (stmt_info))
/* If IS_STORE is TRUE, the vectorization of the
interleaving chain was completed - free all the stores in
the chain. */
vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
else
/* FORNOW: SLP originates only from strided stores. */
gcc_unreachable ();
return true;
}
/* FORNOW: SLP originates only from strided stores. */
return false;
return is_store;
}
@ -2075,6 +2306,26 @@ vect_schedule_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
fprintf (vect_dump, "vectorizing stmts using SLP.");
}
for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
{
slp_tree root = SLP_INSTANCE_TREE (instance);
gimple store;
unsigned int j;
gimple_stmt_iterator gsi;
for (j = 0; VEC_iterate (gimple, SLP_TREE_SCALAR_STMTS (root), j, store)
&& j < SLP_INSTANCE_GROUP_SIZE (instance); j++)
{
if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (store)))
break;
/* Free the attached stmt_vec_info and remove the stmt. */
gsi = gsi_for_stmt (store);
gsi_remove (&gsi, true);
free_stmt_vec_info (store);
}
}
return is_store;
}

View File

@ -1134,7 +1134,7 @@ vect_get_vec_defs (tree op0, tree op1, gimple stmt,
slp_tree slp_node)
{
if (slp_node)
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1);
vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1, -1);
else
{
tree vec_oprnd;
@ -2519,7 +2519,7 @@ vectorizable_type_demotion (gimple stmt, gimple_stmt_iterator *gsi,
{
/* Handle uses. */
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds0, NULL, -1);
else
{
VEC_free (tree, heap, vec_oprnds0);
@ -2819,7 +2819,7 @@ vectorizable_type_promotion (gimple stmt, gimple_stmt_iterator *gsi,
if (j == 0)
{
if (slp_node)
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1);
vect_get_slp_defs (slp_node, &vec_oprnds0, &vec_oprnds1, -1);
else
{
vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL);
@ -3105,7 +3105,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
if (slp)
{
/* Get vectorized arguments for SLP_NODE. */
vect_get_slp_defs (slp_node, &vec_oprnds, NULL);
vect_get_slp_defs (slp_node, &vec_oprnds, NULL, -1);
vec_oprnd = VEC_index (tree, vec_oprnds, 0);
}
@ -4049,7 +4049,7 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vectorize, slp_tree node)
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL)
|| vectorizable_reduction (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL, NULL, 0));
else
{
@ -4201,8 +4201,7 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case reduc_vec_info_type:
gcc_assert (!slp_node);
done = vectorizable_reduction (stmt, gsi, &vec_stmt);
done = vectorizable_reduction (stmt, gsi, &vec_stmt, slp_node);
gcc_assert (done);
break;

View File

@ -242,6 +242,9 @@ typedef struct _loop_vec_info {
/* The unrolling factor needed to SLP the loop. In case of that pure SLP is
applied to the loop, i.e., no unrolling is needed, this is 1. */
unsigned slp_unrolling_factor;
/* Reduction cycles detected in the loop. Used in loop-aware SLP. */
VEC (gimple, heap) *reductions;
} *loop_vec_info;
/* Access Functions. */
@ -266,6 +269,7 @@ typedef struct _loop_vec_info {
#define LOOP_VINFO_STRIDED_STORES(L) (L)->strided_stores
#define LOOP_VINFO_SLP_INSTANCES(L) (L)->slp_instances
#define LOOP_VINFO_SLP_UNROLLING_FACTOR(L) (L)->slp_unrolling_factor
#define LOOP_VINFO_REDUCTIONS(L) (L)->reductions
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
VEC_length (gimple, (L)->may_misalign_stmts) > 0
@ -844,7 +848,8 @@ extern void vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern bool vectorizable_live_operation (gimple, gimple_stmt_iterator *,
gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *);
extern bool vectorizable_reduction (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_induction (gimple, gimple_stmt_iterator *, gimple *);
extern int vect_estimate_min_profitable_iters (loop_vec_info);
extern tree get_initial_def_for_reduction (gimple, tree, tree *);
@ -862,7 +867,7 @@ extern bool vect_analyze_slp (loop_vec_info, bb_vec_info);
extern void vect_make_slp_decision (loop_vec_info);
extern void vect_detect_hybrid_slp (loop_vec_info);
extern void vect_get_slp_defs (slp_tree, VEC (tree,heap) **,
VEC (tree,heap) **);
VEC (tree,heap) **, int);
extern LOC find_bb_location (basic_block);
extern bb_vec_info vect_slp_analyze_bb (basic_block);
extern void vect_slp_transform_bb (basic_block);