target.h (struct vectorize): Add new target builtin.

* target.h (struct vectorize): Add new target builtin.
	* tree-vectorizer.c (destroy_loop_vec_info): Call
	vect_free_slp_instance instead of vect_free_slp_node.
	* tree-vectorizer.h (enum slp_load_perm_type): New.
	(struct _slp_instance): Add new fields.
	(SLP_INSTANCE_LOAD_PERMUTATION): New.
	(SLP_INSTANCE_LOADS): New.
	(vect_free_slp_tree): Remove.
	(vect_free_slp_instance): Declare.
	(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
	(vectorizable_load): Add argument.
	(vect_transform_slp_perm_load): New.
	* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
	vectorizable_load.
	(vect_get_place_in_interleaving_chain): New function.
	(vect_free_slp_tree): Make static.
	(vect_free_slp_instance): New function.
	(vect_build_slp_tree): Add new arguments. Allow load permutations and
	collect the load location in the interleaving chain.
	(vect_supported_slp_permutation_p): New function.
	(vect_supported_load_permutation_p): Likewise.
	(vect_analyze_slp_instance): In case of loads permutation, call
	vect_supported_load_permutation_p to check that the permutation is
	supported.
	* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
	* tree-vect-transform.c (vect_transform_stmt): Add new argument.
	(vect_create_mask_and_perm): New function.
	(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
	(vectorizable_load): Add an argument. Don't keep the created vectors
	statements in the node if permutation is required. Call
	vect_transform_slp_perm_load to generate the permutation.
	(vect_transform_stmt): Add new argument. Call vectorizable_load with
	additional argument.
	(vect_schedule_slp_instance): In case of loads permutation, allocate
	vectorized statements structure for all the related SLP nodes. Call
	vect_transform_stmt with addditional argument.
	(vect_transform_loop): Call vect_transform_stmt with correct arguments.
	* config/spu/spu.c (spu_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
	* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
	* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
	(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.

From-SVN: r139706
This commit is contained in:
Ira Rosen 2008-08-28 11:11:14 +00:00 committed by Ira Rosen
parent b8c41c8ed2
commit 0fca40f598
21 changed files with 1461 additions and 113 deletions

View File

@ -1,3 +1,48 @@
2008-08-28 Ira Rosen <irar@il.ibm.com>
* target.h (struct vectorize): Add new target builtin.
* tree-vectorizer.c (destroy_loop_vec_info): Call
vect_free_slp_instance instead of vect_free_slp_node.
* tree-vectorizer.h (enum slp_load_perm_type): New.
(struct _slp_instance): Add new fields.
(SLP_INSTANCE_LOAD_PERMUTATION): New.
(SLP_INSTANCE_LOADS): New.
(vect_free_slp_tree): Remove.
(vect_free_slp_instance): Declare.
(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
(vectorizable_load): Add argument.
(vect_transform_slp_perm_load): New.
* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
vectorizable_load.
(vect_get_place_in_interleaving_chain): New function.
(vect_free_slp_tree): Make static.
(vect_free_slp_instance): New function.
(vect_build_slp_tree): Add new arguments. Allow load permutations and
collect the load location in the interleaving chain.
(vect_supported_slp_permutation_p): New function.
(vect_supported_load_permutation_p): Likewise.
(vect_analyze_slp_instance): In case of loads permutation, call
vect_supported_load_permutation_p to check that the permutation is
supported.
* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
* tree-vect-transform.c (vect_transform_stmt): Add new argument.
(vect_create_mask_and_perm): New function.
(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
(vectorizable_load): Add an argument. Don't keep the created vectors
statements in the node if permutation is required. Call
vect_transform_slp_perm_load to generate the permutation.
(vect_transform_stmt): Add new argument. Call vectorizable_load with
additional argument.
(vect_schedule_slp_instance): In case of loads permutation, allocate
vectorized statements structure for all the related SLP nodes. Call
vect_transform_stmt with addditional argument.
(vect_transform_loop): Call vect_transform_stmt with correct arguments.
* config/spu/spu.c (spu_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
2008-08-28 Chris Fairles <chris.fairles@gmail.com>
* gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach,

View File

@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void);
static tree rs6000_builtin_mul_widen_even (tree);
static tree rs6000_builtin_mul_widen_odd (tree);
static tree rs6000_builtin_conversion (enum tree_code, tree);
static tree rs6000_builtin_vec_perm (tree, tree *);
static void def_builtin (int, const char *, tree, int);
static bool rs6000_vector_alignment_reachable (const_tree, bool);
@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] =
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
#define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac
}
}
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
{
tree d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI];
break;
case V8HImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI];
break;
case V4SImode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI];
break;
case V4SFmode:
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d;
}
/* Handle generic options of the form -mfoo=yes/no.
NAME is the option name.
VALUE is the option value.

View File

@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree);
static tree spu_builtin_mask_for_load (void);
static int spu_builtin_vectorization_cost (bool);
static bool spu_vector_alignment_reachable (const_tree, bool);
static tree spu_builtin_vec_perm (tree, tree *);
static int spu_sms_res_mii (struct ddg *g);
extern const char *reg_names[];
@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[];
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
#define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
#undef TARGET_LIBGCC_CMP_RETURN_MODE
#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed
return true;
}
/* Implement targetm.vectorize.builtin_vec_perm. */
tree
spu_builtin_vec_perm (tree type, tree *mask_element_type)
{
struct spu_builtin_description *d;
*mask_element_type = unsigned_char_type_node;
switch (TYPE_MODE (type))
{
case V16QImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_0];
else
d = &spu_builtins[SPU_SHUFFLE_1];
break;
case V8HImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_2];
else
d = &spu_builtins[SPU_SHUFFLE_3];
break;
case V4SImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_4];
else
d = &spu_builtins[SPU_SHUFFLE_5];
break;
case V2DImode:
if (TYPE_UNSIGNED (type))
d = &spu_builtins[SPU_SHUFFLE_6];
else
d = &spu_builtins[SPU_SHUFFLE_7];
break;
case V4SFmode:
d = &spu_builtins[SPU_SHUFFLE_8];
break;
case V2DFmode:
d = &spu_builtins[SPU_SHUFFLE_9];
break;
default:
return NULL_TREE;
}
gcc_assert (d);
return d->fndecl;
}
/* Count the total number of instructions in each pipe and return the
maximum, which is used as the Minimum Iteration Interval (MII)
in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1.

View File

@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \
#undef TARG_VEC_STORE_COST
#define TARG_VEC_STORE_COST 1
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* Misc */

View File

@ -364,6 +364,7 @@
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0
#define TARGET_VECTOR_ALIGNMENT_REACHABLE \
default_builtin_vector_alignment_reachable
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0
#define TARGET_VECTORIZE \
{ \
@ -373,7 +374,8 @@
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \
TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \
TARGET_VECTOR_ALIGNMENT_REACHABLE \
TARGET_VECTOR_ALIGNMENT_REACHABLE, \
TARGET_VECTORIZE_BUILTIN_VEC_PERM \
}
#define TARGET_DEFAULT_TARGET_FLAGS 0

View File

@ -438,7 +438,10 @@ struct gcc_target
/* Return true if vector alignment is reachable (by peeling N
iterations) for the given type. */
bool (* vector_alignment_reachable) (const_tree, bool);
} vectorize;
/* Target builtin that implements vector permute. */
tree (* builtin_vec_perm) (tree, tree*);
} vectorize;
/* The initial value of target_flags. */
int default_target_flags;

View File

@ -1,3 +1,16 @@
2008-08-28 Ira Rosen <irar@il.ibm.com>
* lib/target-supports.exp (check_effective_target_vect_perm): New.
* gcc.dg/vect/slp-perm-1.c: New testcase.
* gcc.dg/vect/slp-perm-2.c: New testcase.
* gcc.dg/vect/slp-perm-3.c: New testcase.
* gcc.dg/vect/slp-perm-4.c: New testcase.
* gcc.dg/vect/slp-perm-5.c: New testcase.
* gcc.dg/vect/slp-perm-6.c: New testcase.
* gcc.dg/vect/slp-perm-7.c: New testcase.
* gcc.dg/vect/slp-perm-8.c: New testcase.
* gcc.dg/vect/slp-perm-9.c: New testcase.
2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org>
PR 37217

View File

@ -0,0 +1,60 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,55 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M01 1322
#define M11 13
#define M02 74
#define M12 191
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b;
for (i = 0; i < N / 2; i++)
{
a = *pInput++;
b = *pInput++;
*pOutput++ = M00 * a + M01 * b;
*pOutput++ = M10 * a + M11 * b;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,70 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d;
for (i = 0; i < N / 4; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,85 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M30 237
#define M40 437
#define M01 1322
#define M11 13
#define M21 27271
#define M31 2280
#define M41 284
#define M02 74
#define M12 191
#define M22 500
#define M32 111
#define M42 1114
#define M03 134
#define M13 117
#define M23 11
#define M33 771
#define M43 71
#define M04 334
#define M14 147
#define M24 115
#define M34 7716
#define M44 16
#define N 16
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
{
unsigned int i, a, b, c, d, e;
for (i = 0; i < N / 5; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput++;
e = *pInput++;
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
*pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
}
}
int main (int argc, const char* argv[])
{
unsigned int input[N], output[N], i;
unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
if (input[i] > 200)
abort();
output[i] = 0;
}
foo (input, output);
for (i = 0; i < N - N; i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,77 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
*pOutput2++ = K00 * d + K01 * e;
*pOutput2++ = K10 * d + K11 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,77 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d, e;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
e = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Regular SLP - no permutation required. */
*pOutput2++ = K00 * d;
*pOutput2++ = K10 * e;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 256)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,76 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define M00 100
#define M10 216
#define M20 23
#define M01 1322
#define M11 13
#define M21 27271
#define M02 74
#define M12 191
#define M22 500
#define K00 405
#define K10 112
#define K01 4322
#define K11 135
#define N 16
/* SLP with load permutation and loop-based vectorization. */
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
int *__restrict__ pInput2, int *__restrict__ pOutput2)
{
int i, a, b, c, d;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
d = *pInput2++;
*pOutput++ = M00 * a + M01 * b + M02 * c;
*pOutput++ = M10 * a + M11 * b + M12 * c;
*pOutput++ = M20 * a + M21 * b + M22 * c;
/* Loop-based vectorization. */
*pOutput2++ = K00 * d;
}
}
int main (int argc, const char* argv[])
{
int input[N], output[N], i;
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
int input2[N], output2[N];
int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
for (i = 0; i < N; i++)
{
input[i] = i%256;
input2[i] = i%256;
output[i] = 0;
output2[i] = 0;
if (input[i] > 200)
abort ();
}
foo (input, output, input2, output2);
for (i = 0; i < N; i++)
if (output[i] != check_results[i] || output2[i] != check_results2[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,57 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput)
{
unsigned char i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned char input[N], output[N], i;
unsigned char check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,58 @@
/* { dg-require-effective-target vect_int } */
#include <stdarg.h>
#include <stdio.h>
#include "tree-vect.h"
#define N 200
void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
{
unsigned short i, a, b, c;
for (i = 0; i < N / 3; i++)
{
a = *pInput++;
b = *pInput++;
c = *pInput++;
*pOutput++ = a + b + c + 3;
*pOutput++ = a + b + c + 12;
*pOutput++ = a + b + c + 1;
}
}
int main (int argc, const char* argv[])
{
unsigned short input[N], output[N], i;
unsigned short check_results[N];
for (i = 0; i < N; i++)
{
input[i] = i;
output[i] = 0;
if (input[i] > 256)
abort ();
}
for (i = 0; i < N / 3; i++)
{
check_results[3*i] = 9 * i + 6;
check_results[3*i+1] = 9 * i + 15;
check_results[3*i+2] = 9 * i + 4;
}
foo (input, output);
for (i = 0; i < N - (N % 3); i++)
if (output[i] != check_results[i])
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } {
return $et_vect_no_bitwise_saved
}
# Return 1 if the target plus current options supports vector permutation,
# 0 otherwise.
#
# This won't change for different subtargets so cache the result.
proc check_effective_target_vect_perm { } {
global et_vect_perm
if [info exists et_vect_perm_saved] {
verbose "check_effective_target_vect_perm: using cached result" 2
} else {
set et_vect_perm_saved 0
if { [istarget powerpc*-*-*]
|| [istarget spu-*-*] } {
set et_vect_perm_saved 1
}
}
verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
return $et_vect_perm_saved
}
# Return 1 if the target plus current options supports a vector
# widening summation of *short* args into *int* result, 0 otherwise.
# A target can also support this widening summation if it can support

View File

@ -486,7 +486,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
|| vectorizable_conversion (stmt, NULL, NULL, NULL)
|| vectorizable_operation (stmt, NULL, NULL, NULL)
|| vectorizable_assignment (stmt, NULL, NULL, NULL)
|| vectorizable_load (stmt, NULL, NULL, NULL)
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|| vectorizable_call (stmt, NULL, NULL)
|| vectorizable_store (stmt, NULL, NULL, NULL)
|| vectorizable_condition (stmt, NULL, NULL)
@ -846,6 +846,31 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
}
/* Find the place of the data-ref in STMT in the interleaving chain that starts
from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
static int
vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
{
gimple next_stmt = first_stmt;
int result = 0;
if (first_stmt != DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
return -1;
while (next_stmt && next_stmt != stmt)
{
result++;
next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
}
if (next_stmt)
return result;
else
return -1;
}
/* Function vect_insert_into_interleaving_chain.
Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
@ -2482,7 +2507,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
void
static void
vect_free_slp_tree (slp_tree node)
{
if (!node)
@ -2503,6 +2528,17 @@ vect_free_slp_tree (slp_tree node)
}
/* Free the memory allocated for the SLP instance. */
void
vect_free_slp_instance (slp_instance instance)
{
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance));
VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
}
/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that
they are of a legal type and that they match the defs of the first stmt of
the SLP group (stored in FIRST_STMT_...). */
@ -2705,7 +2741,9 @@ static bool
vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
unsigned int group_size,
int *inside_cost, int *outside_cost,
int ncopies_for_cost, unsigned int *max_nunits)
int ncopies_for_cost, unsigned int *max_nunits,
VEC (int, heap) **load_permutation,
VEC (slp_tree, heap) **loads)
{
VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
@ -2716,7 +2754,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
enum tree_code first_stmt_code = 0, rhs_code;
tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
tree lhs;
gimple prev_stmt = NULL;
bool stop_recursion = false, need_same_oprnds = false;
tree vectype, scalar_type, first_op1 = NULL_TREE;
unsigned int vectorization_factor = 0, ncopies;
@ -2728,6 +2765,9 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
struct data_reference *first_dr;
bool pattern0 = false, pattern1 = false;
HOST_WIDE_INT dummy;
bool permutation = false;
unsigned int load_place;
gimple first_load;
/* For every stmt in NODE find its def stmt/s. */
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
@ -2813,8 +2853,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
if (icode == CODE_FOR_nothing)
{
if (vect_print_dump_info (REPORT_SLP))
fprintf (vect_dump,
"Build SLP failed: op not supported by target.");
fprintf (vect_dump, "Build SLP failed: "
"op not supported by target.");
return false;
}
optab_op2_mode = insn_data[icode].operand[2].mode;
@ -2878,70 +2918,60 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
else
{
/* Load. */
if (i == 0)
{
/* First stmt of the SLP group should be the first load of
the interleaving loop if data permutation is not allowed.
Check that there is no gap between the loads. */
if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
|| DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
{
/* FORNOW: data permutations and gaps in loads are not
supported. */
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
" loads need permutation or have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
if (vect_supportable_dr_alignment (first_dr)
== dr_unaligned_unsupported)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: unsupported "
" unaligned load ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
/* Analyze costs (for the first stmt in the group). */
vect_model_load_cost (vinfo_for_stmt (stmt),
ncopies_for_cost, *node);
}
else
{
/* Check that we have consecutive loads from interleaving
chain and that there is no gap between the loads. */
if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt
|| DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)
{
/* FORNOW: data permutations and gaps in loads are not
supported. */
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
" loads need permutation or have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
}
prev_stmt = stmt;
/* We stop the tree when we reach a group of loads. */
stop_recursion = true;
continue;
}
} /* Strided access. */
/* FORNOW: Check that there is no gap between the loads. */
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: strided "
"loads have gaps ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
if (first_load == stmt)
{
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
if (vect_supportable_dr_alignment (first_dr)
== dr_unaligned_unsupported)
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: unsupported "
"unaligned load ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
/* Analyze costs (for the first stmt in the group). */
vect_model_load_cost (vinfo_for_stmt (stmt),
ncopies_for_cost, *node);
}
/* Store the place of this load in the interleaving chain. In
case that permutation is needed we later decide if a specific
permutation is supported. */
load_place = vect_get_place_in_interleaving_chain (stmt,
first_load);
if (load_place != i)
permutation = true;
VEC_safe_push (int, heap, *load_permutation, load_place);
/* We stop the tree when we reach a group of loads. */
stop_recursion = true;
continue;
}
} /* Strided access. */
else
{
if (TREE_CODE_CLASS (rhs_code) == tcc_reference)
@ -2990,7 +3020,15 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
/* Strided loads were reached - stop the recursion. */
if (stop_recursion)
return true;
{
if (permutation)
{
VEC_safe_push (slp_tree, heap, *loads, *node);
*inside_cost += TARG_VEC_PERMUTE_COST * group_size;
}
return true;
}
/* Create SLP_TREE nodes for the definition node/s. */
if (first_stmt_dt0 == vect_loop_def)
@ -3003,8 +3041,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size,
inside_cost, outside_cost,
ncopies_for_cost, max_nunits))
inside_cost, outside_cost, ncopies_for_cost,
max_nunits, load_permutation, loads))
return false;
SLP_TREE_LEFT (*node) = left_node;
@ -3020,8 +3058,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size,
inside_cost, outside_cost,
ncopies_for_cost, max_nunits))
inside_cost, outside_cost, ncopies_for_cost,
max_nunits, load_permutation, loads))
return false;
SLP_TREE_RIGHT (*node) = right_node;
@ -3076,6 +3114,116 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
}
/* Check if the permutation required by the SLP INSTANCE is supported.
Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
static bool
vect_supported_slp_permutation_p (slp_instance instance)
{
slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
VEC (slp_tree, heap) *sorted_loads = NULL;
int index;
slp_tree *tmp_loads = NULL;
int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
slp_tree load;
/* FORNOW: The only supported loads permutation is loads from the same
location in all the loads in the node, when the data-refs in
nodes of LOADS constitute an interleaving chain.
Sort the nodes according to the order of accesses in the chain. */
tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
for (i = 0, j = 0;
VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index)
&& VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load);
i += group_size, j++)
{
gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
/* Check that the loads are all in the same interleaving chain. */
if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
{
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "Build SLP failed: unsupported data "
"permutation ");
print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM);
}
free (tmp_loads);
return false;
}
tmp_loads[index] = load;
}
sorted_loads = VEC_alloc (slp_tree, heap, group_size);
for (i = 0; i < group_size; i++)
VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]);
VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
SLP_INSTANCE_LOADS (instance) = sorted_loads;
free (tmp_loads);
if (!vect_transform_slp_perm_load (stmt, NULL, NULL,
SLP_INSTANCE_UNROLLING_FACTOR (instance),
instance, true))
return false;
return true;
}
/* Check if the required load permutation is supported.
LOAD_PERMUTATION contains a list of indices of the loads.
In SLP this permutation is relative to the order of strided stores that are
the base of the SLP instance. */
static bool
vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
VEC (int, heap) *load_permutation)
{
int i = 0, j, prev = -1, next, k;
bool supported;
/* FORNOW: permutations are only supported for loop-aware SLP. */
if (!slp_instn)
return false;
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Load permutation ");
for (i = 0; VEC_iterate (int, load_permutation, i, next); i++)
fprintf (vect_dump, "%d ", next);
}
/* FORNOW: the only supported permutation is 0..01..1.. of length equal to
GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
well. */
supported = true;
for (j = 0; j < group_size; j++)
{
for (i = j * group_size, k = 0;
VEC_iterate (int, load_permutation, i, next) && k < group_size;
i++, k++)
{
if (i != j * group_size && next != prev)
{
supported = false;
break;
}
prev = next;
}
}
if (supported && i == group_size * group_size
&& vect_supported_slp_permutation_p (slp_instn))
return true;
return false;
}
/* Analyze an SLP instance starting from a group of strided stores. Call
vect_build_slp_tree to build a tree of packed stmts if possible.
Return FALSE if it's impossible to SLP any stmt in the loop. */
@ -3093,8 +3241,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
bool slp_impossible = false;
int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
unsigned int max_nunits = 0;
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))));
VEC (int, heap) *load_permutation;
VEC (slp_tree, heap) *loads;
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
vinfo_for_stmt (stmt))));
vectype = get_vectype_for_scalar_type (scalar_type);
if (!vectype)
{
@ -3134,16 +3285,21 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
GROUP_SIZE / NUNITS otherwise. */
ncopies_for_cost = unrolling_factor * group_size / nunits;
load_permutation = VEC_alloc (int, heap, group_size * group_size);
loads = VEC_alloc (slp_tree, heap, group_size);
/* Build the tree for the SLP instance. */
if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost,
&outside_cost, ncopies_for_cost, &max_nunits))
&outside_cost, ncopies_for_cost, &max_nunits,
&load_permutation, &loads))
{
/* Create a new SLP instance. */
new_instance = XNEW (struct _slp_instance);
SLP_INSTANCE_TREE (new_instance) = node;
SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
/* Calculate the unrolling factor based on the smallest type. */
/* Calculate the unrolling factor based on the smallest type in the
loop. */
if (max_nunits > nunits)
unrolling_factor = least_common_multiple (max_nunits, group_size)
/ group_size;
@ -3151,6 +3307,27 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
SLP_INSTANCE_LOADS (new_instance) = loads;
SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
if (VEC_length (slp_tree, loads))
{
if (!vect_supported_load_permutation_p (new_instance, group_size,
load_permutation))
{
if (vect_print_dump_info (REPORT_SLP))
{
fprintf (vect_dump, "Build SLP failed: unsupported load "
"permutation ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
vect_free_slp_instance (new_instance);
return false;
}
}
else
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance));
VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
new_instance);
if (vect_print_dump_info (REPORT_SLP))
@ -3162,7 +3339,9 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
/* Failed to SLP. */
/* Free the allocated memory. */
vect_free_slp_tree (node);
VEC_free (int, heap, load_permutation);
VEC_free (slp_tree, heap, loads);
if (slp_impossible)
return false;

View File

@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
/* Utility functions for the code transformation. */
static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
slp_tree);
slp_tree, slp_instance);
static tree vect_create_destination_var (tree, tree);
static tree vect_create_data_ref_ptr
(gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
@ -936,7 +936,7 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
gimple_seq_add_seq (new_stmt_list, seq);
}
/* base + base_offset */
addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base),
data_ref_base, base_offset);
@ -5962,6 +5962,313 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
}
/* Create NCOPIES permutation statements using the mask MASK_BYTES (by
building a vector of type MASK_TYPE from it) and two input vectors placed in
DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
shifting by STRIDE elements of DR_CHAIN for every copy.
(STRIDE is the number of vectorized stmts for NODE divided by the number of
copies).
VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
the created stmts must be inserted. */
static inline void
vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
int *mask_array, int mask_nunits,
tree mask_element_type, tree mask_type,
int first_vec_indx, int second_vec_indx,
gimple_stmt_iterator *gsi, slp_tree node,
tree builtin_decl, tree vectype,
VEC(tree,heap) *dr_chain,
int ncopies, int vect_stmts_counter)
{
tree t = NULL_TREE, mask_vec, mask, perm_dest;
gimple perm_stmt = NULL;
stmt_vec_info next_stmt_info;
int i, group_size, stride, dr_chain_size;
tree first_vec, second_vec, data_ref;
tree sym;
ssa_op_iter iter;
VEC (tree, heap) *params = NULL;
/* Create a vector mask. */
for (i = mask_nunits - 1; i >= 0; --i)
t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
t);
mask_vec = build_vector (mask_type, t);
mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
dr_chain_size = VEC_length (tree, dr_chain);
/* Initialize the vect stmts of NODE to properly insert the generated
stmts later. */
for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
for (i = 0; i < ncopies; i++)
{
first_vec = VEC_index (tree, dr_chain, first_vec_indx);
second_vec = VEC_index (tree, dr_chain, second_vec_indx);
/* Build argument list for the vectorized call. */
VEC_free (tree, heap, params);
params = VEC_alloc (tree, heap, 3);
VEC_quick_push (tree, params, first_vec);
VEC_quick_push (tree, params, second_vec);
VEC_quick_push (tree, params, mask);
/* Generate the permute statement. */
perm_stmt = gimple_build_call_vec (builtin_decl, params);
data_ref = make_ssa_name (perm_dest, perm_stmt);
gimple_call_set_lhs (perm_stmt, data_ref);
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
{
if (TREE_CODE (sym) == SSA_NAME)
sym = SSA_NAME_VAR (sym);
mark_sym_for_renaming (sym);
}
/* Store the vector statement in NODE. */
VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
stride * i + vect_stmts_counter, perm_stmt);
first_vec_indx += stride;
second_vec_indx += stride;
}
/* Mark the scalar stmt as vectorized. */
next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
}
/* Given FIRST_MASK_ELEMENT - the mask element in element representation,
return in CURRENT_MASK_ELEMENT its equivalent in target specific
representation. Check that the mask is valid and return FALSE if not.
Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
the next vector, i.e., the current first vector is not needed. */
static bool
vect_get_mask_element (gimple stmt, int first_mask_element, int m,
int mask_nunits, bool only_one_vec, int index,
int *mask, int *current_mask_element,
bool *need_next_vector)
{
int i;
static int number_of_mask_fixes = 1;
static bool mask_fixed = false;
static bool needs_first_vector = false;
/* Convert to target specific representation. */
*current_mask_element = first_mask_element + m;
/* Adjust the value in case it's a mask for second and third vectors. */
*current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
if (*current_mask_element < mask_nunits)
needs_first_vector = true;
/* We have only one input vector to permute but the mask accesses values in
the next vector as well. */
if (only_one_vec && *current_mask_element >= mask_nunits)
{
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "permutation requires at least two vectors ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
/* The mask requires the next vector. */
if (*current_mask_element >= mask_nunits * 2)
{
if (needs_first_vector || mask_fixed)
{
/* We either need the first vector too or have already moved to the
next vector. In both cases, this permutation needs three
vectors. */
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "permutation requires at "
"least three vectors ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
/* We move to the next vector, dropping the first one and working with
the second and the third - we need to adjust the values of the mask
accordingly. */
*current_mask_element -= mask_nunits * number_of_mask_fixes;
for (i = 0; i < index; i++)
mask[i] -= mask_nunits * number_of_mask_fixes;
(number_of_mask_fixes)++;
mask_fixed = true;
}
*need_next_vector = mask_fixed;
/* This was the last element of this mask. Start a new one. */
if (index == mask_nunits - 1)
{
number_of_mask_fixes = 1;
mask_fixed = false;
needs_first_vector = false;
}
return true;
}
/* Generate vector permute statements from a list of loads in DR_CHAIN.
If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
permute statements for SLP_NODE_INSTANCE. */
bool
vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
gimple_stmt_iterator *gsi, int vf,
slp_instance slp_node_instance, bool analyze_only)
{
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
tree mask_element_type = NULL_TREE, mask_type;
int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
slp_tree node;
tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
gimple next_scalar_stmt;
int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
int first_mask_element;
int index, unroll_factor, *mask, current_mask_element, ncopies;
bool only_one_vec = false, need_next_vector = false;
int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
if (!targetm.vectorize.builtin_vec_perm)
{
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "no builtin for vect permute for ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
&mask_element_type);
if (!builtin_decl || !mask_element_type)
{
if (vect_print_dump_info (REPORT_DETAILS))
{
fprintf (vect_dump, "no builtin for vect permute for ");
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
}
return false;
}
mask_type = get_vectype_for_scalar_type (mask_element_type);
mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
mask = (int *) xmalloc (sizeof (int) * mask_nunits);
nunits = TYPE_VECTOR_SUBPARTS (vectype);
scale = mask_nunits / nunits;
unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
/* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
unrolling factor. */
orig_vec_stmts_num = group_size *
SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
if (orig_vec_stmts_num == 1)
only_one_vec = true;
/* Number of copies is determined by the final vectorization factor
relatively to SLP_NODE_INSTANCE unrolling factor. */
ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
/* Generate permutation masks for every NODE. Number of masks for each NODE
is equal to GROUP_SIZE.
E.g., we have a group of three nodes with three loads from the same
location in each node, and the vector size is 4. I.e., we have a
a0b0c0a1b1c1... sequence and we need to create the following vectors:
for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
...
The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
scpecific type, e.g., in bytes for Altivec.
The last mask is illegal since we assume two operands for permute
operation, and the mask element values can't be outside that range. Hence,
the last mask must be converted into {2,5,5,5}.
For the first two permutations we need the first and the second input
vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
we need the second and the third vectors: {b1,c1,a2,b2} and
{c2,a3,b3,c3}. */
for (i = 0;
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
i, node);
i++)
{
scalar_index = 0;
index = 0;
vect_stmts_counter = 0;
vec_index = 0;
first_vec_index = vec_index++;
if (only_one_vec)
second_vec_index = first_vec_index;
else
second_vec_index = vec_index++;
for (j = 0; j < unroll_factor; j++)
{
for (k = 0; k < group_size; k++)
{
first_mask_element = (i + j * group_size) * scale;
for (m = 0; m < scale; m++)
{
if (!vect_get_mask_element (stmt, first_mask_element, m,
mask_nunits, only_one_vec, index, mask,
&current_mask_element, &need_next_vector))
return false;
mask[index++] = current_mask_element;
}
if (index == mask_nunits)
{
index = 0;
if (!analyze_only)
{
if (need_next_vector)
{
first_vec_index = second_vec_index;
second_vec_index = vec_index;
}
next_scalar_stmt = VEC_index (gimple,
SLP_TREE_SCALAR_STMTS (node), scalar_index++);
vect_create_mask_and_perm (stmt, next_scalar_stmt,
mask, mask_nunits, mask_element_type, mask_type,
first_vec_index, second_vec_index, gsi, node,
builtin_decl, vectype, dr_chain, ncopies,
vect_stmts_counter++);
}
}
}
}
}
free (mask);
return true;
}
/* vectorizable_load.
Check if STMT reads a non scalar data-ref (array/pointer/structure) that
@ -5972,7 +6279,7 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
bool
vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
slp_tree slp_node)
slp_tree slp_node, slp_instance slp_node_instance)
{
tree scalar_dest;
tree vec_dest = NULL;
@ -6008,6 +6315,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
struct loop *at_loop;
int vec_num;
bool slp = (slp_node != NULL);
bool slp_perm = false;
enum tree_code code;
/* Multiple types in SLP are handled by creating the appropriate number of
@ -6028,6 +6336,9 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
return false;
}
if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
slp_perm = true;
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
@ -6397,33 +6708,47 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
/* Collect vector loads and later create their permutation in
vect_transform_strided_load (). */
if (strided_load)
if (strided_load || slp_perm)
VEC_quick_push (tree, dr_chain, new_temp);
/* Store vector loads in the corresponding SLP_NODE. */
if (slp)
if (slp && !slp_perm)
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
}
if (slp)
if (slp && !slp_perm)
continue;
if (strided_load)
{
if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
return false;
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
VEC_free (tree, heap, dr_chain);
dr_chain = VEC_alloc (tree, heap, group_size);
}
if (slp_perm)
{
if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
LOOP_VINFO_VECT_FACTOR (loop_vinfo),
slp_node_instance, false))
{
VEC_free (tree, heap, dr_chain);
return false;
}
}
else
{
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
{
if (strided_load)
{
if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
return false;
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
VEC_free (tree, heap, dr_chain);
dr_chain = VEC_alloc (tree, heap, group_size);
}
else
{
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
}
}
if (dr_chain)
@ -6690,7 +7015,8 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
static bool
vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
bool *strided_store, slp_tree slp_node)
bool *strided_store, slp_tree slp_node,
slp_instance slp_node_instance)
{
bool is_store = false;
gimple vec_stmt = NULL;
@ -6732,7 +7058,8 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
break;
case load_vec_info_type:
done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node);
done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
slp_node_instance);
gcc_assert (done);
break;
@ -7807,6 +8134,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
stmt_vec_info stmt_info;
unsigned int vec_stmts_size, nunits, group_size;
tree vectype;
int i;
slp_tree loads_node;
if (!node)
return false;
@ -7830,8 +8159,28 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
size. */
vec_stmts_size = (vectorization_factor * group_size) / nunits;
SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
/* In case of load permutation we have to allocate vectorized statements for
all the nodes that participate in that permutation. */
if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
{
for (i = 0;
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
i++)
{
if (!SLP_TREE_VEC_STMTS (loads_node))
{
SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
vec_stmts_size);
SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
}
}
}
if (!SLP_TREE_VEC_STMTS (node))
{
SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
}
if (vect_print_dump_info (REPORT_DETAILS))
{
@ -7840,7 +8189,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
}
si = gsi_for_stmt (stmt);
is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
if (is_store)
{
if (DR_GROUP_FIRST_DR (stmt_info))
@ -7980,7 +8329,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform phi.");
vect_transform_stmt (phi, NULL, NULL, NULL);
vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
}
}
@ -8059,7 +8408,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
fprintf (vect_dump, "transform statement.");
strided_store = false;
is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
if (is_store)
{
if (STMT_VINFO_STRIDED_ACCESS (stmt_info))

View File

@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
vect_free_slp_instance (instance);
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));

View File

@ -105,6 +105,8 @@ typedef struct _slp_tree {
} cost;
} *slp_tree;
DEF_VEC_P(slp_tree);
DEF_VEC_ALLOC_P(slp_tree, heap);
/* SLP instance is a sequence of stmts in a loop that can be packed into
SIMD stmts. */
@ -124,6 +126,13 @@ typedef struct _slp_instance {
int outside_of_loop; /* Statements generated outside loop. */
int inside_of_loop; /* Statements generated inside loop. */
} cost;
/* Loads permutation relatively to the stores, NULL if there is no
permutation. */
VEC (int, heap) *load_permutation;
/* The group of nodes that contain loads of this SLP instance. */
VEC (slp_tree, heap) *loads;
} *slp_instance;
DEF_VEC_P(slp_instance);
@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation
#define SLP_INSTANCE_LOADS(S) (S)->loads
#define SLP_TREE_LEFT(S) (S)->left
#define SLP_TREE_RIGHT(S) (S)->right
@ -522,6 +533,11 @@ typedef struct _stmt_vec_info {
#define TARG_VEC_STORE_COST 1
#endif
/* Cost of vector permutation. */
#ifndef TARG_VEC_PERMUTE_COST
#define TARG_VEC_PERMUTE_COST 1
#endif
/* The maximum number of intermediate steps required in multi-step type
conversion. */
#define MAX_INTERM_CVT_STEPS 3
@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt);
/** In tree-vect-analyze.c **/
/* Driver for analysis stage. */
extern loop_vec_info vect_analyze_loop (struct loop *);
extern void vect_free_slp_tree (slp_tree);
extern void vect_free_slp_instance (slp_instance);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
HOST_WIDE_INT *);
@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info);
/** In tree-vect-transform.c **/
extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
slp_tree, slp_instance);
extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *,
slp_tree);
extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *,
@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
slp_tree);
extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *,
gimple_stmt_iterator *, int, slp_instance, bool);
/* Driver for transformation stage. */
extern void vect_transform_loop (loop_vec_info);