target.h (struct vectorize): Add new target builtin.
* target.h (struct vectorize): Add new target builtin. * tree-vectorizer.c (destroy_loop_vec_info): Call vect_free_slp_instance instead of vect_free_slp_node. * tree-vectorizer.h (enum slp_load_perm_type): New. (struct _slp_instance): Add new fields. (SLP_INSTANCE_LOAD_PERMUTATION): New. (SLP_INSTANCE_LOADS): New. (vect_free_slp_tree): Remove. (vect_free_slp_instance): Declare. (SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New. (vectorizable_load): Add argument. (vect_transform_slp_perm_load): New. * tree-vect-analyze.c (vect_analyze_operations): Add an argument to vectorizable_load. (vect_get_place_in_interleaving_chain): New function. (vect_free_slp_tree): Make static. (vect_free_slp_instance): New function. (vect_build_slp_tree): Add new arguments. Allow load permutations and collect the load location in the interleaving chain. (vect_supported_slp_permutation_p): New function. (vect_supported_load_permutation_p): Likewise. (vect_analyze_slp_instance): In case of loads permutation, call vect_supported_load_permutation_p to check that the permutation is supported. * target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New. * tree-vect-transform.c (vect_transform_stmt): Add new argument. (vect_create_mask_and_perm): New function. (vect_get_mask_element, vect_transform_slp_perm_load): Likewise. (vectorizable_load): Add an argument. Don't keep the created vectors statements in the node if permutation is required. Call vect_transform_slp_perm_load to generate the permutation. (vect_transform_stmt): Add new argument. Call vectorizable_load with additional argument. (vect_schedule_slp_instance): In case of loads permutation, allocate vectorized statements structure for all the related SLP nodes. Call vect_transform_stmt with addditional argument. (vect_transform_loop): Call vect_transform_stmt with correct arguments. * config/spu/spu.c (spu_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. * config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define. * config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. From-SVN: r139706
This commit is contained in:
parent
b8c41c8ed2
commit
0fca40f598
|
@ -1,3 +1,48 @@
|
|||
2008-08-28 Ira Rosen <irar@il.ibm.com>
|
||||
|
||||
* target.h (struct vectorize): Add new target builtin.
|
||||
* tree-vectorizer.c (destroy_loop_vec_info): Call
|
||||
vect_free_slp_instance instead of vect_free_slp_node.
|
||||
* tree-vectorizer.h (enum slp_load_perm_type): New.
|
||||
(struct _slp_instance): Add new fields.
|
||||
(SLP_INSTANCE_LOAD_PERMUTATION): New.
|
||||
(SLP_INSTANCE_LOADS): New.
|
||||
(vect_free_slp_tree): Remove.
|
||||
(vect_free_slp_instance): Declare.
|
||||
(SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New.
|
||||
(vectorizable_load): Add argument.
|
||||
(vect_transform_slp_perm_load): New.
|
||||
* tree-vect-analyze.c (vect_analyze_operations): Add an argument to
|
||||
vectorizable_load.
|
||||
(vect_get_place_in_interleaving_chain): New function.
|
||||
(vect_free_slp_tree): Make static.
|
||||
(vect_free_slp_instance): New function.
|
||||
(vect_build_slp_tree): Add new arguments. Allow load permutations and
|
||||
collect the load location in the interleaving chain.
|
||||
(vect_supported_slp_permutation_p): New function.
|
||||
(vect_supported_load_permutation_p): Likewise.
|
||||
(vect_analyze_slp_instance): In case of loads permutation, call
|
||||
vect_supported_load_permutation_p to check that the permutation is
|
||||
supported.
|
||||
* target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New.
|
||||
* tree-vect-transform.c (vect_transform_stmt): Add new argument.
|
||||
(vect_create_mask_and_perm): New function.
|
||||
(vect_get_mask_element, vect_transform_slp_perm_load): Likewise.
|
||||
(vectorizable_load): Add an argument. Don't keep the created vectors
|
||||
statements in the node if permutation is required. Call
|
||||
vect_transform_slp_perm_load to generate the permutation.
|
||||
(vect_transform_stmt): Add new argument. Call vectorizable_load with
|
||||
additional argument.
|
||||
(vect_schedule_slp_instance): In case of loads permutation, allocate
|
||||
vectorized statements structure for all the related SLP nodes. Call
|
||||
vect_transform_stmt with addditional argument.
|
||||
(vect_transform_loop): Call vect_transform_stmt with correct arguments.
|
||||
* config/spu/spu.c (spu_builtin_vec_perm): New.
|
||||
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
|
||||
* config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define.
|
||||
* config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New.
|
||||
(TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine.
|
||||
|
||||
2008-08-28 Chris Fairles <chris.fairles@gmail.com>
|
||||
|
||||
* gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach,
|
||||
|
|
|
@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void);
|
|||
static tree rs6000_builtin_mul_widen_even (tree);
|
||||
static tree rs6000_builtin_mul_widen_odd (tree);
|
||||
static tree rs6000_builtin_conversion (enum tree_code, tree);
|
||||
static tree rs6000_builtin_vec_perm (tree, tree *);
|
||||
|
||||
static void def_builtin (int, const char *, tree, int);
|
||||
static bool rs6000_vector_alignment_reachable (const_tree, bool);
|
||||
|
@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] =
|
|||
#define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd
|
||||
#undef TARGET_VECTORIZE_BUILTIN_CONVERSION
|
||||
#define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion
|
||||
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
|
||||
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm
|
||||
|
||||
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
|
||||
#define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable
|
||||
|
@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac
|
|||
}
|
||||
}
|
||||
|
||||
/* Implement targetm.vectorize.builtin_vec_perm. */
|
||||
tree
|
||||
rs6000_builtin_vec_perm (tree type, tree *mask_element_type)
|
||||
{
|
||||
tree d;
|
||||
|
||||
*mask_element_type = unsigned_char_type_node;
|
||||
|
||||
switch (TYPE_MODE (type))
|
||||
{
|
||||
case V16QImode:
|
||||
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI];
|
||||
break;
|
||||
|
||||
case V8HImode:
|
||||
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI];
|
||||
break;
|
||||
|
||||
case V4SImode:
|
||||
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI];
|
||||
break;
|
||||
|
||||
case V4SFmode:
|
||||
d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF];
|
||||
break;
|
||||
|
||||
default:
|
||||
return NULL_TREE;
|
||||
}
|
||||
|
||||
gcc_assert (d);
|
||||
return d;
|
||||
}
|
||||
|
||||
/* Handle generic options of the form -mfoo=yes/no.
|
||||
NAME is the option name.
|
||||
VALUE is the option value.
|
||||
|
|
|
@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree);
|
|||
static tree spu_builtin_mask_for_load (void);
|
||||
static int spu_builtin_vectorization_cost (bool);
|
||||
static bool spu_vector_alignment_reachable (const_tree, bool);
|
||||
static tree spu_builtin_vec_perm (tree, tree *);
|
||||
static int spu_sms_res_mii (struct ddg *g);
|
||||
|
||||
extern const char *reg_names[];
|
||||
|
@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[];
|
|||
#undef TARGET_VECTOR_ALIGNMENT_REACHABLE
|
||||
#define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
|
||||
|
||||
#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM
|
||||
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm
|
||||
|
||||
#undef TARGET_LIBGCC_CMP_RETURN_MODE
|
||||
#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
|
||||
|
||||
|
@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed
|
|||
return true;
|
||||
}
|
||||
|
||||
/* Implement targetm.vectorize.builtin_vec_perm. */
|
||||
tree
|
||||
spu_builtin_vec_perm (tree type, tree *mask_element_type)
|
||||
{
|
||||
struct spu_builtin_description *d;
|
||||
|
||||
*mask_element_type = unsigned_char_type_node;
|
||||
|
||||
switch (TYPE_MODE (type))
|
||||
{
|
||||
case V16QImode:
|
||||
if (TYPE_UNSIGNED (type))
|
||||
d = &spu_builtins[SPU_SHUFFLE_0];
|
||||
else
|
||||
d = &spu_builtins[SPU_SHUFFLE_1];
|
||||
break;
|
||||
|
||||
case V8HImode:
|
||||
if (TYPE_UNSIGNED (type))
|
||||
d = &spu_builtins[SPU_SHUFFLE_2];
|
||||
else
|
||||
d = &spu_builtins[SPU_SHUFFLE_3];
|
||||
break;
|
||||
|
||||
case V4SImode:
|
||||
if (TYPE_UNSIGNED (type))
|
||||
d = &spu_builtins[SPU_SHUFFLE_4];
|
||||
else
|
||||
d = &spu_builtins[SPU_SHUFFLE_5];
|
||||
break;
|
||||
|
||||
case V2DImode:
|
||||
if (TYPE_UNSIGNED (type))
|
||||
d = &spu_builtins[SPU_SHUFFLE_6];
|
||||
else
|
||||
d = &spu_builtins[SPU_SHUFFLE_7];
|
||||
break;
|
||||
|
||||
case V4SFmode:
|
||||
d = &spu_builtins[SPU_SHUFFLE_8];
|
||||
break;
|
||||
|
||||
case V2DFmode:
|
||||
d = &spu_builtins[SPU_SHUFFLE_9];
|
||||
break;
|
||||
|
||||
default:
|
||||
return NULL_TREE;
|
||||
}
|
||||
|
||||
gcc_assert (d);
|
||||
return d->fndecl;
|
||||
}
|
||||
|
||||
/* Count the total number of instructions in each pipe and return the
|
||||
maximum, which is used as the Minimum Iteration Interval (MII)
|
||||
in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1.
|
||||
|
|
|
@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \
|
|||
#undef TARG_VEC_STORE_COST
|
||||
#define TARG_VEC_STORE_COST 1
|
||||
|
||||
/* Cost of vector permutation. */
|
||||
#ifndef TARG_VEC_PERMUTE_COST
|
||||
#define TARG_VEC_PERMUTE_COST 1
|
||||
#endif
|
||||
|
||||
|
||||
/* Misc */
|
||||
|
||||
|
|
|
@ -364,6 +364,7 @@
|
|||
#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0
|
||||
#define TARGET_VECTOR_ALIGNMENT_REACHABLE \
|
||||
default_builtin_vector_alignment_reachable
|
||||
#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0
|
||||
|
||||
#define TARGET_VECTORIZE \
|
||||
{ \
|
||||
|
@ -373,7 +374,8 @@
|
|||
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \
|
||||
TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \
|
||||
TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \
|
||||
TARGET_VECTOR_ALIGNMENT_REACHABLE \
|
||||
TARGET_VECTOR_ALIGNMENT_REACHABLE, \
|
||||
TARGET_VECTORIZE_BUILTIN_VEC_PERM \
|
||||
}
|
||||
|
||||
#define TARGET_DEFAULT_TARGET_FLAGS 0
|
||||
|
|
|
@ -438,6 +438,9 @@ struct gcc_target
|
|||
/* Return true if vector alignment is reachable (by peeling N
|
||||
iterations) for the given type. */
|
||||
bool (* vector_alignment_reachable) (const_tree, bool);
|
||||
|
||||
/* Target builtin that implements vector permute. */
|
||||
tree (* builtin_vec_perm) (tree, tree*);
|
||||
} vectorize;
|
||||
|
||||
/* The initial value of target_flags. */
|
||||
|
|
|
@ -1,3 +1,16 @@
|
|||
2008-08-28 Ira Rosen <irar@il.ibm.com>
|
||||
|
||||
* lib/target-supports.exp (check_effective_target_vect_perm): New.
|
||||
* gcc.dg/vect/slp-perm-1.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-2.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-3.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-4.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-5.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-6.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-7.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-8.c: New testcase.
|
||||
* gcc.dg/vect/slp-perm-9.c: New testcase.
|
||||
|
||||
2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org>
|
||||
|
||||
PR 37217
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
|
||||
{
|
||||
unsigned int i, a, b, c;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned int input[N], output[N], i;
|
||||
unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
if (input[i] > 200)
|
||||
abort();
|
||||
output[i] = 0;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
|
||||
{
|
||||
unsigned int i, a, b;
|
||||
|
||||
for (i = 0; i < N / 2; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b;
|
||||
*pOutput++ = M10 * a + M11 * b;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned int input[N], output[N], i;
|
||||
unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
if (input[i] > 200)
|
||||
abort();
|
||||
output[i] = 0;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M30 237
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M31 2280
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
#define M32 111
|
||||
#define M03 134
|
||||
#define M13 117
|
||||
#define M23 11
|
||||
#define M33 771
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
|
||||
{
|
||||
unsigned int i, a, b, c, d;
|
||||
|
||||
for (i = 0; i < N / 4; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
d = *pInput++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d;
|
||||
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned int input[N], output[N], i;
|
||||
unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
if (input[i] > 200)
|
||||
abort();
|
||||
output[i] = 0;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N - N; i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M30 237
|
||||
#define M40 437
|
||||
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M31 2280
|
||||
#define M41 284
|
||||
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
#define M32 111
|
||||
#define M42 1114
|
||||
|
||||
#define M03 134
|
||||
#define M13 117
|
||||
#define M23 11
|
||||
#define M33 771
|
||||
#define M43 71
|
||||
|
||||
#define M04 334
|
||||
#define M14 147
|
||||
#define M24 115
|
||||
#define M34 7716
|
||||
#define M44 16
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput)
|
||||
{
|
||||
unsigned int i, a, b, c, d, e;
|
||||
|
||||
for (i = 0; i < N / 5; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
d = *pInput++;
|
||||
e = *pInput++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e;
|
||||
*pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e;
|
||||
*pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned int input[N], output[N], i;
|
||||
unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
if (input[i] > 200)
|
||||
abort();
|
||||
output[i] = 0;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N - N; i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
|
||||
#define K00 405
|
||||
#define K10 112
|
||||
#define K01 4322
|
||||
#define K11 135
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
|
||||
int *__restrict__ pInput2, int *__restrict__ pOutput2)
|
||||
{
|
||||
int i, a, b, c, d, e;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
|
||||
d = *pInput2++;
|
||||
e = *pInput2++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c;
|
||||
|
||||
*pOutput2++ = K00 * d + K01 * e;
|
||||
*pOutput2++ = K10 * d + K11 * e;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
int input[N], output[N], i;
|
||||
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
|
||||
int input2[N], output2[N];
|
||||
int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
input2[i] = i%256;
|
||||
output[i] = 0;
|
||||
output2[i] = 0;
|
||||
if (input[i] > 256)
|
||||
abort ();
|
||||
}
|
||||
|
||||
foo (input, output, input2, output2);
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
if (output[i] != check_results[i] || output2[i] != check_results2[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
|
||||
#define K00 405
|
||||
#define K10 112
|
||||
#define K01 4322
|
||||
#define K11 135
|
||||
|
||||
#define N 16
|
||||
|
||||
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
|
||||
int *__restrict__ pInput2, int *__restrict__ pOutput2)
|
||||
{
|
||||
int i, a, b, c, d, e;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
|
||||
d = *pInput2++;
|
||||
e = *pInput2++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c;
|
||||
|
||||
/* Regular SLP - no permutation required. */
|
||||
*pOutput2++ = K00 * d;
|
||||
*pOutput2++ = K10 * e;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
int input[N], output[N], i;
|
||||
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
|
||||
int input2[N], output2[N];
|
||||
int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
input2[i] = i%256;
|
||||
output[i] = 0;
|
||||
output2[i] = 0;
|
||||
if (input[i] > 256)
|
||||
abort ();
|
||||
}
|
||||
|
||||
foo (input, output, input2, output2);
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
if (output[i] != check_results[i] || output2[i] != check_results2[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define M00 100
|
||||
#define M10 216
|
||||
#define M20 23
|
||||
#define M01 1322
|
||||
#define M11 13
|
||||
#define M21 27271
|
||||
#define M02 74
|
||||
#define M12 191
|
||||
#define M22 500
|
||||
|
||||
#define K00 405
|
||||
#define K10 112
|
||||
#define K01 4322
|
||||
#define K11 135
|
||||
|
||||
#define N 16
|
||||
|
||||
/* SLP with load permutation and loop-based vectorization. */
|
||||
void foo (int *__restrict__ pInput, int *__restrict__ pOutput,
|
||||
int *__restrict__ pInput2, int *__restrict__ pOutput2)
|
||||
{
|
||||
int i, a, b, c, d;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
d = *pInput2++;
|
||||
|
||||
*pOutput++ = M00 * a + M01 * b + M02 * c;
|
||||
*pOutput++ = M10 * a + M11 * b + M12 * c;
|
||||
*pOutput++ = M20 * a + M21 * b + M22 * c;
|
||||
|
||||
/* Loop-based vectorization. */
|
||||
*pOutput2++ = K00 * d;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
int input[N], output[N], i;
|
||||
int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0};
|
||||
int input2[N], output2[N];
|
||||
int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i%256;
|
||||
input2[i] = i%256;
|
||||
output[i] = 0;
|
||||
output2[i] = 0;
|
||||
if (input[i] > 200)
|
||||
abort ();
|
||||
}
|
||||
|
||||
foo (input, output, input2, output2);
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
if (output[i] != check_results[i] || output2[i] != check_results2[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define N 200
|
||||
|
||||
void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput)
|
||||
{
|
||||
unsigned char i, a, b, c;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
|
||||
*pOutput++ = a + b + c + 3;
|
||||
*pOutput++ = a + b + c + 12;
|
||||
*pOutput++ = a + b + c + 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned char input[N], output[N], i;
|
||||
unsigned char check_results[N];
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i;
|
||||
output[i] = 0;
|
||||
if (input[i] > 256)
|
||||
abort ();
|
||||
}
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
check_results[3*i] = 9 * i + 6;
|
||||
check_results[3*i+1] = 9 * i + 15;
|
||||
check_results[3*i+2] = 9 * i + 4;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N - (N % 3); i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
/* { dg-require-effective-target vect_int } */
|
||||
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include "tree-vect.h"
|
||||
|
||||
#define N 200
|
||||
|
||||
void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput)
|
||||
{
|
||||
unsigned short i, a, b, c;
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
a = *pInput++;
|
||||
b = *pInput++;
|
||||
c = *pInput++;
|
||||
|
||||
*pOutput++ = a + b + c + 3;
|
||||
*pOutput++ = a + b + c + 12;
|
||||
*pOutput++ = a + b + c + 1;
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, const char* argv[])
|
||||
{
|
||||
unsigned short input[N], output[N], i;
|
||||
unsigned short check_results[N];
|
||||
|
||||
for (i = 0; i < N; i++)
|
||||
{
|
||||
input[i] = i;
|
||||
output[i] = 0;
|
||||
if (input[i] > 256)
|
||||
abort ();
|
||||
}
|
||||
|
||||
for (i = 0; i < N / 3; i++)
|
||||
{
|
||||
check_results[3*i] = 9 * i + 6;
|
||||
check_results[3*i+1] = 9 * i + 15;
|
||||
check_results[3*i+2] = 9 * i + 4;
|
||||
}
|
||||
|
||||
foo (input, output);
|
||||
|
||||
for (i = 0; i < N - (N % 3); i++)
|
||||
if (output[i] != check_results[i])
|
||||
abort ();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
|
||||
/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */
|
||||
/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */
|
||||
/* { dg-final { cleanup-tree-dump "vect" } } */
|
||||
|
|
@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } {
|
|||
return $et_vect_no_bitwise_saved
|
||||
}
|
||||
|
||||
# Return 1 if the target plus current options supports vector permutation,
|
||||
# 0 otherwise.
|
||||
#
|
||||
# This won't change for different subtargets so cache the result.
|
||||
|
||||
proc check_effective_target_vect_perm { } {
|
||||
global et_vect_perm
|
||||
|
||||
if [info exists et_vect_perm_saved] {
|
||||
verbose "check_effective_target_vect_perm: using cached result" 2
|
||||
} else {
|
||||
set et_vect_perm_saved 0
|
||||
if { [istarget powerpc*-*-*]
|
||||
|| [istarget spu-*-*] } {
|
||||
set et_vect_perm_saved 1
|
||||
}
|
||||
}
|
||||
verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2
|
||||
return $et_vect_perm_saved
|
||||
}
|
||||
|
||||
|
||||
# Return 1 if the target plus current options supports a vector
|
||||
# widening summation of *short* args into *int* result, 0 otherwise.
|
||||
# A target can also support this widening summation if it can support
|
||||
|
|
|
@ -486,7 +486,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
|
|||
|| vectorizable_conversion (stmt, NULL, NULL, NULL)
|
||||
|| vectorizable_operation (stmt, NULL, NULL, NULL)
|
||||
|| vectorizable_assignment (stmt, NULL, NULL, NULL)
|
||||
|| vectorizable_load (stmt, NULL, NULL, NULL)
|
||||
|| vectorizable_load (stmt, NULL, NULL, NULL, NULL)
|
||||
|| vectorizable_call (stmt, NULL, NULL)
|
||||
|| vectorizable_store (stmt, NULL, NULL, NULL)
|
||||
|| vectorizable_condition (stmt, NULL, NULL)
|
||||
|
@ -846,6 +846,31 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
|
|||
}
|
||||
|
||||
|
||||
/* Find the place of the data-ref in STMT in the interleaving chain that starts
|
||||
from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
|
||||
|
||||
static int
|
||||
vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
|
||||
{
|
||||
gimple next_stmt = first_stmt;
|
||||
int result = 0;
|
||||
|
||||
if (first_stmt != DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)))
|
||||
return -1;
|
||||
|
||||
while (next_stmt && next_stmt != stmt)
|
||||
{
|
||||
result++;
|
||||
next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt));
|
||||
}
|
||||
|
||||
if (next_stmt)
|
||||
return result;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/* Function vect_insert_into_interleaving_chain.
|
||||
|
||||
Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
|
||||
|
@ -2482,7 +2507,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
|
|||
|
||||
/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
|
||||
|
||||
void
|
||||
static void
|
||||
vect_free_slp_tree (slp_tree node)
|
||||
{
|
||||
if (!node)
|
||||
|
@ -2503,6 +2528,17 @@ vect_free_slp_tree (slp_tree node)
|
|||
}
|
||||
|
||||
|
||||
/* Free the memory allocated for the SLP instance. */
|
||||
|
||||
void
|
||||
vect_free_slp_instance (slp_instance instance)
|
||||
{
|
||||
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
|
||||
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance));
|
||||
VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
|
||||
}
|
||||
|
||||
|
||||
/* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that
|
||||
they are of a legal type and that they match the defs of the first stmt of
|
||||
the SLP group (stored in FIRST_STMT_...). */
|
||||
|
@ -2705,7 +2741,9 @@ static bool
|
|||
vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
||||
unsigned int group_size,
|
||||
int *inside_cost, int *outside_cost,
|
||||
int ncopies_for_cost, unsigned int *max_nunits)
|
||||
int ncopies_for_cost, unsigned int *max_nunits,
|
||||
VEC (int, heap) **load_permutation,
|
||||
VEC (slp_tree, heap) **loads)
|
||||
{
|
||||
VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size);
|
||||
VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size);
|
||||
|
@ -2716,7 +2754,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
enum tree_code first_stmt_code = 0, rhs_code;
|
||||
tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE;
|
||||
tree lhs;
|
||||
gimple prev_stmt = NULL;
|
||||
bool stop_recursion = false, need_same_oprnds = false;
|
||||
tree vectype, scalar_type, first_op1 = NULL_TREE;
|
||||
unsigned int vectorization_factor = 0, ncopies;
|
||||
|
@ -2728,6 +2765,9 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
struct data_reference *first_dr;
|
||||
bool pattern0 = false, pattern1 = false;
|
||||
HOST_WIDE_INT dummy;
|
||||
bool permutation = false;
|
||||
unsigned int load_place;
|
||||
gimple first_load;
|
||||
|
||||
/* For every stmt in NODE find its def stmt/s. */
|
||||
for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++)
|
||||
|
@ -2813,8 +2853,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
if (icode == CODE_FOR_nothing)
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
fprintf (vect_dump,
|
||||
"Build SLP failed: op not supported by target.");
|
||||
fprintf (vect_dump, "Build SLP failed: "
|
||||
"op not supported by target.");
|
||||
return false;
|
||||
}
|
||||
optab_op2_mode = insn_data[icode].operand[2].mode;
|
||||
|
@ -2878,26 +2918,26 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
else
|
||||
{
|
||||
/* Load. */
|
||||
if (i == 0)
|
||||
/* FORNOW: Check that there is no gap between the loads. */
|
||||
if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt
|
||||
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|
||||
|| (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
|
||||
&& DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
|
||||
{
|
||||
/* First stmt of the SLP group should be the first load of
|
||||
the interleaving loop if data permutation is not allowed.
|
||||
Check that there is no gap between the loads. */
|
||||
if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt
|
||||
|| DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
|
||||
{
|
||||
/* FORNOW: data permutations and gaps in loads are not
|
||||
supported. */
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
{
|
||||
fprintf (vect_dump, "Build SLP failed: strided "
|
||||
" loads need permutation or have gaps ");
|
||||
"loads have gaps ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
|
||||
|
||||
if (first_load == stmt)
|
||||
{
|
||||
first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
|
||||
if (vect_supportable_dr_alignment (first_dr)
|
||||
== dr_unaligned_unsupported)
|
||||
|
@ -2916,26 +2956,16 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
vect_model_load_cost (vinfo_for_stmt (stmt),
|
||||
ncopies_for_cost, *node);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Check that we have consecutive loads from interleaving
|
||||
chain and that there is no gap between the loads. */
|
||||
if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt
|
||||
|| DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)
|
||||
{
|
||||
/* FORNOW: data permutations and gaps in loads are not
|
||||
supported. */
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
{
|
||||
fprintf (vect_dump, "Build SLP failed: strided "
|
||||
" loads need permutation or have gaps ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
prev_stmt = stmt;
|
||||
/* Store the place of this load in the interleaving chain. In
|
||||
case that permutation is needed we later decide if a specific
|
||||
permutation is supported. */
|
||||
load_place = vect_get_place_in_interleaving_chain (stmt,
|
||||
first_load);
|
||||
if (load_place != i)
|
||||
permutation = true;
|
||||
|
||||
VEC_safe_push (int, heap, *load_permutation, load_place);
|
||||
|
||||
/* We stop the tree when we reach a group of loads. */
|
||||
stop_recursion = true;
|
||||
|
@ -2990,7 +3020,15 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
|
||||
/* Strided loads were reached - stop the recursion. */
|
||||
if (stop_recursion)
|
||||
{
|
||||
if (permutation)
|
||||
{
|
||||
VEC_safe_push (slp_tree, heap, *loads, *node);
|
||||
*inside_cost += TARG_VEC_PERMUTE_COST * group_size;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Create SLP_TREE nodes for the definition node/s. */
|
||||
if (first_stmt_dt0 == vect_loop_def)
|
||||
|
@ -3003,8 +3041,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0;
|
||||
SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0;
|
||||
if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size,
|
||||
inside_cost, outside_cost,
|
||||
ncopies_for_cost, max_nunits))
|
||||
inside_cost, outside_cost, ncopies_for_cost,
|
||||
max_nunits, load_permutation, loads))
|
||||
return false;
|
||||
|
||||
SLP_TREE_LEFT (*node) = left_node;
|
||||
|
@ -3020,8 +3058,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node,
|
|||
SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0;
|
||||
SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0;
|
||||
if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size,
|
||||
inside_cost, outside_cost,
|
||||
ncopies_for_cost, max_nunits))
|
||||
inside_cost, outside_cost, ncopies_for_cost,
|
||||
max_nunits, load_permutation, loads))
|
||||
return false;
|
||||
|
||||
SLP_TREE_RIGHT (*node) = right_node;
|
||||
|
@ -3076,6 +3114,116 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j)
|
|||
}
|
||||
|
||||
|
||||
/* Check if the permutation required by the SLP INSTANCE is supported.
|
||||
Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */
|
||||
|
||||
static bool
|
||||
vect_supported_slp_permutation_p (slp_instance instance)
|
||||
{
|
||||
slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0);
|
||||
gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0);
|
||||
gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt));
|
||||
VEC (slp_tree, heap) *sorted_loads = NULL;
|
||||
int index;
|
||||
slp_tree *tmp_loads = NULL;
|
||||
int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j;
|
||||
slp_tree load;
|
||||
|
||||
/* FORNOW: The only supported loads permutation is loads from the same
|
||||
location in all the loads in the node, when the data-refs in
|
||||
nodes of LOADS constitute an interleaving chain.
|
||||
Sort the nodes according to the order of accesses in the chain. */
|
||||
tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size);
|
||||
for (i = 0, j = 0;
|
||||
VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index)
|
||||
&& VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load);
|
||||
i += group_size, j++)
|
||||
{
|
||||
gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0);
|
||||
/* Check that the loads are all in the same interleaving chain. */
|
||||
if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load)
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
fprintf (vect_dump, "Build SLP failed: unsupported data "
|
||||
"permutation ");
|
||||
print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
free (tmp_loads);
|
||||
return false;
|
||||
}
|
||||
|
||||
tmp_loads[index] = load;
|
||||
}
|
||||
|
||||
sorted_loads = VEC_alloc (slp_tree, heap, group_size);
|
||||
for (i = 0; i < group_size; i++)
|
||||
VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]);
|
||||
|
||||
VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance));
|
||||
SLP_INSTANCE_LOADS (instance) = sorted_loads;
|
||||
free (tmp_loads);
|
||||
|
||||
if (!vect_transform_slp_perm_load (stmt, NULL, NULL,
|
||||
SLP_INSTANCE_UNROLLING_FACTOR (instance),
|
||||
instance, true))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/* Check if the required load permutation is supported.
|
||||
LOAD_PERMUTATION contains a list of indices of the loads.
|
||||
In SLP this permutation is relative to the order of strided stores that are
|
||||
the base of the SLP instance. */
|
||||
|
||||
static bool
|
||||
vect_supported_load_permutation_p (slp_instance slp_instn, int group_size,
|
||||
VEC (int, heap) *load_permutation)
|
||||
{
|
||||
int i = 0, j, prev = -1, next, k;
|
||||
bool supported;
|
||||
|
||||
/* FORNOW: permutations are only supported for loop-aware SLP. */
|
||||
if (!slp_instn)
|
||||
return false;
|
||||
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
{
|
||||
fprintf (vect_dump, "Load permutation ");
|
||||
for (i = 0; VEC_iterate (int, load_permutation, i, next); i++)
|
||||
fprintf (vect_dump, "%d ", next);
|
||||
}
|
||||
|
||||
/* FORNOW: the only supported permutation is 0..01..1.. of length equal to
|
||||
GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
|
||||
well. */
|
||||
supported = true;
|
||||
for (j = 0; j < group_size; j++)
|
||||
{
|
||||
for (i = j * group_size, k = 0;
|
||||
VEC_iterate (int, load_permutation, i, next) && k < group_size;
|
||||
i++, k++)
|
||||
{
|
||||
if (i != j * group_size && next != prev)
|
||||
{
|
||||
supported = false;
|
||||
break;
|
||||
}
|
||||
|
||||
prev = next;
|
||||
}
|
||||
}
|
||||
|
||||
if (supported && i == group_size * group_size
|
||||
&& vect_supported_slp_permutation_p (slp_instn))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Analyze an SLP instance starting from a group of strided stores. Call
|
||||
vect_build_slp_tree to build a tree of packed stmts if possible.
|
||||
Return FALSE if it's impossible to SLP any stmt in the loop. */
|
||||
|
@ -3093,8 +3241,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
|
|||
bool slp_impossible = false;
|
||||
int inside_cost = 0, outside_cost = 0, ncopies_for_cost;
|
||||
unsigned int max_nunits = 0;
|
||||
VEC (int, heap) *load_permutation;
|
||||
VEC (slp_tree, heap) *loads;
|
||||
|
||||
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))));
|
||||
scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (
|
||||
vinfo_for_stmt (stmt))));
|
||||
vectype = get_vectype_for_scalar_type (scalar_type);
|
||||
if (!vectype)
|
||||
{
|
||||
|
@ -3135,15 +3286,20 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
|
|||
GROUP_SIZE / NUNITS otherwise. */
|
||||
ncopies_for_cost = unrolling_factor * group_size / nunits;
|
||||
|
||||
load_permutation = VEC_alloc (int, heap, group_size * group_size);
|
||||
loads = VEC_alloc (slp_tree, heap, group_size);
|
||||
|
||||
/* Build the tree for the SLP instance. */
|
||||
if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost,
|
||||
&outside_cost, ncopies_for_cost, &max_nunits))
|
||||
&outside_cost, ncopies_for_cost, &max_nunits,
|
||||
&load_permutation, &loads))
|
||||
{
|
||||
/* Create a new SLP instance. */
|
||||
new_instance = XNEW (struct _slp_instance);
|
||||
SLP_INSTANCE_TREE (new_instance) = node;
|
||||
SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
|
||||
/* Calculate the unrolling factor based on the smallest type. */
|
||||
/* Calculate the unrolling factor based on the smallest type in the
|
||||
loop. */
|
||||
if (max_nunits > nunits)
|
||||
unrolling_factor = least_common_multiple (max_nunits, group_size)
|
||||
/ group_size;
|
||||
|
@ -3151,6 +3307,27 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
|
|||
SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
|
||||
SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost;
|
||||
SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost;
|
||||
SLP_INSTANCE_LOADS (new_instance) = loads;
|
||||
SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation;
|
||||
if (VEC_length (slp_tree, loads))
|
||||
{
|
||||
if (!vect_supported_load_permutation_p (new_instance, group_size,
|
||||
load_permutation))
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
{
|
||||
fprintf (vect_dump, "Build SLP failed: unsupported load "
|
||||
"permutation ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
vect_free_slp_instance (new_instance);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance));
|
||||
|
||||
VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
|
||||
new_instance);
|
||||
if (vect_print_dump_info (REPORT_SLP))
|
||||
|
@ -3162,6 +3339,8 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt)
|
|||
/* Failed to SLP. */
|
||||
/* Free the allocated memory. */
|
||||
vect_free_slp_tree (node);
|
||||
VEC_free (int, heap, load_permutation);
|
||||
VEC_free (slp_tree, heap, loads);
|
||||
|
||||
if (slp_impossible)
|
||||
return false;
|
||||
|
|
|
@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see
|
|||
|
||||
/* Utility functions for the code transformation. */
|
||||
static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *,
|
||||
slp_tree);
|
||||
slp_tree, slp_instance);
|
||||
static tree vect_create_destination_var (tree, tree);
|
||||
static tree vect_create_data_ref_ptr
|
||||
(gimple, struct loop*, tree, tree *, gimple *, bool, bool *);
|
||||
|
@ -5962,6 +5962,313 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
|
|||
}
|
||||
|
||||
|
||||
/* Create NCOPIES permutation statements using the mask MASK_BYTES (by
|
||||
building a vector of type MASK_TYPE from it) and two input vectors placed in
|
||||
DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and
|
||||
shifting by STRIDE elements of DR_CHAIN for every copy.
|
||||
(STRIDE is the number of vectorized stmts for NODE divided by the number of
|
||||
copies).
|
||||
VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where
|
||||
the created stmts must be inserted. */
|
||||
|
||||
static inline void
|
||||
vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
|
||||
int *mask_array, int mask_nunits,
|
||||
tree mask_element_type, tree mask_type,
|
||||
int first_vec_indx, int second_vec_indx,
|
||||
gimple_stmt_iterator *gsi, slp_tree node,
|
||||
tree builtin_decl, tree vectype,
|
||||
VEC(tree,heap) *dr_chain,
|
||||
int ncopies, int vect_stmts_counter)
|
||||
{
|
||||
tree t = NULL_TREE, mask_vec, mask, perm_dest;
|
||||
gimple perm_stmt = NULL;
|
||||
stmt_vec_info next_stmt_info;
|
||||
int i, group_size, stride, dr_chain_size;
|
||||
tree first_vec, second_vec, data_ref;
|
||||
tree sym;
|
||||
ssa_op_iter iter;
|
||||
VEC (tree, heap) *params = NULL;
|
||||
|
||||
/* Create a vector mask. */
|
||||
for (i = mask_nunits - 1; i >= 0; --i)
|
||||
t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]),
|
||||
t);
|
||||
|
||||
mask_vec = build_vector (mask_type, t);
|
||||
mask = vect_init_vector (stmt, mask_vec, mask_type, NULL);
|
||||
|
||||
group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node));
|
||||
stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies;
|
||||
dr_chain_size = VEC_length (tree, dr_chain);
|
||||
|
||||
/* Initialize the vect stmts of NODE to properly insert the generated
|
||||
stmts later. */
|
||||
for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node));
|
||||
i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++)
|
||||
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL);
|
||||
|
||||
perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype);
|
||||
for (i = 0; i < ncopies; i++)
|
||||
{
|
||||
first_vec = VEC_index (tree, dr_chain, first_vec_indx);
|
||||
second_vec = VEC_index (tree, dr_chain, second_vec_indx);
|
||||
|
||||
/* Build argument list for the vectorized call. */
|
||||
VEC_free (tree, heap, params);
|
||||
params = VEC_alloc (tree, heap, 3);
|
||||
VEC_quick_push (tree, params, first_vec);
|
||||
VEC_quick_push (tree, params, second_vec);
|
||||
VEC_quick_push (tree, params, mask);
|
||||
|
||||
/* Generate the permute statement. */
|
||||
perm_stmt = gimple_build_call_vec (builtin_decl, params);
|
||||
data_ref = make_ssa_name (perm_dest, perm_stmt);
|
||||
gimple_call_set_lhs (perm_stmt, data_ref);
|
||||
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
||||
FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS)
|
||||
{
|
||||
if (TREE_CODE (sym) == SSA_NAME)
|
||||
sym = SSA_NAME_VAR (sym);
|
||||
mark_sym_for_renaming (sym);
|
||||
}
|
||||
|
||||
/* Store the vector statement in NODE. */
|
||||
VEC_replace (gimple, SLP_TREE_VEC_STMTS (node),
|
||||
stride * i + vect_stmts_counter, perm_stmt);
|
||||
|
||||
first_vec_indx += stride;
|
||||
second_vec_indx += stride;
|
||||
}
|
||||
|
||||
/* Mark the scalar stmt as vectorized. */
|
||||
next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
|
||||
STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
|
||||
}
|
||||
|
||||
|
||||
/* Given FIRST_MASK_ELEMENT - the mask element in element representation,
|
||||
return in CURRENT_MASK_ELEMENT its equivalent in target specific
|
||||
representation. Check that the mask is valid and return FALSE if not.
|
||||
Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to
|
||||
the next vector, i.e., the current first vector is not needed. */
|
||||
|
||||
static bool
|
||||
vect_get_mask_element (gimple stmt, int first_mask_element, int m,
|
||||
int mask_nunits, bool only_one_vec, int index,
|
||||
int *mask, int *current_mask_element,
|
||||
bool *need_next_vector)
|
||||
{
|
||||
int i;
|
||||
static int number_of_mask_fixes = 1;
|
||||
static bool mask_fixed = false;
|
||||
static bool needs_first_vector = false;
|
||||
|
||||
/* Convert to target specific representation. */
|
||||
*current_mask_element = first_mask_element + m;
|
||||
/* Adjust the value in case it's a mask for second and third vectors. */
|
||||
*current_mask_element -= mask_nunits * (number_of_mask_fixes - 1);
|
||||
|
||||
if (*current_mask_element < mask_nunits)
|
||||
needs_first_vector = true;
|
||||
|
||||
/* We have only one input vector to permute but the mask accesses values in
|
||||
the next vector as well. */
|
||||
if (only_one_vec && *current_mask_element >= mask_nunits)
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
fprintf (vect_dump, "permutation requires at least two vectors ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* The mask requires the next vector. */
|
||||
if (*current_mask_element >= mask_nunits * 2)
|
||||
{
|
||||
if (needs_first_vector || mask_fixed)
|
||||
{
|
||||
/* We either need the first vector too or have already moved to the
|
||||
next vector. In both cases, this permutation needs three
|
||||
vectors. */
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
fprintf (vect_dump, "permutation requires at "
|
||||
"least three vectors ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* We move to the next vector, dropping the first one and working with
|
||||
the second and the third - we need to adjust the values of the mask
|
||||
accordingly. */
|
||||
*current_mask_element -= mask_nunits * number_of_mask_fixes;
|
||||
|
||||
for (i = 0; i < index; i++)
|
||||
mask[i] -= mask_nunits * number_of_mask_fixes;
|
||||
|
||||
(number_of_mask_fixes)++;
|
||||
mask_fixed = true;
|
||||
}
|
||||
|
||||
*need_next_vector = mask_fixed;
|
||||
|
||||
/* This was the last element of this mask. Start a new one. */
|
||||
if (index == mask_nunits - 1)
|
||||
{
|
||||
number_of_mask_fixes = 1;
|
||||
mask_fixed = false;
|
||||
needs_first_vector = false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/* Generate vector permute statements from a list of loads in DR_CHAIN.
|
||||
If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
|
||||
permute statements for SLP_NODE_INSTANCE. */
|
||||
bool
|
||||
vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain,
|
||||
gimple_stmt_iterator *gsi, int vf,
|
||||
slp_instance slp_node_instance, bool analyze_only)
|
||||
{
|
||||
stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
|
||||
tree mask_element_type = NULL_TREE, mask_type;
|
||||
int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index;
|
||||
slp_tree node;
|
||||
tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl;
|
||||
gimple next_scalar_stmt;
|
||||
int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
|
||||
int first_mask_element;
|
||||
int index, unroll_factor, *mask, current_mask_element, ncopies;
|
||||
bool only_one_vec = false, need_next_vector = false;
|
||||
int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter;
|
||||
|
||||
if (!targetm.vectorize.builtin_vec_perm)
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
fprintf (vect_dump, "no builtin for vect permute for ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
builtin_decl = targetm.vectorize.builtin_vec_perm (vectype,
|
||||
&mask_element_type);
|
||||
if (!builtin_decl || !mask_element_type)
|
||||
{
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
fprintf (vect_dump, "no builtin for vect permute for ");
|
||||
print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
mask_type = get_vectype_for_scalar_type (mask_element_type);
|
||||
mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type);
|
||||
mask = (int *) xmalloc (sizeof (int) * mask_nunits);
|
||||
nunits = TYPE_VECTOR_SUBPARTS (vectype);
|
||||
scale = mask_nunits / nunits;
|
||||
unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
|
||||
|
||||
/* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
|
||||
unrolling factor. */
|
||||
orig_vec_stmts_num = group_size *
|
||||
SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
|
||||
if (orig_vec_stmts_num == 1)
|
||||
only_one_vec = true;
|
||||
|
||||
/* Number of copies is determined by the final vectorization factor
|
||||
relatively to SLP_NODE_INSTANCE unrolling factor. */
|
||||
ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
|
||||
|
||||
/* Generate permutation masks for every NODE. Number of masks for each NODE
|
||||
is equal to GROUP_SIZE.
|
||||
E.g., we have a group of three nodes with three loads from the same
|
||||
location in each node, and the vector size is 4. I.e., we have a
|
||||
a0b0c0a1b1c1... sequence and we need to create the following vectors:
|
||||
for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
|
||||
for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
|
||||
...
|
||||
|
||||
The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target
|
||||
scpecific type, e.g., in bytes for Altivec.
|
||||
The last mask is illegal since we assume two operands for permute
|
||||
operation, and the mask element values can't be outside that range. Hence,
|
||||
the last mask must be converted into {2,5,5,5}.
|
||||
For the first two permutations we need the first and the second input
|
||||
vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
|
||||
we need the second and the third vectors: {b1,c1,a2,b2} and
|
||||
{c2,a3,b3,c3}. */
|
||||
|
||||
for (i = 0;
|
||||
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance),
|
||||
i, node);
|
||||
i++)
|
||||
{
|
||||
scalar_index = 0;
|
||||
index = 0;
|
||||
vect_stmts_counter = 0;
|
||||
vec_index = 0;
|
||||
first_vec_index = vec_index++;
|
||||
if (only_one_vec)
|
||||
second_vec_index = first_vec_index;
|
||||
else
|
||||
second_vec_index = vec_index++;
|
||||
|
||||
for (j = 0; j < unroll_factor; j++)
|
||||
{
|
||||
for (k = 0; k < group_size; k++)
|
||||
{
|
||||
first_mask_element = (i + j * group_size) * scale;
|
||||
for (m = 0; m < scale; m++)
|
||||
{
|
||||
if (!vect_get_mask_element (stmt, first_mask_element, m,
|
||||
mask_nunits, only_one_vec, index, mask,
|
||||
¤t_mask_element, &need_next_vector))
|
||||
return false;
|
||||
|
||||
mask[index++] = current_mask_element;
|
||||
}
|
||||
|
||||
if (index == mask_nunits)
|
||||
{
|
||||
index = 0;
|
||||
if (!analyze_only)
|
||||
{
|
||||
if (need_next_vector)
|
||||
{
|
||||
first_vec_index = second_vec_index;
|
||||
second_vec_index = vec_index;
|
||||
}
|
||||
|
||||
next_scalar_stmt = VEC_index (gimple,
|
||||
SLP_TREE_SCALAR_STMTS (node), scalar_index++);
|
||||
|
||||
vect_create_mask_and_perm (stmt, next_scalar_stmt,
|
||||
mask, mask_nunits, mask_element_type, mask_type,
|
||||
first_vec_index, second_vec_index, gsi, node,
|
||||
builtin_decl, vectype, dr_chain, ncopies,
|
||||
vect_stmts_counter++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
free (mask);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* vectorizable_load.
|
||||
|
||||
Check if STMT reads a non scalar data-ref (array/pointer/structure) that
|
||||
|
@ -5972,7 +6279,7 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
|
|||
|
||||
bool
|
||||
vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
|
||||
slp_tree slp_node)
|
||||
slp_tree slp_node, slp_instance slp_node_instance)
|
||||
{
|
||||
tree scalar_dest;
|
||||
tree vec_dest = NULL;
|
||||
|
@ -6008,6 +6315,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
|
|||
struct loop *at_loop;
|
||||
int vec_num;
|
||||
bool slp = (slp_node != NULL);
|
||||
bool slp_perm = false;
|
||||
enum tree_code code;
|
||||
|
||||
/* Multiple types in SLP are handled by creating the appropriate number of
|
||||
|
@ -6028,6 +6336,9 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
|
|||
return false;
|
||||
}
|
||||
|
||||
if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance))
|
||||
slp_perm = true;
|
||||
|
||||
if (!STMT_VINFO_RELEVANT_P (stmt_info))
|
||||
return false;
|
||||
|
||||
|
@ -6397,21 +6708,34 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
|
|||
|
||||
/* Collect vector loads and later create their permutation in
|
||||
vect_transform_strided_load (). */
|
||||
if (strided_load)
|
||||
if (strided_load || slp_perm)
|
||||
VEC_quick_push (tree, dr_chain, new_temp);
|
||||
|
||||
/* Store vector loads in the corresponding SLP_NODE. */
|
||||
if (slp)
|
||||
if (slp && !slp_perm)
|
||||
VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
|
||||
}
|
||||
|
||||
if (slp)
|
||||
if (slp && !slp_perm)
|
||||
continue;
|
||||
|
||||
if (slp_perm)
|
||||
{
|
||||
if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi,
|
||||
LOOP_VINFO_VECT_FACTOR (loop_vinfo),
|
||||
slp_node_instance, false))
|
||||
{
|
||||
VEC_free (tree, heap, dr_chain);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (strided_load)
|
||||
{
|
||||
if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi))
|
||||
return false;
|
||||
|
||||
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
|
||||
VEC_free (tree, heap, dr_chain);
|
||||
dr_chain = VEC_alloc (tree, heap, group_size);
|
||||
|
@ -6425,6 +6749,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt,
|
|||
prev_stmt_info = vinfo_for_stmt (new_stmt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dr_chain)
|
||||
VEC_free (tree, heap, dr_chain);
|
||||
|
@ -6690,7 +7015,8 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi,
|
|||
|
||||
static bool
|
||||
vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
|
||||
bool *strided_store, slp_tree slp_node)
|
||||
bool *strided_store, slp_tree slp_node,
|
||||
slp_instance slp_node_instance)
|
||||
{
|
||||
bool is_store = false;
|
||||
gimple vec_stmt = NULL;
|
||||
|
@ -6732,7 +7058,8 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi,
|
|||
break;
|
||||
|
||||
case load_vec_info_type:
|
||||
done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node);
|
||||
done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node,
|
||||
slp_node_instance);
|
||||
gcc_assert (done);
|
||||
break;
|
||||
|
||||
|
@ -7807,6 +8134,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
|
|||
stmt_vec_info stmt_info;
|
||||
unsigned int vec_stmts_size, nunits, group_size;
|
||||
tree vectype;
|
||||
int i;
|
||||
slp_tree loads_node;
|
||||
|
||||
if (!node)
|
||||
return false;
|
||||
|
@ -7830,8 +8159,28 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
|
|||
size. */
|
||||
vec_stmts_size = (vectorization_factor * group_size) / nunits;
|
||||
|
||||
/* In case of load permutation we have to allocate vectorized statements for
|
||||
all the nodes that participate in that permutation. */
|
||||
if (SLP_INSTANCE_LOAD_PERMUTATION (instance))
|
||||
{
|
||||
for (i = 0;
|
||||
VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node);
|
||||
i++)
|
||||
{
|
||||
if (!SLP_TREE_VEC_STMTS (loads_node))
|
||||
{
|
||||
SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap,
|
||||
vec_stmts_size);
|
||||
SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!SLP_TREE_VEC_STMTS (node))
|
||||
{
|
||||
SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size);
|
||||
SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size;
|
||||
}
|
||||
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
{
|
||||
|
@ -7840,7 +8189,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
|
|||
}
|
||||
|
||||
si = gsi_for_stmt (stmt);
|
||||
is_store = vect_transform_stmt (stmt, &si, &strided_store, node);
|
||||
is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance);
|
||||
if (is_store)
|
||||
{
|
||||
if (DR_GROUP_FIRST_DR (stmt_info))
|
||||
|
@ -7980,7 +8329,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
|
|||
{
|
||||
if (vect_print_dump_info (REPORT_DETAILS))
|
||||
fprintf (vect_dump, "transform phi.");
|
||||
vect_transform_stmt (phi, NULL, NULL, NULL);
|
||||
vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8059,7 +8408,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
|
|||
fprintf (vect_dump, "transform statement.");
|
||||
|
||||
strided_store = false;
|
||||
is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL);
|
||||
is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
|
||||
if (is_store)
|
||||
{
|
||||
if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
|
||||
|
|
|
@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
|
|||
VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
|
||||
slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
|
||||
for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
|
||||
vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
|
||||
vect_free_slp_instance (instance);
|
||||
|
||||
VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
|
||||
VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
|
||||
|
||||
|
|
|
@ -105,6 +105,8 @@ typedef struct _slp_tree {
|
|||
} cost;
|
||||
} *slp_tree;
|
||||
|
||||
DEF_VEC_P(slp_tree);
|
||||
DEF_VEC_ALLOC_P(slp_tree, heap);
|
||||
|
||||
/* SLP instance is a sequence of stmts in a loop that can be packed into
|
||||
SIMD stmts. */
|
||||
|
@ -124,6 +126,13 @@ typedef struct _slp_instance {
|
|||
int outside_of_loop; /* Statements generated outside loop. */
|
||||
int inside_of_loop; /* Statements generated inside loop. */
|
||||
} cost;
|
||||
|
||||
/* Loads permutation relatively to the stores, NULL if there is no
|
||||
permutation. */
|
||||
VEC (int, heap) *load_permutation;
|
||||
|
||||
/* The group of nodes that contain loads of this SLP instance. */
|
||||
VEC (slp_tree, heap) *loads;
|
||||
} *slp_instance;
|
||||
|
||||
DEF_VEC_P(slp_instance);
|
||||
|
@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap);
|
|||
#define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor
|
||||
#define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop
|
||||
#define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop
|
||||
#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation
|
||||
#define SLP_INSTANCE_LOADS(S) (S)->loads
|
||||
|
||||
#define SLP_TREE_LEFT(S) (S)->left
|
||||
#define SLP_TREE_RIGHT(S) (S)->right
|
||||
|
@ -522,6 +533,11 @@ typedef struct _stmt_vec_info {
|
|||
#define TARG_VEC_STORE_COST 1
|
||||
#endif
|
||||
|
||||
/* Cost of vector permutation. */
|
||||
#ifndef TARG_VEC_PERMUTE_COST
|
||||
#define TARG_VEC_PERMUTE_COST 1
|
||||
#endif
|
||||
|
||||
/* The maximum number of intermediate steps required in multi-step type
|
||||
conversion. */
|
||||
#define MAX_INTERM_CVT_STEPS 3
|
||||
|
@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt);
|
|||
/** In tree-vect-analyze.c **/
|
||||
/* Driver for analysis stage. */
|
||||
extern loop_vec_info vect_analyze_loop (struct loop *);
|
||||
extern void vect_free_slp_tree (slp_tree);
|
||||
extern void vect_free_slp_instance (slp_instance);
|
||||
extern loop_vec_info vect_analyze_loop_form (struct loop *);
|
||||
extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *,
|
||||
HOST_WIDE_INT *);
|
||||
|
@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info);
|
|||
|
||||
/** In tree-vect-transform.c **/
|
||||
extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *,
|
||||
slp_tree);
|
||||
slp_tree, slp_instance);
|
||||
extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *,
|
||||
slp_tree);
|
||||
extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *,
|
||||
|
@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
|
|||
extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type,
|
||||
slp_tree);
|
||||
extern void vect_model_load_cost (stmt_vec_info, int, slp_tree);
|
||||
extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *,
|
||||
gimple_stmt_iterator *, int, slp_instance, bool);
|
||||
|
||||
/* Driver for transformation stage. */
|
||||
extern void vect_transform_loop (loop_vec_info);
|
||||
|
||||
|
|
Loading…
Reference in New Issue