tree-vectorizer.h (struct _loop_vec_info): Add scalar_loop field.

* tree-vectorizer.h (struct _loop_vec_info): Add scalar_loop field.
	(LOOP_VINFO_SCALAR_LOOP): Define.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Add scalar_loop argument.
	* config/i386/sse.md (maskload<mode>, maskstore<mode>): New expanders.
	* tree-data-ref.c (get_references_in_stmt): Handle MASK_LOAD and
	MASK_STORE.
	* internal-fn.def (LOOP_VECTORIZED, MASK_LOAD, MASK_STORE): New
	internal fns.
	* tree-if-conv.c: Include expr.h, optabs.h, tree-ssa-loop-ivopts.h and
	tree-ssa-address.h.
	(release_bb_predicate): New function.
	(free_bb_predicate): Use it.
	(reset_bb_predicate): Likewise.  Don't unallocate bb->aux
	just to immediately allocate it again.
	(add_to_predicate_list): Add loop argument.  If basic blocks that
	dominate loop->latch don't insert any predicate.
	(add_to_dst_predicate_list): Adjust caller.
	(if_convertible_phi_p): Add any_mask_load_store argument, if true,
	handle it like flag_tree_loop_if_convert_stores.
	(insert_gimplified_predicates): Likewise.
	(ifcvt_can_use_mask_load_store): New function.
	(if_convertible_gimple_assign_stmt_p): Add any_mask_load_store
	argument, check if some conditional loads or stores can't be
	converted into MASK_LOAD or MASK_STORE.
	(if_convertible_stmt_p): Add any_mask_load_store argument,
	pass it down to if_convertible_gimple_assign_stmt_p.
	(predicate_bbs): Don't return bool, only check if the last stmt
	of a basic block is GIMPLE_COND and handle that.  Adjust
	add_to_predicate_list caller.
	(if_convertible_loop_p_1): Only call predicate_bbs if
	flag_tree_loop_if_convert_stores and free_bb_predicate in that case
	afterwards, check gimple_code of stmts here.  Replace is_predicated
	check with dominance check.  Add any_mask_load_store argument,
	pass it down to if_convertible_stmt_p and if_convertible_phi_p,
	call if_convertible_phi_p only after all if_convertible_stmt_p
	calls.
	(if_convertible_loop_p): Add any_mask_load_store argument,
	pass it down to if_convertible_loop_p_1.
	(predicate_mem_writes): Emit MASK_LOAD and/or MASK_STORE calls.
	(combine_blocks): Add any_mask_load_store argument, pass
	it down to insert_gimplified_predicates and call predicate_mem_writes
	if it is set.  Call predicate_bbs.
	(version_loop_for_if_conversion): New function.
	(tree_if_conversion): Adjust if_convertible_loop_p and combine_blocks
	calls.  Return todo flags instead of bool, call
	version_loop_for_if_conversion if if-conversion should be just
	for the vectorized loops and nothing else.
	(main_tree_if_conversion): Adjust caller.  Don't call
	tree_if_conversion for dont_vectorize loops if if-conversion
	isn't explicitly enabled.
	* tree-vect-data-refs.c (vect_check_gather): Handle
	MASK_LOAD/MASK_STORE.
	(vect_analyze_data_refs, vect_supportable_dr_alignment): Likewise.
	* gimple.h (gimple_expr_type): Handle MASK_STORE.
	* internal-fn.c (expand_LOOP_VECTORIZED, expand_MASK_LOAD,
	expand_MASK_STORE): New functions.
	* tree-vectorizer.c: Include tree-cfg.h and gimple-fold.h.
	(vect_loop_vectorized_call, fold_loop_vectorized_call): New functions.
	(vectorize_loops): Don't try to vectorize loops with
	loop->dont_vectorize set.  Set LOOP_VINFO_SCALAR_LOOP for if-converted
	loops, fold LOOP_VECTORIZED internal call depending on if loop
	has been vectorized or not.
	* tree-vect-loop-manip.c (slpeel_duplicate_current_defs_from_edges):
	New function.
	(slpeel_tree_duplicate_loop_to_edge_cfg): Add scalar_loop argument.
	If non-NULL, copy basic blocks from scalar_loop instead of loop, but
	still to loop's entry or exit edge.
	(slpeel_tree_peel_loop_to_edge): Add scalar_loop argument, pass it
	down to slpeel_tree_duplicate_loop_to_edge_cfg.
	(vect_do_peeling_for_loop_bound, vect_do_peeling_for_loop_alignment):
	Adjust callers.
	(vect_loop_versioning): If LOOP_VINFO_SCALAR_LOOP, perform loop
	versioning from that loop instead of LOOP_VINFO_LOOP, move it to the
	right place in the CFG afterwards.
	* tree-vect-loop.c (vect_determine_vectorization_factor): Handle
	MASK_STORE.
	* cfgloop.h (struct loop): Add dont_vectorize field.
	* tree-loop-distribution.c (copy_loop_before): Adjust
	slpeel_tree_duplicate_loop_to_edge_cfg caller.
	* optabs.def (maskload_optab, maskstore_optab): New optabs.
	* passes.def: Add a note that pass_vectorize must immediately follow
	pass_if_conversion.
	* tree-predcom.c (split_data_refs_to_components): Give up if
	DR_STMT is a call.
	* tree-vect-stmts.c (vect_mark_relevant): Don't crash if lhs
	is NULL.
	(exist_non_indexing_operands_for_use_p): Handle MASK_LOAD
	and MASK_STORE.
	(vectorizable_mask_load_store): New function.
	(vectorizable_call): Call it for MASK_LOAD or MASK_STORE.
	(vect_transform_stmt): Handle MASK_STORE.
	* tree-ssa-phiopt.c (cond_if_else_store_replacement): Ignore
	DR_STMT where lhs is NULL.
	* optabs.h (can_vec_perm_p): Fix up comment typo.
	(can_vec_mask_load_store_p): New prototype.
	* optabs.c (can_vec_mask_load_store_p): New function.

	* gcc.dg/vect/vect-cond-11.c: New test.
	* gcc.target/i386/vect-cond-1.c: New test.
	* gcc.target/i386/avx2-gather-5.c: New test.
	* gcc.target/i386/avx2-gather-6.c: New test.
	* gcc.dg/vect/vect-mask-loadstore-1.c: New test.
	* gcc.dg/vect/vect-mask-load-1.c: New test.

From-SVN: r206022
This commit is contained in:
Jakub Jelinek 2013-12-16 19:24:15 +01:00
parent d5be902880
commit 7670d795d5
6 changed files with 293 additions and 0 deletions

View File

@ -0,0 +1,116 @@
#include "tree-vect.h"
#define N 1024
typedef int V __attribute__((vector_size (4)));
unsigned int a[N * 2] __attribute__((aligned));
unsigned int b[N * 2] __attribute__((aligned));
V c[N];
__attribute__((noinline, noclone)) unsigned int
foo (unsigned int *a, unsigned int *b)
{
int i;
unsigned int r = 0;
for (i = 0; i < N; i++)
{
unsigned int x = a[i], y = b[i];
if (x < 32)
{
x = x + 127;
y = y * 2;
}
else
{
x = x - 16;
y = y + 1;
}
a[i] = x;
b[i] = y;
r += x;
}
return r;
}
__attribute__((noinline, noclone)) unsigned int
bar (unsigned int *a, unsigned int *b)
{
int i;
unsigned int r = 0;
for (i = 0; i < N; i++)
{
unsigned int x = a[i], y = b[i];
if (x < 32)
{
x = x + 127;
y = y * 2;
}
else
{
x = x - 16;
y = y + 1;
}
a[i] = x;
b[i] = y;
c[i] = c[i] + 1;
r += x;
}
return r;
}
void
baz (unsigned int *a, unsigned int *b,
unsigned int (*fn) (unsigned int *, unsigned int *))
{
int i;
for (i = -64; i < 0; i++)
{
a[i] = 19;
b[i] = 17;
}
for (; i < N; i++)
{
a[i] = i - 512;
b[i] = i;
}
for (; i < N + 64; i++)
{
a[i] = 27;
b[i] = 19;
}
if (fn (a, b) != -512U - (N - 32) * 16U + 32 * 127U)
__builtin_abort ();
for (i = -64; i < 0; i++)
if (a[i] != 19 || b[i] != 17)
__builtin_abort ();
for (; i < N; i++)
if (a[i] != (i - 512U < 32U ? i - 512U + 127 : i - 512U - 16)
|| b[i] != (i - 512U < 32U ? i * 2U : i + 1U))
__builtin_abort ();
for (; i < N + 64; i++)
if (a[i] != 27 || b[i] != 19)
__builtin_abort ();
}
int
main ()
{
int i;
check_vect ();
baz (a + 512, b + 512, foo);
baz (a + 512, b + 512, bar);
baz (a + 512 + 1, b + 512 + 1, foo);
baz (a + 512 + 1, b + 512 + 1, bar);
baz (a + 512 + 31, b + 512 + 31, foo);
baz (a + 512 + 31, b + 512 + 31, bar);
baz (a + 512 + 1, b + 512, foo);
baz (a + 512 + 1, b + 512, bar);
baz (a + 512 + 31, b + 512, foo);
baz (a + 512 + 31, b + 512, bar);
baz (a + 512, b + 512 + 1, foo);
baz (a + 512, b + 512 + 1, bar);
baz (a + 512, b + 512 + 31, foo);
baz (a + 512, b + 512 + 31, bar);
return 0;
}
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,52 @@
/* { dg-do run } */
/* { dg-additional-options "-Ofast -fno-common" } */
/* { dg-additional-options "-Ofast -fno-common -mavx" { target avx_runtime } } */
#include <stdlib.h>
#include "tree-vect.h"
__attribute__((noinline, noclone)) void
foo (double *x, double *y)
{
double *p = __builtin_assume_aligned (x, 16);
double *q = __builtin_assume_aligned (y, 16);
double z, h;
int i;
for (i = 0; i < 1024; i++)
{
if (p[i] < 0.0)
z = q[i], h = q[i] * 7.0 + 3.0;
else
z = p[i] + 6.0, h = p[1024 + i];
p[i] = z + 2.0 * h;
}
}
double a[2048] __attribute__((aligned (16)));
double b[1024] __attribute__((aligned (16)));
int
main ()
{
int i;
check_vect ();
for (i = 0; i < 1024; i++)
{
a[i] = (i & 1) ? -i : 2 * i;
a[i + 1024] = i;
b[i] = 7 * i;
asm ("");
}
foo (a, b);
for (i = 0; i < 1024; i++)
if (a[i] != ((i & 1)
? 7 * i + 2.0 * (7 * i * 7.0 + 3.0)
: 2 * i + 6.0 + 2.0 * i)
|| b[i] != 7 * i
|| a[i + 1024] != i)
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops" 1 "vect" { target avx_runtime } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,50 @@
/* { dg-do run } */
/* { dg-additional-options "-Ofast -fno-common" } */
/* { dg-additional-options "-Ofast -fno-common -mavx" { target avx_runtime } } */
#include <stdlib.h>
#include "tree-vect.h"
__attribute__((noinline, noclone)) void
foo (float *__restrict x, float *__restrict y, float *__restrict z)
{
float *__restrict p = __builtin_assume_aligned (x, 32);
float *__restrict q = __builtin_assume_aligned (y, 32);
float *__restrict r = __builtin_assume_aligned (z, 32);
int i;
for (i = 0; i < 1024; i++)
{
if (p[i] < 0.0f)
q[i] = p[i] + 2.0f;
else
p[i] = r[i] + 3.0f;
}
}
float a[1024] __attribute__((aligned (32)));
float b[1024] __attribute__((aligned (32)));
float c[1024] __attribute__((aligned (32)));
int
main ()
{
int i;
check_vect ();
for (i = 0; i < 1024; i++)
{
a[i] = (i & 1) ? -i : i;
b[i] = 7 * i;
c[i] = a[i] - 3.0f;
asm ("");
}
foo (a, b, c);
for (i = 0; i < 1024; i++)
if (a[i] != ((i & 1) ? -i : i)
|| b[i] != ((i & 1) ? a[i] + 2.0f : 7 * i)
|| c[i] != a[i] - 3.0f)
abort ();
return 0;
}
/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops" 1 "vect" { target avx_runtime } } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,47 @@
/* { dg-do run } */
/* { dg-require-effective-target avx2 } */
/* { dg-options "-O3 -mavx2 -fno-common" } */
#include "avx2-check.h"
#define N 1024
float vf1[N+16], vf2[N], vf3[N];
int k[N];
__attribute__((noinline, noclone)) void
foo (void)
{
int i;
for (i = 0; i < N; i++)
{
float f;
if (vf3[i] < 0.0f)
f = vf1[k[i]];
else
f = 7.0f;
vf2[i] = f;
}
}
static void
avx2_test (void)
{
int i;
for (i = 0; i < N + 16; i++)
{
vf1[i] = 5.5f * i;
if (i >= N)
continue;
vf2[i] = 2.0f;
vf3[i] = (i & 1) ? i : -i - 1;
k[i] = (i & 1) ? ((i & 2) ? -i : N / 2 + i) : (i * 7) % N;
asm ("");
}
foo ();
for (i = 0; i < N; i++)
if (vf1[i] != 5.5 * i
|| vf2[i] != ((i & 1) ? 7.0f : 5.5f * ((i * 7) % N))
|| vf3[i] != ((i & 1) ? i : -i - 1)
|| k[i] != ((i & 1) ? ((i & 2) ? -i : N / 2 + i) : ((i * 7) % N)))
abort ();
}

View File

@ -0,0 +1,7 @@
/* { dg-do compile } */
/* { dg-options "-O3 -mavx2 -fno-common -fdump-tree-vect-details" } */
#include "avx2-gather-5.c"
/* { dg-final { scan-tree-dump-times "note: vectorized 1 loops in function" 1 "vect" } } */
/* { dg-final { cleanup-tree-dump "vect" } } */

View File

@ -0,0 +1,21 @@
/* { dg-do compile } */
/* { dg-options "-O2 -ftree-vectorize -mavx2" { target avx2 } } */
int a[1024];
int
foo (int *p)
{
int i;
for (i = 0; i < 1024; i++)
{
int t;
if (a[i] < 30)
t = *p;
else
t = a[i] + 12;
a[i] = t;
}
}
/* { dg-final { cleanup-tree-dump "vect" } } */