re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to 3 times))
gcc/ * tree-vect-data-refs.c (vect_grouped_store_supported): New check for stores group of length 3. (vect_permute_store_chain): New permutations for stores group of length 3. * tree-vect-stmts.c (vect_model_store_cost): Change cost of vec_perm_shuffle for the new permutations. gcc/testsuite/ PR tree-optimization/52252 * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. From-SVN: r211439
This commit is contained in:
parent
862b3da6a4
commit
e1377713ce
@ -1,3 +1,12 @@
|
|||||||
|
2014-06-11 Evgeny Stupachenko <evstupac@gmail.com>
|
||||||
|
|
||||||
|
* tree-vect-data-refs.c (vect_grouped_store_supported): New
|
||||||
|
check for stores group of length 3.
|
||||||
|
(vect_permute_store_chain): New permutations for stores group of
|
||||||
|
length 3.
|
||||||
|
* tree-vect-stmts.c (vect_model_store_cost): Change cost
|
||||||
|
of vec_perm_shuffle for the new permutations.
|
||||||
|
|
||||||
2014-06-11 Jan Hubicka <hubicka@ucw.cz>
|
2014-06-11 Jan Hubicka <hubicka@ucw.cz>
|
||||||
|
|
||||||
* ipa-visibility.c (function_and_variable_visibility): Disable
|
* ipa-visibility.c (function_and_variable_visibility): Disable
|
||||||
|
@ -1,3 +1,8 @@
|
|||||||
|
2014-06-11 Evgeny Stupachenko <evstupac@gmail.com>
|
||||||
|
|
||||||
|
PR tree-optimization/52252
|
||||||
|
* gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
|
||||||
|
|
||||||
2014-06-11 Richard Biener <rguenther@suse.de>
|
2014-06-11 Richard Biener <rguenther@suse.de>
|
||||||
|
|
||||||
PR middle-end/61437
|
PR middle-end/61437
|
||||||
|
21
gcc/testsuite/gcc.dg/vect/pr52252-st.c
Normal file
21
gcc/testsuite/gcc.dg/vect/pr52252-st.c
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
/* { dg-do compile } */
|
||||||
|
/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
|
||||||
|
|
||||||
|
#define byte unsigned char
|
||||||
|
|
||||||
|
void
|
||||||
|
matrix_mul (byte *in, byte *out, int size)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < size; i++)
|
||||||
|
{
|
||||||
|
out[0] = in[0] + in[1] + in[3];
|
||||||
|
out[1] = in[0] + in[2] + in[4];
|
||||||
|
out[2] = in[1] + in[2] + in[4];
|
||||||
|
in += 4;
|
||||||
|
out += 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { i?86-*-* x86_64-*-* } } } } */
|
||||||
|
/* { dg-final { cleanup-tree-dump "vect" } } */
|
@ -4364,13 +4364,14 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
|
|||||||
{
|
{
|
||||||
enum machine_mode mode = TYPE_MODE (vectype);
|
enum machine_mode mode = TYPE_MODE (vectype);
|
||||||
|
|
||||||
/* vect_permute_store_chain requires the group size to be a power of two. */
|
/* vect_permute_store_chain requires the group size to be equal to 3 or
|
||||||
if (exact_log2 (count) == -1)
|
be a power of two. */
|
||||||
|
if (count != 3 && exact_log2 (count) == -1)
|
||||||
{
|
{
|
||||||
if (dump_enabled_p ())
|
if (dump_enabled_p ())
|
||||||
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
|
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
|
||||||
"the size of the group of accesses"
|
"the size of the group of accesses"
|
||||||
" is not a power of 2\n");
|
" is not a power of 2 or not eqaul to 3\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4379,23 +4380,76 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
|
|||||||
{
|
{
|
||||||
unsigned int i, nelt = GET_MODE_NUNITS (mode);
|
unsigned int i, nelt = GET_MODE_NUNITS (mode);
|
||||||
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
|
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
|
||||||
for (i = 0; i < nelt / 2; i++)
|
|
||||||
|
if (count == 3)
|
||||||
{
|
{
|
||||||
sel[i * 2] = i;
|
unsigned int j0 = 0, j1 = 0, j2 = 0;
|
||||||
sel[i * 2 + 1] = i + nelt;
|
unsigned int i, j;
|
||||||
|
|
||||||
|
for (j = 0; j < 3; j++)
|
||||||
|
{
|
||||||
|
int nelt0 = ((3 - j) * nelt) % 3;
|
||||||
|
int nelt1 = ((3 - j) * nelt + 1) % 3;
|
||||||
|
int nelt2 = ((3 - j) * nelt + 2) % 3;
|
||||||
|
for (i = 0; i < nelt; i++)
|
||||||
|
{
|
||||||
|
if (3 * i + nelt0 < nelt)
|
||||||
|
sel[3 * i + nelt0] = j0++;
|
||||||
|
if (3 * i + nelt1 < nelt)
|
||||||
|
sel[3 * i + nelt1] = nelt + j1++;
|
||||||
|
if (3 * i + nelt2 < nelt)
|
||||||
|
sel[3 * i + nelt2] = 0;
|
||||||
|
}
|
||||||
|
if (!can_vec_perm_p (mode, false, sel))
|
||||||
|
{
|
||||||
|
if (dump_enabled_p ())
|
||||||
|
dump_printf (MSG_MISSED_OPTIMIZATION,
|
||||||
|
"permutaion op not supported by target.\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < nelt; i++)
|
||||||
|
{
|
||||||
|
if (3 * i + nelt0 < nelt)
|
||||||
|
sel[3 * i + nelt0] = 3 * i + nelt0;
|
||||||
|
if (3 * i + nelt1 < nelt)
|
||||||
|
sel[3 * i + nelt1] = 3 * i + nelt1;
|
||||||
|
if (3 * i + nelt2 < nelt)
|
||||||
|
sel[3 * i + nelt2] = nelt + j2++;
|
||||||
|
}
|
||||||
|
if (!can_vec_perm_p (mode, false, sel))
|
||||||
|
{
|
||||||
|
if (dump_enabled_p ())
|
||||||
|
dump_printf (MSG_MISSED_OPTIMIZATION,
|
||||||
|
"permutaion op not supported by target.\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if (can_vec_perm_p (mode, false, sel))
|
else
|
||||||
{
|
{
|
||||||
for (i = 0; i < nelt; i++)
|
/* If length is not equal to 3 then only power of 2 is supported. */
|
||||||
sel[i] += nelt / 2;
|
gcc_assert (exact_log2 (count) != -1);
|
||||||
if (can_vec_perm_p (mode, false, sel))
|
|
||||||
return true;
|
for (i = 0; i < nelt / 2; i++)
|
||||||
|
{
|
||||||
|
sel[i * 2] = i;
|
||||||
|
sel[i * 2 + 1] = i + nelt;
|
||||||
|
}
|
||||||
|
if (can_vec_perm_p (mode, false, sel))
|
||||||
|
{
|
||||||
|
for (i = 0; i < nelt; i++)
|
||||||
|
sel[i] += nelt / 2;
|
||||||
|
if (can_vec_perm_p (mode, false, sel))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dump_enabled_p ())
|
if (dump_enabled_p ())
|
||||||
dump_printf (MSG_MISSED_OPTIMIZATION,
|
dump_printf (MSG_MISSED_OPTIMIZATION,
|
||||||
"interleave op not supported by target.\n");
|
"permutaion op not supported by target.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4415,9 +4469,9 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
|
|||||||
/* Function vect_permute_store_chain.
|
/* Function vect_permute_store_chain.
|
||||||
|
|
||||||
Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
|
Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
|
||||||
a power of 2, generate interleave_high/low stmts to reorder the data
|
a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
|
||||||
correctly for the stores. Return the final references for stores in
|
the data correctly for the stores. Return the final references for stores
|
||||||
RESULT_CHAIN.
|
in RESULT_CHAIN.
|
||||||
|
|
||||||
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
|
E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
|
||||||
The input is 4 vectors each containing 8 elements. We assign a number to
|
The input is 4 vectors each containing 8 elements. We assign a number to
|
||||||
@ -4484,7 +4538,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
|
|||||||
gimple perm_stmt;
|
gimple perm_stmt;
|
||||||
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
|
tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
|
||||||
tree perm_mask_low, perm_mask_high;
|
tree perm_mask_low, perm_mask_high;
|
||||||
unsigned int i, n;
|
tree data_ref;
|
||||||
|
tree perm3_mask_low, perm3_mask_high;
|
||||||
|
unsigned int i, n, log_length = exact_log2 (length);
|
||||||
unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
|
unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
|
||||||
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
|
unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
|
||||||
|
|
||||||
@ -4492,47 +4548,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
|
|||||||
memcpy (result_chain->address (), dr_chain.address (),
|
memcpy (result_chain->address (), dr_chain.address (),
|
||||||
length * sizeof (tree));
|
length * sizeof (tree));
|
||||||
|
|
||||||
for (i = 0, n = nelt / 2; i < n; i++)
|
if (length == 3)
|
||||||
{
|
{
|
||||||
sel[i * 2] = i;
|
unsigned int j0 = 0, j1 = 0, j2 = 0;
|
||||||
sel[i * 2 + 1] = i + nelt;
|
|
||||||
}
|
|
||||||
perm_mask_high = vect_gen_perm_mask (vectype, sel);
|
|
||||||
gcc_assert (perm_mask_high != NULL);
|
|
||||||
|
|
||||||
for (i = 0; i < nelt; i++)
|
for (j = 0; j < 3; j++)
|
||||||
sel[i] += nelt / 2;
|
{
|
||||||
perm_mask_low = vect_gen_perm_mask (vectype, sel);
|
int nelt0 = ((3 - j) * nelt) % 3;
|
||||||
gcc_assert (perm_mask_low != NULL);
|
int nelt1 = ((3 - j) * nelt + 1) % 3;
|
||||||
|
int nelt2 = ((3 - j) * nelt + 2) % 3;
|
||||||
|
|
||||||
for (i = 0, n = exact_log2 (length); i < n; i++)
|
for (i = 0; i < nelt; i++)
|
||||||
{
|
{
|
||||||
for (j = 0; j < length/2; j++)
|
if (3 * i + nelt0 < nelt)
|
||||||
{
|
sel[3 * i + nelt0] = j0++;
|
||||||
vect1 = dr_chain[j];
|
if (3 * i + nelt1 < nelt)
|
||||||
vect2 = dr_chain[j+length/2];
|
sel[3 * i + nelt1] = nelt + j1++;
|
||||||
|
if (3 * i + nelt2 < nelt)
|
||||||
|
sel[3 * i + nelt2] = 0;
|
||||||
|
}
|
||||||
|
perm3_mask_low = vect_gen_perm_mask (vectype, sel);
|
||||||
|
gcc_assert (perm3_mask_low != NULL);
|
||||||
|
|
||||||
|
for (i = 0; i < nelt; i++)
|
||||||
|
{
|
||||||
|
if (3 * i + nelt0 < nelt)
|
||||||
|
sel[3 * i + nelt0] = 3 * i + nelt0;
|
||||||
|
if (3 * i + nelt1 < nelt)
|
||||||
|
sel[3 * i + nelt1] = 3 * i + nelt1;
|
||||||
|
if (3 * i + nelt2 < nelt)
|
||||||
|
sel[3 * i + nelt2] = nelt + j2++;
|
||||||
|
}
|
||||||
|
perm3_mask_high = vect_gen_perm_mask (vectype, sel);
|
||||||
|
gcc_assert (perm3_mask_high != NULL);
|
||||||
|
|
||||||
|
vect1 = dr_chain[0];
|
||||||
|
vect2 = dr_chain[1];
|
||||||
|
|
||||||
/* Create interleaving stmt:
|
/* Create interleaving stmt:
|
||||||
high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
|
low = VEC_PERM_EXPR <vect1, vect2,
|
||||||
high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
|
{j, nelt, *, j + 1, nelt + j + 1, *,
|
||||||
perm_stmt
|
j + 2, nelt + j + 2, *, ...}> */
|
||||||
= gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
|
data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
|
||||||
vect1, vect2, perm_mask_high);
|
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
|
||||||
|
vect1, vect2,
|
||||||
|
perm3_mask_low);
|
||||||
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
||||||
(*result_chain)[2*j] = high;
|
|
||||||
|
|
||||||
|
vect1 = data_ref;
|
||||||
|
vect2 = dr_chain[2];
|
||||||
/* Create interleaving stmt:
|
/* Create interleaving stmt:
|
||||||
low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
|
low = VEC_PERM_EXPR <vect1, vect2,
|
||||||
nelt*3/2+1, ...}> */
|
{0, 1, nelt + j, 3, 4, nelt + j + 1,
|
||||||
low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
|
6, 7, nelt + j + 2, ...}> */
|
||||||
perm_stmt
|
data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
|
||||||
= gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
|
perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
|
||||||
vect1, vect2, perm_mask_low);
|
vect1, vect2,
|
||||||
|
perm3_mask_high);
|
||||||
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
||||||
(*result_chain)[2*j+1] = low;
|
(*result_chain)[j] = data_ref;
|
||||||
}
|
}
|
||||||
memcpy (dr_chain.address (), result_chain->address (),
|
}
|
||||||
length * sizeof (tree));
|
else
|
||||||
|
{
|
||||||
|
/* If length is not equal to 3 then only power of 2 is supported. */
|
||||||
|
gcc_assert (exact_log2 (length) != -1);
|
||||||
|
|
||||||
|
for (i = 0, n = nelt / 2; i < n; i++)
|
||||||
|
{
|
||||||
|
sel[i * 2] = i;
|
||||||
|
sel[i * 2 + 1] = i + nelt;
|
||||||
|
}
|
||||||
|
perm_mask_high = vect_gen_perm_mask (vectype, sel);
|
||||||
|
gcc_assert (perm_mask_high != NULL);
|
||||||
|
|
||||||
|
for (i = 0; i < nelt; i++)
|
||||||
|
sel[i] += nelt / 2;
|
||||||
|
perm_mask_low = vect_gen_perm_mask (vectype, sel);
|
||||||
|
gcc_assert (perm_mask_low != NULL);
|
||||||
|
|
||||||
|
for (i = 0, n = log_length; i < n; i++)
|
||||||
|
{
|
||||||
|
for (j = 0; j < length/2; j++)
|
||||||
|
{
|
||||||
|
vect1 = dr_chain[j];
|
||||||
|
vect2 = dr_chain[j+length/2];
|
||||||
|
|
||||||
|
/* Create interleaving stmt:
|
||||||
|
high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
|
||||||
|
...}> */
|
||||||
|
high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
|
||||||
|
perm_stmt
|
||||||
|
= gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
|
||||||
|
vect1, vect2, perm_mask_high);
|
||||||
|
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
||||||
|
(*result_chain)[2*j] = high;
|
||||||
|
|
||||||
|
/* Create interleaving stmt:
|
||||||
|
low = VEC_PERM_EXPR <vect1, vect2,
|
||||||
|
{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
|
||||||
|
...}> */
|
||||||
|
low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
|
||||||
|
perm_stmt
|
||||||
|
= gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
|
||||||
|
vect1, vect2, perm_mask_low);
|
||||||
|
vect_finish_stmt_generation (stmt, perm_stmt, gsi);
|
||||||
|
(*result_chain)[2*j+1] = low;
|
||||||
|
}
|
||||||
|
memcpy (dr_chain.address (), result_chain->address (),
|
||||||
|
length * sizeof (tree));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -975,9 +975,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
|
|||||||
include the cost of the permutes. */
|
include the cost of the permutes. */
|
||||||
if (!store_lanes_p && group_size > 1)
|
if (!store_lanes_p && group_size > 1)
|
||||||
{
|
{
|
||||||
/* Uses a high and low interleave operation for each needed permute. */
|
/* Uses a high and low interleave or shuffle operations for each
|
||||||
|
needed permute. */
|
||||||
int nstmts = ncopies * exact_log2 (group_size) * group_size;
|
int nstmts = ncopies * ceil_log2 (group_size) * group_size;
|
||||||
inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
|
inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
|
||||||
stmt_info, 0, vect_body);
|
stmt_info, 0, vect_body);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user