[aarch64][vect] Support V8QI->V8HI WIDEN_ patterns
In the case where 8 out of every 16 elements are widened using a widening pattern and the next 8 are skipped, the patterns are not recognized. This is because they are normally used in a pair, such as VEC_WIDEN_MINUS_HI/LO, to achieve a v16qi->v16hi conversion for example. This patch adds support for V8QI->V8HI patterns. gcc/ChangeLog: PR tree-optimization/98772 * optabs-tree.c (supportable_half_widening_operation): New function to check for supportable V8QI->V8HI widening patterns. * optabs-tree.h (supportable_half_widening_operation): New function. * tree-vect-stmts.c (vect_create_half_widening_stmts): New function to create promotion stmts for V8QI->V8HI widening patterns. (vectorizable_conversion): Add case for V8QI->V8HI. gcc/testsuite/ChangeLog: PR tree-optimization/98772 * gcc.target/aarch64/pr98772.c: New test.
This commit is contained in:
parent
ff6903288d
commit
4af29981ab
@ -277,6 +277,75 @@ optab_for_tree_code (enum tree_code code, const_tree type,
|
||||
}
|
||||
}
|
||||
|
||||
/* Check whether an operation represented by CODE is a 'half' widening operation
|
||||
in which the input vector type has half the number of bits of the output
|
||||
vector type e.g. V8QI->V8HI.
|
||||
|
||||
This is handled by widening the inputs using NOP_EXPRs then using a
|
||||
non-widening stmt e.g. MINUS_EXPR. RTL fusing converts these to the widening
|
||||
hardware instructions if supported.
|
||||
|
||||
The more typical case (handled in supportable_widening_operation) is where
|
||||
the input vector type has the same number of bits as the output vector type.
|
||||
In this case half the elements of the input vectors must be processed at a
|
||||
time into respective vector outputs with elements twice as wide i.e. a
|
||||
'hi'/'lo' pair using codes such as VEC_WIDEN_MINUS_HI/LO.
|
||||
|
||||
Supported widening operations:
|
||||
WIDEN_MINUS_EXPR
|
||||
WIDEN_PLUS_EXPR
|
||||
WIDEN_MULT_EXPR
|
||||
WIDEN_LSHIFT_EXPR
|
||||
|
||||
Output:
|
||||
- CODE1 - The non-widened code, which will be used after the inputs are
|
||||
converted to the wide type. */
|
||||
bool
|
||||
supportable_half_widening_operation (enum tree_code code, tree vectype_out,
|
||||
tree vectype_in, enum tree_code *code1)
|
||||
{
|
||||
machine_mode m1,m2;
|
||||
enum tree_code dummy_code;
|
||||
optab op;
|
||||
|
||||
gcc_assert (VECTOR_TYPE_P (vectype_out) && VECTOR_TYPE_P (vectype_in));
|
||||
|
||||
m1 = TYPE_MODE (vectype_out);
|
||||
m2 = TYPE_MODE (vectype_in);
|
||||
|
||||
if (!VECTOR_MODE_P (m1) || !VECTOR_MODE_P (m2))
|
||||
return false;
|
||||
|
||||
if (maybe_ne (TYPE_VECTOR_SUBPARTS (vectype_in),
|
||||
TYPE_VECTOR_SUBPARTS (vectype_out)))
|
||||
return false;
|
||||
|
||||
switch (code)
|
||||
{
|
||||
case WIDEN_LSHIFT_EXPR:
|
||||
*code1 = LSHIFT_EXPR;
|
||||
break;
|
||||
case WIDEN_MINUS_EXPR:
|
||||
*code1 = MINUS_EXPR;
|
||||
break;
|
||||
case WIDEN_PLUS_EXPR:
|
||||
*code1 = PLUS_EXPR;
|
||||
break;
|
||||
case WIDEN_MULT_EXPR:
|
||||
*code1 = MULT_EXPR;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!supportable_convert_operation (NOP_EXPR, vectype_out, vectype_in,
|
||||
&dummy_code))
|
||||
return false;
|
||||
|
||||
op = optab_for_tree_code (*code1, vectype_out, optab_vector);
|
||||
return (optab_handler (op, TYPE_MODE (vectype_out)) != CODE_FOR_nothing);
|
||||
}
|
||||
|
||||
/* Function supportable_convert_operation
|
||||
|
||||
Check whether an operation represented by the code CODE is a
|
||||
|
@ -36,6 +36,9 @@ enum optab_subtype
|
||||
the second argument. The third argument distinguishes between the types of
|
||||
vector shifts and rotates. */
|
||||
optab optab_for_tree_code (enum tree_code, const_tree, enum optab_subtype);
|
||||
bool
|
||||
supportable_half_widening_operation (enum tree_code, tree, tree,
|
||||
enum tree_code *);
|
||||
bool supportable_convert_operation (enum tree_code, tree, tree,
|
||||
enum tree_code *);
|
||||
bool expand_vec_cmp_expr_p (tree, tree, enum tree_code);
|
||||
|
155
gcc/testsuite/gcc.target/aarch64/pr98772.c
Normal file
155
gcc/testsuite/gcc.target/aarch64/pr98772.c
Normal file
@ -0,0 +1,155 @@
|
||||
/* { dg-do run } */
|
||||
/* { dg-options "-O3 -save-temps" } */
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define DSIZE 16
|
||||
#define PIXSIZE 64
|
||||
|
||||
extern void
|
||||
wplus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] + pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
extern void __attribute__((optimize (0)))
|
||||
wplus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] + pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
wminus (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] - pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
extern void __attribute__((optimize (0)))
|
||||
wminus_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] - pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
wmult (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] * pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
extern void __attribute__((optimize (0)))
|
||||
wmult_no_opt (uint16_t *d, uint8_t *restrict pix1, uint8_t *restrict pix2 )
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] * pix2[x];
|
||||
pix1 += 16;
|
||||
pix2 += 16;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
wlshift (uint16_t *d, uint8_t *restrict pix1)
|
||||
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] << 8;
|
||||
pix1 += 16;
|
||||
}
|
||||
}
|
||||
extern void __attribute__((optimize (0)))
|
||||
wlshift_no_opt (uint16_t *d, uint8_t *restrict pix1)
|
||||
|
||||
{
|
||||
for (int y = 0; y < 4; y++ )
|
||||
{
|
||||
for (int x = 0; x < 4; x++ )
|
||||
d[x + y*4] = pix1[x] << 8;
|
||||
pix1 += 16;
|
||||
}
|
||||
}
|
||||
|
||||
void __attribute__((optimize (0)))
|
||||
init_arrays (uint16_t *d_a, uint16_t *d_b, uint8_t *pix1, uint8_t *pix2)
|
||||
{
|
||||
for (int i = 0; i < DSIZE; i++)
|
||||
{
|
||||
d_a[i] = (1074 * i)%17;
|
||||
d_b[i] = (1074 * i)%17;
|
||||
}
|
||||
for (int i = 0; i < PIXSIZE; i++)
|
||||
{
|
||||
pix1[i] = (1024 * i)%17;
|
||||
pix2[i] = (1024 * i)%17;
|
||||
}
|
||||
}
|
||||
|
||||
/* Don't optimize main so we don't get confused over where the vector
|
||||
instructions are generated. */
|
||||
__attribute__((optimize (0)))
|
||||
int main ()
|
||||
{
|
||||
uint16_t d_a[DSIZE];
|
||||
uint16_t d_b[DSIZE];
|
||||
uint8_t pix1[PIXSIZE];
|
||||
uint8_t pix2[PIXSIZE];
|
||||
|
||||
init_arrays (d_a, d_b, pix1, pix2);
|
||||
wplus (d_a, pix1, pix2);
|
||||
wplus_no_opt (d_b, pix1, pix2);
|
||||
if (memcmp (d_a,d_b, DSIZE) != 0)
|
||||
return 1;
|
||||
|
||||
init_arrays (d_a, d_b, pix1, pix2);
|
||||
wminus (d_a, pix1, pix2);
|
||||
wminus_no_opt (d_b, pix1, pix2);
|
||||
if (memcmp (d_a,d_b, DSIZE) != 0)
|
||||
return 2;
|
||||
|
||||
init_arrays (d_a, d_b, pix1, pix2);
|
||||
wmult (d_a, pix1, pix2);
|
||||
wmult_no_opt (d_b, pix1, pix2);
|
||||
if (memcmp (d_a,d_b, DSIZE) != 0)
|
||||
return 3;
|
||||
|
||||
init_arrays (d_a, d_b, pix1, pix2);
|
||||
wlshift (d_a, pix1);
|
||||
wlshift_no_opt (d_b, pix1);
|
||||
if (memcmp (d_a,d_b, DSIZE) != 0)
|
||||
return 4;
|
||||
|
||||
}
|
||||
|
||||
/* { dg-final { scan-assembler-times "uaddl\\tv" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "usubl\\tv" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "umull\\tv" 2 } } */
|
||||
/* { dg-final { scan-assembler-times "shl\\tv" 2 } } */
|
@ -4544,6 +4544,64 @@ vect_create_vectorized_promotion_stmts (vec_info *vinfo,
|
||||
*vec_oprnds0 = vec_tmp;
|
||||
}
|
||||
|
||||
/* Create vectorized promotion stmts for widening stmts using only half the
|
||||
potential vector size for input. */
|
||||
static void
|
||||
vect_create_half_widening_stmts (vec_info *vinfo,
|
||||
vec<tree> *vec_oprnds0,
|
||||
vec<tree> *vec_oprnds1,
|
||||
stmt_vec_info stmt_info, tree vec_dest,
|
||||
gimple_stmt_iterator *gsi,
|
||||
enum tree_code code1,
|
||||
int op_type)
|
||||
{
|
||||
int i;
|
||||
tree vop0, vop1;
|
||||
gimple *new_stmt1;
|
||||
gimple *new_stmt2;
|
||||
gimple *new_stmt3;
|
||||
vec<tree> vec_tmp = vNULL;
|
||||
|
||||
vec_tmp.create (vec_oprnds0->length ());
|
||||
FOR_EACH_VEC_ELT (*vec_oprnds0, i, vop0)
|
||||
{
|
||||
tree new_tmp1, new_tmp2, new_tmp3, out_type;
|
||||
|
||||
gcc_assert (op_type == binary_op);
|
||||
vop1 = (*vec_oprnds1)[i];
|
||||
|
||||
/* Widen the first vector input. */
|
||||
out_type = TREE_TYPE (vec_dest);
|
||||
new_tmp1 = make_ssa_name (out_type);
|
||||
new_stmt1 = gimple_build_assign (new_tmp1, NOP_EXPR, vop0);
|
||||
vect_finish_stmt_generation (vinfo, stmt_info, new_stmt1, gsi);
|
||||
if (VECTOR_TYPE_P (TREE_TYPE (vop1)))
|
||||
{
|
||||
/* Widen the second vector input. */
|
||||
new_tmp2 = make_ssa_name (out_type);
|
||||
new_stmt2 = gimple_build_assign (new_tmp2, NOP_EXPR, vop1);
|
||||
vect_finish_stmt_generation (vinfo, stmt_info, new_stmt2, gsi);
|
||||
/* Perform the operation. With both vector inputs widened. */
|
||||
new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, new_tmp2);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Perform the operation. With the single vector input widened. */
|
||||
new_stmt3 = gimple_build_assign (vec_dest, code1, new_tmp1, vop1);
|
||||
}
|
||||
|
||||
new_tmp3 = make_ssa_name (vec_dest, new_stmt3);
|
||||
gimple_assign_set_lhs (new_stmt3, new_tmp3);
|
||||
vect_finish_stmt_generation (vinfo, stmt_info, new_stmt3, gsi);
|
||||
|
||||
/* Store the results for the next step. */
|
||||
vec_tmp.quick_push (new_tmp3);
|
||||
}
|
||||
|
||||
vec_oprnds0->release ();
|
||||
*vec_oprnds0 = vec_tmp;
|
||||
}
|
||||
|
||||
|
||||
/* Check if STMT_INFO performs a conversion operation that can be vectorized.
|
||||
If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
|
||||
@ -4696,7 +4754,13 @@ vectorizable_conversion (vec_info *vinfo,
|
||||
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
|
||||
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
|
||||
if (known_eq (nunits_out, nunits_in))
|
||||
modifier = NONE;
|
||||
if (code == WIDEN_MINUS_EXPR
|
||||
|| code == WIDEN_PLUS_EXPR
|
||||
|| code == WIDEN_LSHIFT_EXPR
|
||||
|| code == WIDEN_MULT_EXPR)
|
||||
modifier = WIDEN;
|
||||
else
|
||||
modifier = NONE;
|
||||
else if (multiple_p (nunits_out, nunits_in))
|
||||
modifier = NARROW;
|
||||
else
|
||||
@ -4742,9 +4806,18 @@ vectorizable_conversion (vec_info *vinfo,
|
||||
return false;
|
||||
|
||||
case WIDEN:
|
||||
if (supportable_widening_operation (vinfo, code, stmt_info, vectype_out,
|
||||
vectype_in, &code1, &code2,
|
||||
&multi_step_cvt, &interm_types))
|
||||
if (known_eq (nunits_in, nunits_out))
|
||||
{
|
||||
if (!supportable_half_widening_operation (code, vectype_out,
|
||||
vectype_in, &code1))
|
||||
goto unsupported;
|
||||
gcc_assert (!(multi_step_cvt && op_type == binary_op));
|
||||
break;
|
||||
}
|
||||
if (supportable_widening_operation (vinfo, code, stmt_info,
|
||||
vectype_out, vectype_in, &code1,
|
||||
&code2, &multi_step_cvt,
|
||||
&interm_types))
|
||||
{
|
||||
/* Binary widening operation can only be supported directly by the
|
||||
architecture. */
|
||||
@ -4980,10 +5053,16 @@ vectorizable_conversion (vec_info *vinfo,
|
||||
c1 = codecvt1;
|
||||
c2 = codecvt2;
|
||||
}
|
||||
vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
|
||||
&vec_oprnds1, stmt_info,
|
||||
this_dest, gsi,
|
||||
c1, c2, op_type);
|
||||
if (known_eq (nunits_out, nunits_in))
|
||||
vect_create_half_widening_stmts (vinfo, &vec_oprnds0,
|
||||
&vec_oprnds1, stmt_info,
|
||||
this_dest, gsi,
|
||||
c1, op_type);
|
||||
else
|
||||
vect_create_vectorized_promotion_stmts (vinfo, &vec_oprnds0,
|
||||
&vec_oprnds1, stmt_info,
|
||||
this_dest, gsi,
|
||||
c1, c2, op_type);
|
||||
}
|
||||
|
||||
FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
|
||||
|
Loading…
Reference in New Issue
Block a user