re PR tree-optimization/24659 (Conversions are not vectorized)

PR tree-optimization/24659
	* tree-vect-transform.c (vectorizable_call): Handle
	(nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases.

	* config/i386/sse.md (vec_pack_sfix_v2df): New expander.
	* config/i386/i386.c (enum ix86_builtins)
	[IX86_BUILTIN_VEC_PACK_SFIX]: New constant.
	(struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin
	description.
	(ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as
	const using def_builtin_const.
	(ix86_expand_binop_builtin): Remove bogus assert() that insn wants
	input operands in the same modes as the result.
	(ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT.

testsuite/ChangeLog:

	PR tree-optimization/24659
	* gcc.target/i386/vectorize2.c: New test.
	* gcc.target/i386/sse2-lrint-vec.c: New runtime test.
	* gcc.target/i386/sse2-lrintf-vec.c: Ditto.

From-SVN: r126111
This commit is contained in:
Uros Bizjak 2007-06-29 12:30:06 +02:00 committed by Uros Bizjak
parent 690f48b1b4
commit b40c4f6816
8 changed files with 306 additions and 50 deletions

View File

@ -1,3 +1,20 @@
2007-06-29 Uros Bizjak <ubizjak@gmail.com>
PR tree-optimization/24659
* tree-vect-transform.c (vectorizable_call): Handle
(nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases.
* config/i386/sse.md (vec_pack_sfix_v2df): New expander.
* config/i386/i386.c (enum ix86_builtins)
[IX86_BUILTIN_VEC_PACK_SFIX]: New constant.
(struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin
description.
(ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as
const using def_builtin_const.
(ix86_expand_binop_builtin): Remove bogus assert() that insn wants
input operands in the same modes as the result.
(ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT.
2007-06-29 Richard Sandiford <rsandifo@nildram.co.uk>
* df-problems.c (df_set_unused_notes_for_mw): Fix formatting.
@ -264,7 +281,7 @@
* gimplify.c (mark_addressable): New function.
(gimplify_modify_expr_rhs, gimplify_addr_expr, gimplify_expr): Use it.
2007-06-19 Uros Bizjak <ubizjak@gmail.com>
2007-06-22 Uros Bizjak <ubizjak@gmail.com>
PR middle-end/32374
* expr.c (store_constructor): Do not clobber non-zeroed memory.

View File

@ -16820,6 +16820,8 @@ enum ix86_builtins
IX86_BUILTIN_VEC_SET_V4HI,
IX86_BUILTIN_VEC_SET_V16QI,
IX86_BUILTIN_VEC_PACK_SFIX,
/* SSE4.2. */
IX86_BUILTIN_CRC32QI,
IX86_BUILTIN_CRC32HI,
@ -17167,6 +17169,8 @@ static const struct builtin_description bdesc_2arg[] =
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, 0 },
/* SSE2 MMX */
{ OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, 0 },
{ OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, 0 },
@ -17563,6 +17567,9 @@ ix86_init_mmx_sse_builtins (void)
= build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE);
tree v4si_ftype_v2df
= build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE);
tree v4si_ftype_v2df_v2df
= build_function_type_list (V4SI_type_node,
V2DF_type_node, V2DF_type_node, NULL_TREE);
tree v2si_ftype_v2df
= build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE);
tree v4sf_ftype_v2df
@ -17906,7 +17913,10 @@ ix86_init_mmx_sse_builtins (void)
|| d->icode == CODE_FOR_sse2_vmmaskcmpv2df3)
type = v2di_ftype_v2df_v2df;
def_builtin (d->mask, d->name, type, d->code);
if (d->icode == CODE_FOR_vec_pack_sfix_v2df)
type = v4si_ftype_v2df_v2df;
def_builtin_const (d->mask, d->name, type, d->code);
}
/* Add all builtins that are more or less simple operations on 1 operand. */
@ -18457,11 +18467,6 @@ ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
op1 = gen_lowpart (TImode, x);
}
/* The insn must want input operands in the same modes as the
result. */
gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode)
&& (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode));
if (!(*insn_data[icode].operand[1].predicate) (op0, mode0))
op0 = copy_to_mode_reg (mode0, op0);
if (!(*insn_data[icode].operand[2].predicate) (op1, mode1))
@ -19863,6 +19868,12 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
return ix86_builtins[IX86_BUILTIN_SQRTPS];
return NULL_TREE;
case BUILT_IN_LRINT:
if (out_mode == SImode && out_n == 4
&& in_mode == DFmode && in_n == 2)
return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX];
return NULL_TREE;
case BUILT_IN_LRINTF:
if (out_mode == SImode && out_n == 4
&& in_mode == SFmode && in_n == 4)

View File

@ -2421,6 +2421,26 @@
DONE;
})
(define_expand "vec_pack_sfix_v2df"
[(match_operand:V4SI 0 "register_operand" "")
(match_operand:V2DF 1 "nonimmediate_operand" "")
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
rtx r1, r2;
r1 = gen_reg_rtx (V4SImode);
r2 = gen_reg_rtx (V4SImode);
emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
emit_insn (gen_sse2_punpcklqdq (gen_lowpart (V2DImode, operands[0]),
gen_lowpart (V2DImode, r1),
gen_lowpart (V2DImode, r2)));
DONE;
})
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; Parallel double-precision floating point element swizzling

View File

@ -1,3 +1,10 @@
2007-06-29 Uros Bizjak <ubizjak@gmail.com>
PR tree-optimization/24659
* gcc.target/i386/vectorize2.c: New test.
* gcc.target/i386/sse2-lrint-vec.c: New runtime test.
* gcc.target/i386/sse2-lrintf-vec.c: Ditto.
2007-06-29 Eric Botcazou <ebotcazou@adacore.com>
* gcc.dg/pointer-arith-9.c: New test.

View File

@ -0,0 +1,48 @@
/* { dg-do run } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
#include "../../gcc.dg/i386-cpuid.h"
extern long lrint (double);
extern void abort (void);
#define N 32
int __attribute__((noinline))
main1 ()
{
double a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
long r[N];
int i;
for (i = 0; i < N; i++)
{
r[i] = lrint (a[i]);
}
/* check results: */
for (i = 0; i < N; i++)
{
if (r[i] != lrint (a[i]))
abort();
}
return 0;
}
int
main ()
{
unsigned long cpu_facilities;
cpu_facilities = i386_cpuid ();
if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
!= (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
/* If host has no vector support, pass. */
return 0;
main1 ();
return 0;
}

View File

@ -0,0 +1,48 @@
/* { dg-do run } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */
#include "../../gcc.dg/i386-cpuid.h"
extern long lrintf (float);
extern void abort (void);
#define N 32
int __attribute__((noinline))
main1 ()
{
float a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5};
long r[N];
int i;
for (i = 0; i < N; i++)
{
r[i] = lrintf (a[i]);
}
/* check results: */
for (i = 0; i < N; i++)
{
if (r[i] != lrintf (a[i]))
abort();
}
return 0;
}
int
main ()
{
unsigned long cpu_facilities;
cpu_facilities = i386_cpuid ();
if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
!= (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV))
/* If host has no vector support, pass. */
return 0;
main1 ();
return 0;
}

View File

@ -0,0 +1,30 @@
/* { dg-do compile } */
/* { dg-require-effective-target ilp32 } */
/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */
double a[256];
int b[256];
unsigned short c[256];
extern long lrint (double);
void foo(void)
{
int i;
for (i=0; i<256; ++i)
b[i] = lrint (a[i]);
}
void bar(void)
{
int i;
for (i=0; i<256; ++i)
{
b[i] = lrint (a[i]);
c[i] += c[i];
}
}
/* { dg-final { scan-assembler "cvtpd2dq" } } */

View File

@ -2253,13 +2253,19 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
tree scalar_dest;
tree operation;
tree op, type;
tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE;
stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info;
tree vectype_out, vectype_in;
int nunits_in;
int nunits_out;
loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type;
enum vect_def_type dt[2];
tree new_stmt;
int ncopies, j, nargs;
call_expr_arg_iterator iter;
tree vargs;
enum { NARROW, NONE, WIDEN } modifier;
if (!STMT_VINFO_RELEVANT_P (stmt_info))
return false;
@ -2291,12 +2297,10 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
nargs = 0;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
{
++nargs;
/* Bail out if the function has more than two arguments, we
do not have interesting builtin functions to vectorize with
more than two arguments. */
if (nargs > 2)
if (nargs >= 2)
return false;
/* We can only handle calls with arguments of the same type. */
@ -2309,12 +2313,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
}
rhs_type = TREE_TYPE (op);
if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1]))
if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs]))
{
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "use not simple.");
return false;
}
++nargs;
}
/* No arguments is also not good. */
@ -2322,15 +2328,20 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
return false;
vectype_in = get_vectype_for_scalar_type (rhs_type);
nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in);
lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0));
vectype_out = get_vectype_for_scalar_type (lhs_type);
nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
/* Only handle the case of vectors with the same number of elements.
FIXME: We need a way to handle for example the SSE2 cvtpd2dq
instruction which converts V2DFmode to V4SImode but only
using the lower half of the V4SImode result. */
if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out))
/* FORNOW */
if (nunits_in == nunits_out / 2)
modifier = NARROW;
else if (nunits_out == nunits_in)
modifier = NONE;
else if (nunits_out == nunits_in / 2)
modifier = WIDEN;
else
return false;
/* For now, we only vectorize functions if a target specific builtin
@ -2348,8 +2359,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS));
ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
/ TYPE_VECTOR_SUBPARTS (vectype_out));
if (modifier == NARROW)
ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out;
else
ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in;
/* Sanity check: make sure that at least one copy of the vectorized stmt
needs to be generated. */
gcc_assert (ncopies >= 1);
if (!vec_stmt) /* transformation not required. */
{
@ -2365,55 +2382,113 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt)
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "transform operation.");
gcc_assert (ncopies >= 1);
/* Handle def. */
scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0);
vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
prev_stmt_info = NULL;
for (j = 0; j < ncopies; ++j)
switch (modifier)
{
tree new_stmt, vargs;
tree vec_oprnd[2];
int n;
/* Build argument list for the vectorized call. */
/* FIXME: Rewrite this so that it doesn't construct a temporary
list. */
vargs = NULL_TREE;
n = -1;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
case NONE:
for (j = 0; j < ncopies; ++j)
{
++n;
/* Build argument list for the vectorized call. */
/* FIXME: Rewrite this so that it doesn't
construct a temporary list. */
vargs = NULL_TREE;
nargs = 0;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
{
if (j == 0)
vec_oprnd0
= vect_get_vec_def_for_operand (op, stmt, NULL);
else
vec_oprnd0
= vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
++nargs;
}
vargs = nreverse (vargs);
rhs = build_function_call_expr (fndecl, vargs);
new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
if (j == 0)
vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL);
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]);
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs);
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
vargs = nreverse (vargs);
rhs = build_function_call_expr (fndecl, vargs);
new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
break;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
case NARROW:
for (j = 0; j < ncopies; ++j)
{
/* Build argument list for the vectorized call. */
/* FIXME: Rewrite this so that it doesn't
construct a temporary list. */
vargs = NULL_TREE;
nargs = 0;
FOR_EACH_CALL_EXPR_ARG (op, iter, operation)
{
if (j == 0)
{
vec_oprnd0
= vect_get_vec_def_for_operand (op, stmt, NULL);
vec_oprnd1
= vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
}
else
{
vec_oprnd0
= vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1);
vec_oprnd1
= vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0);
}
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs);
vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs);
++nargs;
}
vargs = nreverse (vargs);
rhs = build_function_call_expr (fndecl, vargs);
new_stmt = build_gimple_modify_stmt (vec_dest, rhs);
new_temp = make_ssa_name (vec_dest, new_stmt);
GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp;
vect_finish_stmt_generation (stmt, new_stmt, bsi);
if (j == 0)
STMT_VINFO_VEC_STMT (stmt_info) = new_stmt;
else
STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
prev_stmt_info = vinfo_for_stmt (new_stmt);
}
*vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
break;
case WIDEN:
/* No current target implements this case. */
return false;
}
/* The call in STMT might prevent it from being removed in dce. We however
cannot remove it here, due to the way the ssa name it defines is mapped
to the new definition. So just replace rhs of the statement with something
harmless. */
/* The call in STMT might prevent it from being removed in dce.
We however cannot remove it here, due to the way the ssa name
it defines is mapped to the new definition. So just replace
rhs of the statement with something harmless. */
type = TREE_TYPE (scalar_dest);
GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node);
update_stmt (stmt);