From b40c4f68160a4b556135185e8b5e013ab77ba2ec Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 29 Jun 2007 12:30:06 +0200 Subject: [PATCH] re PR tree-optimization/24659 (Conversions are not vectorized) PR tree-optimization/24659 * tree-vect-transform.c (vectorizable_call): Handle (nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases. * config/i386/sse.md (vec_pack_sfix_v2df): New expander. * config/i386/i386.c (enum ix86_builtins) [IX86_BUILTIN_VEC_PACK_SFIX]: New constant. (struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin description. (ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as const using def_builtin_const. (ix86_expand_binop_builtin): Remove bogus assert() that insn wants input operands in the same modes as the result. (ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT. testsuite/ChangeLog: PR tree-optimization/24659 * gcc.target/i386/vectorize2.c: New test. * gcc.target/i386/sse2-lrint-vec.c: New runtime test. * gcc.target/i386/sse2-lrintf-vec.c: Ditto. From-SVN: r126111 --- gcc/ChangeLog | 19 ++- gcc/config/i386/i386.c | 23 ++- gcc/config/i386/sse.md | 20 +++ gcc/testsuite/ChangeLog | 7 + .../gcc.target/i386/sse2-lrint-vec.c | 48 ++++++ .../gcc.target/i386/sse2-lrintf-vec.c | 48 ++++++ gcc/testsuite/gcc.target/i386/vectorize2.c | 30 ++++ gcc/tree-vect-transform.c | 161 +++++++++++++----- 8 files changed, 306 insertions(+), 50 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/sse2-lrint-vec.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-lrintf-vec.c create mode 100644 gcc/testsuite/gcc.target/i386/vectorize2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 2629c90f552..082dfa8b5f1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2007-06-29 Uros Bizjak + + PR tree-optimization/24659 + * tree-vect-transform.c (vectorizable_call): Handle + (nunits_in == nunits_out / 2) and (nunits_out == nunits_in / 2) cases. + + * config/i386/sse.md (vec_pack_sfix_v2df): New expander. + * config/i386/i386.c (enum ix86_builtins) + [IX86_BUILTIN_VEC_PACK_SFIX]: New constant. + (struct bdesc_2arg) [__builtin_ia32_vec_pack_sfix]: New builtin + description. + (ix86_init_mmx_sse_builtins): Define all builtins with 2 arguments as + const using def_builtin_const. + (ix86_expand_binop_builtin): Remove bogus assert() that insn wants + input operands in the same modes as the result. + (ix86_builtin_vectorized_function): Handle BUILT_IN_LRINT. + 2007-06-29 Richard Sandiford * df-problems.c (df_set_unused_notes_for_mw): Fix formatting. @@ -264,7 +281,7 @@ * gimplify.c (mark_addressable): New function. (gimplify_modify_expr_rhs, gimplify_addr_expr, gimplify_expr): Use it. -2007-06-19 Uros Bizjak +2007-06-22 Uros Bizjak PR middle-end/32374 * expr.c (store_constructor): Do not clobber non-zeroed memory. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 5c5cb528858..96c948f6dea 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -16820,6 +16820,8 @@ enum ix86_builtins IX86_BUILTIN_VEC_SET_V4HI, IX86_BUILTIN_VEC_SET_V16QI, + IX86_BUILTIN_VEC_PACK_SFIX, + /* SSE4.2. */ IX86_BUILTIN_CRC32QI, IX86_BUILTIN_CRC32HI, @@ -17167,6 +17169,8 @@ static const struct builtin_description bdesc_2arg[] = { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_unpcklpd, "__builtin_ia32_unpcklpd", IX86_BUILTIN_UNPCKLPD, UNKNOWN, 0 }, + { OPTION_MASK_ISA_SSE2, CODE_FOR_vec_pack_sfix_v2df, "__builtin_ia32_vec_pack_sfix", IX86_BUILTIN_VEC_PACK_SFIX, UNKNOWN, 0 }, + /* SSE2 MMX */ { OPTION_MASK_ISA_SSE2, CODE_FOR_addv16qi3, "__builtin_ia32_paddb128", IX86_BUILTIN_PADDB128, UNKNOWN, 0 }, { OPTION_MASK_ISA_SSE2, CODE_FOR_addv8hi3, "__builtin_ia32_paddw128", IX86_BUILTIN_PADDW128, UNKNOWN, 0 }, @@ -17563,6 +17567,9 @@ ix86_init_mmx_sse_builtins (void) = build_function_type_list (V2DF_type_node, V4SI_type_node, NULL_TREE); tree v4si_ftype_v2df = build_function_type_list (V4SI_type_node, V2DF_type_node, NULL_TREE); + tree v4si_ftype_v2df_v2df + = build_function_type_list (V4SI_type_node, + V2DF_type_node, V2DF_type_node, NULL_TREE); tree v2si_ftype_v2df = build_function_type_list (V2SI_type_node, V2DF_type_node, NULL_TREE); tree v4sf_ftype_v2df @@ -17906,7 +17913,10 @@ ix86_init_mmx_sse_builtins (void) || d->icode == CODE_FOR_sse2_vmmaskcmpv2df3) type = v2di_ftype_v2df_v2df; - def_builtin (d->mask, d->name, type, d->code); + if (d->icode == CODE_FOR_vec_pack_sfix_v2df) + type = v4si_ftype_v2df_v2df; + + def_builtin_const (d->mask, d->name, type, d->code); } /* Add all builtins that are more or less simple operations on 1 operand. */ @@ -18457,11 +18467,6 @@ ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target) op1 = gen_lowpart (TImode, x); } - /* The insn must want input operands in the same modes as the - result. */ - gcc_assert ((GET_MODE (op0) == mode0 || GET_MODE (op0) == VOIDmode) - && (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)); - if (!(*insn_data[icode].operand[1].predicate) (op0, mode0)) op0 = copy_to_mode_reg (mode0, op0); if (!(*insn_data[icode].operand[2].predicate) (op1, mode1)) @@ -19863,6 +19868,12 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out, return ix86_builtins[IX86_BUILTIN_SQRTPS]; return NULL_TREE; + case BUILT_IN_LRINT: + if (out_mode == SImode && out_n == 4 + && in_mode == DFmode && in_n == 2) + return ix86_builtins[IX86_BUILTIN_VEC_PACK_SFIX]; + return NULL_TREE; + case BUILT_IN_LRINTF: if (out_mode == SImode && out_n == 4 && in_mode == SFmode && in_n == 4) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 65abbcf3b69..12b8cc832c8 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2421,6 +2421,26 @@ DONE; }) +(define_expand "vec_pack_sfix_v2df" + [(match_operand:V4SI 0 "register_operand" "") + (match_operand:V2DF 1 "nonimmediate_operand" "") + (match_operand:V2DF 2 "nonimmediate_operand" "")] + "TARGET_SSE2" +{ + rtx r1, r2; + + r1 = gen_reg_rtx (V4SImode); + r2 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sse2_cvtpd2dq (r1, operands[1])); + emit_insn (gen_sse2_cvtpd2dq (r2, operands[2])); + emit_insn (gen_sse2_punpcklqdq (gen_lowpart (V2DImode, operands[0]), + gen_lowpart (V2DImode, r1), + gen_lowpart (V2DImode, r2))); + DONE; +}) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; ;; Parallel double-precision floating point element swizzling diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d023d472c98..2ac00bf8914 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2007-06-29 Uros Bizjak + + PR tree-optimization/24659 + * gcc.target/i386/vectorize2.c: New test. + * gcc.target/i386/sse2-lrint-vec.c: New runtime test. + * gcc.target/i386/sse2-lrintf-vec.c: Ditto. + 2007-06-29 Eric Botcazou * gcc.dg/pointer-arith-9.c: New test. diff --git a/gcc/testsuite/gcc.target/i386/sse2-lrint-vec.c b/gcc/testsuite/gcc.target/i386/sse2-lrint-vec.c new file mode 100644 index 00000000000..35a7ac8dbb0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-lrint-vec.c @@ -0,0 +1,48 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */ + +#include "../../gcc.dg/i386-cpuid.h" + +extern long lrint (double); +extern void abort (void); + +#define N 32 + +int __attribute__((noinline)) +main1 () +{ + double a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5}; + long r[N]; + + int i; + + for (i = 0; i < N; i++) + { + r[i] = lrint (a[i]); + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (r[i] != lrint (a[i])) + abort(); + } + + return 0; +} + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_cpuid (); + + if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV)) + != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV)) + /* If host has no vector support, pass. */ + return 0; + + main1 (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-lrintf-vec.c b/gcc/testsuite/gcc.target/i386/sse2-lrintf-vec.c new file mode 100644 index 00000000000..dc4ae93b7a2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-lrintf-vec.c @@ -0,0 +1,48 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2" } */ + +#include "../../gcc.dg/i386-cpuid.h" + +extern long lrintf (float); +extern void abort (void); + +#define N 32 + +int __attribute__((noinline)) +main1 () +{ + float a[N] = {0.4,3.5,6.6,9.4,12.5,15.6,18.4,21.5,24.6,27.4,30.5,33.6,36.4,39.5,42.6,45.4,0.5,3.6,6.4,9.5,12.6,15.4,18.5,21.6,24.4,27.5,30.6,33.4,36.5,39.6,42.4,45.5}; + long r[N]; + + int i; + + for (i = 0; i < N; i++) + { + r[i] = lrintf (a[i]); + } + + /* check results: */ + for (i = 0; i < N; i++) + { + if (r[i] != lrintf (a[i])) + abort(); + } + + return 0; +} + +int +main () +{ + unsigned long cpu_facilities; + + cpu_facilities = i386_cpuid (); + + if ((cpu_facilities & (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV)) + != (bit_MMX | bit_SSE | bit_SSE2 | bit_CMOV)) + /* If host has no vector support, pass. */ + return 0; + + main1 (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/vectorize2.c b/gcc/testsuite/gcc.target/i386/vectorize2.c new file mode 100644 index 00000000000..41964871959 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vectorize2.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target ilp32 } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse" } */ + +double a[256]; +int b[256]; +unsigned short c[256]; + +extern long lrint (double); + +void foo(void) +{ + int i; + + for (i=0; i<256; ++i) + b[i] = lrint (a[i]); +} + +void bar(void) +{ + int i; + + for (i=0; i<256; ++i) + { + b[i] = lrint (a[i]); + c[i] += c[i]; + } +} + +/* { dg-final { scan-assembler "cvtpd2dq" } } */ diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 66228f006f1..5fdbbe122be 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -2253,13 +2253,19 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) tree scalar_dest; tree operation; tree op, type; + tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info; tree vectype_out, vectype_in; + int nunits_in; + int nunits_out; loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type; enum vect_def_type dt[2]; + tree new_stmt; int ncopies, j, nargs; call_expr_arg_iterator iter; + tree vargs; + enum { NARROW, NONE, WIDEN } modifier; if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; @@ -2291,12 +2297,10 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) nargs = 0; FOR_EACH_CALL_EXPR_ARG (op, iter, operation) { - ++nargs; - /* Bail out if the function has more than two arguments, we do not have interesting builtin functions to vectorize with more than two arguments. */ - if (nargs > 2) + if (nargs >= 2) return false; /* We can only handle calls with arguments of the same type. */ @@ -2309,12 +2313,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) } rhs_type = TREE_TYPE (op); - if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs-1])) + if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs])) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "use not simple."); return false; } + + ++nargs; } /* No arguments is also not good. */ @@ -2322,15 +2328,20 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) return false; vectype_in = get_vectype_for_scalar_type (rhs_type); + nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0)); vectype_out = get_vectype_for_scalar_type (lhs_type); + nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); - /* Only handle the case of vectors with the same number of elements. - FIXME: We need a way to handle for example the SSE2 cvtpd2dq - instruction which converts V2DFmode to V4SImode but only - using the lower half of the V4SImode result. */ - if (TYPE_VECTOR_SUBPARTS (vectype_in) != TYPE_VECTOR_SUBPARTS (vectype_out)) + /* FORNOW */ + if (nunits_in == nunits_out / 2) + modifier = NARROW; + else if (nunits_out == nunits_in) + modifier = NONE; + else if (nunits_out == nunits_in / 2) + modifier = WIDEN; + else return false; /* For now, we only vectorize functions if a target specific builtin @@ -2348,8 +2359,14 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS)); - ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) - / TYPE_VECTOR_SUBPARTS (vectype_out)); + if (modifier == NARROW) + ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; + else + ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; + + /* Sanity check: make sure that at least one copy of the vectorized stmt + needs to be generated. */ + gcc_assert (ncopies >= 1); if (!vec_stmt) /* transformation not required. */ { @@ -2365,55 +2382,113 @@ vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform operation."); - gcc_assert (ncopies >= 1); - /* Handle def. */ scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); vec_dest = vect_create_destination_var (scalar_dest, vectype_out); prev_stmt_info = NULL; - for (j = 0; j < ncopies; ++j) + switch (modifier) { - tree new_stmt, vargs; - tree vec_oprnd[2]; - int n; - - /* Build argument list for the vectorized call. */ - /* FIXME: Rewrite this so that it doesn't construct a temporary - list. */ - vargs = NULL_TREE; - n = -1; - FOR_EACH_CALL_EXPR_ARG (op, iter, operation) + case NONE: + for (j = 0; j < ncopies; ++j) { - ++n; + /* Build argument list for the vectorized call. */ + /* FIXME: Rewrite this so that it doesn't + construct a temporary list. */ + vargs = NULL_TREE; + nargs = 0; + FOR_EACH_CALL_EXPR_ARG (op, iter, operation) + { + if (j == 0) + vec_oprnd0 + = vect_get_vec_def_for_operand (op, stmt, NULL); + else + vec_oprnd0 + = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); + + vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs); + + ++nargs; + } + vargs = nreverse (vargs); + + rhs = build_function_call_expr (fndecl, vargs); + new_stmt = build_gimple_modify_stmt (vec_dest, rhs); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + + vect_finish_stmt_generation (stmt, new_stmt, bsi); if (j == 0) - vec_oprnd[n] = vect_get_vec_def_for_operand (op, stmt, NULL); + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; else - vec_oprnd[n] = vect_get_vec_def_for_stmt_copy (dt[n], vec_oprnd[n]); + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; - vargs = tree_cons (NULL_TREE, vec_oprnd[n], vargs); + prev_stmt_info = vinfo_for_stmt (new_stmt); } - vargs = nreverse (vargs); - rhs = build_function_call_expr (fndecl, vargs); - new_stmt = build_gimple_modify_stmt (vec_dest, rhs); - new_temp = make_ssa_name (vec_dest, new_stmt); - GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + break; - vect_finish_stmt_generation (stmt, new_stmt, bsi); + case NARROW: + for (j = 0; j < ncopies; ++j) + { + /* Build argument list for the vectorized call. */ + /* FIXME: Rewrite this so that it doesn't + construct a temporary list. */ + vargs = NULL_TREE; + nargs = 0; + FOR_EACH_CALL_EXPR_ARG (op, iter, operation) + { + if (j == 0) + { + vec_oprnd0 + = vect_get_vec_def_for_operand (op, stmt, NULL); + vec_oprnd1 + = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); + } + else + { + vec_oprnd0 + = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1); + vec_oprnd1 + = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); + } - if (j == 0) - STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; - else - STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; - prev_stmt_info = vinfo_for_stmt (new_stmt); + vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs); + vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs); + + ++nargs; + } + vargs = nreverse (vargs); + + rhs = build_function_call_expr (fndecl, vargs); + new_stmt = build_gimple_modify_stmt (vec_dest, rhs); + new_temp = make_ssa_name (vec_dest, new_stmt); + GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; + + vect_finish_stmt_generation (stmt, new_stmt, bsi); + + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + + prev_stmt_info = vinfo_for_stmt (new_stmt); + } + + *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); + + break; + + case WIDEN: + /* No current target implements this case. */ + return false; } - /* The call in STMT might prevent it from being removed in dce. We however - cannot remove it here, due to the way the ssa name it defines is mapped - to the new definition. So just replace rhs of the statement with something - harmless. */ + /* The call in STMT might prevent it from being removed in dce. + We however cannot remove it here, due to the way the ssa name + it defines is mapped to the new definition. So just replace + rhs of the statement with something harmless. */ type = TREE_TYPE (scalar_dest); GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node); update_stmt (stmt);