From 1fa715db5490fb44668e0a37f9a5927d9030a50e Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 27 Nov 2019 08:56:23 +0000 Subject: [PATCH] re PR tree-optimization/92645 (Hand written vector code is 450 times slower when compiled with GCC compared to Clang) 2019-11-27 Richard Biener PR tree-optimization/92645 * tree-ssa-forwprop.c (simplify_vector_constructor): Handle CTORs with just a subset of the original vectors. * gcc.target/i386/pr92645-2.c: New testcase. * gcc.target/i386/pr92645-3.c: Likewise. From-SVN: r278758 --- gcc/ChangeLog | 6 + gcc/testsuite/ChangeLog | 6 + gcc/testsuite/gcc.target/i386/pr92645-2.c | 34 ++++++ gcc/testsuite/gcc.target/i386/pr92645-3.c | 37 ++++++ gcc/tree-ssa-forwprop.c | 138 ++++++++++++++-------- 5 files changed, 174 insertions(+), 47 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr92645-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr92645-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1235b4411c2..78aa3ca8189 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2019-11-27 Richard Biener + + PR tree-optimization/92645 + * tree-ssa-forwprop.c (simplify_vector_constructor): Handle + CTORs with just a subset of the original vectors. + 2019-11-27 Richard Biener PR middle-end/92674 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 1b56973b06e..df74ba61243 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2019-11-27 Richard Biener + + PR tree-optimization/92645 + * gcc.target/i386/pr92645-2.c: New testcase. + * gcc.target/i386/pr92645-3.c: Likewise. + 2019-11-26 Paolo Carlini * g++.dg/cpp1z/bool-increment1.C: Test location(s) too. diff --git a/gcc/testsuite/gcc.target/i386/pr92645-2.c b/gcc/testsuite/gcc.target/i386/pr92645-2.c new file mode 100644 index 00000000000..d34ed3aa8e5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92645-2.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -fdump-tree-cddce1" } */ + +typedef int v4si __attribute__((vector_size(16))); +typedef int v2si __attribute__((vector_size(8))); + +void low (v2si *dst, v4si *srcp) +{ + v4si src = *srcp; + *dst = (v2si) { src[0], src[1] }; +} + +void high (v2si *dst, v4si *srcp) +{ + v4si src = *srcp; + *dst = (v2si) { src[2], src[3] }; +} + +void even (v2si *dst, v4si *srcp) +{ + v4si src = *srcp; + *dst = (v2si) { src[0], src[2] }; +} + +void odd (v2si *dst, v4si *srcp) +{ + v4si src = *srcp; + *dst = (v2si) { src[1], src[3] }; +} + +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */ +/* Ideally highpart extraction would elide the permutation as well. */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr92645-3.c b/gcc/testsuite/gcc.target/i386/pr92645-3.c new file mode 100644 index 00000000000..9c08c9fb632 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92645-3.c @@ -0,0 +1,37 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -fdump-tree-cddce1" } */ + +typedef int v8si __attribute__((vector_size(32))); +typedef float v4sf __attribute__((vector_size(16))); + +void low (v4sf *dst, v8si *srcp) +{ + v8si src = *srcp; + *dst = (v4sf) { src[0], src[1], src[2], src[3] }; +} + +void high (v4sf *dst, v8si *srcp) +{ + v8si src = *srcp; + *dst = (v4sf) { src[4], src[5], src[6], src[7] }; +} + +void even (v4sf *dst, v8si *srcp) +{ + v8si src = *srcp; + *dst = (v4sf) { src[0], src[2], src[4], src[6] }; +} + +void odd (v4sf *dst, v8si *srcp) +{ + v8si src = *srcp; + *dst = (v4sf) { src[1], src[3], src[5], src[7] }; +} + +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "cddce1" } } */ +/* Four conversions, on the smaller vector type, to not convert excess + elements. */ +/* { dg-final { scan-tree-dump-times " = \\\(vector\\\(4\\\) float\\\)" 4 "cddce1" } } */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 3 "cddce1" } } */ +/* Ideally highpart extraction would elide the VEC_PERM_EXPR as well. */ +/* { dg-final { scan-tree-dump-times "VEC_PERM_EXPR" 2 "cddce1" { xfail *-*-* } } } */ diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index 386406fbb38..7202d4ac554 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2037,6 +2037,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) tree op, op2, orig[2], type, elem_type; unsigned elem_size, i; unsigned HOST_WIDE_INT nelts; + unsigned HOST_WIDE_INT refnelts; enum tree_code conv_code; constructor_elt *elt; bool maybe_ident; @@ -2052,7 +2053,6 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) elem_type = TREE_TYPE (type); elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); - vec_perm_builder sel (nelts, nelts, 1); orig[0] = NULL; orig[1] = NULL; conv_code = ERROR_MARK; @@ -2061,6 +2061,7 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) tree one_nonconstant = NULL_TREE; auto_vec constants; constants.safe_grow_cleared (nelts); + auto_vec, 64> elts; FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (op), i, elt) { tree ref, op1; @@ -2079,7 +2080,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) TREE_TYPE (TREE_TYPE (ref))) && known_eq (bit_field_size (op1), elem_size) && constant_multiple_p (bit_field_offset (op1), - elem_size, &elem)) + elem_size, &elem) + && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts)) { unsigned int j; for (j = 0; j < 2; ++j) @@ -2098,11 +2100,9 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) if (j < 2) { orig[j] = ref; - if (j) - elem += nelts; - if (elem != i) + if (elem != i || j != 0) maybe_ident = false; - sel.quick_push (elem); + elts.safe_push (std::make_pair (j, elem)); continue; } /* Else fallthru. */ @@ -2131,28 +2131,41 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) else if (!operand_equal_p (one_nonconstant, elt->value, 0)) return false; } - sel.quick_push (i + nelts); + elts.safe_push (std::make_pair (1, i)); maybe_ident = false; } if (i < nelts) return false; if (! orig[0] - || ! VECTOR_TYPE_P (TREE_TYPE (orig[0])) - || maybe_ne (TYPE_VECTOR_SUBPARTS (type), - TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])))) - return false; - - tree tem; - if (conv_code != ERROR_MARK - && (! supportable_convert_operation (conv_code, type, - TREE_TYPE (orig[0]), - &tem, &conv_code) - || conv_code == CALL_EXPR)) + || ! VECTOR_TYPE_P (TREE_TYPE (orig[0]))) return false; + refnelts = TYPE_VECTOR_SUBPARTS (TREE_TYPE (orig[0])).to_constant (); if (maybe_ident) { + tree conv_src_type + = (nelts != refnelts + ? (conv_code != ERROR_MARK + ? build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), nelts) + : type) + : TREE_TYPE (orig[0])); + tree tem; + if (conv_code != ERROR_MARK + && (!supportable_convert_operation (conv_code, type, conv_src_type, + &tem, &conv_code) + || conv_code == CALL_EXPR)) + return false; + if (nelts != refnelts) + { + gassign *lowpart + = gimple_build_assign (make_ssa_name (conv_src_type), + build3 (BIT_FIELD_REF, conv_src_type, + orig[0], TYPE_SIZE (conv_src_type), + bitsize_zero_node)); + gsi_insert_before (gsi, lowpart, GSI_SAME_STMT); + orig[0] = gimple_assign_lhs (lowpart); + } if (conv_code == ERROR_MARK) gimple_assign_set_rhs_from_tree (gsi, orig[0]); else @@ -2161,34 +2174,66 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) } else { - tree mask_type; + tree mask_type, perm_type, conv_src_type; + if (orig[1] == error_mark_node && conv_code != ERROR_MARK) + { + /* ??? For subsetting a larger vector we need to permute the original + but then the constants are in the converted type already which is + why for that case we first convert and then permute. */ + if (nelts != refnelts) + return false; + conv_src_type = TREE_TYPE (orig[0]); + perm_type = type; + } + else + { + perm_type = TREE_TYPE (orig[0]); + conv_src_type = (nelts == refnelts + ? perm_type + : build_vector_type (TREE_TYPE (perm_type), nelts)); + } + tree tem; + if (conv_code != ERROR_MARK + && (!supportable_convert_operation (conv_code, type, conv_src_type, + &tem, &conv_code) + || conv_code == CALL_EXPR)) + return false; - vec_perm_indices indices (sel, orig[1] ? 2 : 1, nelts); - if (!can_vec_perm_const_p (TYPE_MODE (type), indices)) + /* Now that we know the number of elements of the source build the + permute vector. */ + vec_perm_builder sel (refnelts, refnelts, 1); + for (i = 0; i < elts.length (); ++i) + sel.quick_push (elts[i].second + elts[i].first * refnelts); + /* And fill the tail with "something". It's really don't care, + and ideally we'd allow VEC_PERM to have a smaller destination + vector. */ + for (; i < refnelts; ++i) + sel.quick_push (i - elts.length ()); + vec_perm_indices indices (sel, orig[1] ? 2 : 1, refnelts); + if (!can_vec_perm_const_p (TYPE_MODE (perm_type), indices)) return false; mask_type = build_vector_type (build_nonstandard_integer_type (elem_size, 1), - nelts); + refnelts); if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_VECTOR_INT || maybe_ne (GET_MODE_SIZE (TYPE_MODE (mask_type)), - GET_MODE_SIZE (TYPE_MODE (type)))) + GET_MODE_SIZE (TYPE_MODE (perm_type)))) return false; op2 = vec_perm_indices_to_tree (mask_type, indices); bool convert_orig0 = false; + gimple_seq stmts = NULL; if (!orig[1]) orig[1] = orig[0]; else if (orig[1] == error_mark_node && one_nonconstant) { - gimple_seq seq = NULL; - orig[1] = gimple_build_vector_from_val (&seq, UNKNOWN_LOCATION, - type, one_nonconstant); - gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); - convert_orig0 = true; + orig[1] = gimple_build_vector_from_val (&stmts, UNKNOWN_LOCATION, + perm_type, one_nonconstant); + convert_orig0 = conv_code != ERROR_MARK; } else if (orig[1] == error_mark_node) { - tree_vector_builder vec (type, nelts, 1); + tree_vector_builder vec (perm_type, nelts, 1); for (unsigned i = 0; i < nelts; ++i) if (constants[i]) vec.quick_push (constants[i]); @@ -2196,30 +2241,29 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) /* ??? Push a don't-care value. */ vec.quick_push (one_constant); orig[1] = vec.build (); - convert_orig0 = true; + convert_orig0 = conv_code != ERROR_MARK; } - if (conv_code == ERROR_MARK) - gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, orig[0], - orig[1], op2); - else if (convert_orig0) + tree res; + if (convert_orig0) { - gimple *conv - = gimple_build_assign (make_ssa_name (type), conv_code, orig[0]); - orig[0] = gimple_assign_lhs (conv); - gsi_insert_before (gsi, conv, GSI_SAME_STMT); - gimple_assign_set_rhs_with_ops (gsi, VEC_PERM_EXPR, - orig[0], orig[1], op2); + gcc_assert (nelts == refnelts); + res = gimple_build (&stmts, conv_code, type, orig[0]); + res = gimple_build (&stmts, VEC_PERM_EXPR, perm_type, + res, orig[1], op2); } else { - gimple *perm - = gimple_build_assign (make_ssa_name (TREE_TYPE (orig[0])), - VEC_PERM_EXPR, orig[0], orig[1], op2); - orig[0] = gimple_assign_lhs (perm); - gsi_insert_before (gsi, perm, GSI_SAME_STMT); - gimple_assign_set_rhs_with_ops (gsi, conv_code, orig[0], - NULL_TREE, NULL_TREE); + res = gimple_build (&stmts, VEC_PERM_EXPR, perm_type, + orig[0], orig[1], op2); + if (nelts != refnelts) + res = gimple_build (&stmts, BIT_FIELD_REF, + conv_code != ERROR_MARK ? conv_src_type : type, + res, TYPE_SIZE (type), bitsize_zero_node); + if (conv_code != ERROR_MARK) + res = gimple_build (&stmts, conv_code, type, res); } + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops (gsi, SSA_NAME, res); } update_stmt (gsi_stmt (*gsi)); return true;