slp: Support optimizing load distribution

This introduces a post processing step for the pattern matcher to flatten
permutes introduced by the complex multiplications patterns.

This performs a blend early such that SLP is not cancelled by the LOAD_LANES
permute.  This is a temporary workaround to the fact that loads are not CSEd
during building and is required to produce efficient code.

gcc/ChangeLog:

	* tree-vect-slp.c (optimize_load_redistribution_1): New.
	(optimize_load_redistribution, vect_is_slp_load_node): New.
	(vect_match_slp_patterns): Use it.
This commit is contained in:
Tamar Christina 2021-01-14 20:50:57 +00:00
parent fe70119531
commit 500600c784

View File

@ -2258,6 +2258,123 @@ calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
return exact_div (common_multiple (nunits, group_size), group_size);
}
/* Helper that checks to see if a node is a load node. */
static inline bool
vect_is_slp_load_node (slp_tree root)
{
return SLP_TREE_DEF_TYPE (root) == vect_internal_def
&& STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
&& DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
}
/* Helper function of optimize_load_redistribution that performs the operation
recursively. */
static slp_tree
optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
vec_info *vinfo, unsigned int group_size,
hash_map<slp_tree, slp_tree> *load_map,
slp_tree root)
{
if (slp_tree *leader = load_map->get (root))
return *leader;
load_map->put (root, NULL);
slp_tree node;
unsigned i;
/* For now, we don't know anything about externals so do not do anything. */
if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
return NULL;
else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
{
/* First convert this node into a load node and add it to the leaves
list and flatten the permute from a lane to a load one. If it's
unneeded it will be elided later. */
vec<stmt_vec_info> stmts;
stmts.create (SLP_TREE_LANES (root));
lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
for (unsigned j = 0; j < lane_perm.length (); j++)
{
std::pair<unsigned, unsigned> perm = lane_perm[j];
node = SLP_TREE_CHILDREN (root)[perm.first];
if (!vect_is_slp_load_node (node)
|| SLP_TREE_CHILDREN (node).exists ())
{
stmts.release ();
goto next;
}
stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
}
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"converting stmts on permute node %p\n", root);
bool *matches = XALLOCAVEC (bool, group_size);
poly_uint64 max_nunits = 1;
unsigned tree_size = 0, limit = 1;
node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
matches, &limit, &tree_size, bst_map);
if (!node)
stmts.release ();
load_map->put (root, node);
return node;
}
next:
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
{
slp_tree value
= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
node);
if (value)
{
SLP_TREE_REF_COUNT (value)++;
SLP_TREE_CHILDREN (root)[i] = value;
vect_free_slp_tree (node);
}
}
return NULL;
}
/* Temporary workaround for loads not being CSEd during SLP build. This
function will traverse the SLP tree rooted in ROOT for INSTANCE and find
VEC_PERM nodes that blend vectors from multiple nodes that all read from the
same DR such that the final operation is equal to a permuted load. Such
NODES are then directly converted into LOADS themselves. The nodes are
CSEd using BST_MAP. */
static void
optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
vec_info *vinfo, unsigned int group_size,
hash_map<slp_tree, slp_tree> *load_map,
slp_tree root)
{
slp_tree node;
unsigned i;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
{
slp_tree value
= optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
node);
if (value)
{
SLP_TREE_REF_COUNT (value)++;
SLP_TREE_CHILDREN (root)[i] = value;
vect_free_slp_tree (node);
}
}
}
/* Helper function of vect_match_slp_patterns.
Attempts to match patterns against the slp tree rooted in REF_NODE using
@ -2305,8 +2422,7 @@ vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
static bool
vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
hash_set<slp_tree> *visited,
slp_tree_to_load_perm_map_t *perm_cache,
scalar_stmts_to_slp_tree_map_t * /* bst_map */)
slp_tree_to_load_perm_map_t *perm_cache)
{
DUMP_VECT_SCOPE ("vect_match_slp_patterns");
slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
@ -2316,20 +2432,7 @@ vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
"Analyzing SLP tree %p for patterns\n",
SLP_INSTANCE_TREE (instance));
bool found_p
= vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
if (found_p)
{
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
"Pattern matched SLP tree\n");
vect_print_slp_graph (MSG_NOTE, vect_location, *ref_node);
}
}
return found_p;
return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
}
/* Analyze an SLP instance starting from a group of grouped stores. Call
@ -2768,10 +2871,25 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
hash_set<slp_tree> visited_patterns;
slp_tree_to_load_perm_map_t perm_cache;
hash_map<slp_tree, slp_tree> load_map;
/* See if any patterns can be found in the SLP tree. */
FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
vect_match_slp_patterns (instance, vinfo, &visited_patterns, &perm_cache,
bst_map);
if (vect_match_slp_patterns (instance, vinfo, &visited_patterns,
&perm_cache))
{
slp_tree root = SLP_INSTANCE_TREE (instance);
optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
&load_map, root);
if (dump_enabled_p ())
{
dump_printf_loc (MSG_NOTE, vect_location,
"Pattern matched SLP tree\n");
vect_print_slp_graph (MSG_NOTE, vect_location, root);
}
}
/* The map keeps a reference on SLP nodes built, release that. */
for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();