diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 8baa3b76601..6653e9e2142 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1513,6 +1513,7 @@ OBJS = \
omp-general.o \
omp-low.o \
omp-oacc-kernels-decompose.o \
+ omp-oacc-neuter-broadcast.o \
omp-simd-clone.o \
opt-problem.o \
optabs.o \
diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 8bd0b434a84..5d62a845bec 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -38,9 +38,10 @@ extern rtx gcn_full_exec ();
extern rtx gcn_full_exec_reg ();
extern rtx gcn_gen_undef (machine_mode);
extern bool gcn_global_address_p (rtx);
-extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
- const char *name);
extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level);
+extern tree gcn_goacc_create_worker_broadcast_record (tree record_type,
+ bool sender,
+ const char *name);
extern void gcn_goacc_reduction (gcall *call);
extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
unsigned int to_reg);
diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c
index 1eb8882d4bf..f722d2d3c4e 100644
--- a/gcc/config/gcn/gcn-tree.c
+++ b/gcc/config/gcn/gcn-tree.c
@@ -548,35 +548,6 @@ gcn_goacc_reduction (gcall *call)
}
}
-/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
-
- Tweak (worker) propagation record, e.g. to put it in shared memory. */
-
-tree
-gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
- const char *name)
-{
- tree type = record_type;
-
- TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
-
- if (!sender)
- type = build_pointer_type (type);
-
- tree decl = create_tmp_var_raw (type, name);
-
- if (sender)
- {
- DECL_CONTEXT (decl) = NULL_TREE;
- TREE_STATIC (decl) = 1;
- }
-
- if (sender)
- varpool_node::finalize_decl (decl);
-
- return decl;
-}
-
tree
gcn_goacc_adjust_private_decl (location_t, tree var, int level)
{
@@ -604,4 +575,33 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level)
return var;
}
+/* Implement TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD.
+
+ Create OpenACC worker state propagation record in shared memory. */
+
+tree
+gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender,
+ const char *name)
+{
+ tree type = record_type;
+
+ TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
+
+ if (!sender)
+ type = build_pointer_type (type);
+
+ tree decl = create_tmp_var_raw (type, name);
+
+ if (sender)
+ {
+ DECL_CONTEXT (decl) = NULL_TREE;
+ TREE_STATIC (decl) = 1;
+ }
+
+ if (sender)
+ varpool_node::finalize_decl (decl);
+
+ return decl;
+}
+
/* }}} */
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index d25c4e54e16..87af5d18f42 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -6513,11 +6513,11 @@ gcn_dwarf_register_span (rtx rtl)
#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
-#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
-#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
- gcn_goacc_adjust_propagation_record
#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
+#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
+#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
+ gcn_goacc_create_worker_broadcast_record
#undef TARGET_GOACC_FORK_JOIN
#define TARGET_GOACC_FORK_JOIN gcn_fork_join
#undef TARGET_GOACC_REDUCTION
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index cb015283237..a30fdcbbf3d 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6409,6 +6409,15 @@ private variables at OpenACC device-lowering time using the
@code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook.
@end deftypefn
+@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name})
+Create a record used to propagate local-variable state from an active
+worker to other workers. A possible implementation might adjust the type
+of REC to place the new variable in shared GPU memory.
+
+Presence of this target hook indicates that middle end neutering/broadcasting
+be used.
+@end deftypefn
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 4a522ae7e2e..611fc500ac8 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4223,6 +4223,8 @@ address; but often a machine-dependent strategy can generate better code.
@hook TARGET_GOACC_EXPAND_VAR_DECL
+@hook TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
+
@node Anchored Addresses
@section Anchored Addresses
@cindex anchored addresses
diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def
index 4a7e7badd7e..05b555c7fa0 100644
--- a/gcc/omp-builtins.def
+++ b/gcc/omp-builtins.def
@@ -59,6 +59,15 @@ DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_ID, "goacc_parlevel_id",
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_SIZE, "goacc_parlevel_size",
BT_FN_INT_INT, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_BARRIER, "GOACC_barrier",
+ BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_START, "GOACC_single_start",
+ BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_START, "GOACC_single_copy_start",
+ BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
+DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_END, "GOACC_single_copy_end",
+ BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
+
DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_THREAD_NUM, "omp_get_thread_num",
BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_NUM_THREADS, "omp_get_num_threads",
diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc
new file mode 100644
index 00000000000..0f6ba885c6c
--- /dev/null
+++ b/gcc/omp-oacc-neuter-broadcast.cc
@@ -0,0 +1,1515 @@
+/* OpenACC worker partitioning via middle end neutering/broadcasting scheme
+
+ Copyright (C) 2015-2021 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 3, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING3. If not see
+ . */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "backend.h"
+#include "rtl.h"
+#include "tree.h"
+#include "gimple.h"
+#include "tree-pass.h"
+#include "ssa.h"
+#include "cgraph.h"
+#include "pretty-print.h"
+#include "fold-const.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimple-walk.h"
+#include "tree-inline.h"
+#include "langhooks.h"
+#include "omp-general.h"
+#include "omp-low.h"
+#include "gimple-pretty-print.h"
+#include "cfghooks.h"
+#include "insn-config.h"
+#include "recog.h"
+#include "internal-fn.h"
+#include "bitmap.h"
+#include "tree-nested.h"
+#include "stor-layout.h"
+#include "tree-ssa-threadupdate.h"
+#include "tree-into-ssa.h"
+#include "splay-tree.h"
+#include "target.h"
+#include "cfgloop.h"
+#include "tree-cfg.h"
+#include "omp-offload.h"
+#include "attribs.h"
+
+/* Loop structure of the function. The entire function is described as
+ a NULL loop. */
+
+struct parallel_g
+{
+ /* Parent parallel. */
+ parallel_g *parent;
+
+ /* Next sibling parallel. */
+ parallel_g *next;
+
+ /* First child parallel. */
+ parallel_g *inner;
+
+ /* Partitioning mask of the parallel. */
+ unsigned mask;
+
+ /* Partitioning used within inner parallels. */
+ unsigned inner_mask;
+
+ /* Location of parallel forked and join. The forked is the first
+ block in the parallel and the join is the first block after of
+ the partition. */
+ basic_block forked_block;
+ basic_block join_block;
+
+ gimple *forked_stmt;
+ gimple *join_stmt;
+
+ gimple *fork_stmt;
+ gimple *joining_stmt;
+
+ /* Basic blocks in this parallel, but not in child parallels. The
+ FORKED and JOINING blocks are in the partition. The FORK and JOIN
+ blocks are not. */
+ auto_vec blocks;
+
+ tree record_type;
+ tree sender_decl;
+ tree receiver_decl;
+
+public:
+ parallel_g (parallel_g *parent, unsigned mode);
+ ~parallel_g ();
+};
+
+/* Constructor links the new parallel into it's parent's chain of
+ children. */
+
+parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
+ :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
+{
+ forked_block = join_block = 0;
+ forked_stmt = join_stmt = NULL;
+ fork_stmt = joining_stmt = NULL;
+
+ record_type = NULL_TREE;
+ sender_decl = NULL_TREE;
+ receiver_decl = NULL_TREE;
+
+ if (parent)
+ {
+ next = parent->inner;
+ parent->inner = this;
+ }
+}
+
+parallel_g::~parallel_g ()
+{
+ delete inner;
+ delete next;
+}
+
+static bool
+local_var_based_p (tree decl)
+{
+ switch (TREE_CODE (decl))
+ {
+ case VAR_DECL:
+ return !is_global_var (decl);
+
+ case COMPONENT_REF:
+ case BIT_FIELD_REF:
+ case ARRAY_REF:
+ return local_var_based_p (TREE_OPERAND (decl, 0));
+
+ default:
+ return false;
+ }
+}
+
+/* Map of basic blocks to gimple stmts. */
+typedef hash_map bb_stmt_map_t;
+
+/* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
+ the routine likely contains partitioned loops (else will do its own
+ neutering and variable propagation). Return TRUE if a function call CALL
+ should be made in (worker) single mode instead, rather than redundant
+ mode. */
+
+static bool
+omp_sese_active_worker_call (gcall *call)
+{
+#define GOMP_DIM_SEQ GOMP_DIM_MAX
+ tree fndecl = gimple_call_fndecl (call);
+
+ if (!fndecl)
+ return true;
+
+ tree attrs = oacc_get_fn_attrib (fndecl);
+
+ if (!attrs)
+ return true;
+
+ int level = oacc_fn_attrib_level (attrs);
+
+ /* Neither regular functions nor "seq" routines should be run by all threads
+ in worker-single mode. */
+ return level == -1 || level == GOMP_DIM_SEQ;
+#undef GOMP_DIM_SEQ
+}
+
+/* Split basic blocks such that each forked and join unspecs are at
+ the start of their basic blocks. Thus afterwards each block will
+ have a single partitioning mode. We also do the same for return
+ insns, as they are executed by every thread. Return the
+ partitioning mode of the function as a whole. Populate MAP with
+ head and tail blocks. We also clear the BB visited flag, which is
+ used when finding partitions. */
+
+static void
+omp_sese_split_blocks (bb_stmt_map_t *map)
+{
+ auto_vec worklist;
+ basic_block block;
+
+ /* Locate all the reorg instructions of interest. */
+ FOR_ALL_BB_FN (block, cfun)
+ {
+ /* Clear visited flag, for use by parallel locator */
+ block->flags &= ~BB_VISITED;
+
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+ {
+ enum ifn_unique_kind k = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+
+ if (k == IFN_UNIQUE_OACC_JOIN)
+ worklist.safe_push (stmt);
+ else if (k == IFN_UNIQUE_OACC_FORK)
+ {
+ gcc_assert (gsi_one_before_end_p (gsi));
+ basic_block forked_block = single_succ (block);
+ gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
+
+ /* We push a NOP as a placeholder for the "forked" stmt.
+ This is then recognized in omp_sese_find_par. */
+ gimple *nop = gimple_build_nop ();
+ gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
+
+ worklist.safe_push (nop);
+ }
+ }
+ else if (gimple_code (stmt) == GIMPLE_RETURN
+ || gimple_code (stmt) == GIMPLE_COND
+ || gimple_code (stmt) == GIMPLE_SWITCH
+ || (gimple_code (stmt) == GIMPLE_CALL
+ && !gimple_call_internal_p (stmt)
+ && !omp_sese_active_worker_call (as_a (stmt))))
+ worklist.safe_push (stmt);
+ else if (is_gimple_assign (stmt))
+ {
+ tree lhs = gimple_assign_lhs (stmt);
+
+ /* Force assignments to components/fields/elements of local
+ aggregates into fully-partitioned (redundant) mode. This
+ avoids having to broadcast the whole aggregate. The RHS of
+ the assignment will be propagated using the normal
+ mechanism. */
+
+ switch (TREE_CODE (lhs))
+ {
+ case COMPONENT_REF:
+ case BIT_FIELD_REF:
+ case ARRAY_REF:
+ {
+ tree aggr = TREE_OPERAND (lhs, 0);
+
+ if (local_var_based_p (aggr))
+ worklist.safe_push (stmt);
+ }
+ break;
+
+ default:
+ ;
+ }
+ }
+ }
+ }
+
+ /* Split blocks on the worklist. */
+ unsigned ix;
+ gimple *stmt;
+
+ for (ix = 0; worklist.iterate (ix, &stmt); ix++)
+ {
+ basic_block block = gimple_bb (stmt);
+
+ if (gimple_code (stmt) == GIMPLE_COND)
+ {
+ gcond *orig_cond = as_a (stmt);
+ tree_code code = gimple_expr_code (orig_cond);
+ tree pred = make_ssa_name (boolean_type_node);
+ gimple *asgn = gimple_build_assign (pred, code,
+ gimple_cond_lhs (orig_cond),
+ gimple_cond_rhs (orig_cond));
+ gcond *new_cond
+ = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
+ gimple_cond_true_label (orig_cond),
+ gimple_cond_false_label (orig_cond));
+
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
+ gsi_replace (&gsi, new_cond, true);
+
+ edge e = split_block (block, asgn);
+ block = e->dest;
+ map->get_or_insert (block) = new_cond;
+ }
+ else if ((gimple_code (stmt) == GIMPLE_CALL
+ && !gimple_call_internal_p (stmt))
+ || is_gimple_assign (stmt))
+ {
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gsi_prev (&gsi);
+
+ edge call = split_block (block, gsi_stmt (gsi));
+
+ gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
+
+ edge call_to_ret = split_block (call->dest, call_stmt);
+
+ map->get_or_insert (call_to_ret->src) = call_stmt;
+ }
+ else
+ {
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gsi_prev (&gsi);
+
+ if (gsi_end_p (gsi))
+ map->get_or_insert (block) = stmt;
+ else
+ {
+ /* Split block before insn. The insn is in the new block. */
+ edge e = split_block (block, gsi_stmt (gsi));
+
+ block = e->dest;
+ map->get_or_insert (block) = stmt;
+ }
+ }
+ }
+}
+
+static const char *
+mask_name (unsigned mask)
+{
+ switch (mask)
+ {
+ case 0: return "gang redundant";
+ case 1: return "gang partitioned";
+ case 2: return "worker partitioned";
+ case 3: return "gang+worker partitioned";
+ case 4: return "vector partitioned";
+ case 5: return "gang+vector partitioned";
+ case 6: return "worker+vector partitioned";
+ case 7: return "fully partitioned";
+ default: return "";
+ }
+}
+
+/* Dump this parallel and all its inner parallels. */
+
+static void
+omp_sese_dump_pars (parallel_g *par, unsigned depth)
+{
+ fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
+ depth, par->mask, mask_name (par->mask),
+ par->forked_block ? par->forked_block->index : -1,
+ par->join_block ? par->join_block->index : -1);
+
+ fprintf (dump_file, " blocks:");
+
+ basic_block block;
+ for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
+ fprintf (dump_file, " %d", block->index);
+ fprintf (dump_file, "\n");
+ if (par->inner)
+ omp_sese_dump_pars (par->inner, depth + 1);
+
+ if (par->next)
+ omp_sese_dump_pars (par->next, depth);
+}
+
+/* If BLOCK contains a fork/join marker, process it to create or
+ terminate a loop structure. Add this block to the current loop,
+ and then walk successor blocks. */
+
+static parallel_g *
+omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
+{
+ if (block->flags & BB_VISITED)
+ return par;
+ block->flags |= BB_VISITED;
+
+ if (gimple **stmtp = map->get (block))
+ {
+ gimple *stmt = *stmtp;
+
+ if (gimple_code (stmt) == GIMPLE_COND
+ || gimple_code (stmt) == GIMPLE_SWITCH
+ || gimple_code (stmt) == GIMPLE_RETURN
+ || (gimple_code (stmt) == GIMPLE_CALL
+ && !gimple_call_internal_p (stmt))
+ || is_gimple_assign (stmt))
+ {
+ /* A single block that is forced to be at the maximum partition
+ level. Make a singleton par for it. */
+ par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
+ | GOMP_DIM_MASK (GOMP_DIM_WORKER)
+ | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+ par->forked_block = block;
+ par->forked_stmt = stmt;
+ par->blocks.safe_push (block);
+ par = par->parent;
+ goto walk_successors;
+ }
+ else if (gimple_nop_p (stmt))
+ {
+ basic_block pred = single_pred (block);
+ gcc_assert (pred);
+ gimple_stmt_iterator gsi = gsi_last_bb (pred);
+ gimple *final_stmt = gsi_stmt (gsi);
+
+ if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
+ {
+ gcall *call = as_a (final_stmt);
+ enum ifn_unique_kind k = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+
+ if (k == IFN_UNIQUE_OACC_FORK)
+ {
+ HOST_WIDE_INT dim
+ = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+ unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
+
+ par = new parallel_g (par, mask);
+ par->forked_block = block;
+ par->forked_stmt = final_stmt;
+ par->fork_stmt = stmt;
+ }
+ else
+ gcc_unreachable ();
+ }
+ else
+ gcc_unreachable ();
+ }
+ else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+ {
+ gcall *call = as_a (stmt);
+ enum ifn_unique_kind k = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+ if (k == IFN_UNIQUE_OACC_JOIN)
+ {
+ HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
+ unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
+
+ gcc_assert (par->mask == mask);
+ par->join_block = block;
+ par->join_stmt = stmt;
+ par = par->parent;
+ }
+ else
+ gcc_unreachable ();
+ }
+ else
+ gcc_unreachable ();
+ }
+
+ if (par)
+ /* Add this block onto the current loop's list of blocks. */
+ par->blocks.safe_push (block);
+ else
+ /* This must be the entry block. Create a NULL parallel. */
+ par = new parallel_g (0, 0);
+
+walk_successors:
+ /* Walk successor blocks. */
+ edge e;
+ edge_iterator ei;
+
+ FOR_EACH_EDGE (e, ei, block->succs)
+ omp_sese_find_par (map, par, e->dest);
+
+ return par;
+}
+
+/* DFS walk the CFG looking for fork & join markers. Construct
+ loop structures as we go. MAP is a mapping of basic blocks
+ to head & tail markers, discovered when splitting blocks. This
+ speeds up the discovery. We rely on the BB visited flag having
+ been cleared when splitting blocks. */
+
+static parallel_g *
+omp_sese_discover_pars (bb_stmt_map_t *map)
+{
+ basic_block block;
+
+ /* Mark exit blocks as visited. */
+ block = EXIT_BLOCK_PTR_FOR_FN (cfun);
+ block->flags |= BB_VISITED;
+
+ /* And entry block as not. */
+ block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
+ block->flags &= ~BB_VISITED;
+
+ parallel_g *par = omp_sese_find_par (map, 0, block);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\nLoops\n");
+ omp_sese_dump_pars (par, 0);
+ fprintf (dump_file, "\n");
+ }
+
+ return par;
+}
+
+static void
+populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
+ bitmap vector_single, unsigned outer_mask,
+ int depth)
+{
+ unsigned mask = outer_mask | par->mask;
+
+ basic_block block;
+
+ for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+ {
+ if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+ bitmap_set_bit (worker_single, block->index);
+
+ if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
+ bitmap_set_bit (vector_single, block->index);
+ }
+
+ if (par->inner)
+ populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
+ mask, depth + 1);
+ if (par->next)
+ populate_single_mode_bitmaps (par->next, worker_single, vector_single,
+ outer_mask, depth);
+}
+
+/* A map from SSA names or var decls to record fields. */
+
+typedef hash_map field_map_t;
+
+/* For each propagation record type, this is a map from SSA names or var decls
+ to propagate, to the field in the record type that should be used for
+ transmission and reception. */
+
+typedef hash_map record_field_map_t;
+
+static GTY(()) record_field_map_t *field_map;
+
+static void
+install_var_field (tree var, tree record_type)
+{
+ field_map_t *fields = *field_map->get (record_type);
+ tree name;
+ char tmp[20];
+
+ if (TREE_CODE (var) == SSA_NAME)
+ {
+ name = SSA_NAME_IDENTIFIER (var);
+ if (!name)
+ {
+ sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
+ name = get_identifier (tmp);
+ }
+ }
+ else if (TREE_CODE (var) == VAR_DECL)
+ {
+ name = DECL_NAME (var);
+ if (!name)
+ {
+ sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
+ name = get_identifier (tmp);
+ }
+ }
+ else
+ gcc_unreachable ();
+
+ gcc_assert (!fields->get (var));
+
+ tree type = TREE_TYPE (var);
+
+ if (POINTER_TYPE_P (type)
+ && TYPE_RESTRICT (type))
+ type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
+
+ tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
+
+ if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
+ {
+ SET_DECL_ALIGN (field, DECL_ALIGN (var));
+ DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
+ TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
+ }
+ else
+ SET_DECL_ALIGN (field, TYPE_ALIGN (type));
+
+ fields->put (var, field);
+
+ insert_field_into_struct (record_type, field);
+}
+
+/* Sets of SSA_NAMES or VAR_DECLs to propagate. */
+typedef hash_set propagation_set;
+
+static void
+find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
+ bitmap worker_single, bitmap vector_single,
+ vec *prop_set)
+{
+ unsigned mask = outer_mask | par->mask;
+
+ if (par->inner)
+ find_ssa_names_to_propagate (par->inner, mask, worker_single,
+ vector_single, prop_set);
+ if (par->next)
+ find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
+ vector_single, prop_set);
+
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ {
+ basic_block block;
+ int ix;
+
+ for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+ {
+ for (gphi_iterator psi = gsi_start_phis (block);
+ !gsi_end_p (psi); gsi_next (&psi))
+ {
+ gphi *phi = psi.phi ();
+ use_operand_p use;
+ ssa_op_iter iter;
+
+ FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
+ {
+ tree var = USE_FROM_PTR (use);
+
+ if (TREE_CODE (var) != SSA_NAME)
+ continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+ if (gimple_nop_p (def_stmt))
+ continue;
+
+ basic_block def_bb = gimple_bb (def_stmt);
+
+ if (bitmap_bit_p (worker_single, def_bb->index))
+ {
+ if (!(*prop_set)[def_bb->index])
+ (*prop_set)[def_bb->index] = new propagation_set;
+
+ propagation_set *ws_prop = (*prop_set)[def_bb->index];
+
+ ws_prop->add (var);
+ }
+ }
+ }
+
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ use_operand_p use;
+ ssa_op_iter iter;
+ gimple *stmt = gsi_stmt (gsi);
+
+ FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
+ {
+ tree var = USE_FROM_PTR (use);
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+ if (gimple_nop_p (def_stmt))
+ continue;
+
+ basic_block def_bb = gimple_bb (def_stmt);
+
+ if (bitmap_bit_p (worker_single, def_bb->index))
+ {
+ if (!(*prop_set)[def_bb->index])
+ (*prop_set)[def_bb->index] = new propagation_set;
+
+ propagation_set *ws_prop = (*prop_set)[def_bb->index];
+
+ ws_prop->add (var);
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
+ statement. */
+
+static tree
+find_partitioned_var_uses_1 (tree *node, int *, void *data)
+{
+ walk_stmt_info *wi = (walk_stmt_info *) data;
+ hash_set *partitioned_var_uses = (hash_set *) wi->info;
+
+ if (!wi->is_lhs && VAR_P (*node))
+ partitioned_var_uses->add (*node);
+
+ return NULL_TREE;
+}
+
+static void
+find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
+ hash_set *partitioned_var_uses)
+{
+ unsigned mask = outer_mask | par->mask;
+
+ if (par->inner)
+ find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
+ if (par->next)
+ find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
+
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ {
+ basic_block block;
+ int ix;
+
+ for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ walk_stmt_info wi;
+ memset (&wi, 0, sizeof (wi));
+ wi.info = (void *) partitioned_var_uses;
+ walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
+ }
+ }
+}
+
+/* Gang-private variables (typically placed in a GPU's shared memory) do not
+ need to be processed by the worker-propagation mechanism. Populate the
+ GANG_PRIVATE_VARS set with any such variables found in the current
+ function. */
+
+static void
+find_gang_private_vars (hash_set *gang_private_vars)
+{
+ basic_block block;
+
+ FOR_EACH_BB_FN (block, cfun)
+ {
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (gimple_call_internal_p (stmt, IFN_UNIQUE))
+ {
+ enum ifn_unique_kind k = ((enum ifn_unique_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
+ if (k == IFN_UNIQUE_OACC_PRIVATE)
+ {
+ HOST_WIDE_INT level
+ = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
+ if (level != GOMP_DIM_GANG)
+ continue;
+ for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
+ {
+ tree arg = gimple_call_arg (stmt, i);
+ gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+ tree decl = TREE_OPERAND (arg, 0);
+ gang_private_vars->add (decl);
+ }
+ }
+ }
+ }
+ }
+}
+
+static void
+find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
+ hash_set *partitioned_var_uses,
+ hash_set *gang_private_vars,
+ vec *prop_set)
+{
+ unsigned mask = outer_mask | par->mask;
+
+ if (par->inner)
+ find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
+ gang_private_vars, prop_set);
+ if (par->next)
+ find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
+ gang_private_vars, prop_set);
+
+ if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
+ {
+ basic_block block;
+ int ix;
+
+ for (ix = 0; par->blocks.iterate (ix, &block); ix++)
+ {
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ tree var;
+ unsigned i;
+
+ FOR_EACH_LOCAL_DECL (cfun, i, var)
+ {
+ if (!VAR_P (var)
+ || is_global_var (var)
+ || AGGREGATE_TYPE_P (TREE_TYPE (var))
+ || !partitioned_var_uses->contains (var)
+ || gang_private_vars->contains (var))
+ continue;
+
+ if (stmt_may_clobber_ref_p (stmt, var))
+ {
+ if (dump_file)
+ {
+ fprintf (dump_file, "bb %u: local variable may be "
+ "clobbered in %s mode: ", block->index,
+ mask_name (mask));
+ print_generic_expr (dump_file, var, TDF_SLIM);
+ fprintf (dump_file, "\n");
+ }
+
+ if (!(*prop_set)[block->index])
+ (*prop_set)[block->index] = new propagation_set;
+
+ propagation_set *ws_prop
+ = (*prop_set)[block->index];
+
+ ws_prop->add (var);
+ }
+ }
+ }
+ }
+ }
+}
+
+/* Transform basic blocks FROM, TO (which may be the same block) into:
+ if (GOACC_single_start ())
+ BLOCK;
+ GOACC_barrier ();
+ \ | /
+ +----+
+ | | (new) predicate block
+ +----+--
+ \ | / \ | / |t \
+ +----+ +----+ +----+ |
+ | | | | ===> | | | f (old) from block
+ +----+ +----+ +----+ |
+ | t/ \f | /
+ +----+/
+ (split (split before | | skip block
+ at end) condition) +----+
+ t/ \f
+*/
+
+static void
+worker_single_simple (basic_block from, basic_block to,
+ hash_set *def_escapes_block)
+{
+ gimple *call, *cond;
+ tree lhs, decl;
+ basic_block skip_block;
+
+ gimple_stmt_iterator gsi = gsi_last_bb (to);
+ if (EDGE_COUNT (to->succs) > 1)
+ {
+ gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
+ gsi_prev (&gsi);
+ }
+ edge e = split_block (to, gsi_stmt (gsi));
+ skip_block = e->dest;
+
+ gimple_stmt_iterator start = gsi_after_labels (from);
+
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
+ lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
+ call = gimple_build_call (decl, 0);
+ gimple_call_set_lhs (call, lhs);
+ gsi_insert_before (&start, call, GSI_NEW_STMT);
+ update_stmt (call);
+
+ cond = gimple_build_cond (EQ_EXPR, lhs,
+ fold_convert_loc (UNKNOWN_LOCATION,
+ TREE_TYPE (lhs),
+ boolean_true_node),
+ NULL_TREE, NULL_TREE);
+ gsi_insert_after (&start, cond, GSI_NEW_STMT);
+ update_stmt (cond);
+
+ edge et = split_block (from, cond);
+ et->flags &= ~EDGE_FALLTHRU;
+ et->flags |= EDGE_TRUE_VALUE;
+ /* Make the active worker the more probable path so we prefer fallthrough
+ (letting the idle workers jump around more). */
+ et->probability = profile_probability::likely ();
+
+ edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
+ ef->probability = et->probability.invert ();
+
+ basic_block neutered = split_edge (ef);
+ gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
+
+ for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ ssa_op_iter iter;
+ tree var;
+
+ FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
+ {
+ if (def_escapes_block->contains (var))
+ {
+ gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
+ create_new_def_for (var, join_phi,
+ gimple_phi_result_ptr (join_phi));
+ add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
+
+ tree neutered_def = copy_ssa_name (var, NULL);
+ /* We really want "don't care" or some value representing
+ undefined here, but optimizers will probably get rid of the
+ zero-assignments anyway. */
+ gassign *zero = gimple_build_assign (neutered_def,
+ build_zero_cst (TREE_TYPE (neutered_def)));
+
+ gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
+ update_stmt (zero);
+
+ add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
+ UNKNOWN_LOCATION);
+ update_stmt (join_phi);
+ }
+ }
+ }
+
+ gsi = gsi_start_bb (skip_block);
+
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+
+ gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
+ update_stmt (acc_bar);
+}
+
+/* This is a copied and renamed omp-low.c:omp_build_component_ref. */
+
+static tree
+oacc_build_component_ref (tree obj, tree field)
+{
+ tree field_type = TREE_TYPE (field);
+ tree obj_type = TREE_TYPE (obj);
+ if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (obj_type)))
+ field_type = build_qualified_type
+ (field_type,
+ KEEP_QUAL_ADDR_SPACE (TYPE_QUALS (obj_type)));
+
+ tree ret = build3 (COMPONENT_REF, field_type, obj, field, NULL);
+ if (TREE_THIS_VOLATILE (field))
+ TREE_THIS_VOLATILE (ret) |= 1;
+ if (TREE_READONLY (field))
+ TREE_READONLY (ret) |= 1;
+ return ret;
+}
+
+static tree
+build_receiver_ref (tree record_type, tree var, tree receiver_decl)
+{
+ field_map_t *fields = *field_map->get (record_type);
+ tree x = build_simple_mem_ref (receiver_decl);
+ tree field = *fields->get (var);
+ TREE_THIS_NOTRAP (x) = 1;
+ x = oacc_build_component_ref (x, field);
+ return x;
+}
+
+static tree
+build_sender_ref (tree record_type, tree var, tree sender_decl)
+{
+ field_map_t *fields = *field_map->get (record_type);
+ tree field = *fields->get (var);
+ return oacc_build_component_ref (sender_decl, field);
+}
+
+static int
+sort_by_ssa_version_or_uid (const void *p1, const void *p2)
+{
+ const tree t1 = *(const tree *)p1;
+ const tree t2 = *(const tree *)p2;
+
+ if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
+ return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
+ else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
+ return -1;
+ else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
+ return 1;
+ else
+ return DECL_UID (t1) - DECL_UID (t2);
+}
+
+static int
+sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
+{
+ const tree t1 = *(const tree *)p1;
+ const tree t2 = *(const tree *)p2;
+ unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
+ unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
+ if (s1 != s2)
+ return s2 - s1;
+ else
+ return sort_by_ssa_version_or_uid (p1, p2);
+}
+
+static void
+worker_single_copy (basic_block from, basic_block to,
+ hash_set *def_escapes_block,
+ hash_set *worker_partitioned_uses,
+ tree record_type)
+{
+ /* If we only have virtual defs, we'll have no record type, but we still want
+ to emit single_copy_start and (particularly) single_copy_end to act as
+ a vdef source on the neutered edge representing memory writes on the
+ non-neutered edge. */
+ if (!record_type)
+ record_type = char_type_node;
+
+ tree sender_decl
+ = targetm.goacc.create_worker_broadcast_record (record_type, true,
+ ".oacc_worker_o");
+ tree receiver_decl
+ = targetm.goacc.create_worker_broadcast_record (record_type, false,
+ ".oacc_worker_i");
+
+ gimple_stmt_iterator gsi = gsi_last_bb (to);
+ if (EDGE_COUNT (to->succs) > 1)
+ gsi_prev (&gsi);
+ edge e = split_block (to, gsi_stmt (gsi));
+ basic_block barrier_block = e->dest;
+
+ gimple_stmt_iterator start = gsi_after_labels (from);
+
+ tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
+
+ tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
+
+ gimple *call = gimple_build_call (decl, 1,
+ build_fold_addr_expr (sender_decl));
+ gimple_call_set_lhs (call, lhs);
+ gsi_insert_before (&start, call, GSI_NEW_STMT);
+ update_stmt (call);
+
+ tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
+
+ gimple *conv = gimple_build_assign (conv_tmp,
+ fold_convert (TREE_TYPE (receiver_decl),
+ lhs));
+ update_stmt (conv);
+ gsi_insert_after (&start, conv, GSI_NEW_STMT);
+ gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
+ gsi_insert_after (&start, asgn, GSI_NEW_STMT);
+ update_stmt (asgn);
+
+ tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
+
+ tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
+ asgn = gimple_build_assign (recv_tmp, receiver_decl);
+ gsi_insert_after (&start, asgn, GSI_NEW_STMT);
+ update_stmt (asgn);
+
+ gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
+ NULL_TREE);
+ update_stmt (cond);
+
+ gsi_insert_after (&start, cond, GSI_NEW_STMT);
+
+ edge et = split_block (from, cond);
+ et->flags &= ~EDGE_FALLTHRU;
+ et->flags |= EDGE_TRUE_VALUE;
+ /* Make the active worker the more probable path so we prefer fallthrough
+ (letting the idle workers jump around more). */
+ et->probability = profile_probability::likely ();
+
+ basic_block body = et->dest;
+
+ edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
+ ef->probability = et->probability.invert ();
+
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gimple *acc_bar = gimple_build_call (decl, 0);
+
+ gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
+ gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
+
+ cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
+ gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
+
+ edge et2 = split_block (barrier_block, cond);
+ et2->flags &= ~EDGE_FALLTHRU;
+ et2->flags |= EDGE_TRUE_VALUE;
+ et2->probability = profile_probability::unlikely ();
+
+ basic_block exit_block = et2->dest;
+
+ basic_block copyout_block = split_edge (et2);
+ edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
+ ef2->probability = et2->probability.invert ();
+
+ gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
+
+ edge copyout_to_exit = single_succ_edge (copyout_block);
+
+ gimple_seq sender_seq = NULL;
+
+ /* Make sure we iterate over definitions in a stable order. */
+ auto_vec escape_vec (def_escapes_block->elements ());
+ for (hash_set::iterator it = def_escapes_block->begin ();
+ it != def_escapes_block->end (); ++it)
+ escape_vec.quick_push (*it);
+ escape_vec.qsort (sort_by_ssa_version_or_uid);
+
+ for (unsigned i = 0; i < escape_vec.length (); i++)
+ {
+ tree var = escape_vec[i];
+
+ if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
+ continue;
+
+ tree barrier_def = 0;
+
+ if (TREE_CODE (var) == SSA_NAME)
+ {
+ gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+ if (gimple_nop_p (def_stmt))
+ continue;
+
+ /* The barrier phi takes one result from the actual work of the
+ block we're neutering, and the other result is constant zero of
+ the same type. */
+
+ gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
+ barrier_def = create_new_def_for (var, barrier_phi,
+ gimple_phi_result_ptr (barrier_phi));
+
+ add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
+ add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
+ UNKNOWN_LOCATION);
+
+ update_stmt (barrier_phi);
+ }
+ else
+ gcc_assert (TREE_CODE (var) == VAR_DECL);
+
+ /* If we had no record type, we will have no fields map. */
+ field_map_t **fields_p = field_map->get (record_type);
+ field_map_t *fields = fields_p ? *fields_p : NULL;
+
+ if (worker_partitioned_uses->contains (var)
+ && fields
+ && fields->get (var))
+ {
+ tree neutered_def = make_ssa_name (TREE_TYPE (var));
+
+ /* Receive definition from shared memory block. */
+
+ tree receiver_ref = build_receiver_ref (record_type, var,
+ receiver_decl);
+ gassign *recv = gimple_build_assign (neutered_def,
+ receiver_ref);
+ gsi_insert_after (©out_gsi, recv, GSI_CONTINUE_LINKING);
+ update_stmt (recv);
+
+ if (TREE_CODE (var) == VAR_DECL)
+ {
+ /* If it's a VAR_DECL, we only copied to an SSA temporary. Copy
+ to the final location now. */
+ gassign *asgn = gimple_build_assign (var, neutered_def);
+ gsi_insert_after (©out_gsi, asgn, GSI_CONTINUE_LINKING);
+ update_stmt (asgn);
+ }
+ else
+ {
+ /* If it's an SSA name, create a new phi at the join node to
+ represent either the output from the active worker (the
+ barrier) or the inactive workers (the copyout block). */
+ gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
+ create_new_def_for (barrier_def, join_phi,
+ gimple_phi_result_ptr (join_phi));
+ add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
+ add_phi_arg (join_phi, neutered_def, copyout_to_exit,
+ UNKNOWN_LOCATION);
+ update_stmt (join_phi);
+ }
+
+ /* Send definition to shared memory block. */
+
+ tree sender_ref = build_sender_ref (record_type, var, sender_decl);
+
+ if (TREE_CODE (var) == SSA_NAME)
+ {
+ gassign *send = gimple_build_assign (sender_ref, var);
+ gimple_seq_add_stmt (&sender_seq, send);
+ update_stmt (send);
+ }
+ else if (TREE_CODE (var) == VAR_DECL)
+ {
+ tree tmp = make_ssa_name (TREE_TYPE (var));
+ gassign *send = gimple_build_assign (tmp, var);
+ gimple_seq_add_stmt (&sender_seq, send);
+ update_stmt (send);
+ send = gimple_build_assign (sender_ref, tmp);
+ gimple_seq_add_stmt (&sender_seq, send);
+ update_stmt (send);
+ }
+ else
+ gcc_unreachable ();
+ }
+ }
+
+ /* It's possible for the ET->DEST block (the work done by the active thread)
+ to finish with a control-flow insn, e.g. a UNIQUE function call. Split
+ the block and add SENDER_SEQ in the latter part to avoid having control
+ flow in the middle of a BB. */
+
+ decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
+ call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
+ gimple_seq_add_stmt (&sender_seq, call);
+
+ gsi = gsi_last_bb (body);
+ gimple *last = gsi_stmt (gsi);
+ basic_block sender_block = split_block (body, last)->dest;
+ gsi = gsi_last_bb (sender_block);
+ gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
+}
+
+static void
+neuter_worker_single (parallel_g *par, unsigned outer_mask,
+ bitmap worker_single, bitmap vector_single,
+ vec *prop_set,
+ hash_set *partitioned_var_uses)
+{
+ unsigned mask = outer_mask | par->mask;
+
+ if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+ {
+ basic_block block;
+
+ for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+ {
+ bool has_defs = false;
+ hash_set def_escapes_block;
+ hash_set worker_partitioned_uses;
+ unsigned j;
+ tree var;
+
+ FOR_EACH_SSA_NAME (j, var, cfun)
+ {
+ if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
+ {
+ has_defs = true;
+ continue;
+ }
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (var);
+
+ if (gimple_nop_p (def_stmt))
+ continue;
+
+ if (gimple_bb (def_stmt)->index != block->index)
+ continue;
+
+ gimple *use_stmt;
+ imm_use_iterator use_iter;
+ bool uses_outside_block = false;
+ bool worker_partitioned_use = false;
+
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
+ {
+ int blocknum = gimple_bb (use_stmt)->index;
+
+ /* Don't propagate SSA names that are only used in the
+ current block, unless the usage is in a phi node: that
+ means the name left the block, then came back in at the
+ top. */
+ if (blocknum != block->index
+ || gimple_code (use_stmt) == GIMPLE_PHI)
+ uses_outside_block = true;
+ if (!bitmap_bit_p (worker_single, blocknum))
+ worker_partitioned_use = true;
+ }
+
+ if (uses_outside_block)
+ def_escapes_block.add (var);
+
+ if (worker_partitioned_use)
+ {
+ worker_partitioned_uses.add (var);
+ has_defs = true;
+ }
+ }
+
+ propagation_set *ws_prop = (*prop_set)[block->index];
+
+ if (ws_prop)
+ {
+ for (propagation_set::iterator it = ws_prop->begin ();
+ it != ws_prop->end ();
+ ++it)
+ {
+ tree var = *it;
+ if (TREE_CODE (var) == VAR_DECL)
+ {
+ def_escapes_block.add (var);
+ if (partitioned_var_uses->contains (var))
+ {
+ worker_partitioned_uses.add (var);
+ has_defs = true;
+ }
+ }
+ }
+
+ delete ws_prop;
+ (*prop_set)[block->index] = 0;
+ }
+
+ tree record_type = (tree) block->aux;
+
+ if (has_defs)
+ worker_single_copy (block, block, &def_escapes_block,
+ &worker_partitioned_uses, record_type);
+ else
+ worker_single_simple (block, block, &def_escapes_block);
+ }
+ }
+
+ if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
+ {
+ basic_block block;
+
+ for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
+ for (gimple_stmt_iterator gsi = gsi_start_bb (block);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+
+ if (gimple_code (stmt) == GIMPLE_CALL
+ && !gimple_call_internal_p (stmt)
+ && !omp_sese_active_worker_call (as_a (stmt)))
+ {
+ /* If we have an OpenACC routine call in worker-single mode,
+ place barriers before and afterwards to prevent
+ clobbering re-used shared memory regions (as are used
+ for AMDGCN at present, for example). */
+ tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+ gsi_insert_before (&gsi, gimple_build_call (decl, 0),
+ GSI_SAME_STMT);
+ gsi_insert_after (&gsi, gimple_build_call (decl, 0),
+ GSI_NEW_STMT);
+ }
+ }
+ }
+
+ if (par->inner)
+ neuter_worker_single (par->inner, mask, worker_single, vector_single,
+ prop_set, partitioned_var_uses);
+ if (par->next)
+ neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
+ prop_set, partitioned_var_uses);
+}
+
+static int
+execute_omp_oacc_neuter_broadcast ()
+{
+ bb_stmt_map_t bb_stmt_map;
+ auto_bitmap worker_single, vector_single;
+
+ omp_sese_split_blocks (&bb_stmt_map);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\n\nAfter splitting:\n\n");
+ dump_function_to_file (current_function_decl, dump_file, dump_flags);
+ }
+
+ unsigned mask = 0;
+
+ /* If this is a routine, calculate MASK as if the outer levels are already
+ partitioned. */
+ tree attr = oacc_get_fn_attrib (current_function_decl);
+ if (attr)
+ {
+ tree dims = TREE_VALUE (attr);
+ unsigned ix;
+ for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
+ {
+ tree allowed = TREE_PURPOSE (dims);
+ if (allowed && integer_zerop (allowed))
+ mask |= GOMP_DIM_MASK (ix);
+ }
+ }
+
+ parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
+ populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
+
+ basic_block bb;
+ FOR_ALL_BB_FN (bb, cfun)
+ bb->aux = NULL;
+
+ field_map = record_field_map_t::create_ggc (40);
+
+ vec prop_set;
+ prop_set.create (last_basic_block_for_fn (cfun));
+
+ for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+ prop_set.quick_push (0);
+
+ find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
+ &prop_set);
+
+ hash_set partitioned_var_uses;
+ hash_set gang_private_vars;
+
+ find_gang_private_vars (&gang_private_vars);
+ find_partitioned_var_uses (par, mask, &partitioned_var_uses);
+ find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
+ &gang_private_vars, &prop_set);
+
+ FOR_ALL_BB_FN (bb, cfun)
+ {
+ propagation_set *ws_prop = prop_set[bb->index];
+ if (ws_prop)
+ {
+ tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
+ tree name = create_tmp_var_name (".oacc_ws_data_s");
+ name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
+ DECL_ARTIFICIAL (name) = 1;
+ DECL_NAMELESS (name) = 1;
+ TYPE_NAME (record_type) = name;
+ TYPE_ARTIFICIAL (record_type) = 1;
+
+ auto_vec field_vec (ws_prop->elements ());
+ for (hash_set::iterator it = ws_prop->begin ();
+ it != ws_prop->end (); ++it)
+ field_vec.quick_push (*it);
+
+ field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
+
+ field_map->put (record_type, field_map_t::create_ggc (17));
+
+ /* Insert var fields in reverse order, so the last inserted element
+ is the first in the structure. */
+ for (int i = field_vec.length () - 1; i >= 0; i--)
+ install_var_field (field_vec[i], record_type);
+
+ layout_type (record_type);
+
+ bb->aux = (tree) record_type;
+ }
+ }
+
+ neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
+ &partitioned_var_uses);
+
+ prop_set.release ();
+
+ /* This doesn't seem to make a difference. */
+ loops_state_clear (LOOP_CLOSED_SSA);
+
+ /* Neutering worker-single neutered blocks will invalidate dominance info.
+ It may be possible to incrementally update just the affected blocks, but
+ obliterate everything for now. */
+ free_dominance_info (CDI_DOMINATORS);
+ free_dominance_info (CDI_POST_DOMINATORS);
+
+ if (dump_file)
+ {
+ fprintf (dump_file, "\n\nAfter neutering:\n\n");
+ dump_function_to_file (current_function_decl, dump_file, dump_flags);
+ }
+
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_omp_oacc_neuter_broadcast =
+{
+ GIMPLE_PASS, /* type */
+ "omp_oacc_neuter_broadcast", /* name */
+ OPTGROUP_OMP, /* optinfo_flags */
+ TV_NONE, /* tv_id */
+ PROP_cfg, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
+};
+
+class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
+{
+public:
+ pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ return (flag_openacc
+ && targetm.goacc.create_worker_broadcast_record);
+ };
+
+ virtual unsigned int execute (function *)
+ {
+ return execute_omp_oacc_neuter_broadcast ();
+ }
+
+}; // class pass_omp_oacc_neuter_broadcast
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
+{
+ return new pass_omp_oacc_neuter_broadcast (ctxt);
+}
diff --git a/gcc/passes.def b/gcc/passes.def
index 26d86df2f5a..d7a1f8c97a6 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -184,6 +184,7 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_fixup_cfg);
NEXT_PASS (pass_lower_eh_dispatch);
NEXT_PASS (pass_oacc_loop_designation);
+ NEXT_PASS (pass_omp_oacc_neuter_broadcast);
NEXT_PASS (pass_oacc_device_lower);
NEXT_PASS (pass_omp_device_lower);
NEXT_PASS (pass_omp_target_link);
diff --git a/gcc/target.def b/gcc/target.def
index 68a46aaa832..7676d5e626e 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1756,6 +1756,17 @@ private variables at OpenACC device-lowering time using the\n\
rtx, (tree var),
NULL)
+DEFHOOK
+(create_worker_broadcast_record,
+"Create a record used to propagate local-variable state from an active\n\
+worker to other workers. A possible implementation might adjust the type\n\
+of REC to place the new variable in shared GPU memory.\n\
+\n\
+Presence of this target hook indicates that middle end neutering/broadcasting\n\
+be used.",
+tree, (tree rec, bool sender, const char *name),
+NULL)
+
HOOK_VECTOR_END (goacc)
/* Functions relating to vectorization. */
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 5484ad5eac7..83941bc0cee 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -425,6 +425,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);