openacc: Middle-end worker-partitioning support
This patch implements worker-partitioning support in the middle end, by rewriting gimple. The OpenACC execution model requires that code can run in either "worker single" mode where only a single worker per gang is active, or "worker partitioned" mode, where multiple workers per gang are active. This means we need to do something equivalent to spawning additional workers when transitioning from worker-single to worker-partitioned mode. However, GPUs typically fix the number of threads of invoked kernels at launch time, so we need to do something with the "extra" threads when they are not wanted. The scheme used is to conditionalise each basic block that executes in "worker single" mode for worker 0 only. Conditional branches are handled specially so "idle" (non-0) workers follow along with worker 0. On transitioning to "worker partitioned" mode, any variables modified by worker 0 are propagated to the other workers via GPU shared memory. Special care is taken for routine calls, writes through pointers, and so forth, as follows: - There are two types of function calls to consider in worker-single mode: "normal" calls to maths library routines, etc. are called from worker 0 only. OpenACC routines may contain worker-partitioned loops themselves, so are called from all workers, including "idle" ones. - SSA names set in worker-single mode, but used in worker-partitioned mode, are copied to shared memory in worker 0. Other workers retrieve the value from the appropriate shared-memory location after a barrier, and new phi nodes are introduced at the convergence point to resolve the worker 0/other worker copies of the value. - Local scalar variables (on the stack) also need special handling. We broadcast any variables that are written in the current worker-single block, and that are read in any worker-partitioned block. (This is believed to be safe, and is flow-insensitive to ease analysis.) - Local aggregates (arrays and composites) on the stack are *not* broadcast. Instead we force gimple stmts modifying elements/fields of local aggregates into fully-partitioned mode. The RHS of the assignment is a scalar, and is thus subject to broadcasting as above. - Writes through pointers may affect any local variable that has its address taken. We use points-to analysis to determine the set of potentially-affected variables for a given pointer indirection. We broadcast any such variable which is used in worker-partitioned mode, on a per-block basis for any block containing a write through a pointer. Some slides about the implementation (from 2018) are available at: https://jtb20.github.io/gcnworkers.pdf gcc/ * Makefile.in (OBJS): Add omp-oacc-neuter-broadcast.o. * doc/tm.texi.in (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): Add documentation hook. * doc/tm.texi: Regenerate. * omp-oacc-neuter-broadcast.cc: New file. * omp-builtins.def (BUILT_IN_GOACC_BARRIER) (BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START) (BUILT_IN_GOACC_SINGLE_COPY_END): New builtins. * passes.def (pass_omp_oacc_neuter_broadcast): Add pass. * target.def (goacc.create_worker_broadcast_record): Add target hook. * tree-pass.h (make_pass_omp_oacc_neuter_broadcast): Add prototype. * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): Rename prototype to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename function to... (gcn_goacc_create_worker_broadcast_record): ... this. * config/gcn/gcn.c (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to... (TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD): ... this. Co-Authored-By: Nathan Sidwell <nathan@codesourcery.com> (via 'gcc/config/nvptx/nvptx.c' master) Co-Authored-By: Kwok Cheung Yeung <kcy@codesourcery.com> Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
This commit is contained in:
parent
e2e0b85c1e
commit
e2a58ed6dc
@ -1513,6 +1513,7 @@ OBJS = \
|
||||
omp-general.o \
|
||||
omp-low.o \
|
||||
omp-oacc-kernels-decompose.o \
|
||||
omp-oacc-neuter-broadcast.o \
|
||||
omp-simd-clone.o \
|
||||
opt-problem.o \
|
||||
optabs.o \
|
||||
|
@ -38,9 +38,10 @@ extern rtx gcn_full_exec ();
|
||||
extern rtx gcn_full_exec_reg ();
|
||||
extern rtx gcn_gen_undef (machine_mode);
|
||||
extern bool gcn_global_address_p (rtx);
|
||||
extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
|
||||
const char *name);
|
||||
extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level);
|
||||
extern tree gcn_goacc_create_worker_broadcast_record (tree record_type,
|
||||
bool sender,
|
||||
const char *name);
|
||||
extern void gcn_goacc_reduction (gcall *call);
|
||||
extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
|
||||
unsigned int to_reg);
|
||||
|
@ -548,35 +548,6 @@ gcn_goacc_reduction (gcall *call)
|
||||
}
|
||||
}
|
||||
|
||||
/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
|
||||
|
||||
Tweak (worker) propagation record, e.g. to put it in shared memory. */
|
||||
|
||||
tree
|
||||
gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
|
||||
const char *name)
|
||||
{
|
||||
tree type = record_type;
|
||||
|
||||
TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
|
||||
|
||||
if (!sender)
|
||||
type = build_pointer_type (type);
|
||||
|
||||
tree decl = create_tmp_var_raw (type, name);
|
||||
|
||||
if (sender)
|
||||
{
|
||||
DECL_CONTEXT (decl) = NULL_TREE;
|
||||
TREE_STATIC (decl) = 1;
|
||||
}
|
||||
|
||||
if (sender)
|
||||
varpool_node::finalize_decl (decl);
|
||||
|
||||
return decl;
|
||||
}
|
||||
|
||||
tree
|
||||
gcn_goacc_adjust_private_decl (location_t, tree var, int level)
|
||||
{
|
||||
@ -604,4 +575,33 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level)
|
||||
return var;
|
||||
}
|
||||
|
||||
/* Implement TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD.
|
||||
|
||||
Create OpenACC worker state propagation record in shared memory. */
|
||||
|
||||
tree
|
||||
gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender,
|
||||
const char *name)
|
||||
{
|
||||
tree type = record_type;
|
||||
|
||||
TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
|
||||
|
||||
if (!sender)
|
||||
type = build_pointer_type (type);
|
||||
|
||||
tree decl = create_tmp_var_raw (type, name);
|
||||
|
||||
if (sender)
|
||||
{
|
||||
DECL_CONTEXT (decl) = NULL_TREE;
|
||||
TREE_STATIC (decl) = 1;
|
||||
}
|
||||
|
||||
if (sender)
|
||||
varpool_node::finalize_decl (decl);
|
||||
|
||||
return decl;
|
||||
}
|
||||
|
||||
/* }}} */
|
||||
|
@ -6513,11 +6513,11 @@ gcn_dwarf_register_span (rtx rtl)
|
||||
#define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
|
||||
#undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
|
||||
#define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
|
||||
#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD
|
||||
#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
|
||||
gcn_goacc_adjust_propagation_record
|
||||
#undef TARGET_GOACC_ADJUST_PRIVATE_DECL
|
||||
#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
|
||||
#undef TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
|
||||
#define TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD \
|
||||
gcn_goacc_create_worker_broadcast_record
|
||||
#undef TARGET_GOACC_FORK_JOIN
|
||||
#define TARGET_GOACC_FORK_JOIN gcn_fork_join
|
||||
#undef TARGET_GOACC_REDUCTION
|
||||
|
@ -6409,6 +6409,15 @@ private variables at OpenACC device-lowering time using the
|
||||
@code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook.
|
||||
@end deftypefn
|
||||
|
||||
@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name})
|
||||
Create a record used to propagate local-variable state from an active
|
||||
worker to other workers. A possible implementation might adjust the type
|
||||
of REC to place the new variable in shared GPU memory.
|
||||
|
||||
Presence of this target hook indicates that middle end neutering/broadcasting
|
||||
be used.
|
||||
@end deftypefn
|
||||
|
||||
@node Anchored Addresses
|
||||
@section Anchored Addresses
|
||||
@cindex anchored addresses
|
||||
|
@ -4223,6 +4223,8 @@ address; but often a machine-dependent strategy can generate better code.
|
||||
|
||||
@hook TARGET_GOACC_EXPAND_VAR_DECL
|
||||
|
||||
@hook TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
|
||||
|
||||
@node Anchored Addresses
|
||||
@section Anchored Addresses
|
||||
@cindex anchored addresses
|
||||
|
@ -59,6 +59,15 @@ DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_ID, "goacc_parlevel_id",
|
||||
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_PARLEVEL_SIZE, "goacc_parlevel_size",
|
||||
BT_FN_INT_INT, ATTR_NOTHROW_LEAF_LIST)
|
||||
|
||||
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_BARRIER, "GOACC_barrier",
|
||||
BT_FN_VOID, ATTR_NOTHROW_LEAF_LIST)
|
||||
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_START, "GOACC_single_start",
|
||||
BT_FN_BOOL, ATTR_NOTHROW_LEAF_LIST)
|
||||
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_START, "GOACC_single_copy_start",
|
||||
BT_FN_PTR, ATTR_NOTHROW_LEAF_LIST)
|
||||
DEF_GOACC_BUILTIN_ONLY (BUILT_IN_GOACC_SINGLE_COPY_END, "GOACC_single_copy_end",
|
||||
BT_FN_VOID_PTR, ATTR_NOTHROW_LEAF_LIST)
|
||||
|
||||
DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_THREAD_NUM, "omp_get_thread_num",
|
||||
BT_FN_INT, ATTR_CONST_NOTHROW_LEAF_LIST)
|
||||
DEF_GOMP_BUILTIN (BUILT_IN_OMP_GET_NUM_THREADS, "omp_get_num_threads",
|
||||
|
1515
gcc/omp-oacc-neuter-broadcast.cc
Normal file
1515
gcc/omp-oacc-neuter-broadcast.cc
Normal file
File diff suppressed because it is too large
Load Diff
@ -184,6 +184,7 @@ along with GCC; see the file COPYING3. If not see
|
||||
NEXT_PASS (pass_fixup_cfg);
|
||||
NEXT_PASS (pass_lower_eh_dispatch);
|
||||
NEXT_PASS (pass_oacc_loop_designation);
|
||||
NEXT_PASS (pass_omp_oacc_neuter_broadcast);
|
||||
NEXT_PASS (pass_oacc_device_lower);
|
||||
NEXT_PASS (pass_omp_device_lower);
|
||||
NEXT_PASS (pass_omp_target_link);
|
||||
|
@ -1756,6 +1756,17 @@ private variables at OpenACC device-lowering time using the\n\
|
||||
rtx, (tree var),
|
||||
NULL)
|
||||
|
||||
DEFHOOK
|
||||
(create_worker_broadcast_record,
|
||||
"Create a record used to propagate local-variable state from an active\n\
|
||||
worker to other workers. A possible implementation might adjust the type\n\
|
||||
of REC to place the new variable in shared GPU memory.\n\
|
||||
\n\
|
||||
Presence of this target hook indicates that middle end neutering/broadcasting\n\
|
||||
be used.",
|
||||
tree, (tree rec, bool sender, const char *name),
|
||||
NULL)
|
||||
|
||||
HOOK_VECTOR_END (goacc)
|
||||
|
||||
/* Functions relating to vectorization. */
|
||||
|
@ -425,6 +425,7 @@ extern gimple_opt_pass *make_pass_expand_omp (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_expand_omp_ssa (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_omp_target_link (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_oacc_loop_designation (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_oacc_device_lower (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_omp_device_lower (gcc::context *ctxt);
|
||||
extern gimple_opt_pass *make_pass_object_sizes (gcc::context *ctxt);
|
||||
|
Loading…
Reference in New Issue
Block a user