diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index dc9331c445d..7ef7ae8af46 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -40,7 +40,7 @@ extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, const char *name); -extern void gcn_goacc_adjust_gangprivate_decl (tree var); +extern tree gcn_goacc_adjust_private_decl (tree var, int level); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 8f270991c86..75ea50c59dd 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -577,9 +577,12 @@ gcn_goacc_adjust_propagation_record (tree record_type, bool sender, return decl; } -void -gcn_goacc_adjust_gangprivate_decl (tree var) +tree +gcn_goacc_adjust_private_decl (tree var, int level) { + if (level != GOMP_DIM_GANG) + return var; + tree type = TREE_TYPE (var); tree lds_type = build_qualified_type (type, TYPE_QUALS_NO_ADDR_SPACE (type) @@ -597,6 +600,8 @@ gcn_goacc_adjust_gangprivate_decl (tree var) if (machfun) machfun->use_flat_addressing = true; + + return var; } /* }}} */ diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 9660ca6eaa4..283a91fe50a 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -6320,8 +6320,8 @@ gcn_dwarf_register_span (rtx rtl) #undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD #define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ gcn_goacc_adjust_propagation_record -#undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL -#define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl +#undef TARGET_GOACC_ADJUST_PRIVATE_DECL +#define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl #undef TARGET_GOACC_FORK_JOIN #define TARGET_GOACC_FORK_JOIN gcn_fork_join #undef TARGET_GOACC_REDUCTION diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index 722b0faa330..80116e570d6 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -75,6 +75,7 @@ #include "fold-const.h" #include "intl.h" #include "opts.h" +#include "tree-pretty-print.h" /* This file should be included last. */ #include "target-def.h" @@ -167,6 +168,12 @@ static unsigned vector_red_align; static unsigned vector_red_partition; static GTY(()) rtx vector_red_sym; +/* Shared memory block for gang-private variables. */ +static unsigned gang_private_shared_size; +static unsigned gang_private_shared_align; +static GTY(()) rtx gang_private_shared_sym; +static hash_map gang_private_shared_hmap; + /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; @@ -251,6 +258,10 @@ nvptx_option_override (void) vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; vector_red_partition = 0; + gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared"); + SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED); + gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); @@ -5435,6 +5446,10 @@ nvptx_file_end (void) write_shared_buffer (asm_out_file, vector_red_sym, vector_red_align, vector_red_size); + if (gang_private_shared_size) + write_shared_buffer (asm_out_file, gang_private_shared_sym, + gang_private_shared_align, gang_private_shared_size); + if (need_softstack_decl) { write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); @@ -6662,6 +6677,64 @@ nvptx_truly_noop_truncation (poly_uint64, poly_uint64) return false; } +/* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */ + +static tree +nvptx_goacc_adjust_private_decl (tree decl, int level) +{ + if (level != GOMP_DIM_GANG) + return decl; + + /* Set "oacc gang-private" attribute for gang-private variable + declarations. */ + if (!lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (decl))) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Setting 'oacc gang-private' attribute for decl:"); + print_generic_decl (dump_file, decl, TDF_SLIM); + fputc ('\n', dump_file); + } + tree id = get_identifier ("oacc gang-private"); + DECL_ATTRIBUTES (decl) = tree_cons (id, NULL, DECL_ATTRIBUTES (decl)); + } + + return decl; +} + +/* Implement TARGET_GOACC_EXPAND_VAR_DECL. */ + +static rtx +nvptx_goacc_expand_var_decl (tree var) +{ + /* Place "oacc gang-private" variables in shared memory. */ + if (VAR_P (var) + && lookup_attribute ("oacc gang-private", DECL_ATTRIBUTES (var))) + { + unsigned int offset, *poffset; + poffset = gang_private_shared_hmap.get (var); + if (poffset) + offset = *poffset; + else + { + unsigned HOST_WIDE_INT align = DECL_ALIGN (var); + gang_private_shared_size + = (gang_private_shared_size + align - 1) & ~(align - 1); + if (gang_private_shared_align < align) + gang_private_shared_align = align; + + offset = gang_private_shared_size; + bool existed = gang_private_shared_hmap.put (var, offset); + gcc_checking_assert (!existed); + gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); + } + rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset); + return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); + } + + return NULL_RTX; +} + static GTY(()) tree nvptx_previous_fndecl; static void @@ -6670,6 +6743,7 @@ nvptx_set_current_function (tree fndecl) if (!fndecl || fndecl == nvptx_previous_fndecl) return; + gang_private_shared_hmap.empty (); nvptx_previous_fndecl = fndecl; vector_red_partition = 0; oacc_bcast_partition = 0; @@ -6834,6 +6908,12 @@ nvptx_libc_has_function (enum function_class fn_class, tree type) #undef TARGET_HAVE_SPECULATION_SAFE_VALUE #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed +#undef TARGET_GOACC_ADJUST_PRIVATE_DECL +#define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl + +#undef TARGET_GOACC_EXPAND_VAR_DECL +#define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl + #undef TARGET_SET_CURRENT_FUNCTION #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 85ea9395560..78c330c292d 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6236,6 +6236,31 @@ like @code{cond_add@var{m}}. The default implementation returns a zero constant of type @var{type}. @end deftypefn +@deftypefn {Target Hook} tree TARGET_GOACC_ADJUST_PRIVATE_DECL (tree @var{var}, int @var{level}) +This hook, if defined, is used by accelerator target back-ends to adjust +OpenACC variable declarations that should be made private to the given +parallelism level (i.e. @code{GOMP_DIM_GANG}, @code{GOMP_DIM_WORKER} or +@code{GOMP_DIM_VECTOR}). A typical use for this hook is to force variable +declarations at the @code{gang} level to reside in GPU shared memory. + +You may also use the @code{TARGET_GOACC_EXPAND_VAR_DECL} hook if the +adjusted variable declaration needs to be expanded to RTL in a non-standard +way. +@end deftypefn + +@deftypefn {Target Hook} rtx TARGET_GOACC_EXPAND_VAR_DECL (tree @var{var}) +This hook, if defined, is used by accelerator target back-ends to expand +specially handled kinds of @code{VAR_DECL} expressions. A particular use is +to place variables with specific attributes inside special accelarator +memories. A return value of @code{NULL} indicates that the target does not +handle this @code{VAR_DECL}, and normal RTL expanding is resumed. + +Only define this hook if your accelerator target needs to expand certain +@code{VAR_DECL} nodes in a way that differs from the default. You can also adjust +private variables at OpenACC device-lowering time using the +@code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index d8e3de14af1..d9fbbe20e6f 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4221,6 +4221,10 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_PREFERRED_ELSE_VALUE +@hook TARGET_GOACC_ADJUST_PRIVATE_DECL + +@hook TARGET_GOACC_EXPAND_VAR_DECL + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/expr.c b/gcc/expr.c index ba61eb98b3b..e4660f0e90a 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -10419,8 +10419,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, exp = SSA_NAME_VAR (ssa_name); goto expand_decl_rtl; - case PARM_DECL: case VAR_DECL: + /* Allow accel compiler to handle variables that require special + treatment, e.g. if they have been modified in some way earlier in + compilation by the adjust_private_decl OpenACC hook. */ + if (flag_openacc && targetm.goacc.expand_var_decl) + { + temp = targetm.goacc.expand_var_decl (exp); + if (temp) + return temp; + } + /* ... fall through ... */ + + case PARM_DECL: /* If a static var's type was incomplete when the decl was written, but the type is complete now, lay out the decl now. */ if (DECL_SIZE (exp) == 0 diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index d209a52f823..d92080c8077 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2969,6 +2969,8 @@ expand_UNIQUE (internal_fn, gcall *stmt) else gcc_unreachable (); break; + case IFN_UNIQUE_OACC_PRIVATE: + break; } if (pattern) diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h index c6599ce4894..5bc5660c1ff 100644 --- a/gcc/internal-fn.h +++ b/gcc/internal-fn.h @@ -32,11 +32,15 @@ along with GCC; see the file COPYING3. If not see or leaving partitioned execution. DEP_VAR = UNIQUE ({HEAD,TAIL}_MARK, REMAINING_MARKS, ...PRIMARY_FLAGS) - The PRIMARY_FLAGS only occur on the first HEAD_MARK of a sequence. */ + The PRIMARY_FLAGS only occur on the first HEAD_MARK of a sequence. + + PRIVATE captures variables to be made private at the surrounding parallelism + level. */ #define IFN_UNIQUE_CODES \ DEF(UNSPEC), \ DEF(OACC_FORK), DEF(OACC_JOIN), \ - DEF(OACC_HEAD_MARK), DEF(OACC_TAIL_MARK) + DEF(OACC_HEAD_MARK), DEF(OACC_TAIL_MARK), \ + DEF(OACC_PRIVATE) enum ifn_unique_kind { #define DEF(X) IFN_UNIQUE_##X diff --git a/gcc/omp-low.c b/gcc/omp-low.c index d1136d181b3..da827ef2e34 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -179,6 +179,9 @@ struct omp_context /* Only used for omp target contexts. True if an OpenMP construct other than teams is strictly nested in it. */ bool nonteams_nested_p; + + /* Candidates for adjusting OpenACC privatization level. */ + vec oacc_privatization_candidates; }; static splay_tree all_contexts; @@ -7132,8 +7135,9 @@ lower_lastprivate_clauses (tree clauses, tree predicate, gimple_seq *body_p, static void lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, - gcall *fork, gcall *join, gimple_seq *fork_seq, - gimple_seq *join_seq, omp_context *ctx) + gcall *fork, gcall *private_marker, gcall *join, + gimple_seq *fork_seq, gimple_seq *join_seq, + omp_context *ctx) { gimple_seq before_fork = NULL; gimple_seq after_fork = NULL; @@ -7337,6 +7341,8 @@ lower_oacc_reductions (location_t loc, tree clauses, tree level, bool inner, /* Now stitch things together. */ gimple_seq_add_seq (fork_seq, before_fork); + if (private_marker) + gimple_seq_add_stmt (fork_seq, private_marker); if (fork) gimple_seq_add_stmt (fork_seq, fork); gimple_seq_add_seq (fork_seq, after_fork); @@ -8116,7 +8122,7 @@ lower_oacc_loop_marker (location_t loc, tree ddvar, bool head, HEAD and TAIL. */ static void -lower_oacc_head_tail (location_t loc, tree clauses, +lower_oacc_head_tail (location_t loc, tree clauses, gcall *private_marker, gimple_seq *head, gimple_seq *tail, omp_context *ctx) { bool inner = false; @@ -8124,6 +8130,14 @@ lower_oacc_head_tail (location_t loc, tree clauses, gimple_seq_add_stmt (head, gimple_build_assign (ddvar, integer_zero_node)); unsigned count = lower_oacc_head_mark (loc, ddvar, clauses, head, ctx); + + if (private_marker) + { + gimple_set_location (private_marker, loc); + gimple_call_set_lhs (private_marker, ddvar); + gimple_call_set_arg (private_marker, 1, ddvar); + } + tree fork_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_FORK); tree join_kind = build_int_cst (unsigned_type_node, IFN_UNIQUE_OACC_JOIN); @@ -8154,7 +8168,8 @@ lower_oacc_head_tail (location_t loc, tree clauses, &join_seq); lower_oacc_reductions (loc, clauses, place, inner, - fork, join, &fork_seq, &join_seq, ctx); + fork, (count == 1) ? private_marker : NULL, + join, &fork_seq, &join_seq, ctx); /* Append this level to head. */ gimple_seq_add_seq (head, fork_seq); @@ -10129,6 +10144,32 @@ lower_omp_for_lastprivate (struct omp_for_data *fd, gimple_seq *body_p, } } +/* Scan CLAUSES for candidates for adjusting OpenACC privatization level in + CTX. */ + +static void +oacc_privatization_scan_clause_chain (omp_context *ctx, tree clauses) +{ + for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_PRIVATE) + { + tree decl = OMP_CLAUSE_DECL (c); + if (VAR_P (decl) && TREE_ADDRESSABLE (decl)) + ctx->oacc_privatization_candidates.safe_push (decl); + } +} + +/* Scan DECLS for candidates for adjusting OpenACC privatization level in + CTX. */ + +static void +oacc_privatization_scan_decl_chain (omp_context *ctx, tree decls) +{ + for (tree decl = decls; decl; decl = DECL_CHAIN (decl)) + if (VAR_P (decl) && TREE_ADDRESSABLE (decl)) + ctx->oacc_privatization_candidates.safe_push (decl); +} + /* Callback for walk_gimple_seq. Find #pragma omp scan statement. */ static tree @@ -10958,6 +10999,58 @@ lower_omp_for_scan (gimple_seq *body_p, gimple_seq *dlist, gomp_for *stmt, *dlist = new_dlist; } +/* Build an internal UNIQUE function with type IFN_UNIQUE_OACC_PRIVATE listing + the addresses of variables to be made private at the surrounding + parallelism level. Such functions appear in the gimple code stream in two + forms, e.g. for a partitioned loop: + + .data_dep.6 = .UNIQUE (OACC_HEAD_MARK, .data_dep.6, 1, 68); + .data_dep.6 = .UNIQUE (OACC_PRIVATE, .data_dep.6, -1, &w); + .data_dep.6 = .UNIQUE (OACC_FORK, .data_dep.6, -1); + .data_dep.6 = .UNIQUE (OACC_HEAD_MARK, .data_dep.6); + + or alternatively, OACC_PRIVATE can appear at the top level of a parallel, + not as part of a HEAD_MARK sequence: + + .UNIQUE (OACC_PRIVATE, 0, 0, &w); + + For such stand-alone appearances, the 3rd argument is always 0, denoting + gang partitioning. */ + +static gcall * +lower_oacc_private_marker (omp_context *ctx) +{ + if (ctx->oacc_privatization_candidates.length () == 0) + return NULL; + + auto_vec args; + + args.quick_push (build_int_cst (integer_type_node, IFN_UNIQUE_OACC_PRIVATE)); + args.quick_push (integer_zero_node); + args.quick_push (integer_minus_one_node); + + int i; + tree decl; + FOR_EACH_VEC_ELT (ctx->oacc_privatization_candidates, i, decl) + { + for (omp_context *thisctx = ctx; thisctx; thisctx = thisctx->outer) + { + tree inner_decl = maybe_lookup_decl (decl, thisctx); + if (inner_decl) + { + decl = inner_decl; + break; + } + } + gcc_checking_assert (decl); + + tree addr = build_fold_addr_expr (decl); + args.safe_push (addr); + } + + return gimple_build_call_internal_vec (IFN_UNIQUE, args); +} + /* Lower code for an OMP loop directive. */ static void @@ -10974,6 +11067,8 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) push_gimplify_context (); + oacc_privatization_scan_clause_chain (ctx, gimple_omp_for_clauses (stmt)); + lower_omp (gimple_omp_for_pre_body_ptr (stmt), ctx); block = make_node (BLOCK); @@ -10992,6 +11087,8 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) gbind *inner_bind = as_a (gimple_seq_first_stmt (omp_for_body)); tree vars = gimple_bind_vars (inner_bind); + if (is_gimple_omp_oacc (ctx->stmt)) + oacc_privatization_scan_decl_chain (ctx, vars); gimple_bind_append_vars (new_stmt, vars); /* bind_vars/BLOCK_VARS are being moved to new_stmt/block, don't keep them on the inner_bind and it's block. */ @@ -11105,6 +11202,11 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) lower_omp (gimple_omp_body_ptr (stmt), ctx); + gcall *private_marker = NULL; + if (is_gimple_omp_oacc (ctx->stmt) + && !gimple_seq_empty_p (omp_for_body)) + private_marker = lower_oacc_private_marker (ctx); + /* Lower the header expressions. At this point, we can assume that the header is of the form: @@ -11159,7 +11261,7 @@ lower_omp_for (gimple_stmt_iterator *gsi_p, omp_context *ctx) if (is_gimple_omp_oacc (ctx->stmt) && !ctx_in_oacc_kernels_region (ctx)) lower_oacc_head_tail (gimple_location (stmt), - gimple_omp_for_clauses (stmt), + gimple_omp_for_clauses (stmt), private_marker, &oacc_head, &oacc_tail, ctx); /* Add OpenACC partitioning and reduction markers just before the loop. */ @@ -13156,8 +13258,14 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) them as a dummy GANG loop. */ tree level = build_int_cst (integer_type_node, GOMP_DIM_GANG); + gcall *private_marker = lower_oacc_private_marker (ctx); + + if (private_marker) + gimple_call_set_arg (private_marker, 2, level); + lower_oacc_reductions (gimple_location (ctx->stmt), clauses, level, - false, NULL, NULL, &fork_seq, &join_seq, ctx); + false, NULL, private_marker, NULL, &fork_seq, + &join_seq, ctx); } gimple_seq_add_seq (&new_body, fork_seq); @@ -13399,6 +13507,11 @@ lower_omp_1 (gimple_stmt_iterator *gsi_p, omp_context *ctx) ctx); break; case GIMPLE_BIND: + if (ctx && is_gimple_omp_oacc (ctx->stmt)) + { + tree vars = gimple_bind_vars (as_a (stmt)); + oacc_privatization_scan_decl_chain (ctx, vars); + } lower_omp (gimple_bind_body_ptr (as_a (stmt)), ctx); maybe_remove_omp_member_access_dummy_vars (as_a (stmt)); break; diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c index 16124613fa7..080bdddfe88 100644 --- a/gcc/omp-offload.c +++ b/gcc/omp-offload.c @@ -53,6 +53,7 @@ along with GCC; see the file COPYING3. If not see #include "attribs.h" #include "cfgloop.h" #include "context.h" +#include "convert.h" /* Describe the OpenACC looping structure of a function. The entire function is held in a 'NULL' loop. */ @@ -1357,7 +1358,9 @@ oacc_loop_xform_head_tail (gcall *from, int level) = ((enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (stmt, 0))); - if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN) + if (k == IFN_UNIQUE_OACC_FORK + || k == IFN_UNIQUE_OACC_JOIN + || k == IFN_UNIQUE_OACC_PRIVATE) *gimple_call_arg_ptr (stmt, 2) = replacement; else if (k == kind && stmt != from) break; @@ -1774,6 +1777,136 @@ default_goacc_reduction (gcall *call) gsi_replace_with_seq (&gsi, seq, true); } +struct var_decl_rewrite_info +{ + gimple *stmt; + hash_map *adjusted_vars; + bool avoid_pointer_conversion; + bool modified; +}; + +/* Helper function for execute_oacc_device_lower. Rewrite VAR_DECLs (by + themselves or wrapped in various other nodes) according to ADJUSTED_VARS in + the var_decl_rewrite_info pointed to via DATA. Used as part of coercing + gang-private variables in OpenACC offload regions to reside in GPU shared + memory. */ + +static tree +oacc_rewrite_var_decl (tree *tp, int *walk_subtrees, void *data) +{ + walk_stmt_info *wi = (walk_stmt_info *) data; + var_decl_rewrite_info *info = (var_decl_rewrite_info *) wi->info; + + if (TREE_CODE (*tp) == ADDR_EXPR) + { + tree arg = TREE_OPERAND (*tp, 0); + tree *new_arg = info->adjusted_vars->get (arg); + + if (new_arg) + { + if (info->avoid_pointer_conversion) + { + *tp = build_fold_addr_expr (*new_arg); + info->modified = true; + *walk_subtrees = 0; + } + else + { + gimple_stmt_iterator gsi = gsi_for_stmt (info->stmt); + tree repl = build_fold_addr_expr (*new_arg); + gimple *stmt1 + = gimple_build_assign (make_ssa_name (TREE_TYPE (repl)), repl); + tree conv = convert_to_pointer (TREE_TYPE (*tp), + gimple_assign_lhs (stmt1)); + gimple *stmt2 + = gimple_build_assign (make_ssa_name (TREE_TYPE (*tp)), conv); + gsi_insert_before (&gsi, stmt1, GSI_SAME_STMT); + gsi_insert_before (&gsi, stmt2, GSI_SAME_STMT); + *tp = gimple_assign_lhs (stmt2); + info->modified = true; + *walk_subtrees = 0; + } + } + } + else if (TREE_CODE (*tp) == COMPONENT_REF || TREE_CODE (*tp) == ARRAY_REF) + { + tree *base = &TREE_OPERAND (*tp, 0); + + while (TREE_CODE (*base) == COMPONENT_REF + || TREE_CODE (*base) == ARRAY_REF) + base = &TREE_OPERAND (*base, 0); + + if (TREE_CODE (*base) != VAR_DECL) + return NULL; + + tree *new_decl = info->adjusted_vars->get (*base); + if (!new_decl) + return NULL; + + int base_quals = TYPE_QUALS (TREE_TYPE (*new_decl)); + tree field = TREE_OPERAND (*tp, 1); + + /* Adjust the type of the field. */ + int field_quals = TYPE_QUALS (TREE_TYPE (field)); + if (TREE_CODE (field) == FIELD_DECL && field_quals != base_quals) + { + tree *field_type = &TREE_TYPE (field); + while (TREE_CODE (*field_type) == ARRAY_TYPE) + field_type = &TREE_TYPE (*field_type); + field_quals |= base_quals; + *field_type = build_qualified_type (*field_type, field_quals); + } + + /* Adjust the type of the component ref itself. */ + tree comp_type = TREE_TYPE (*tp); + int comp_quals = TYPE_QUALS (comp_type); + if (TREE_CODE (*tp) == COMPONENT_REF && comp_quals != base_quals) + { + comp_quals |= base_quals; + TREE_TYPE (*tp) + = build_qualified_type (comp_type, comp_quals); + } + + *base = *new_decl; + info->modified = true; + } + else if (TREE_CODE (*tp) == VAR_DECL) + { + tree *new_decl = info->adjusted_vars->get (*tp); + if (new_decl) + { + *tp = *new_decl; + info->modified = true; + } + } + + return NULL_TREE; +} + +/* Return TRUE if CALL is a call to a builtin atomic/sync operation. */ + +static bool +is_sync_builtin_call (gcall *call) +{ + tree callee = gimple_call_fndecl (call); + + if (callee != NULL_TREE + && gimple_call_builtin_p (call, BUILT_IN_NORMAL)) + switch (DECL_FUNCTION_CODE (callee)) + { +#undef DEF_SYNC_BUILTIN +#define DEF_SYNC_BUILTIN(ENUM, NAME, TYPE, ATTRS) case ENUM: +#include "sync-builtins.def" +#undef DEF_SYNC_BUILTIN + return true; + + default: + ; + } + + return false; +} + /* Main entry point for oacc transformations which run on the device compiler after LTO, so we know what the target device is at this point (including the host fallback). */ @@ -1923,6 +2056,8 @@ execute_oacc_device_lower () dominance information to update SSA. */ calculate_dominance_info (CDI_DOMINATORS); + hash_map adjusted_vars; + /* Now lower internal loop functions to target-specific code sequences. */ basic_block bb; @@ -1999,6 +2134,45 @@ execute_oacc_device_lower () case IFN_UNIQUE_OACC_TAIL_MARK: remove = true; break; + + case IFN_UNIQUE_OACC_PRIVATE: + { + HOST_WIDE_INT level + = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + if (level == -1) + break; + for (unsigned i = 3; + i < gimple_call_num_args (call); + i++) + { + tree arg = gimple_call_arg (call, i); + gcc_checking_assert (TREE_CODE (arg) == ADDR_EXPR); + tree decl = TREE_OPERAND (arg, 0); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + static char const *const axes[] = + /* Must be kept in sync with GOMP_DIM + enumeration. */ + { "gang", "worker", "vector" }; + fprintf (dump_file, "Decl UID %u has %s " + "partitioning:", DECL_UID (decl), + axes[level]); + print_generic_decl (dump_file, decl, TDF_SLIM); + fputc ('\n', dump_file); + } + if (targetm.goacc.adjust_private_decl) + { + tree oldtype = TREE_TYPE (decl); + tree newdecl + = targetm.goacc.adjust_private_decl (decl, level); + if (TREE_TYPE (newdecl) != oldtype + || newdecl != decl) + adjusted_vars.put (decl, newdecl); + } + } + remove = true; + } + break; } break; } @@ -2030,6 +2204,55 @@ execute_oacc_device_lower () gsi_next (&gsi); } + /* Make adjustments to gang-private local variables if required by the + target, e.g. forcing them into a particular address space. Afterwards, + ADDR_EXPR nodes which have adjusted variables as their argument need to + be modified in one of two ways: + + 1. They can be recreated, making a pointer to the variable in the new + address space, or + + 2. The address of the variable in the new address space can be taken, + converted to the default (original) address space, and the result of + that conversion subsituted in place of the original ADDR_EXPR node. + + Which of these is done depends on the gimple statement being processed. + At present atomic operations and inline asms use (1), and everything else + uses (2). At least on AMD GCN, there are atomic operations that work + directly in the LDS address space. + + COMPONENT_REFS, ARRAY_REFS and plain VAR_DECLs are also rewritten to use + the new decl, adjusting types of appropriate tree nodes as necessary. */ + + if (targetm.goacc.adjust_private_decl) + { + FOR_ALL_BB_FN (bb, cfun) + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + walk_stmt_info wi; + var_decl_rewrite_info info; + + info.avoid_pointer_conversion + = (is_gimple_call (stmt) + && is_sync_builtin_call (as_a (stmt))) + || gimple_code (stmt) == GIMPLE_ASM; + info.stmt = stmt; + info.modified = false; + info.adjusted_vars = &adjusted_vars; + + memset (&wi, 0, sizeof (wi)); + wi.info = &info; + + walk_gimple_op (stmt, oacc_rewrite_var_decl, &wi); + + if (info.modified) + update_stmt (stmt); + } + } + free_oacc_loop (loops); return 0; diff --git a/gcc/target.def b/gcc/target.def index bbaf6b4f3a0..660b69f5cb5 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1726,6 +1726,35 @@ for allocating any storage for reductions when necessary.", void, (gcall *call), default_goacc_reduction) +DEFHOOK +(adjust_private_decl, +"This hook, if defined, is used by accelerator target back-ends to adjust\n\ +OpenACC variable declarations that should be made private to the given\n\ +parallelism level (i.e. @code{GOMP_DIM_GANG}, @code{GOMP_DIM_WORKER} or\n\ +@code{GOMP_DIM_VECTOR}). A typical use for this hook is to force variable\n\ +declarations at the @code{gang} level to reside in GPU shared memory.\n\ +\n\ +You may also use the @code{TARGET_GOACC_EXPAND_VAR_DECL} hook if the\n\ +adjusted variable declaration needs to be expanded to RTL in a non-standard\n\ +way.", +tree, (tree var, int level), +NULL) + +DEFHOOK +(expand_var_decl, +"This hook, if defined, is used by accelerator target back-ends to expand\n\ +specially handled kinds of @code{VAR_DECL} expressions. A particular use is\n\ +to place variables with specific attributes inside special accelarator\n\ +memories. A return value of @code{NULL} indicates that the target does not\n\ +handle this @code{VAR_DECL}, and normal RTL expanding is resumed.\n\ +\n\ +Only define this hook if your accelerator target needs to expand certain\n\ +@code{VAR_DECL} nodes in a way that differs from the default. You can also adjust\n\ +private variables at OpenACC device-lowering time using the\n\ +@code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook.", +rtx, (tree var), +NULL) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c new file mode 100644 index 00000000000..28222c25da3 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/private-atomic-1-gang.c @@ -0,0 +1,38 @@ +#include + +int main (void) +{ + int ret; + + #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret) + { + int w = 0; + + #pragma acc loop worker + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + w++; + } + + ret = (w == 32); + } + assert (ret); + + #pragma acc parallel num_gangs(1) vector_length(32) copyout(ret) + { + int v = 0; + + #pragma acc loop vector + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + v++; + } + + ret = (v == 32); + } + assert (ret); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90 b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90 new file mode 100644 index 00000000000..81487d7a7e0 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-gang.f90 @@ -0,0 +1,25 @@ +! Test for "oacc gang-private" attribute on gang-private variables + +! { dg-do run } +! { dg-additional-options "-fdump-tree-oaccdevlow-details -w" } + +program main + integer :: w, arr(0:31) + + !$acc parallel num_gangs(32) num_workers(32) copyout(arr) + !$acc loop gang private(w) +! { dg-final { scan-tree-dump-times "Decl UID \[0-9\]+ has gang partitioning: integer\\(kind=4\\) w;" 1 "oaccdevlow" } } */ + do j = 0, 31 + w = 0 + !$acc loop seq + do i = 0, 31 + !$acc atomic update + w = w + 1 + !$acc end atomic + end do + arr(j) = w + end do + !$acc end parallel + + if (any (arr .ne. 32)) stop 1 +end program main diff --git a/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90 b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90 new file mode 100644 index 00000000000..21d13754591 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/private-atomic-1-worker.f90 @@ -0,0 +1,32 @@ +! Test for worker-private variables + +! { dg-do run } +! { dg-additional-options "-fdump-tree-oaccdevlow-details" } + +program main + integer :: w, arr(0:31) + + !$acc parallel num_gangs(32) num_workers(32) copyout(arr) + !$acc loop gang worker private(w) +! { dg-final { scan-tree-dump-times "Decl UID \[0-9\]+ has worker partitioning: integer\\(kind=4\\) w;" 1 "oaccdevlow" } } */ + do j = 0, 31 + w = 0 + !$acc loop seq + do i = 0, 31 + !$acc atomic update + w = w + 1 + ! nvptx offloading: PR83812 "operation not supported on global/shared address space". + ! { dg-output "(\n|\r\n|\r)libgomp: cuStreamSynchronize error: operation not supported on global/shared address space(\n|\r\n|\r)$" { target openacc_nvidia_accel_selected } } + ! Scan for what we expect in the "XFAILed" case (without actually XFAILing). + ! { dg-shouldfail "XFAILed" { openacc_nvidia_accel_selected } } + ! ... instead of 'dg-xfail-run-if' so that 'dg-output' is evaluated at all. + ! { dg-final { if { [dg-process-target { xfail openacc_nvidia_accel_selected }] == "F" } { xfail "[testname-for-summary] really is XFAILed" } } } + ! ... so that we still get an XFAIL visible in the log. + !$acc end atomic + end do + arr(j) = w + end do + !$acc end parallel + + if (any (arr .ne. 32)) stop 1 +end program main