1714 lines
46 KiB
C
1714 lines
46 KiB
C
/* Bits of OpenMP and OpenACC handling that is specific to device offloading
|
|
and a lowering pass for OpenACC device directives.
|
|
|
|
Copyright (C) 2005-2017 Free Software Foundation, Inc.
|
|
|
|
This file is part of GCC.
|
|
|
|
GCC is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 3, or (at your option) any later
|
|
version.
|
|
|
|
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with GCC; see the file COPYING3. If not see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "config.h"
|
|
#include "system.h"
|
|
#include "coretypes.h"
|
|
#include "backend.h"
|
|
#include "target.h"
|
|
#include "tree.h"
|
|
#include "gimple.h"
|
|
#include "tree-pass.h"
|
|
#include "ssa.h"
|
|
#include "cgraph.h"
|
|
#include "pretty-print.h"
|
|
#include "diagnostic-core.h"
|
|
#include "fold-const.h"
|
|
#include "internal-fn.h"
|
|
#include "gimplify.h"
|
|
#include "gimple-iterator.h"
|
|
#include "gimplify-me.h"
|
|
#include "gimple-walk.h"
|
|
#include "tree-cfg.h"
|
|
#include "tree-into-ssa.h"
|
|
#include "common/common-target.h"
|
|
#include "omp-general.h"
|
|
#include "omp-offload.h"
|
|
#include "lto-section-names.h"
|
|
#include "gomp-constants.h"
|
|
#include "gimple-pretty-print.h"
|
|
|
|
/* Describe the OpenACC looping structure of a function. The entire
|
|
function is held in a 'NULL' loop. */
|
|
|
|
struct oacc_loop
|
|
{
|
|
oacc_loop *parent; /* Containing loop. */
|
|
|
|
oacc_loop *child; /* First inner loop. */
|
|
|
|
oacc_loop *sibling; /* Next loop within same parent. */
|
|
|
|
location_t loc; /* Location of the loop start. */
|
|
|
|
gcall *marker; /* Initial head marker. */
|
|
|
|
gcall *heads[GOMP_DIM_MAX]; /* Head marker functions. */
|
|
gcall *tails[GOMP_DIM_MAX]; /* Tail marker functions. */
|
|
|
|
tree routine; /* Pseudo-loop enclosing a routine. */
|
|
|
|
unsigned mask; /* Partitioning mask. */
|
|
unsigned inner; /* Partitioning of inner loops. */
|
|
unsigned flags; /* Partitioning flags. */
|
|
unsigned ifns; /* Contained loop abstraction functions. */
|
|
tree chunk_size; /* Chunk size. */
|
|
gcall *head_end; /* Final marker of head sequence. */
|
|
};
|
|
|
|
/* Holds offload tables with decls. */
|
|
vec<tree, va_gc> *offload_funcs, *offload_vars;
|
|
|
|
/* Return level at which oacc routine may spawn a partitioned loop, or
|
|
-1 if it is not a routine (i.e. is an offload fn). */
|
|
|
|
static int
|
|
oacc_fn_attrib_level (tree attr)
|
|
{
|
|
tree pos = TREE_VALUE (attr);
|
|
|
|
if (!TREE_PURPOSE (pos))
|
|
return -1;
|
|
|
|
int ix = 0;
|
|
for (ix = 0; ix != GOMP_DIM_MAX;
|
|
ix++, pos = TREE_CHAIN (pos))
|
|
if (!integer_zerop (TREE_PURPOSE (pos)))
|
|
break;
|
|
|
|
return ix;
|
|
}
|
|
|
|
/* Helper function for omp_finish_file routine. Takes decls from V_DECLS and
|
|
adds their addresses and sizes to constructor-vector V_CTOR. */
|
|
|
|
static void
|
|
add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls,
|
|
vec<constructor_elt, va_gc> *v_ctor)
|
|
{
|
|
unsigned len = vec_safe_length (v_decls);
|
|
for (unsigned i = 0; i < len; i++)
|
|
{
|
|
tree it = (*v_decls)[i];
|
|
bool is_var = VAR_P (it);
|
|
bool is_link_var
|
|
= is_var
|
|
#ifdef ACCEL_COMPILER
|
|
&& DECL_HAS_VALUE_EXPR_P (it)
|
|
#endif
|
|
&& lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (it));
|
|
|
|
tree size = NULL_TREE;
|
|
if (is_var)
|
|
size = fold_convert (const_ptr_type_node, DECL_SIZE_UNIT (it));
|
|
|
|
tree addr;
|
|
if (!is_link_var)
|
|
addr = build_fold_addr_expr (it);
|
|
else
|
|
{
|
|
#ifdef ACCEL_COMPILER
|
|
/* For "omp declare target link" vars add address of the pointer to
|
|
the target table, instead of address of the var. */
|
|
tree value_expr = DECL_VALUE_EXPR (it);
|
|
tree link_ptr_decl = TREE_OPERAND (value_expr, 0);
|
|
varpool_node::finalize_decl (link_ptr_decl);
|
|
addr = build_fold_addr_expr (link_ptr_decl);
|
|
#else
|
|
addr = build_fold_addr_expr (it);
|
|
#endif
|
|
|
|
/* Most significant bit of the size marks "omp declare target link"
|
|
vars in host and target tables. */
|
|
unsigned HOST_WIDE_INT isize = tree_to_uhwi (size);
|
|
isize |= 1ULL << (int_size_in_bytes (const_ptr_type_node)
|
|
* BITS_PER_UNIT - 1);
|
|
size = wide_int_to_tree (const_ptr_type_node, isize);
|
|
}
|
|
|
|
CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, addr);
|
|
if (is_var)
|
|
CONSTRUCTOR_APPEND_ELT (v_ctor, NULL_TREE, size);
|
|
}
|
|
}
|
|
|
|
/* Create new symbols containing (address, size) pairs for global variables,
|
|
marked with "omp declare target" attribute, as well as addresses for the
|
|
functions, which are outlined offloading regions. */
|
|
void
|
|
omp_finish_file (void)
|
|
{
|
|
unsigned num_funcs = vec_safe_length (offload_funcs);
|
|
unsigned num_vars = vec_safe_length (offload_vars);
|
|
|
|
if (num_funcs == 0 && num_vars == 0)
|
|
return;
|
|
|
|
if (targetm_common.have_named_sections)
|
|
{
|
|
vec<constructor_elt, va_gc> *v_f, *v_v;
|
|
vec_alloc (v_f, num_funcs);
|
|
vec_alloc (v_v, num_vars * 2);
|
|
|
|
add_decls_addresses_to_decl_constructor (offload_funcs, v_f);
|
|
add_decls_addresses_to_decl_constructor (offload_vars, v_v);
|
|
|
|
tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node,
|
|
num_vars * 2);
|
|
tree funcs_decl_type = build_array_type_nelts (pointer_sized_int_node,
|
|
num_funcs);
|
|
SET_TYPE_ALIGN (vars_decl_type, TYPE_ALIGN (pointer_sized_int_node));
|
|
SET_TYPE_ALIGN (funcs_decl_type, TYPE_ALIGN (pointer_sized_int_node));
|
|
tree ctor_v = build_constructor (vars_decl_type, v_v);
|
|
tree ctor_f = build_constructor (funcs_decl_type, v_f);
|
|
TREE_CONSTANT (ctor_v) = TREE_CONSTANT (ctor_f) = 1;
|
|
TREE_STATIC (ctor_v) = TREE_STATIC (ctor_f) = 1;
|
|
tree funcs_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
|
|
get_identifier (".offload_func_table"),
|
|
funcs_decl_type);
|
|
tree vars_decl = build_decl (UNKNOWN_LOCATION, VAR_DECL,
|
|
get_identifier (".offload_var_table"),
|
|
vars_decl_type);
|
|
TREE_STATIC (funcs_decl) = TREE_STATIC (vars_decl) = 1;
|
|
/* Do not align tables more than TYPE_ALIGN (pointer_sized_int_node),
|
|
otherwise a joint table in a binary will contain padding between
|
|
tables from multiple object files. */
|
|
DECL_USER_ALIGN (funcs_decl) = DECL_USER_ALIGN (vars_decl) = 1;
|
|
SET_DECL_ALIGN (funcs_decl, TYPE_ALIGN (funcs_decl_type));
|
|
SET_DECL_ALIGN (vars_decl, TYPE_ALIGN (vars_decl_type));
|
|
DECL_INITIAL (funcs_decl) = ctor_f;
|
|
DECL_INITIAL (vars_decl) = ctor_v;
|
|
set_decl_section_name (funcs_decl, OFFLOAD_FUNC_TABLE_SECTION_NAME);
|
|
set_decl_section_name (vars_decl, OFFLOAD_VAR_TABLE_SECTION_NAME);
|
|
|
|
varpool_node::finalize_decl (vars_decl);
|
|
varpool_node::finalize_decl (funcs_decl);
|
|
}
|
|
else
|
|
{
|
|
for (unsigned i = 0; i < num_funcs; i++)
|
|
{
|
|
tree it = (*offload_funcs)[i];
|
|
targetm.record_offload_symbol (it);
|
|
}
|
|
for (unsigned i = 0; i < num_vars; i++)
|
|
{
|
|
tree it = (*offload_vars)[i];
|
|
targetm.record_offload_symbol (it);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Find the number of threads (POS = false), or thread number (POS =
|
|
true) for an OpenACC region partitioned as MASK. Setup code
|
|
required for the calculation is added to SEQ. */
|
|
|
|
static tree
|
|
oacc_thread_numbers (bool pos, int mask, gimple_seq *seq)
|
|
{
|
|
tree res = pos ? NULL_TREE : build_int_cst (unsigned_type_node, 1);
|
|
unsigned ix;
|
|
|
|
/* Start at gang level, and examine relevant dimension indices. */
|
|
for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
|
|
if (GOMP_DIM_MASK (ix) & mask)
|
|
{
|
|
tree arg = build_int_cst (unsigned_type_node, ix);
|
|
|
|
if (res)
|
|
{
|
|
/* We had an outer index, so scale that by the size of
|
|
this dimension. */
|
|
tree n = create_tmp_var (integer_type_node);
|
|
gimple *call
|
|
= gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, arg);
|
|
|
|
gimple_call_set_lhs (call, n);
|
|
gimple_seq_add_stmt (seq, call);
|
|
res = fold_build2 (MULT_EXPR, integer_type_node, res, n);
|
|
}
|
|
if (pos)
|
|
{
|
|
/* Determine index in this dimension. */
|
|
tree id = create_tmp_var (integer_type_node);
|
|
gimple *call = gimple_build_call_internal
|
|
(IFN_GOACC_DIM_POS, 1, arg);
|
|
|
|
gimple_call_set_lhs (call, id);
|
|
gimple_seq_add_stmt (seq, call);
|
|
if (res)
|
|
res = fold_build2 (PLUS_EXPR, integer_type_node, res, id);
|
|
else
|
|
res = id;
|
|
}
|
|
}
|
|
|
|
if (res == NULL_TREE)
|
|
res = integer_zero_node;
|
|
|
|
return res;
|
|
}
|
|
|
|
/* Transform IFN_GOACC_LOOP calls to actual code. See
|
|
expand_oacc_for for where these are generated. At the vector
|
|
level, we stride loops, such that each member of a warp will
|
|
operate on adjacent iterations. At the worker and gang level,
|
|
each gang/warp executes a set of contiguous iterations. Chunking
|
|
can override this such that each iteration engine executes a
|
|
contiguous chunk, and then moves on to stride to the next chunk. */
|
|
|
|
static void
|
|
oacc_xform_loop (gcall *call)
|
|
{
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (call);
|
|
enum ifn_goacc_loop_kind code
|
|
= (enum ifn_goacc_loop_kind) TREE_INT_CST_LOW (gimple_call_arg (call, 0));
|
|
tree dir = gimple_call_arg (call, 1);
|
|
tree range = gimple_call_arg (call, 2);
|
|
tree step = gimple_call_arg (call, 3);
|
|
tree chunk_size = NULL_TREE;
|
|
unsigned mask = (unsigned) TREE_INT_CST_LOW (gimple_call_arg (call, 5));
|
|
tree lhs = gimple_call_lhs (call);
|
|
tree type = TREE_TYPE (lhs);
|
|
tree diff_type = TREE_TYPE (range);
|
|
tree r = NULL_TREE;
|
|
gimple_seq seq = NULL;
|
|
bool chunking = false, striding = true;
|
|
unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
|
|
unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
|
|
|
|
#ifdef ACCEL_COMPILER
|
|
chunk_size = gimple_call_arg (call, 4);
|
|
if (integer_minus_onep (chunk_size) /* Force static allocation. */
|
|
|| integer_zerop (chunk_size)) /* Default (also static). */
|
|
{
|
|
/* If we're at the gang level, we want each to execute a
|
|
contiguous run of iterations. Otherwise we want each element
|
|
to stride. */
|
|
striding = !(outer_mask & GOMP_DIM_MASK (GOMP_DIM_GANG));
|
|
chunking = false;
|
|
}
|
|
else
|
|
{
|
|
/* Chunk of size 1 is striding. */
|
|
striding = integer_onep (chunk_size);
|
|
chunking = !striding;
|
|
}
|
|
#endif
|
|
|
|
/* striding=true, chunking=true
|
|
-> invalid.
|
|
striding=true, chunking=false
|
|
-> chunks=1
|
|
striding=false,chunking=true
|
|
-> chunks=ceil (range/(chunksize*threads*step))
|
|
striding=false,chunking=false
|
|
-> chunk_size=ceil(range/(threads*step)),chunks=1 */
|
|
push_gimplify_context (true);
|
|
|
|
switch (code)
|
|
{
|
|
default: gcc_unreachable ();
|
|
|
|
case IFN_GOACC_LOOP_CHUNKS:
|
|
if (!chunking)
|
|
r = build_int_cst (type, 1);
|
|
else
|
|
{
|
|
/* chunk_max
|
|
= (range - dir) / (chunks * step * num_threads) + dir */
|
|
tree per = oacc_thread_numbers (false, mask, &seq);
|
|
per = fold_convert (type, per);
|
|
chunk_size = fold_convert (type, chunk_size);
|
|
per = fold_build2 (MULT_EXPR, type, per, chunk_size);
|
|
per = fold_build2 (MULT_EXPR, type, per, step);
|
|
r = build2 (MINUS_EXPR, type, range, dir);
|
|
r = build2 (PLUS_EXPR, type, r, per);
|
|
r = build2 (TRUNC_DIV_EXPR, type, r, per);
|
|
}
|
|
break;
|
|
|
|
case IFN_GOACC_LOOP_STEP:
|
|
{
|
|
/* If striding, step by the entire compute volume, otherwise
|
|
step by the inner volume. */
|
|
unsigned volume = striding ? mask : inner_mask;
|
|
|
|
r = oacc_thread_numbers (false, volume, &seq);
|
|
r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
|
|
}
|
|
break;
|
|
|
|
case IFN_GOACC_LOOP_OFFSET:
|
|
if (striding)
|
|
{
|
|
r = oacc_thread_numbers (true, mask, &seq);
|
|
r = fold_convert (diff_type, r);
|
|
}
|
|
else
|
|
{
|
|
tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
|
|
tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
|
|
tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
|
|
inner_size, outer_size);
|
|
|
|
volume = fold_convert (diff_type, volume);
|
|
if (chunking)
|
|
chunk_size = fold_convert (diff_type, chunk_size);
|
|
else
|
|
{
|
|
tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
|
|
|
|
chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
|
|
chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
|
|
chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
|
|
}
|
|
|
|
tree span = build2 (MULT_EXPR, diff_type, chunk_size,
|
|
fold_convert (diff_type, inner_size));
|
|
r = oacc_thread_numbers (true, outer_mask, &seq);
|
|
r = fold_convert (diff_type, r);
|
|
r = build2 (MULT_EXPR, diff_type, r, span);
|
|
|
|
tree inner = oacc_thread_numbers (true, inner_mask, &seq);
|
|
inner = fold_convert (diff_type, inner);
|
|
r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
|
|
|
|
if (chunking)
|
|
{
|
|
tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
|
|
tree per
|
|
= fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
|
|
per = build2 (MULT_EXPR, diff_type, per, chunk);
|
|
|
|
r = build2 (PLUS_EXPR, diff_type, r, per);
|
|
}
|
|
}
|
|
r = fold_build2 (MULT_EXPR, diff_type, r, step);
|
|
if (type != diff_type)
|
|
r = fold_convert (type, r);
|
|
break;
|
|
|
|
case IFN_GOACC_LOOP_BOUND:
|
|
if (striding)
|
|
r = range;
|
|
else
|
|
{
|
|
tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
|
|
tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
|
|
tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
|
|
inner_size, outer_size);
|
|
|
|
volume = fold_convert (diff_type, volume);
|
|
if (chunking)
|
|
chunk_size = fold_convert (diff_type, chunk_size);
|
|
else
|
|
{
|
|
tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
|
|
|
|
chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
|
|
chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
|
|
chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
|
|
}
|
|
|
|
tree span = build2 (MULT_EXPR, diff_type, chunk_size,
|
|
fold_convert (diff_type, inner_size));
|
|
|
|
r = fold_build2 (MULT_EXPR, diff_type, span, step);
|
|
|
|
tree offset = gimple_call_arg (call, 6);
|
|
r = build2 (PLUS_EXPR, diff_type, r,
|
|
fold_convert (diff_type, offset));
|
|
r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
|
|
diff_type, r, range);
|
|
}
|
|
if (diff_type != type)
|
|
r = fold_convert (type, r);
|
|
break;
|
|
}
|
|
|
|
gimplify_assign (lhs, r, &seq);
|
|
|
|
pop_gimplify_context (NULL);
|
|
|
|
gsi_replace_with_seq (&gsi, seq, true);
|
|
}
|
|
|
|
/* Default partitioned and minimum partitioned dimensions. */
|
|
|
|
static int oacc_default_dims[GOMP_DIM_MAX];
|
|
static int oacc_min_dims[GOMP_DIM_MAX];
|
|
|
|
/* Parse the default dimension parameter. This is a set of
|
|
:-separated optional compute dimensions. Each specified dimension
|
|
is a positive integer. When device type support is added, it is
|
|
planned to be a comma separated list of such compute dimensions,
|
|
with all but the first prefixed by the colon-terminated device
|
|
type. */
|
|
|
|
static void
|
|
oacc_parse_default_dims (const char *dims)
|
|
{
|
|
int ix;
|
|
|
|
for (ix = GOMP_DIM_MAX; ix--;)
|
|
{
|
|
oacc_default_dims[ix] = -1;
|
|
oacc_min_dims[ix] = 1;
|
|
}
|
|
|
|
#ifndef ACCEL_COMPILER
|
|
/* Cannot be overridden on the host. */
|
|
dims = NULL;
|
|
#endif
|
|
if (dims)
|
|
{
|
|
const char *pos = dims;
|
|
|
|
for (ix = 0; *pos && ix != GOMP_DIM_MAX; ix++)
|
|
{
|
|
if (ix)
|
|
{
|
|
if (*pos != ':')
|
|
goto malformed;
|
|
pos++;
|
|
}
|
|
|
|
if (*pos != ':')
|
|
{
|
|
long val;
|
|
const char *eptr;
|
|
|
|
errno = 0;
|
|
val = strtol (pos, CONST_CAST (char **, &eptr), 10);
|
|
if (errno || val <= 0 || (int) val != val)
|
|
goto malformed;
|
|
pos = eptr;
|
|
oacc_default_dims[ix] = (int) val;
|
|
}
|
|
}
|
|
if (*pos)
|
|
{
|
|
malformed:
|
|
error_at (UNKNOWN_LOCATION,
|
|
"-fopenacc-dim operand is malformed at '%s'", pos);
|
|
}
|
|
}
|
|
|
|
/* Allow the backend to validate the dimensions. */
|
|
targetm.goacc.validate_dims (NULL_TREE, oacc_default_dims, -1);
|
|
targetm.goacc.validate_dims (NULL_TREE, oacc_min_dims, -2);
|
|
}
|
|
|
|
/* Validate and update the dimensions for offloaded FN. ATTRS is the
|
|
raw attribute. DIMS is an array of dimensions, which is filled in.
|
|
LEVEL is the partitioning level of a routine, or -1 for an offload
|
|
region itself. USED is the mask of partitioned execution in the
|
|
function. */
|
|
|
|
static void
|
|
oacc_validate_dims (tree fn, tree attrs, int *dims, int level, unsigned used)
|
|
{
|
|
tree purpose[GOMP_DIM_MAX];
|
|
unsigned ix;
|
|
tree pos = TREE_VALUE (attrs);
|
|
bool is_kernel = oacc_fn_attrib_kernels_p (attrs);
|
|
|
|
/* Make sure the attribute creator attached the dimension
|
|
information. */
|
|
gcc_assert (pos);
|
|
|
|
for (ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
{
|
|
purpose[ix] = TREE_PURPOSE (pos);
|
|
tree val = TREE_VALUE (pos);
|
|
dims[ix] = val ? TREE_INT_CST_LOW (val) : -1;
|
|
pos = TREE_CHAIN (pos);
|
|
}
|
|
|
|
bool changed = targetm.goacc.validate_dims (fn, dims, level);
|
|
|
|
/* Default anything left to 1 or a partitioned default. */
|
|
for (ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
if (dims[ix] < 0)
|
|
{
|
|
/* The OpenACC spec says 'If the [num_gangs] clause is not
|
|
specified, an implementation-defined default will be used;
|
|
the default may depend on the code within the construct.'
|
|
(2.5.6). Thus an implementation is free to choose
|
|
non-unity default for a parallel region that doesn't have
|
|
any gang-partitioned loops. However, it appears that there
|
|
is a sufficient body of user code that expects non-gang
|
|
partitioned regions to not execute in gang-redundant mode.
|
|
So we (a) don't warn about the non-portability and (b) pick
|
|
the minimum permissible dimension size when there is no
|
|
partitioned execution. Otherwise we pick the global
|
|
default for the dimension, which the user can control. The
|
|
same wording and logic applies to num_workers and
|
|
vector_length, however the worker- or vector- single
|
|
execution doesn't have the same impact as gang-redundant
|
|
execution. (If the minimum gang-level partioning is not 1,
|
|
the target is probably too confusing.) */
|
|
dims[ix] = (used & GOMP_DIM_MASK (ix)
|
|
? oacc_default_dims[ix] : oacc_min_dims[ix]);
|
|
changed = true;
|
|
}
|
|
|
|
if (changed)
|
|
{
|
|
/* Replace the attribute with new values. */
|
|
pos = NULL_TREE;
|
|
for (ix = GOMP_DIM_MAX; ix--;)
|
|
{
|
|
pos = tree_cons (purpose[ix],
|
|
build_int_cst (integer_type_node, dims[ix]),
|
|
pos);
|
|
if (is_kernel)
|
|
TREE_PUBLIC (pos) = 1;
|
|
}
|
|
oacc_replace_fn_attrib (fn, pos);
|
|
}
|
|
}
|
|
|
|
/* Create an empty OpenACC loop structure at LOC. */
|
|
|
|
static oacc_loop *
|
|
new_oacc_loop_raw (oacc_loop *parent, location_t loc)
|
|
{
|
|
oacc_loop *loop = XCNEW (oacc_loop);
|
|
|
|
loop->parent = parent;
|
|
loop->child = loop->sibling = NULL;
|
|
|
|
if (parent)
|
|
{
|
|
loop->sibling = parent->child;
|
|
parent->child = loop;
|
|
}
|
|
|
|
loop->loc = loc;
|
|
loop->marker = NULL;
|
|
memset (loop->heads, 0, sizeof (loop->heads));
|
|
memset (loop->tails, 0, sizeof (loop->tails));
|
|
loop->routine = NULL_TREE;
|
|
|
|
loop->mask = loop->flags = loop->inner = 0;
|
|
loop->ifns = 0;
|
|
loop->chunk_size = 0;
|
|
loop->head_end = NULL;
|
|
|
|
return loop;
|
|
}
|
|
|
|
/* Create an outermost, dummy OpenACC loop for offloaded function
|
|
DECL. */
|
|
|
|
static oacc_loop *
|
|
new_oacc_loop_outer (tree decl)
|
|
{
|
|
return new_oacc_loop_raw (NULL, DECL_SOURCE_LOCATION (decl));
|
|
}
|
|
|
|
/* Start a new OpenACC loop structure beginning at head marker HEAD.
|
|
Link into PARENT loop. Return the new loop. */
|
|
|
|
static oacc_loop *
|
|
new_oacc_loop (oacc_loop *parent, gcall *marker)
|
|
{
|
|
oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (marker));
|
|
|
|
loop->marker = marker;
|
|
|
|
/* TODO: This is where device_type flattening would occur for the loop
|
|
flags. */
|
|
|
|
loop->flags = TREE_INT_CST_LOW (gimple_call_arg (marker, 3));
|
|
|
|
tree chunk_size = integer_zero_node;
|
|
if (loop->flags & OLF_GANG_STATIC)
|
|
chunk_size = gimple_call_arg (marker, 4);
|
|
loop->chunk_size = chunk_size;
|
|
|
|
return loop;
|
|
}
|
|
|
|
/* Create a dummy loop encompassing a call to a openACC routine.
|
|
Extract the routine's partitioning requirements. */
|
|
|
|
static void
|
|
new_oacc_loop_routine (oacc_loop *parent, gcall *call, tree decl, tree attrs)
|
|
{
|
|
oacc_loop *loop = new_oacc_loop_raw (parent, gimple_location (call));
|
|
int level = oacc_fn_attrib_level (attrs);
|
|
|
|
gcc_assert (level >= 0);
|
|
|
|
loop->marker = call;
|
|
loop->routine = decl;
|
|
loop->mask = ((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)
|
|
^ (GOMP_DIM_MASK (level) - 1));
|
|
}
|
|
|
|
/* Finish off the current OpenACC loop ending at tail marker TAIL.
|
|
Return the parent loop. */
|
|
|
|
static oacc_loop *
|
|
finish_oacc_loop (oacc_loop *loop)
|
|
{
|
|
/* If the loop has been collapsed, don't partition it. */
|
|
if (!loop->ifns)
|
|
loop->mask = loop->flags = 0;
|
|
return loop->parent;
|
|
}
|
|
|
|
/* Free all OpenACC loop structures within LOOP (inclusive). */
|
|
|
|
static void
|
|
free_oacc_loop (oacc_loop *loop)
|
|
{
|
|
if (loop->sibling)
|
|
free_oacc_loop (loop->sibling);
|
|
if (loop->child)
|
|
free_oacc_loop (loop->child);
|
|
|
|
free (loop);
|
|
}
|
|
|
|
/* Dump out the OpenACC loop head or tail beginning at FROM. */
|
|
|
|
static void
|
|
dump_oacc_loop_part (FILE *file, gcall *from, int depth,
|
|
const char *title, int level)
|
|
{
|
|
enum ifn_unique_kind kind
|
|
= (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
|
|
|
|
fprintf (file, "%*s%s-%d:\n", depth * 2, "", title, level);
|
|
for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
enum ifn_unique_kind k
|
|
= ((enum ifn_unique_kind) TREE_INT_CST_LOW
|
|
(gimple_call_arg (stmt, 0)));
|
|
|
|
if (k == kind && stmt != from)
|
|
break;
|
|
}
|
|
print_gimple_stmt (file, stmt, depth * 2 + 2, 0);
|
|
|
|
gsi_next (&gsi);
|
|
while (gsi_end_p (gsi))
|
|
gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
|
|
}
|
|
}
|
|
|
|
/* Dump OpenACC loops LOOP, its siblings and its children. */
|
|
|
|
static void
|
|
dump_oacc_loop (FILE *file, oacc_loop *loop, int depth)
|
|
{
|
|
int ix;
|
|
|
|
fprintf (file, "%*sLoop %x(%x) %s:%u\n", depth * 2, "",
|
|
loop->flags, loop->mask,
|
|
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc));
|
|
|
|
if (loop->marker)
|
|
print_gimple_stmt (file, loop->marker, depth * 2, 0);
|
|
|
|
if (loop->routine)
|
|
fprintf (file, "%*sRoutine %s:%u:%s\n",
|
|
depth * 2, "", DECL_SOURCE_FILE (loop->routine),
|
|
DECL_SOURCE_LINE (loop->routine),
|
|
IDENTIFIER_POINTER (DECL_NAME (loop->routine)));
|
|
|
|
for (ix = GOMP_DIM_GANG; ix != GOMP_DIM_MAX; ix++)
|
|
if (loop->heads[ix])
|
|
dump_oacc_loop_part (file, loop->heads[ix], depth, "Head", ix);
|
|
for (ix = GOMP_DIM_MAX; ix--;)
|
|
if (loop->tails[ix])
|
|
dump_oacc_loop_part (file, loop->tails[ix], depth, "Tail", ix);
|
|
|
|
if (loop->child)
|
|
dump_oacc_loop (file, loop->child, depth + 1);
|
|
if (loop->sibling)
|
|
dump_oacc_loop (file, loop->sibling, depth);
|
|
}
|
|
|
|
void debug_oacc_loop (oacc_loop *);
|
|
|
|
/* Dump loops to stderr. */
|
|
|
|
DEBUG_FUNCTION void
|
|
debug_oacc_loop (oacc_loop *loop)
|
|
{
|
|
dump_oacc_loop (stderr, loop, 0);
|
|
}
|
|
|
|
/* DFS walk of basic blocks BB onwards, creating OpenACC loop
|
|
structures as we go. By construction these loops are properly
|
|
nested. */
|
|
|
|
static void
|
|
oacc_loop_discover_walk (oacc_loop *loop, basic_block bb)
|
|
{
|
|
int marker = 0;
|
|
int remaining = 0;
|
|
|
|
if (bb->flags & BB_VISITED)
|
|
return;
|
|
|
|
follow:
|
|
bb->flags |= BB_VISITED;
|
|
|
|
/* Scan for loop markers. */
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
|
|
gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (!is_gimple_call (stmt))
|
|
continue;
|
|
|
|
gcall *call = as_a <gcall *> (stmt);
|
|
|
|
/* If this is a routine, make a dummy loop for it. */
|
|
if (tree decl = gimple_call_fndecl (call))
|
|
if (tree attrs = oacc_get_fn_attrib (decl))
|
|
{
|
|
gcc_assert (!marker);
|
|
new_oacc_loop_routine (loop, call, decl, attrs);
|
|
}
|
|
|
|
if (!gimple_call_internal_p (call))
|
|
continue;
|
|
|
|
switch (gimple_call_internal_fn (call))
|
|
{
|
|
default:
|
|
break;
|
|
|
|
case IFN_GOACC_LOOP:
|
|
/* Count the goacc loop abstraction fns, to determine if the
|
|
loop was collapsed already. */
|
|
loop->ifns++;
|
|
break;
|
|
|
|
case IFN_UNIQUE:
|
|
enum ifn_unique_kind kind
|
|
= (enum ifn_unique_kind) (TREE_INT_CST_LOW
|
|
(gimple_call_arg (call, 0)));
|
|
if (kind == IFN_UNIQUE_OACC_HEAD_MARK
|
|
|| kind == IFN_UNIQUE_OACC_TAIL_MARK)
|
|
{
|
|
if (gimple_call_num_args (call) == 2)
|
|
{
|
|
gcc_assert (marker && !remaining);
|
|
marker = 0;
|
|
if (kind == IFN_UNIQUE_OACC_TAIL_MARK)
|
|
loop = finish_oacc_loop (loop);
|
|
else
|
|
loop->head_end = call;
|
|
}
|
|
else
|
|
{
|
|
int count = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
|
|
|
|
if (!marker)
|
|
{
|
|
if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
|
|
loop = new_oacc_loop (loop, call);
|
|
remaining = count;
|
|
}
|
|
gcc_assert (count == remaining);
|
|
if (remaining)
|
|
{
|
|
remaining--;
|
|
if (kind == IFN_UNIQUE_OACC_HEAD_MARK)
|
|
loop->heads[marker] = call;
|
|
else
|
|
loop->tails[remaining] = call;
|
|
}
|
|
marker++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (remaining || marker)
|
|
{
|
|
bb = single_succ (bb);
|
|
gcc_assert (single_pred_p (bb) && !(bb->flags & BB_VISITED));
|
|
goto follow;
|
|
}
|
|
|
|
/* Walk successor blocks. */
|
|
edge e;
|
|
edge_iterator ei;
|
|
|
|
FOR_EACH_EDGE (e, ei, bb->succs)
|
|
oacc_loop_discover_walk (loop, e->dest);
|
|
}
|
|
|
|
/* LOOP is the first sibling. Reverse the order in place and return
|
|
the new first sibling. Recurse to child loops. */
|
|
|
|
static oacc_loop *
|
|
oacc_loop_sibling_nreverse (oacc_loop *loop)
|
|
{
|
|
oacc_loop *last = NULL;
|
|
do
|
|
{
|
|
if (loop->child)
|
|
loop->child = oacc_loop_sibling_nreverse (loop->child);
|
|
|
|
oacc_loop *next = loop->sibling;
|
|
loop->sibling = last;
|
|
last = loop;
|
|
loop = next;
|
|
}
|
|
while (loop);
|
|
|
|
return last;
|
|
}
|
|
|
|
/* Discover the OpenACC loops marked up by HEAD and TAIL markers for
|
|
the current function. */
|
|
|
|
static oacc_loop *
|
|
oacc_loop_discovery ()
|
|
{
|
|
/* Clear basic block flags, in particular BB_VISITED which we're going to use
|
|
in the following. */
|
|
clear_bb_flags ();
|
|
|
|
oacc_loop *top = new_oacc_loop_outer (current_function_decl);
|
|
oacc_loop_discover_walk (top, ENTRY_BLOCK_PTR_FOR_FN (cfun));
|
|
|
|
/* The siblings were constructed in reverse order, reverse them so
|
|
that diagnostics come out in an unsurprising order. */
|
|
top = oacc_loop_sibling_nreverse (top);
|
|
|
|
return top;
|
|
}
|
|
|
|
/* Transform the abstract internal function markers starting at FROM
|
|
to be for partitioning level LEVEL. Stop when we meet another HEAD
|
|
or TAIL marker. */
|
|
|
|
static void
|
|
oacc_loop_xform_head_tail (gcall *from, int level)
|
|
{
|
|
enum ifn_unique_kind kind
|
|
= (enum ifn_unique_kind) TREE_INT_CST_LOW (gimple_call_arg (from, 0));
|
|
tree replacement = build_int_cst (unsigned_type_node, level);
|
|
|
|
for (gimple_stmt_iterator gsi = gsi_for_stmt (from);;)
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (gimple_call_internal_p (stmt, IFN_UNIQUE))
|
|
{
|
|
enum ifn_unique_kind k
|
|
= ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
|
|
|
|
if (k == IFN_UNIQUE_OACC_FORK || k == IFN_UNIQUE_OACC_JOIN)
|
|
*gimple_call_arg_ptr (stmt, 2) = replacement;
|
|
else if (k == kind && stmt != from)
|
|
break;
|
|
}
|
|
else if (gimple_call_internal_p (stmt, IFN_GOACC_REDUCTION))
|
|
*gimple_call_arg_ptr (stmt, 3) = replacement;
|
|
|
|
gsi_next (&gsi);
|
|
while (gsi_end_p (gsi))
|
|
gsi = gsi_start_bb (single_succ (gsi_bb (gsi)));
|
|
}
|
|
}
|
|
|
|
/* Transform the IFN_GOACC_LOOP internal functions by providing the
|
|
determined partitioning mask and chunking argument. END_MARKER
|
|
points at the end IFN_HEAD_TAIL call intgroducing the loop. IFNS
|
|
is the number of IFN_GOACC_LOOP calls for the loop. MASK_ARG is
|
|
the replacement partitioning mask and CHUNK_ARG is the replacement
|
|
chunking arg. */
|
|
|
|
static void
|
|
oacc_loop_xform_loop (gcall *end_marker, unsigned ifns,
|
|
tree mask_arg, tree chunk_arg)
|
|
{
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (end_marker);
|
|
|
|
gcc_checking_assert (ifns);
|
|
for (;;)
|
|
{
|
|
for (; !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
|
|
if (!is_gimple_call (stmt))
|
|
continue;
|
|
|
|
gcall *call = as_a <gcall *> (stmt);
|
|
|
|
if (!gimple_call_internal_p (call))
|
|
continue;
|
|
|
|
if (gimple_call_internal_fn (call) != IFN_GOACC_LOOP)
|
|
continue;
|
|
|
|
*gimple_call_arg_ptr (call, 5) = mask_arg;
|
|
*gimple_call_arg_ptr (call, 4) = chunk_arg;
|
|
ifns--;
|
|
if (!ifns)
|
|
return;
|
|
}
|
|
|
|
/* The LOOP_BOUND ifn could be in the single successor
|
|
block. */
|
|
basic_block bb = single_succ (gsi_bb (gsi));
|
|
gsi = gsi_start_bb (bb);
|
|
}
|
|
}
|
|
|
|
/* Process the discovered OpenACC loops, setting the correct
|
|
partitioning level etc. */
|
|
|
|
static void
|
|
oacc_loop_process (oacc_loop *loop)
|
|
{
|
|
if (loop->child)
|
|
oacc_loop_process (loop->child);
|
|
|
|
if (loop->mask && !loop->routine)
|
|
{
|
|
int ix;
|
|
unsigned mask = loop->mask;
|
|
unsigned dim = GOMP_DIM_GANG;
|
|
tree mask_arg = build_int_cst (unsigned_type_node, mask);
|
|
tree chunk_arg = loop->chunk_size;
|
|
|
|
oacc_loop_xform_loop (loop->head_end, loop->ifns, mask_arg, chunk_arg);
|
|
|
|
for (ix = 0; ix != GOMP_DIM_MAX && mask; ix++)
|
|
{
|
|
while (!(GOMP_DIM_MASK (dim) & mask))
|
|
dim++;
|
|
|
|
oacc_loop_xform_head_tail (loop->heads[ix], dim);
|
|
oacc_loop_xform_head_tail (loop->tails[ix], dim);
|
|
|
|
mask ^= GOMP_DIM_MASK (dim);
|
|
}
|
|
}
|
|
|
|
if (loop->sibling)
|
|
oacc_loop_process (loop->sibling);
|
|
}
|
|
|
|
/* Walk the OpenACC loop heirarchy checking and assigning the
|
|
programmer-specified partitionings. OUTER_MASK is the partitioning
|
|
this loop is contained within. Return mask of partitioning
|
|
encountered. If any auto loops are discovered, set GOMP_DIM_MAX
|
|
bit. */
|
|
|
|
static unsigned
|
|
oacc_loop_fixed_partitions (oacc_loop *loop, unsigned outer_mask)
|
|
{
|
|
unsigned this_mask = loop->mask;
|
|
unsigned mask_all = 0;
|
|
bool noisy = true;
|
|
|
|
#ifdef ACCEL_COMPILER
|
|
/* When device_type is supported, we want the device compiler to be
|
|
noisy, if the loop parameters are device_type-specific. */
|
|
noisy = false;
|
|
#endif
|
|
|
|
if (!loop->routine)
|
|
{
|
|
bool auto_par = (loop->flags & OLF_AUTO) != 0;
|
|
bool seq_par = (loop->flags & OLF_SEQ) != 0;
|
|
|
|
this_mask = ((loop->flags >> OLF_DIM_BASE)
|
|
& (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1));
|
|
|
|
if ((this_mask != 0) + auto_par + seq_par > 1)
|
|
{
|
|
if (noisy)
|
|
error_at (loop->loc,
|
|
seq_par
|
|
? "%<seq%> overrides other OpenACC loop specifiers"
|
|
: "%<auto%> conflicts with other OpenACC loop "
|
|
"specifiers");
|
|
auto_par = false;
|
|
loop->flags &= ~OLF_AUTO;
|
|
if (seq_par)
|
|
{
|
|
loop->flags
|
|
&= ~((GOMP_DIM_MASK (GOMP_DIM_MAX) - 1) << OLF_DIM_BASE);
|
|
this_mask = 0;
|
|
}
|
|
}
|
|
if (auto_par && (loop->flags & OLF_INDEPENDENT))
|
|
mask_all |= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
}
|
|
|
|
if (this_mask & outer_mask)
|
|
{
|
|
const oacc_loop *outer;
|
|
for (outer = loop->parent; outer; outer = outer->parent)
|
|
if (outer->mask & this_mask)
|
|
break;
|
|
|
|
if (noisy)
|
|
{
|
|
if (outer)
|
|
{
|
|
error_at (loop->loc,
|
|
"%s uses same OpenACC parallelism as containing loop",
|
|
loop->routine ? "routine call" : "inner loop");
|
|
inform (outer->loc, "containing loop here");
|
|
}
|
|
else
|
|
error_at (loop->loc,
|
|
"%s uses OpenACC parallelism disallowed by containing "
|
|
"routine", loop->routine ? "routine call" : "loop");
|
|
|
|
if (loop->routine)
|
|
inform (DECL_SOURCE_LOCATION (loop->routine),
|
|
"routine %qD declared here", loop->routine);
|
|
}
|
|
this_mask &= ~outer_mask;
|
|
}
|
|
else
|
|
{
|
|
unsigned outermost = least_bit_hwi (this_mask);
|
|
|
|
if (outermost && outermost <= outer_mask)
|
|
{
|
|
if (noisy)
|
|
{
|
|
error_at (loop->loc,
|
|
"incorrectly nested OpenACC loop parallelism");
|
|
|
|
const oacc_loop *outer;
|
|
for (outer = loop->parent;
|
|
outer->flags && outer->flags < outermost;
|
|
outer = outer->parent)
|
|
continue;
|
|
inform (outer->loc, "containing loop here");
|
|
}
|
|
|
|
this_mask &= ~outermost;
|
|
}
|
|
}
|
|
|
|
loop->mask = this_mask;
|
|
mask_all |= this_mask;
|
|
|
|
if (loop->child)
|
|
{
|
|
loop->inner = oacc_loop_fixed_partitions (loop->child,
|
|
outer_mask | this_mask);
|
|
mask_all |= loop->inner;
|
|
}
|
|
|
|
if (loop->sibling)
|
|
mask_all |= oacc_loop_fixed_partitions (loop->sibling, outer_mask);
|
|
|
|
return mask_all;
|
|
}
|
|
|
|
/* Walk the OpenACC loop heirarchy to assign auto-partitioned loops.
|
|
OUTER_MASK is the partitioning this loop is contained within.
|
|
Return the cumulative partitioning used by this loop, siblings and
|
|
children. */
|
|
|
|
static unsigned
|
|
oacc_loop_auto_partitions (oacc_loop *loop, unsigned outer_mask)
|
|
{
|
|
bool assign = (loop->flags & OLF_AUTO) && (loop->flags & OLF_INDEPENDENT);
|
|
bool noisy = true;
|
|
|
|
#ifdef ACCEL_COMPILER
|
|
/* When device_type is supported, we want the device compiler to be
|
|
noisy, if the loop parameters are device_type-specific. */
|
|
noisy = false;
|
|
#endif
|
|
|
|
if (assign && outer_mask < GOMP_DIM_MASK (GOMP_DIM_MAX - 1))
|
|
{
|
|
/* Allocate the outermost loop at the outermost available
|
|
level. */
|
|
unsigned this_mask = outer_mask + 1;
|
|
|
|
if (!(this_mask & loop->inner))
|
|
loop->mask = this_mask;
|
|
}
|
|
|
|
if (loop->child)
|
|
{
|
|
unsigned child_mask = outer_mask | loop->mask;
|
|
|
|
if (loop->mask || assign)
|
|
child_mask |= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
|
|
loop->inner = oacc_loop_auto_partitions (loop->child, child_mask);
|
|
}
|
|
|
|
if (assign && !loop->mask)
|
|
{
|
|
/* Allocate the loop at the innermost available level. */
|
|
unsigned this_mask = 0;
|
|
|
|
/* Determine the outermost partitioning used within this loop. */
|
|
this_mask = loop->inner | GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
this_mask = least_bit_hwi (this_mask);
|
|
|
|
/* Pick the partitioning just inside that one. */
|
|
this_mask >>= 1;
|
|
|
|
/* And avoid picking one use by an outer loop. */
|
|
this_mask &= ~outer_mask;
|
|
|
|
if (!this_mask && noisy)
|
|
warning_at (loop->loc, 0,
|
|
"insufficient partitioning available to parallelize loop");
|
|
|
|
loop->mask = this_mask;
|
|
}
|
|
|
|
if (assign && dump_file)
|
|
fprintf (dump_file, "Auto loop %s:%d assigned %d\n",
|
|
LOCATION_FILE (loop->loc), LOCATION_LINE (loop->loc),
|
|
loop->mask);
|
|
|
|
unsigned inner_mask = 0;
|
|
|
|
if (loop->sibling)
|
|
inner_mask |= oacc_loop_auto_partitions (loop->sibling, outer_mask);
|
|
|
|
inner_mask |= loop->inner | loop->mask;
|
|
|
|
return inner_mask;
|
|
}
|
|
|
|
/* Walk the OpenACC loop heirarchy to check and assign partitioning
|
|
axes. Return mask of partitioning. */
|
|
|
|
static unsigned
|
|
oacc_loop_partition (oacc_loop *loop, unsigned outer_mask)
|
|
{
|
|
unsigned mask_all = oacc_loop_fixed_partitions (loop, outer_mask);
|
|
|
|
if (mask_all & GOMP_DIM_MASK (GOMP_DIM_MAX))
|
|
{
|
|
mask_all ^= GOMP_DIM_MASK (GOMP_DIM_MAX);
|
|
mask_all |= oacc_loop_auto_partitions (loop, outer_mask);
|
|
}
|
|
return mask_all;
|
|
}
|
|
|
|
/* Default fork/join early expander. Delete the function calls if
|
|
there is no RTL expander. */
|
|
|
|
bool
|
|
default_goacc_fork_join (gcall *ARG_UNUSED (call),
|
|
const int *ARG_UNUSED (dims), bool is_fork)
|
|
{
|
|
if (is_fork)
|
|
return targetm.have_oacc_fork ();
|
|
else
|
|
return targetm.have_oacc_join ();
|
|
}
|
|
|
|
/* Default goacc.reduction early expander.
|
|
|
|
LHS-opt = IFN_REDUCTION (KIND, RES_PTR, VAR, LEVEL, OP, OFFSET)
|
|
If RES_PTR is not integer-zerop:
|
|
SETUP - emit 'LHS = *RES_PTR', LHS = NULL
|
|
TEARDOWN - emit '*RES_PTR = VAR'
|
|
If LHS is not NULL
|
|
emit 'LHS = VAR' */
|
|
|
|
void
|
|
default_goacc_reduction (gcall *call)
|
|
{
|
|
unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
|
|
gimple_stmt_iterator gsi = gsi_for_stmt (call);
|
|
tree lhs = gimple_call_lhs (call);
|
|
tree var = gimple_call_arg (call, 2);
|
|
gimple_seq seq = NULL;
|
|
|
|
if (code == IFN_GOACC_REDUCTION_SETUP
|
|
|| code == IFN_GOACC_REDUCTION_TEARDOWN)
|
|
{
|
|
/* Setup and Teardown need to copy from/to the receiver object,
|
|
if there is one. */
|
|
tree ref_to_res = gimple_call_arg (call, 1);
|
|
|
|
if (!integer_zerop (ref_to_res))
|
|
{
|
|
tree dst = build_simple_mem_ref (ref_to_res);
|
|
tree src = var;
|
|
|
|
if (code == IFN_GOACC_REDUCTION_SETUP)
|
|
{
|
|
src = dst;
|
|
dst = lhs;
|
|
lhs = NULL;
|
|
}
|
|
gimple_seq_add_stmt (&seq, gimple_build_assign (dst, src));
|
|
}
|
|
}
|
|
|
|
/* Copy VAR to LHS, if there is an LHS. */
|
|
if (lhs)
|
|
gimple_seq_add_stmt (&seq, gimple_build_assign (lhs, var));
|
|
|
|
gsi_replace_with_seq (&gsi, seq, true);
|
|
}
|
|
|
|
/* Main entry point for oacc transformations which run on the device
|
|
compiler after LTO, so we know what the target device is at this
|
|
point (including the host fallback). */
|
|
|
|
static unsigned int
|
|
execute_oacc_device_lower ()
|
|
{
|
|
tree attrs = oacc_get_fn_attrib (current_function_decl);
|
|
|
|
if (!attrs)
|
|
/* Not an offloaded function. */
|
|
return 0;
|
|
|
|
/* Parse the default dim argument exactly once. */
|
|
if ((const void *)flag_openacc_dims != &flag_openacc_dims)
|
|
{
|
|
oacc_parse_default_dims (flag_openacc_dims);
|
|
flag_openacc_dims = (char *)&flag_openacc_dims;
|
|
}
|
|
|
|
/* Discover, partition and process the loops. */
|
|
oacc_loop *loops = oacc_loop_discovery ();
|
|
int fn_level = oacc_fn_attrib_level (attrs);
|
|
|
|
if (dump_file)
|
|
fprintf (dump_file, oacc_fn_attrib_kernels_p (attrs)
|
|
? "Function is kernels offload\n"
|
|
: fn_level < 0 ? "Function is parallel offload\n"
|
|
: "Function is routine level %d\n", fn_level);
|
|
|
|
unsigned outer_mask = fn_level >= 0 ? GOMP_DIM_MASK (fn_level) - 1 : 0;
|
|
unsigned used_mask = oacc_loop_partition (loops, outer_mask);
|
|
int dims[GOMP_DIM_MAX];
|
|
|
|
oacc_validate_dims (current_function_decl, attrs, dims, fn_level, used_mask);
|
|
|
|
if (dump_file)
|
|
{
|
|
const char *comma = "Compute dimensions [";
|
|
for (int ix = 0; ix != GOMP_DIM_MAX; ix++, comma = ", ")
|
|
fprintf (dump_file, "%s%d", comma, dims[ix]);
|
|
fprintf (dump_file, "]\n");
|
|
}
|
|
|
|
oacc_loop_process (loops);
|
|
if (dump_file)
|
|
{
|
|
fprintf (dump_file, "OpenACC loops\n");
|
|
dump_oacc_loop (dump_file, loops, 0);
|
|
fprintf (dump_file, "\n");
|
|
}
|
|
|
|
/* Offloaded targets may introduce new basic blocks, which require
|
|
dominance information to update SSA. */
|
|
calculate_dominance_info (CDI_DOMINATORS);
|
|
|
|
/* Now lower internal loop functions to target-specific code
|
|
sequences. */
|
|
basic_block bb;
|
|
FOR_ALL_BB_FN (bb, cfun)
|
|
for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
if (!is_gimple_call (stmt))
|
|
{
|
|
gsi_next (&gsi);
|
|
continue;
|
|
}
|
|
|
|
gcall *call = as_a <gcall *> (stmt);
|
|
if (!gimple_call_internal_p (call))
|
|
{
|
|
gsi_next (&gsi);
|
|
continue;
|
|
}
|
|
|
|
/* Rewind to allow rescan. */
|
|
gsi_prev (&gsi);
|
|
bool rescan = false, remove = false;
|
|
enum internal_fn ifn_code = gimple_call_internal_fn (call);
|
|
|
|
switch (ifn_code)
|
|
{
|
|
default: break;
|
|
|
|
case IFN_GOACC_LOOP:
|
|
oacc_xform_loop (call);
|
|
rescan = true;
|
|
break;
|
|
|
|
case IFN_GOACC_REDUCTION:
|
|
/* Mark the function for SSA renaming. */
|
|
mark_virtual_operands_for_renaming (cfun);
|
|
|
|
/* If the level is -1, this ended up being an unused
|
|
axis. Handle as a default. */
|
|
if (integer_minus_onep (gimple_call_arg (call, 3)))
|
|
default_goacc_reduction (call);
|
|
else
|
|
targetm.goacc.reduction (call);
|
|
rescan = true;
|
|
break;
|
|
|
|
case IFN_UNIQUE:
|
|
{
|
|
enum ifn_unique_kind kind
|
|
= ((enum ifn_unique_kind)
|
|
TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
|
|
|
|
switch (kind)
|
|
{
|
|
default:
|
|
gcc_unreachable ();
|
|
|
|
case IFN_UNIQUE_OACC_FORK:
|
|
case IFN_UNIQUE_OACC_JOIN:
|
|
if (integer_minus_onep (gimple_call_arg (call, 2)))
|
|
remove = true;
|
|
else if (!targetm.goacc.fork_join
|
|
(call, dims, kind == IFN_UNIQUE_OACC_FORK))
|
|
remove = true;
|
|
break;
|
|
|
|
case IFN_UNIQUE_OACC_HEAD_MARK:
|
|
case IFN_UNIQUE_OACC_TAIL_MARK:
|
|
remove = true;
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (gsi_end_p (gsi))
|
|
/* We rewound past the beginning of the BB. */
|
|
gsi = gsi_start_bb (bb);
|
|
else
|
|
/* Undo the rewind. */
|
|
gsi_next (&gsi);
|
|
|
|
if (remove)
|
|
{
|
|
if (gimple_vdef (call))
|
|
replace_uses_by (gimple_vdef (call), gimple_vuse (call));
|
|
if (gimple_call_lhs (call))
|
|
{
|
|
/* Propagate the data dependency var. */
|
|
gimple *ass = gimple_build_assign (gimple_call_lhs (call),
|
|
gimple_call_arg (call, 1));
|
|
gsi_replace (&gsi, ass, false);
|
|
}
|
|
else
|
|
gsi_remove (&gsi, true);
|
|
}
|
|
else if (!rescan)
|
|
/* If not rescanning, advance over the call. */
|
|
gsi_next (&gsi);
|
|
}
|
|
|
|
free_oacc_loop (loops);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Default launch dimension validator. Force everything to 1. A
|
|
backend that wants to provide larger dimensions must override this
|
|
hook. */
|
|
|
|
bool
|
|
default_goacc_validate_dims (tree ARG_UNUSED (decl), int *dims,
|
|
int ARG_UNUSED (fn_level))
|
|
{
|
|
bool changed = false;
|
|
|
|
for (unsigned ix = 0; ix != GOMP_DIM_MAX; ix++)
|
|
{
|
|
if (dims[ix] != 1)
|
|
{
|
|
dims[ix] = 1;
|
|
changed = true;
|
|
}
|
|
}
|
|
|
|
return changed;
|
|
}
|
|
|
|
/* Default dimension bound is unknown on accelerator and 1 on host. */
|
|
|
|
int
|
|
default_goacc_dim_limit (int ARG_UNUSED (axis))
|
|
{
|
|
#ifdef ACCEL_COMPILER
|
|
return 0;
|
|
#else
|
|
return 1;
|
|
#endif
|
|
}
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_oacc_device_lower =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"oaccdevlow", /* name */
|
|
OPTGROUP_OPENMP, /* optinfo_flags */
|
|
TV_NONE, /* tv_id */
|
|
PROP_cfg, /* properties_required */
|
|
0 /* Possibly PROP_gimple_eomp. */, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_oacc_device_lower : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_oacc_device_lower (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_oacc_device_lower, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *) { return flag_openacc; };
|
|
|
|
virtual unsigned int execute (function *)
|
|
{
|
|
return execute_oacc_device_lower ();
|
|
}
|
|
|
|
}; // class pass_oacc_device_lower
|
|
|
|
} // anon namespace
|
|
|
|
gimple_opt_pass *
|
|
make_pass_oacc_device_lower (gcc::context *ctxt)
|
|
{
|
|
return new pass_oacc_device_lower (ctxt);
|
|
}
|
|
|
|
/* Cleanup uses of SIMT placeholder internal functions: on non-SIMT targets,
|
|
VF is 1 and LANE is 0; on SIMT targets, VF is folded to a constant, and
|
|
LANE is kept to be expanded to RTL later on. Also cleanup all other SIMT
|
|
internal functions on non-SIMT targets, and likewise some SIMD internal
|
|
functions on SIMT targets. */
|
|
|
|
static unsigned int
|
|
execute_omp_device_lower ()
|
|
{
|
|
int vf = targetm.simt.vf ? targetm.simt.vf () : 1;
|
|
basic_block bb;
|
|
gimple_stmt_iterator gsi;
|
|
FOR_EACH_BB_FN (bb, cfun)
|
|
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
{
|
|
gimple *stmt = gsi_stmt (gsi);
|
|
if (!is_gimple_call (stmt) || !gimple_call_internal_p (stmt))
|
|
continue;
|
|
tree lhs = gimple_call_lhs (stmt), rhs = NULL_TREE;
|
|
tree type = lhs ? TREE_TYPE (lhs) : integer_type_node;
|
|
switch (gimple_call_internal_fn (stmt))
|
|
{
|
|
case IFN_GOMP_USE_SIMT:
|
|
rhs = vf == 1 ? integer_zero_node : integer_one_node;
|
|
break;
|
|
case IFN_GOMP_SIMT_LANE:
|
|
case IFN_GOMP_SIMT_LAST_LANE:
|
|
rhs = vf == 1 ? build_zero_cst (type) : NULL_TREE;
|
|
break;
|
|
case IFN_GOMP_SIMT_VF:
|
|
rhs = build_int_cst (type, vf);
|
|
break;
|
|
case IFN_GOMP_SIMT_ORDERED_PRED:
|
|
rhs = vf == 1 ? integer_zero_node : NULL_TREE;
|
|
if (rhs || !lhs)
|
|
unlink_stmt_vdef (stmt);
|
|
break;
|
|
case IFN_GOMP_SIMT_VOTE_ANY:
|
|
case IFN_GOMP_SIMT_XCHG_BFLY:
|
|
case IFN_GOMP_SIMT_XCHG_IDX:
|
|
rhs = vf == 1 ? gimple_call_arg (stmt, 0) : NULL_TREE;
|
|
break;
|
|
case IFN_GOMP_SIMD_LANE:
|
|
case IFN_GOMP_SIMD_LAST_LANE:
|
|
rhs = vf != 1 ? build_zero_cst (type) : NULL_TREE;
|
|
break;
|
|
case IFN_GOMP_SIMD_VF:
|
|
rhs = vf != 1 ? build_one_cst (type) : NULL_TREE;
|
|
break;
|
|
default:
|
|
continue;
|
|
}
|
|
if (lhs && !rhs)
|
|
continue;
|
|
stmt = lhs ? gimple_build_assign (lhs, rhs) : gimple_build_nop ();
|
|
gsi_replace (&gsi, stmt, false);
|
|
}
|
|
if (vf != 1)
|
|
cfun->has_force_vectorize_loops = false;
|
|
return 0;
|
|
}
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_omp_device_lower =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"ompdevlow", /* name */
|
|
OPTGROUP_OPENMP, /* optinfo_flags */
|
|
TV_NONE, /* tv_id */
|
|
PROP_cfg, /* properties_required */
|
|
PROP_gimple_lomp_dev, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
TODO_update_ssa, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_omp_device_lower : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_omp_device_lower (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_omp_device_lower, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *fun)
|
|
{
|
|
return !(fun->curr_properties & PROP_gimple_lomp_dev);
|
|
}
|
|
virtual unsigned int execute (function *)
|
|
{
|
|
return execute_omp_device_lower ();
|
|
}
|
|
|
|
}; // class pass_expand_omp_ssa
|
|
|
|
} // anon namespace
|
|
|
|
gimple_opt_pass *
|
|
make_pass_omp_device_lower (gcc::context *ctxt)
|
|
{
|
|
return new pass_omp_device_lower (ctxt);
|
|
}
|
|
|
|
/* "omp declare target link" handling pass. */
|
|
|
|
namespace {
|
|
|
|
const pass_data pass_data_omp_target_link =
|
|
{
|
|
GIMPLE_PASS, /* type */
|
|
"omptargetlink", /* name */
|
|
OPTGROUP_OPENMP, /* optinfo_flags */
|
|
TV_NONE, /* tv_id */
|
|
PROP_ssa, /* properties_required */
|
|
0, /* properties_provided */
|
|
0, /* properties_destroyed */
|
|
0, /* todo_flags_start */
|
|
TODO_update_ssa, /* todo_flags_finish */
|
|
};
|
|
|
|
class pass_omp_target_link : public gimple_opt_pass
|
|
{
|
|
public:
|
|
pass_omp_target_link (gcc::context *ctxt)
|
|
: gimple_opt_pass (pass_data_omp_target_link, ctxt)
|
|
{}
|
|
|
|
/* opt_pass methods: */
|
|
virtual bool gate (function *fun)
|
|
{
|
|
#ifdef ACCEL_COMPILER
|
|
tree attrs = DECL_ATTRIBUTES (fun->decl);
|
|
return lookup_attribute ("omp declare target", attrs)
|
|
|| lookup_attribute ("omp target entrypoint", attrs);
|
|
#else
|
|
(void) fun;
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
virtual unsigned execute (function *);
|
|
};
|
|
|
|
/* Callback for walk_gimple_stmt used to scan for link var operands. */
|
|
|
|
static tree
|
|
find_link_var_op (tree *tp, int *walk_subtrees, void *)
|
|
{
|
|
tree t = *tp;
|
|
|
|
if (VAR_P (t) && DECL_HAS_VALUE_EXPR_P (t)
|
|
&& lookup_attribute ("omp declare target link", DECL_ATTRIBUTES (t)))
|
|
{
|
|
*walk_subtrees = 0;
|
|
return t;
|
|
}
|
|
|
|
return NULL_TREE;
|
|
}
|
|
|
|
unsigned
|
|
pass_omp_target_link::execute (function *fun)
|
|
{
|
|
basic_block bb;
|
|
FOR_EACH_BB_FN (bb, fun)
|
|
{
|
|
gimple_stmt_iterator gsi;
|
|
for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
|
|
if (walk_gimple_stmt (&gsi, NULL, find_link_var_op, NULL))
|
|
gimple_regimplify_operands (gsi_stmt (gsi), &gsi);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
} // anon namespace
|
|
|
|
gimple_opt_pass *
|
|
make_pass_omp_target_link (gcc::context *ctxt)
|
|
{
|
|
return new pass_omp_target_link (ctxt);
|
|
}
|