b52643ab90
The version of nvprof in CUDA 9.0 causes a hang when used to profile an OpenACC program. This is because it calls acc_get_device_type from a callback called during device initialization, which then attempts to acquire acc_device_lock while it is already taken, resulting in deadlock. This works around the issue by returning acc_device_none from acc_get_device_type without attempting to acquire the lock when initialization has not completed yet. 2020-07-14 Tom de Vries <tom@codesourcery.com> Cesar Philippidis <cesar@codesourcery.com> Thomas Schwinge <thomas@codesourcery.com> Kwok Cheung Yeung <kcy@codesourcery.com> libgomp/ * oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread): New variable. (acc_init_1): Set acc_init_thread to pthread_self (). Set acc_init_state to initializing at the start, and to initialized at the end. (self_initializing_p): New function. (acc_get_device_type): Return acc_device_none if called by thread that is currently executing acc_init_1. * libgomp.texi (acc_get_device_type): Update documentation. (Implementation Status and Implementation-Defined Behavior): Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New.
955 lines
25 KiB
C
955 lines
25 KiB
C
/* OpenACC Runtime initialization routines
|
|
|
|
Copyright (C) 2013-2020 Free Software Foundation, Inc.
|
|
|
|
Contributed by Mentor Embedded.
|
|
|
|
This file is part of the GNU Offloading and Multi Processing Library
|
|
(libgomp).
|
|
|
|
Libgomp is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include "libgomp.h"
|
|
#include "oacc-int.h"
|
|
#include "openacc.h"
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
#include <strings.h>
|
|
#include <stdbool.h>
|
|
#include <string.h>
|
|
|
|
/* This lock is used to protect access to cached_base_dev, dispatchers and
|
|
the (abstract) initialisation state of attached offloading devices. */
|
|
|
|
static gomp_mutex_t acc_device_lock;
|
|
|
|
static gomp_mutex_t acc_init_state_lock;
|
|
static enum { uninitialized, initializing, initialized } acc_init_state
|
|
= uninitialized;
|
|
static pthread_t acc_init_thread;
|
|
|
|
/* A cached version of the dispatcher for the global "current" accelerator type,
|
|
e.g. used as the default when creating new host threads. This is the
|
|
device-type equivalent of goacc_device_num (which specifies which device to
|
|
use out of potentially several of the same type). If there are several
|
|
devices of a given type, this points at the first one. */
|
|
|
|
static struct gomp_device_descr *cached_base_dev = NULL;
|
|
|
|
#if defined HAVE_TLS || defined USE_EMUTLS
|
|
__thread struct goacc_thread *goacc_tls_data;
|
|
#else
|
|
pthread_key_t goacc_tls_key;
|
|
#endif
|
|
static pthread_key_t goacc_cleanup_key;
|
|
|
|
static struct goacc_thread *goacc_threads;
|
|
static gomp_mutex_t goacc_thread_lock;
|
|
|
|
/* An array of dispatchers for device types, indexed by the type. This array
|
|
only references "base" devices, and other instances of the same type are
|
|
found by simply indexing from each such device (which are stored linearly,
|
|
grouped by device in target.c:devices). */
|
|
static struct gomp_device_descr *dispatchers[_ACC_device_hwm] = { 0 };
|
|
|
|
attribute_hidden void
|
|
goacc_register (struct gomp_device_descr *disp)
|
|
{
|
|
/* Only register the 0th device here. */
|
|
if (disp->target_id != 0)
|
|
return;
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
|
|
assert (acc_device_type (disp->type) != acc_device_none
|
|
&& acc_device_type (disp->type) != acc_device_default
|
|
&& acc_device_type (disp->type) != acc_device_not_host);
|
|
assert (!dispatchers[disp->type]);
|
|
dispatchers[disp->type] = disp;
|
|
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
}
|
|
|
|
static bool
|
|
known_device_type_p (acc_device_t d)
|
|
{
|
|
return d >= 0 && d < _ACC_device_hwm;
|
|
}
|
|
|
|
static void
|
|
unknown_device_type_error (acc_device_t invalid_type)
|
|
{
|
|
gomp_fatal ("unknown device type %u", invalid_type);
|
|
}
|
|
|
|
/* OpenACC names some things a little differently. */
|
|
|
|
static const char *
|
|
get_openacc_name (const char *name)
|
|
{
|
|
if (strcmp (name, "gcn") == 0)
|
|
return "radeon";
|
|
else if (strcmp (name, "nvptx") == 0)
|
|
return "nvidia";
|
|
else
|
|
return name;
|
|
}
|
|
|
|
static const char *
|
|
name_of_acc_device_t (enum acc_device_t type)
|
|
{
|
|
switch (type)
|
|
{
|
|
case acc_device_none: return "none";
|
|
case acc_device_default: return "default";
|
|
case acc_device_host: return "host";
|
|
case acc_device_not_host: return "not_host";
|
|
case acc_device_nvidia: return "nvidia";
|
|
case acc_device_radeon: return "radeon";
|
|
default: unknown_device_type_error (type);
|
|
}
|
|
__builtin_unreachable ();
|
|
}
|
|
|
|
/* ACC_DEVICE_LOCK must be held before calling this function. If FAIL_IS_ERROR
|
|
is true, this function raises an error if there are no devices of type D,
|
|
otherwise it returns NULL in that case. */
|
|
|
|
static struct gomp_device_descr *
|
|
resolve_device (acc_device_t d, bool fail_is_error)
|
|
{
|
|
acc_device_t d_arg = d;
|
|
|
|
switch (d)
|
|
{
|
|
case acc_device_default:
|
|
{
|
|
if (goacc_device_type)
|
|
{
|
|
/* Lookup the named device. */
|
|
while (known_device_type_p (++d))
|
|
if (dispatchers[d]
|
|
&& !strcasecmp (goacc_device_type,
|
|
get_openacc_name (dispatchers[d]->name))
|
|
&& dispatchers[d]->get_num_devices_func () > 0)
|
|
goto found;
|
|
|
|
if (fail_is_error)
|
|
{
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
gomp_fatal ("device type %s not supported", goacc_device_type);
|
|
}
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
/* No default device specified, so start scanning for any non-host
|
|
device that is available. */
|
|
d = acc_device_not_host;
|
|
}
|
|
/* FALLTHROUGH */
|
|
|
|
case acc_device_not_host:
|
|
/* Find the first available device after acc_device_not_host. */
|
|
while (known_device_type_p (++d))
|
|
if (dispatchers[d] && dispatchers[d]->get_num_devices_func () > 0)
|
|
goto found;
|
|
if (d_arg == acc_device_default)
|
|
{
|
|
d = acc_device_host;
|
|
goto found;
|
|
}
|
|
if (fail_is_error)
|
|
{
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
gomp_fatal ("no device found");
|
|
}
|
|
else
|
|
return NULL;
|
|
break;
|
|
|
|
case acc_device_host:
|
|
break;
|
|
|
|
default:
|
|
if (!known_device_type_p (d))
|
|
{
|
|
if (fail_is_error)
|
|
goto unsupported_device;
|
|
else
|
|
return NULL;
|
|
}
|
|
break;
|
|
}
|
|
found:
|
|
|
|
assert (d != acc_device_none
|
|
&& d != acc_device_default
|
|
&& d != acc_device_not_host);
|
|
|
|
if (dispatchers[d] == NULL && fail_is_error)
|
|
{
|
|
unsupported_device:
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
gomp_fatal ("device type %s not supported", name_of_acc_device_t (d));
|
|
}
|
|
|
|
return dispatchers[d];
|
|
}
|
|
|
|
/* Emit a suitable error if no device of a particular type is available, or
|
|
the given device number is out-of-range. */
|
|
static void
|
|
acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs)
|
|
{
|
|
if (ndevs == 0)
|
|
gomp_fatal ("no devices of type %s available", name_of_acc_device_t (d));
|
|
else
|
|
gomp_fatal ("device %u out of range", ord);
|
|
}
|
|
|
|
/* This is called when plugins have been initialized, and serves to call
|
|
(indirectly) the target's device_init hook. Calling multiple times without
|
|
an intervening acc_shutdown_1 call is an error. ACC_DEVICE_LOCK must be
|
|
held before calling this function. */
|
|
|
|
static struct gomp_device_descr *
|
|
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
|
|
{
|
|
gomp_mutex_lock (&acc_init_state_lock);
|
|
acc_init_state = initializing;
|
|
acc_init_thread = pthread_self ();
|
|
gomp_mutex_unlock (&acc_init_state_lock);
|
|
|
|
bool check_not_nested_p;
|
|
if (implicit)
|
|
{
|
|
/* In the implicit case, there should (TODO: must?) already be something
|
|
have been set up for an outer construct. */
|
|
check_not_nested_p = false;
|
|
}
|
|
else
|
|
{
|
|
check_not_nested_p = true;
|
|
/* TODO: should we set 'thr->prof_info' etc. in this case ('acc_init')?
|
|
The problem is, that we don't have 'thr' yet? (So,
|
|
'check_not_nested_p = true' also is pointless actually.) */
|
|
}
|
|
bool profiling_p = GOACC_PROFILING_DISPATCH_P (check_not_nested_p);
|
|
|
|
acc_prof_info prof_info;
|
|
if (profiling_p)
|
|
{
|
|
prof_info.event_type = acc_ev_device_init_start;
|
|
prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
|
|
prof_info.version = _ACC_PROF_INFO_VERSION;
|
|
prof_info.device_type = d;
|
|
prof_info.device_number = goacc_device_num;
|
|
prof_info.thread_id = -1;
|
|
prof_info.async = acc_async_sync;
|
|
prof_info.async_queue = prof_info.async;
|
|
prof_info.src_file = NULL;
|
|
prof_info.func_name = NULL;
|
|
prof_info.line_no = -1;
|
|
prof_info.end_line_no = -1;
|
|
prof_info.func_line_no = -1;
|
|
prof_info.func_end_line_no = -1;
|
|
}
|
|
acc_event_info device_init_event_info;
|
|
if (profiling_p)
|
|
{
|
|
device_init_event_info.other_event.event_type = prof_info.event_type;
|
|
device_init_event_info.other_event.valid_bytes
|
|
= _ACC_OTHER_EVENT_INFO_VALID_BYTES;
|
|
device_init_event_info.other_event.parent_construct = parent_construct;
|
|
device_init_event_info.other_event.implicit = implicit;
|
|
device_init_event_info.other_event.tool_info = NULL;
|
|
}
|
|
acc_api_info api_info;
|
|
if (profiling_p)
|
|
{
|
|
api_info.device_api = acc_device_api_none;
|
|
api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
|
|
api_info.device_type = prof_info.device_type;
|
|
api_info.vendor = -1;
|
|
api_info.device_handle = NULL;
|
|
api_info.context_handle = NULL;
|
|
api_info.async_handle = NULL;
|
|
}
|
|
|
|
if (profiling_p)
|
|
goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
|
|
|
|
struct gomp_device_descr *base_dev, *acc_dev;
|
|
int ndevs;
|
|
|
|
base_dev = resolve_device (d, true);
|
|
|
|
ndevs = base_dev->get_num_devices_func ();
|
|
|
|
if (ndevs <= 0 || goacc_device_num >= ndevs)
|
|
acc_dev_num_out_of_range (d, goacc_device_num, ndevs);
|
|
|
|
acc_dev = &base_dev[goacc_device_num];
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
|
|
{
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
gomp_fatal ("device already active");
|
|
}
|
|
|
|
gomp_init_device (acc_dev);
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
|
|
if (profiling_p)
|
|
{
|
|
prof_info.event_type = acc_ev_device_init_end;
|
|
device_init_event_info.other_event.event_type = prof_info.event_type;
|
|
goacc_profiling_dispatch (&prof_info, &device_init_event_info,
|
|
&api_info);
|
|
}
|
|
|
|
/* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a
|
|
nested 'acc_get_device_type' called from a profiling callback still sees
|
|
'initializing', so that we don't deadlock when it then again tries to lock
|
|
'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */
|
|
gomp_mutex_lock (&acc_init_state_lock);
|
|
acc_init_state = initialized;
|
|
gomp_mutex_unlock (&acc_init_state_lock);
|
|
|
|
return base_dev;
|
|
}
|
|
|
|
/* ACC_DEVICE_LOCK must be held before calling this function. */
|
|
|
|
static void
|
|
acc_shutdown_1 (acc_device_t d)
|
|
{
|
|
struct gomp_device_descr *base_dev;
|
|
struct goacc_thread *walk;
|
|
int ndevs, i;
|
|
bool devices_active = false;
|
|
|
|
/* Get the base device for this device type. */
|
|
base_dev = resolve_device (d, true);
|
|
|
|
ndevs = base_dev->get_num_devices_func ();
|
|
|
|
/* Unload all the devices of this type that have been opened. */
|
|
for (i = 0; i < ndevs; i++)
|
|
{
|
|
struct gomp_device_descr *acc_dev = &base_dev[i];
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
gomp_unload_device (acc_dev);
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
}
|
|
|
|
gomp_mutex_lock (&goacc_thread_lock);
|
|
|
|
/* Free target-specific TLS data and close all devices. */
|
|
for (walk = goacc_threads; walk != NULL; walk = walk->next)
|
|
{
|
|
if (walk->target_tls)
|
|
base_dev->openacc.destroy_thread_data_func (walk->target_tls);
|
|
|
|
walk->target_tls = NULL;
|
|
|
|
/* This would mean the user is shutting down OpenACC in the middle of an
|
|
"acc data" pragma. Likely not intentional. */
|
|
if (walk->mapped_data)
|
|
{
|
|
gomp_mutex_unlock (&goacc_thread_lock);
|
|
gomp_fatal ("shutdown in 'acc data' region");
|
|
}
|
|
|
|
/* Similarly, if this happens then user code has done something weird. */
|
|
if (walk->saved_bound_dev)
|
|
{
|
|
gomp_mutex_unlock (&goacc_thread_lock);
|
|
gomp_fatal ("shutdown during host fallback");
|
|
}
|
|
|
|
if (walk->dev)
|
|
{
|
|
gomp_mutex_lock (&walk->dev->lock);
|
|
|
|
while (walk->dev->mem_map.root)
|
|
{
|
|
splay_tree_key k = &walk->dev->mem_map.root->key;
|
|
if (k->aux)
|
|
k->aux->link_key = NULL;
|
|
gomp_remove_var (walk->dev, k);
|
|
}
|
|
|
|
gomp_mutex_unlock (&walk->dev->lock);
|
|
|
|
walk->dev = NULL;
|
|
walk->base_dev = NULL;
|
|
}
|
|
}
|
|
|
|
gomp_mutex_unlock (&goacc_thread_lock);
|
|
|
|
/* Close all the devices of this type that have been opened. */
|
|
bool ret = true;
|
|
for (i = 0; i < ndevs; i++)
|
|
{
|
|
struct gomp_device_descr *acc_dev = &base_dev[i];
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
if (acc_dev->state == GOMP_DEVICE_INITIALIZED)
|
|
{
|
|
devices_active = true;
|
|
ret &= gomp_fini_device (acc_dev);
|
|
acc_dev->state = GOMP_DEVICE_UNINITIALIZED;
|
|
}
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
}
|
|
|
|
if (!ret)
|
|
gomp_fatal ("device finalization failed");
|
|
|
|
if (!devices_active)
|
|
gomp_fatal ("no device initialized");
|
|
}
|
|
|
|
static struct goacc_thread *
|
|
goacc_new_thread (void)
|
|
{
|
|
struct goacc_thread *thr = gomp_malloc (sizeof (struct goacc_thread));
|
|
|
|
#if defined HAVE_TLS || defined USE_EMUTLS
|
|
goacc_tls_data = thr;
|
|
#else
|
|
pthread_setspecific (goacc_tls_key, thr);
|
|
#endif
|
|
|
|
pthread_setspecific (goacc_cleanup_key, thr);
|
|
|
|
gomp_mutex_lock (&goacc_thread_lock);
|
|
thr->next = goacc_threads;
|
|
goacc_threads = thr;
|
|
gomp_mutex_unlock (&goacc_thread_lock);
|
|
|
|
return thr;
|
|
}
|
|
|
|
static void
|
|
goacc_destroy_thread (void *data)
|
|
{
|
|
struct goacc_thread *thr = data, *walk, *prev;
|
|
|
|
gomp_mutex_lock (&goacc_thread_lock);
|
|
|
|
if (thr)
|
|
{
|
|
struct gomp_device_descr *acc_dev = thr->dev;
|
|
|
|
if (acc_dev && thr->target_tls)
|
|
{
|
|
acc_dev->openacc.destroy_thread_data_func (thr->target_tls);
|
|
thr->target_tls = NULL;
|
|
}
|
|
|
|
assert (!thr->mapped_data);
|
|
|
|
/* Remove from thread list. */
|
|
for (prev = NULL, walk = goacc_threads; walk;
|
|
prev = walk, walk = walk->next)
|
|
if (walk == thr)
|
|
{
|
|
if (prev == NULL)
|
|
goacc_threads = walk->next;
|
|
else
|
|
prev->next = walk->next;
|
|
|
|
free (thr);
|
|
|
|
break;
|
|
}
|
|
|
|
assert (walk);
|
|
}
|
|
|
|
gomp_mutex_unlock (&goacc_thread_lock);
|
|
}
|
|
|
|
/* Use the ORD'th device instance for the current host thread (or -1 for the
|
|
current global default). The device (and the runtime) must be initialised
|
|
before calling this function. */
|
|
|
|
void
|
|
goacc_attach_host_thread_to_device (int ord)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
struct gomp_device_descr *acc_dev = NULL, *base_dev = NULL;
|
|
int num_devices;
|
|
|
|
if (thr && thr->dev && (thr->dev->target_id == ord || ord < 0))
|
|
return;
|
|
|
|
if (ord < 0)
|
|
ord = goacc_device_num;
|
|
|
|
/* Decide which type of device to use. If the current thread has a device
|
|
type already (e.g. set by acc_set_device_type), use that, else use the
|
|
global default. */
|
|
if (thr && thr->base_dev)
|
|
base_dev = thr->base_dev;
|
|
else
|
|
{
|
|
assert (cached_base_dev);
|
|
base_dev = cached_base_dev;
|
|
}
|
|
|
|
num_devices = base_dev->get_num_devices_func ();
|
|
if (num_devices <= 0 || ord >= num_devices)
|
|
acc_dev_num_out_of_range (acc_device_type (base_dev->type), ord,
|
|
num_devices);
|
|
|
|
if (!thr)
|
|
thr = goacc_new_thread ();
|
|
|
|
thr->base_dev = base_dev;
|
|
thr->dev = acc_dev = &base_dev[ord];
|
|
thr->saved_bound_dev = NULL;
|
|
thr->mapped_data = NULL;
|
|
thr->prof_info = NULL;
|
|
thr->api_info = NULL;
|
|
/* Initially, all callbacks for all events are enabled. */
|
|
thr->prof_callbacks_enabled = true;
|
|
|
|
thr->target_tls
|
|
= acc_dev->openacc.create_thread_data_func (ord);
|
|
}
|
|
|
|
/* OpenACC 2.0a (3.2.12, 3.2.13) doesn't specify whether the serialization of
|
|
init/shutdown is per-process or per-thread. We choose per-process. */
|
|
|
|
void
|
|
acc_init (acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
goacc_attach_host_thread_to_device (-1);
|
|
}
|
|
|
|
ialias (acc_init)
|
|
|
|
void
|
|
acc_shutdown (acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
|
|
acc_shutdown_1 (d);
|
|
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
}
|
|
|
|
ialias (acc_shutdown)
|
|
|
|
int
|
|
acc_get_num_devices (acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
int n = 0;
|
|
struct gomp_device_descr *acc_dev;
|
|
|
|
if (d == acc_device_none)
|
|
return 0;
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
acc_dev = resolve_device (d, false);
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
if (!acc_dev)
|
|
return 0;
|
|
|
|
n = acc_dev->get_num_devices_func ();
|
|
if (n < 0)
|
|
n = 0;
|
|
|
|
return n;
|
|
}
|
|
|
|
ialias (acc_get_num_devices)
|
|
|
|
/* Set the device type for the current thread only (using the current global
|
|
default device number), initialising that device if necessary. Also set the
|
|
default device type for new threads to D. */
|
|
|
|
void
|
|
acc_set_device_type (acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
struct gomp_device_descr *base_dev, *acc_dev;
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
acc_prof_info prof_info;
|
|
acc_api_info api_info;
|
|
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
|
if (profiling_p)
|
|
prof_info.device_type = d;
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
|
|
cached_base_dev = base_dev = resolve_device (d, true);
|
|
acc_dev = &base_dev[goacc_device_num];
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED)
|
|
gomp_init_device (acc_dev);
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
/* We're changing device type: invalidate the current thread's dev and
|
|
base_dev pointers. */
|
|
if (thr && thr->base_dev != base_dev)
|
|
{
|
|
thr->base_dev = thr->dev = NULL;
|
|
if (thr->mapped_data)
|
|
gomp_fatal ("acc_set_device_type in 'acc data' region");
|
|
}
|
|
|
|
goacc_attach_host_thread_to_device (-1);
|
|
|
|
if (profiling_p)
|
|
{
|
|
thr->prof_info = NULL;
|
|
thr->api_info = NULL;
|
|
}
|
|
}
|
|
|
|
ialias (acc_set_device_type)
|
|
|
|
static bool
|
|
self_initializing_p (void)
|
|
{
|
|
bool res;
|
|
gomp_mutex_lock (&acc_init_state_lock);
|
|
res = (acc_init_state == initializing
|
|
&& pthread_equal (acc_init_thread, pthread_self ()));
|
|
gomp_mutex_unlock (&acc_init_state_lock);
|
|
return res;
|
|
}
|
|
|
|
acc_device_t
|
|
acc_get_device_type (void)
|
|
{
|
|
acc_device_t res = acc_device_none;
|
|
struct gomp_device_descr *dev;
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
if (thr && thr->base_dev)
|
|
res = acc_device_type (thr->base_dev->type);
|
|
else if (self_initializing_p ())
|
|
/* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during the
|
|
acc_ev_device_init_start event callback, which is dispatched during
|
|
acc_init_1. Trying to lock acc_device_lock during such a call (as we do
|
|
in the else clause below), will result in deadlock, since the lock has
|
|
already been taken by the acc_init_1 caller. We work around this problem
|
|
by using the acc_get_device_type property "If the device type has not yet
|
|
been selected, the value acc_device_none may be returned". */
|
|
;
|
|
else
|
|
{
|
|
acc_prof_info prof_info;
|
|
acc_api_info api_info;
|
|
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
dev = resolve_device (acc_device_default, true);
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
res = acc_device_type (dev->type);
|
|
|
|
if (profiling_p)
|
|
{
|
|
thr->prof_info = NULL;
|
|
thr->api_info = NULL;
|
|
}
|
|
}
|
|
|
|
assert (res != acc_device_default
|
|
&& res != acc_device_not_host
|
|
&& res != acc_device_current);
|
|
|
|
return res;
|
|
}
|
|
|
|
ialias (acc_get_device_type)
|
|
|
|
int
|
|
acc_get_device_num (acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
const struct gomp_device_descr *dev;
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
acc_prof_info prof_info;
|
|
acc_api_info api_info;
|
|
bool profiling_p = GOACC_PROFILING_SETUP_P (thr, &prof_info, &api_info);
|
|
if (profiling_p)
|
|
prof_info.device_type = d;
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
dev = resolve_device (d, true);
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
if (profiling_p)
|
|
{
|
|
thr->prof_info = NULL;
|
|
thr->api_info = NULL;
|
|
}
|
|
|
|
if (thr && thr->base_dev == dev && thr->dev)
|
|
return thr->dev->target_id;
|
|
|
|
return goacc_device_num;
|
|
}
|
|
|
|
ialias (acc_get_device_num)
|
|
|
|
void
|
|
acc_set_device_num (int ord, acc_device_t d)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error (d);
|
|
|
|
struct gomp_device_descr *base_dev, *acc_dev;
|
|
int num_devices;
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
if (ord < 0)
|
|
ord = goacc_device_num;
|
|
|
|
if ((int) d == 0)
|
|
/* Set whatever device is being used by the current host thread to use
|
|
device instance ORD. It's unclear if this is supposed to affect other
|
|
host threads too (OpenACC 2.0 (3.2.4) acc_set_device_num). */
|
|
goacc_attach_host_thread_to_device (ord);
|
|
else
|
|
{
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
|
|
cached_base_dev = base_dev = resolve_device (d, true);
|
|
|
|
num_devices = base_dev->get_num_devices_func ();
|
|
|
|
if (num_devices <= 0 || ord >= num_devices)
|
|
acc_dev_num_out_of_range (d, ord, num_devices);
|
|
|
|
acc_dev = &base_dev[ord];
|
|
|
|
gomp_mutex_lock (&acc_dev->lock);
|
|
if (acc_dev->state == GOMP_DEVICE_UNINITIALIZED)
|
|
gomp_init_device (acc_dev);
|
|
gomp_mutex_unlock (&acc_dev->lock);
|
|
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
goacc_attach_host_thread_to_device (ord);
|
|
}
|
|
|
|
goacc_device_num = ord;
|
|
}
|
|
|
|
ialias (acc_set_device_num)
|
|
|
|
static union goacc_property_value
|
|
get_property_any (int ord, acc_device_t d, acc_device_property_t prop)
|
|
{
|
|
goacc_lazy_initialize ();
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
if (d == acc_device_current && thr && thr->dev)
|
|
return thr->dev->openacc.get_property_func (thr->dev->target_id, prop);
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
|
|
struct gomp_device_descr *dev = resolve_device (d, true);
|
|
|
|
int num_devices = dev->get_num_devices_func ();
|
|
|
|
if (num_devices <= 0 || ord >= num_devices)
|
|
acc_dev_num_out_of_range (d, ord, num_devices);
|
|
|
|
dev += ord;
|
|
|
|
gomp_mutex_lock (&dev->lock);
|
|
if (dev->state == GOMP_DEVICE_UNINITIALIZED)
|
|
gomp_init_device (dev);
|
|
gomp_mutex_unlock (&dev->lock);
|
|
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
assert (dev);
|
|
|
|
return dev->openacc.get_property_func (dev->target_id, prop);
|
|
}
|
|
|
|
size_t
|
|
acc_get_property (int ord, acc_device_t d, acc_device_property_t prop)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error(d);
|
|
|
|
if (prop & GOACC_PROPERTY_STRING_MASK)
|
|
return 0;
|
|
else
|
|
return get_property_any (ord, d, prop).val;
|
|
}
|
|
|
|
ialias (acc_get_property)
|
|
|
|
const char *
|
|
acc_get_property_string (int ord, acc_device_t d, acc_device_property_t prop)
|
|
{
|
|
if (!known_device_type_p (d))
|
|
unknown_device_type_error(d);
|
|
|
|
if (prop & GOACC_PROPERTY_STRING_MASK)
|
|
return get_property_any (ord, d, prop).ptr;
|
|
else
|
|
return NULL;
|
|
}
|
|
|
|
ialias (acc_get_property_string)
|
|
|
|
/* For -O and higher, the compiler always attempts to expand acc_on_device, but
|
|
if the user disables the builtin, or calls it via a pointer, we'll need this
|
|
version.
|
|
|
|
Compile this with optimization, so that the compiler expands
|
|
this, rather than generating infinitely recursive code.
|
|
|
|
The function just forwards its argument to __builtin_acc_on_device. It does
|
|
not verify that the argument is a valid acc_device_t enumeration value. */
|
|
|
|
int __attribute__ ((__optimize__ ("O2")))
|
|
acc_on_device (acc_device_t dev)
|
|
{
|
|
return __builtin_acc_on_device (dev);
|
|
}
|
|
|
|
ialias (acc_on_device)
|
|
|
|
attribute_hidden void
|
|
goacc_runtime_initialize (void)
|
|
{
|
|
gomp_mutex_init (&acc_device_lock);
|
|
|
|
#if !(defined HAVE_TLS || defined USE_EMUTLS)
|
|
pthread_key_create (&goacc_tls_key, NULL);
|
|
#endif
|
|
|
|
pthread_key_create (&goacc_cleanup_key, goacc_destroy_thread);
|
|
|
|
cached_base_dev = NULL;
|
|
|
|
goacc_threads = NULL;
|
|
gomp_mutex_init (&goacc_thread_lock);
|
|
|
|
/* Initialize and register the 'host' device type. */
|
|
goacc_host_init ();
|
|
}
|
|
|
|
static void __attribute__((destructor))
|
|
goacc_runtime_deinitialize (void)
|
|
{
|
|
#if !(defined HAVE_TLS || defined USE_EMUTLS)
|
|
pthread_key_delete (goacc_tls_key);
|
|
#endif
|
|
pthread_key_delete (goacc_cleanup_key);
|
|
}
|
|
|
|
/* Compiler helper functions */
|
|
|
|
attribute_hidden void
|
|
goacc_save_and_set_bind (acc_device_t d)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
assert (!thr->saved_bound_dev);
|
|
|
|
thr->saved_bound_dev = thr->dev;
|
|
thr->dev = dispatchers[d];
|
|
}
|
|
|
|
attribute_hidden void
|
|
goacc_restore_bind (void)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
thr->dev = thr->saved_bound_dev;
|
|
thr->saved_bound_dev = NULL;
|
|
}
|
|
|
|
/* This is called from any OpenACC support function that may need to implicitly
|
|
initialize the libgomp runtime, either globally or from a new host thread.
|
|
On exit "goacc_thread" will return a valid & populated thread block. */
|
|
|
|
attribute_hidden void
|
|
goacc_lazy_initialize (void)
|
|
{
|
|
struct goacc_thread *thr = goacc_thread ();
|
|
|
|
if (thr && thr->dev)
|
|
return;
|
|
|
|
gomp_init_targets_once ();
|
|
|
|
gomp_mutex_lock (&acc_device_lock);
|
|
if (!cached_base_dev)
|
|
cached_base_dev = acc_init_1 (acc_device_default,
|
|
acc_construct_parallel, 1);
|
|
gomp_mutex_unlock (&acc_device_lock);
|
|
|
|
goacc_attach_host_thread_to_device (-1);
|
|
}
|