OpenMP offloading to NVPTX: libgomp changes

* Makefile.am (libgomp_la_SOURCES): Add atomic.c, icv.c, icv-device.c.
	* Makefile.in. Regenerate.
	* configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it...
	(LIBGOMP_USE_PTHREADS): ...here; new define.
	* configure: Regenerate.
	* config.h.in: Likewise.
	* config/posix/affinity.c: Move to...
	* affinity.c: ...here (new file).  Guard use of Pthreads-specific
	interface by LIBGOMP_USE_PTHREADS. 
	* critical.c: Split out GOMP_atomic_{start,end} into...
	* atomic.c: ...here (new file).
	* env.c: Split out ICV definitions into...
	* icv.c: ...here (new file) and...
	* icv-device.c: ...here. New file.
	* config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c.
	(gomp_destroy_lock_30): Ditto.
	(gomp_set_lock_30): Ditto.
	(gomp_unset_lock_30): Ditto.
	(gomp_test_lock_30): Ditto.
	(gomp_init_nest_lock_30): Ditto.
	(gomp_destroy_nest_lock_30): Ditto.
	(gomp_set_nest_lock_30): Ditto.
	(gomp_unset_nest_lock_30): Ditto.
	(gomp_test_nest_lock_30): Ditto.
	* lock.c: New.
	* config/nvptx/lock.c: New.
	* config/nvptx/bar.c: New.
	* config/nvptx/bar.h: New.
	* config/nvptx/doacross.h: New.
	* config/nvptx/error.c: New.
	* config/nvptx/icv-device.c: New.
	* config/nvptx/mutex.h: New.
	* config/nvptx/pool.h: New.
	* config/nvptx/proc.c: New.
	* config/nvptx/ptrlock.h: New.
	* config/nvptx/sem.h: New.
	* config/nvptx/simple-bar.h: New.
	* config/nvptx/target.c: New.
	* config/nvptx/task.c: New.
	* config/nvptx/team.c: New.
	* config/nvptx/time.c: New.
	* config/posix/simple-bar.h: New.
	* libgomp.h: Guard pthread.h inclusion.  Include simple-bar.h.
	(gomp_num_teams_var): Declare.
	(struct gomp_thread_pool): Change threads_dock member to
	gomp_simple_barrier_t.
	[__nvptx__] (gomp_thread): New implementation.
	(gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS.
	(gomp_thread_destructor): Ditto.
	(gomp_init_thread_affinity): Ditto.
	* team.c: Guard uses of Pthreads-specific interfaces by
	LIBGOMP_USE_PTHREADS.  Adjust all uses of threads_dock.
	(gomp_free_thread) [__nvptx__]: Do not call 'free'.

	* config/nvptx/alloc.c: Delete.
	* config/nvptx/barrier.c: Ditto.
	* config/nvptx/fortran.c: Ditto.
	* config/nvptx/iter.c: Ditto.
	* config/nvptx/iter_ull.c: Ditto.
	* config/nvptx/loop.c: Ditto.
	* config/nvptx/loop_ull.c: Ditto.
	* config/nvptx/ordered.c: Ditto.
	* config/nvptx/parallel.c: Ditto.
	* config/nvptx/priority_queue.c: Ditto.
	* config/nvptx/sections.c: Ditto.
	* config/nvptx/single.c: Ditto.
	* config/nvptx/splay-tree.c: Ditto.
	* config/nvptx/work.c: Ditto.

	* testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass
	-foffload=-lgfortran in addition to -lgfortran.
	* testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto.

	* plugin/plugin-nvptx.c: Include <limits.h>.
	(struct targ_fn_descriptor): Add new fields.
	(struct ptx_device): Ditto.  Set them...
	(nvptx_open_device): ...here.
	(nvptx_adjust_launch_bounds): New.
	(nvptx_host2dev): Allow NULL 'nvthd'.
	(nvptx_dev2host): Ditto.
	(GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400.
	(link_ptx): Adjust log sizes.
	(nvptx_host2dev): Allow NULL 'nvthd'.
	(nvptx_dev2host): Ditto.
	(nvptx_set_clocktick): New.  Use it...
	(GOMP_OFFLOAD_load_image): ...here.  Set new targ_fn_descriptor
	fields.
	(GOMP_OFFLOAD_dev2dev): New.
	(nvptx_adjust_launch_bounds): New.
	(nvptx_stacks_size): New.
	(nvptx_stacks_alloc): New.
	(nvptx_stacks_free): New.
	(GOMP_OFFLOAD_run): New.
	(GOMP_OFFLOAD_async_run): New (stub).

Co-Authored-By: Dmitry Melnik <dm@ispras.ru>
Co-Authored-By: Jakub Jelinek <jakub@redhat.com>

From-SVN: r242789
This commit is contained in:
Alexander Monakov 2016-11-23 21:36:41 +03:00 committed by Alexander Monakov
parent 6251fe936f
commit 6103184e81
49 changed files with 2214 additions and 447 deletions

View File

@ -1,3 +1,99 @@
2016-11-23 Alexander Monakov <amonakov@ispras.ru>
Jakub Jelinek <jakub@redhat.com>
Dmitry Melnik <dm@ispras.ru>
* Makefile.am (libgomp_la_SOURCES): Add atomic.c, icv.c, icv-device.c.
* Makefile.in. Regenerate.
* configure.ac [nvptx*-*-*] (libgomp_use_pthreads): Set and use it...
(LIBGOMP_USE_PTHREADS): ...here; new define.
* configure: Regenerate.
* config.h.in: Likewise.
* config/posix/affinity.c: Move to...
* affinity.c: ...here (new file). Guard use of Pthreads-specific
interface by LIBGOMP_USE_PTHREADS.
* critical.c: Split out GOMP_atomic_{start,end} into...
* atomic.c: ...here (new file).
* env.c: Split out ICV definitions into...
* icv.c: ...here (new file) and...
* icv-device.c: ...here. New file.
* config/linux/lock.c (gomp_init_lock_30): Move to generic lock.c.
(gomp_destroy_lock_30): Ditto.
(gomp_set_lock_30): Ditto.
(gomp_unset_lock_30): Ditto.
(gomp_test_lock_30): Ditto.
(gomp_init_nest_lock_30): Ditto.
(gomp_destroy_nest_lock_30): Ditto.
(gomp_set_nest_lock_30): Ditto.
(gomp_unset_nest_lock_30): Ditto.
(gomp_test_nest_lock_30): Ditto.
* lock.c: New.
* config/nvptx/lock.c: New.
* config/nvptx/bar.c: New.
* config/nvptx/bar.h: New.
* config/nvptx/doacross.h: New.
* config/nvptx/error.c: New.
* config/nvptx/icv-device.c: New.
* config/nvptx/mutex.h: New.
* config/nvptx/pool.h: New.
* config/nvptx/proc.c: New.
* config/nvptx/ptrlock.h: New.
* config/nvptx/sem.h: New.
* config/nvptx/simple-bar.h: New.
* config/nvptx/target.c: New.
* config/nvptx/task.c: New.
* config/nvptx/team.c: New.
* config/nvptx/time.c: New.
* config/posix/simple-bar.h: New.
* libgomp.h: Guard pthread.h inclusion. Include simple-bar.h.
(gomp_num_teams_var): Declare.
(struct gomp_thread_pool): Change threads_dock member to
gomp_simple_barrier_t.
[__nvptx__] (gomp_thread): New implementation.
(gomp_thread_attr): Guard by LIBGOMP_USE_PTHREADS.
(gomp_thread_destructor): Ditto.
(gomp_init_thread_affinity): Ditto.
* team.c: Guard uses of Pthreads-specific interfaces by
LIBGOMP_USE_PTHREADS. Adjust all uses of threads_dock.
(gomp_free_thread) [__nvptx__]: Do not call 'free'.
* config/nvptx/alloc.c: Delete.
* config/nvptx/barrier.c: Ditto.
* config/nvptx/fortran.c: Ditto.
* config/nvptx/iter.c: Ditto.
* config/nvptx/iter_ull.c: Ditto.
* config/nvptx/loop.c: Ditto.
* config/nvptx/loop_ull.c: Ditto.
* config/nvptx/ordered.c: Ditto.
* config/nvptx/parallel.c: Ditto.
* config/nvptx/priority_queue.c: Ditto.
* config/nvptx/sections.c: Ditto.
* config/nvptx/single.c: Ditto.
* config/nvptx/splay-tree.c: Ditto.
* config/nvptx/work.c: Ditto.
* testsuite/libgomp.fortran/fortran.exp (lang_link_flags): Pass
-foffload=-lgfortran in addition to -lgfortran.
* testsuite/libgomp.oacc-fortran/fortran.exp (lang_link_flags): Ditto.
* plugin/plugin-nvptx.c: Include <limits.h>.
(struct targ_fn_descriptor): Add new fields.
(struct ptx_device): Ditto. Set them...
(nvptx_open_device): ...here.
(nvptx_adjust_launch_bounds): New.
(nvptx_host2dev): Allow NULL 'nvthd'.
(nvptx_dev2host): Ditto.
(GOMP_OFFLOAD_get_caps): Add GOMP_OFFLOAD_CAP_OPENMP_400.
(link_ptx): Adjust log sizes.
(nvptx_host2dev): Allow NULL 'nvthd'.
(nvptx_dev2host): Ditto.
(nvptx_set_clocktick): New. Use it...
(GOMP_OFFLOAD_load_image): ...here. Set new targ_fn_descriptor
fields.
(GOMP_OFFLOAD_dev2dev): New.
(nvptx_adjust_launch_bounds): New.
(nvptx_stacks_size): New.
(nvptx_stacks_alloc): New.
(nvptx_stacks_free): New.
(GOMP_OFFLOAD_run): New.
(GOMP_OFFLOAD_async_run): New (stub).
2016-11-23 Martin Jambor <mjambor@suse.cz>
* testsuite/libgomp.hsa.c/bits-insns.c: New test.

View File

@ -58,12 +58,12 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c single.c \
task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
oacc-plugin.c oacc-cuda.c priority_queue.c
libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c error.c \
icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c ordered.c \
parallel.c sections.c single.c task.c team.c work.c lock.c mutex.c \
proc.c sem.c bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c oacc-init.c \
oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c
include $(top_srcdir)/plugin/Makefrag.am

View File

@ -150,14 +150,14 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) --tag=CC \
@PLUGIN_NVPTX_TRUE@ $(toolexeclibdir)
libgomp_la_LIBADD =
@USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \
error.lo iter.lo iter_ull.lo loop.lo loop_ull.lo ordered.lo \
parallel.lo sections.lo single.lo task.lo team.lo work.lo \
lock.lo mutex.lo proc.lo sem.lo bar.lo ptrlock.lo time.lo \
fortran.lo affinity.lo target.lo splay-tree.lo \
libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
priority_queue.lo $(am__objects_1)
am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
single.lo task.lo team.lo work.lo lock.lo mutex.lo proc.lo \
sem.lo bar.lo ptrlock.lo time.lo fortran.lo affinity.lo \
target.lo splay-tree.lo libgomp-plugin.lo oacc-parallel.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo $(am__objects_1)
libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@
depcomp = $(SHELL) $(top_srcdir)/../depcomp
@ -400,13 +400,14 @@ libgomp_la_LDFLAGS = $(libgomp_version_info) $(libgomp_version_script) \
libgomp_la_DEPENDENCIES = $(libgomp_version_dep)
libgomp_la_LINK = $(LINK) $(libgomp_la_LDFLAGS)
libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \
iter_ull.c loop.c loop_ull.c ordered.c parallel.c sections.c \
single.c task.c team.c work.c lock.c mutex.c proc.c sem.c \
bar.c ptrlock.c time.c fortran.c affinity.c target.c \
splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
priority_queue.c $(am__append_3)
libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \
error.c icv.c icv-device.c iter.c iter_ull.c loop.c loop_ull.c \
ordered.c parallel.c sections.c single.c task.c team.c work.c \
lock.c mutex.c proc.c sem.c bar.c ptrlock.c time.c fortran.c \
affinity.c target.c splay-tree.c libgomp-plugin.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
$(am__append_3)
# Nvidia PTX OpenACC plugin.
@PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION)
@ -571,12 +572,15 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/affinity.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/alloc.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/atomic.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bar.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/barrier.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/critical.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fortran.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv-device.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@

View File

@ -32,12 +32,14 @@ gomp_init_affinity (void)
{
}
#ifdef LIBGOMP_USE_PTHREADS
void
gomp_init_thread_affinity (pthread_attr_t *attr, unsigned int place)
{
(void) attr;
(void) place;
}
#endif
void **
gomp_affinity_alloc (unsigned long count, bool quiet)

57
libgomp/atomic.c Normal file
View File

@ -0,0 +1,57 @@
/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file contains helpers for the ATOMIC construct. */
#include "libgomp.h"
/* This mutex is used when atomic operations don't exist for the target
in the mode requested. The result is not globally atomic, but works so
long as all parallel references are within #pragma omp atomic directives.
According to responses received from omp@openmp.org, appears to be within
spec. Which makes sense, since that's how several other compilers
handle this situation as well. */
static gomp_mutex_t atomic_lock;
void
GOMP_atomic_start (void)
{
gomp_mutex_lock (&atomic_lock);
}
void
GOMP_atomic_end (void)
{
gomp_mutex_unlock (&atomic_lock);
}
#if !GOMP_MUTEX_INIT_0
static void __attribute__((constructor))
initialize_atomic (void)
{
gomp_mutex_init (&atomic_lock);
}
#endif

View File

@ -115,6 +115,9 @@
/* Define to 1 if GNU symbol versioning is used for libgomp. */
#undef LIBGOMP_GNU_SYMBOL_VERSIONING
/* Define to 1 if libgomp should use POSIX threads. */
#undef LIBGOMP_USE_PTHREADS
/* Define to the sub-directory in which libtool stores uninstalled libraries.
*/
#undef LT_OBJDIR

View File

@ -32,98 +32,8 @@
#include <sys/syscall.h>
#include "wait.h"
/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
have the same form. Re-use it. */
void
gomp_init_lock_30 (omp_lock_t *lock)
{
gomp_mutex_init (lock);
}
void
gomp_destroy_lock_30 (omp_lock_t *lock)
{
gomp_mutex_destroy (lock);
}
void
gomp_set_lock_30 (omp_lock_t *lock)
{
gomp_mutex_lock (lock);
}
void
gomp_unset_lock_30 (omp_lock_t *lock)
{
gomp_mutex_unlock (lock);
}
int
gomp_test_lock_30 (omp_lock_t *lock)
{
int oldval = 0;
return __atomic_compare_exchange_n (lock, &oldval, 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
}
void
gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
{
memset (lock, '\0', sizeof (*lock));
}
void
gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
{
}
void
gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
{
void *me = gomp_icv (true);
if (lock->owner != me)
{
gomp_mutex_lock (&lock->lock);
lock->owner = me;
}
lock->count++;
}
void
gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
{
if (--lock->count == 0)
{
lock->owner = NULL;
gomp_mutex_unlock (&lock->lock);
}
}
int
gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
{
void *me = gomp_icv (true);
int oldval;
if (lock->owner == me)
return ++lock->count;
oldval = 0;
if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
{
lock->owner = me;
lock->count = 1;
return 1;
}
return 0;
}
/* Reuse the generic implementation in terms of gomp_mutex_t. */
#include "../../lock.c"
#ifdef LIBGOMP_GNU_SYMBOL_VERSIONING
/* gomp_mutex_* can be safely locked in one thread and

View File

@ -0,0 +1,206 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is an NVPTX specific implementation of a barrier synchronization
mechanism for libgomp. This type is private to the library. This
implementation uses atomic instructions and bar.sync instruction. */
#include <limits.h>
#include "libgomp.h"
void
gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
/* Next time we'll be awaiting TOTAL threads again. */
bar->awaited = bar->total;
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELEASE);
}
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
gomp_barrier_wait (gomp_barrier_t *bar)
{
gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
}
/* Like gomp_barrier_wait, except that if the encountering thread
is not the last one to hit the barrier, it returns immediately.
The intended usage is that a thread which intends to gomp_barrier_destroy
this barrier calls gomp_barrier_wait, while all other threads
call gomp_barrier_wait_last. When gomp_barrier_wait returns,
the barrier can be safely destroyed. */
void
gomp_barrier_wait_last (gomp_barrier_t *bar)
{
/* Deferring to gomp_barrier_wait does not use the optimization opportunity
allowed by the interface contract for all-but-last participants. The
original implementation in config/linux/bar.c handles this better. */
gomp_barrier_wait (bar);
}
void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
void
gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
unsigned int generation, gen;
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
/* Next time we'll be awaiting TOTAL threads again. */
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
bar->awaited = bar->total;
team->work_share_cancelled = 0;
if (__builtin_expect (team->task_count, 0))
{
gomp_barrier_handle_tasks (state);
state &= ~BAR_WAS_LAST;
}
else
{
state &= ~BAR_CANCELLED;
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return;
}
}
generation = state;
state &= ~BAR_CANCELLED;
do
{
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
gomp_barrier_handle_tasks (state);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
}
generation |= gen & BAR_WAITING_FOR_TASK;
}
while (gen != state + BAR_INCR);
}
void
gomp_team_barrier_wait (gomp_barrier_t *bar)
{
gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
}
void
gomp_team_barrier_wait_final (gomp_barrier_t *bar)
{
gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
if (__builtin_expect (state & BAR_WAS_LAST, 0))
bar->awaited_final = bar->total;
gomp_team_barrier_wait_end (bar, state);
}
bool
gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
{
unsigned int generation, gen;
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
/* Next time we'll be awaiting TOTAL threads again. */
/* BAR_CANCELLED should never be set in state here, because
cancellation means that at least one of the threads has been
cancelled, thus on a cancellable barrier we should never see
all threads to arrive. */
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
bar->awaited = bar->total;
team->work_share_cancelled = 0;
if (__builtin_expect (team->task_count, 0))
{
gomp_barrier_handle_tasks (state);
state &= ~BAR_WAS_LAST;
}
else
{
state += BAR_INCR - BAR_WAS_LAST;
__atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE);
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
return false;
}
}
if (__builtin_expect (state & BAR_CANCELLED, 0))
return true;
generation = state;
do
{
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
if (__builtin_expect (gen & BAR_CANCELLED, 0))
return true;
if (__builtin_expect (gen & BAR_TASK_PENDING, 0))
{
gomp_barrier_handle_tasks (state);
gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
}
generation |= gen & BAR_WAITING_FOR_TASK;
}
while (gen != state + BAR_INCR);
return false;
}
bool
gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
{
return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
}
void
gomp_team_barrier_cancel (struct gomp_team *team)
{
gomp_mutex_lock (&team->task_lock);
if (team->barrier.generation & BAR_CANCELLED)
{
gomp_mutex_unlock (&team->task_lock);
return;
}
team->barrier.generation |= BAR_CANCELLED;
gomp_mutex_unlock (&team->task_lock);
gomp_team_barrier_wake (&team->barrier, INT_MAX);
}

166
libgomp/config/nvptx/bar.h Normal file
View File

@ -0,0 +1,166 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is an NVPTX specific implementation of a barrier synchronization
mechanism for libgomp. This type is private to the library. This
implementation uses atomic instructions and bar.sync instruction. */
#ifndef GOMP_BARRIER_H
#define GOMP_BARRIER_H 1
#include "mutex.h"
typedef struct
{
unsigned total;
unsigned generation;
unsigned awaited;
unsigned awaited_final;
} gomp_barrier_t;
typedef unsigned int gomp_barrier_state_t;
/* The generation field contains a counter in the high bits, with a few
low bits dedicated to flags. Note that TASK_PENDING and WAS_LAST can
share space because WAS_LAST is never stored back to generation. */
#define BAR_TASK_PENDING 1
#define BAR_WAS_LAST 1
#define BAR_WAITING_FOR_TASK 2
#define BAR_CANCELLED 4
#define BAR_INCR 8
static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
{
bar->total = count;
bar->awaited = count;
bar->awaited_final = count;
bar->generation = 0;
}
static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
{
__atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL);
bar->total = count;
}
static inline void gomp_barrier_destroy (gomp_barrier_t *bar)
{
}
extern void gomp_barrier_wait (gomp_barrier_t *);
extern void gomp_barrier_wait_last (gomp_barrier_t *);
extern void gomp_barrier_wait_end (gomp_barrier_t *, gomp_barrier_state_t);
extern void gomp_team_barrier_wait (gomp_barrier_t *);
extern void gomp_team_barrier_wait_final (gomp_barrier_t *);
extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
gomp_barrier_state_t);
extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
gomp_barrier_state_t);
extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
struct gomp_team;
extern void gomp_team_barrier_cancel (struct gomp_team *);
static inline gomp_barrier_state_t
gomp_barrier_wait_start (gomp_barrier_t *bar)
{
unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
ret &= -BAR_INCR | BAR_CANCELLED;
/* A memory barrier is needed before exiting from the various forms
of gomp_barrier_wait, to satisfy OpenMP API version 3.1 section
2.8.6 flush Construct, which says there is an implicit flush during
a barrier region. This is a convenient place to add the barrier,
so we use MEMMODEL_ACQ_REL here rather than MEMMODEL_ACQUIRE. */
if (__atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0)
ret |= BAR_WAS_LAST;
return ret;
}
static inline gomp_barrier_state_t
gomp_barrier_wait_cancel_start (gomp_barrier_t *bar)
{
return gomp_barrier_wait_start (bar);
}
/* This is like gomp_barrier_wait_start, except it decrements
bar->awaited_final rather than bar->awaited and should be used
for the gomp_team_end barrier only. */
static inline gomp_barrier_state_t
gomp_barrier_wait_final_start (gomp_barrier_t *bar)
{
unsigned int ret = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE);
ret &= -BAR_INCR | BAR_CANCELLED;
/* See above gomp_barrier_wait_start comment. */
if (__atomic_add_fetch (&bar->awaited_final, -1, MEMMODEL_ACQ_REL) == 0)
ret |= BAR_WAS_LAST;
return ret;
}
static inline bool
gomp_barrier_last_thread (gomp_barrier_state_t state)
{
return state & BAR_WAS_LAST;
}
/* All the inlines below must be called with team->task_lock
held. */
static inline void
gomp_team_barrier_set_task_pending (gomp_barrier_t *bar)
{
bar->generation |= BAR_TASK_PENDING;
}
static inline void
gomp_team_barrier_clear_task_pending (gomp_barrier_t *bar)
{
bar->generation &= ~BAR_TASK_PENDING;
}
static inline void
gomp_team_barrier_set_waiting_for_tasks (gomp_barrier_t *bar)
{
bar->generation |= BAR_WAITING_FOR_TASK;
}
static inline bool
gomp_team_barrier_waiting_for_tasks (gomp_barrier_t *bar)
{
return (bar->generation & BAR_WAITING_FOR_TASK) != 0;
}
static inline bool
gomp_team_barrier_cancelled (gomp_barrier_t *bar)
{
return __builtin_expect ((bar->generation & BAR_CANCELLED) != 0, 0);
}
static inline void
gomp_team_barrier_done (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
bar->generation = (state & -BAR_INCR) + BAR_INCR;
}
#endif /* GOMP_BARRIER_H */

View File

@ -0,0 +1,60 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is the NVPTX implementation of doacross spinning. */
#ifndef GOMP_DOACROSS_H
#define GOMP_DOACROSS_H 1
#include "libgomp.h"
static int zero;
static inline int
cpu_relax (void)
{
int r;
/* Here we need a long-latency operation to make the current warp yield.
We could use ld.cv, uncached load from system (host) memory, but that
would require allocating locked memory in the plugin. Alternatively,
we can use ld.cg, which evicts from L1 and caches in L2. */
asm volatile ("ld.cg.s32 %0, [%1];" : "=r" (r) : "i" (&zero) : "memory");
return r;
}
static inline void doacross_spin (unsigned long *addr, unsigned long expected,
unsigned long cur)
{
/* Prevent compiler from optimizing based on bounds of containing object. */
asm ("" : "+r" (addr));
do
{
int i = cpu_relax ();
cur = addr[i];
}
while (cur <= expected);
}
#endif /* GOMP_DOACROSS_H */

View File

@ -0,0 +1,42 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file contains routines used to signal errors. On NVPTX, we have
one default output stream (stdout), so redirect everything there. */
#include "libgomp.h"
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#undef vfprintf
#undef fputs
#undef fputc
#define vfprintf(stream, fmt, list) vprintf (fmt, list)
#define fputs(s, stream) printf ("%s", s)
#define fputc(c, stream) printf ("%c", c)
#include "../../error.c"

View File

@ -0,0 +1,74 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file defines OpenMP API entry points that accelerator targets are
expected to replace. */
#include "libgomp.h"
void
omp_set_default_device (int device_num __attribute__((unused)))
{
}
int
omp_get_default_device (void)
{
return 0;
}
int
omp_get_num_devices (void)
{
return 0;
}
int
omp_get_num_teams (void)
{
return gomp_num_teams_var + 1;
}
int
omp_get_team_num (void)
{
int ctaid;
asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
return ctaid;
}
int
omp_is_initial_device (void)
{
/* NVPTX is an accelerator-only target. */
return 0;
}
ialias (omp_set_default_device)
ialias (omp_get_default_device)
ialias (omp_get_num_devices)
ialias (omp_get_num_teams)
ialias (omp_get_team_num)
ialias (omp_is_initial_device)

View File

@ -0,0 +1,41 @@
/* Copyright (C) 2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is a NVPTX specific implementation of the public OpenMP locking
primitives. */
/* Reuse the generic implementation in terms of gomp_mutex_t. */
#include "../../lock.c"
ialias (omp_init_lock)
ialias (omp_init_nest_lock)
ialias (omp_destroy_lock)
ialias (omp_destroy_nest_lock)
ialias (omp_set_lock)
ialias (omp_set_nest_lock)
ialias (omp_unset_lock)
ialias (omp_unset_nest_lock)
ialias (omp_test_lock)
ialias (omp_test_nest_lock)

View File

@ -0,0 +1,60 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is an NVPTX specific implementation of a mutex synchronization
mechanism for libgomp. This type is private to the library. This
implementation uses atomic instructions and busy waiting. */
#ifndef GOMP_MUTEX_H
#define GOMP_MUTEX_H 1
typedef int gomp_mutex_t;
#define GOMP_MUTEX_INIT_0 1
static inline void
gomp_mutex_init (gomp_mutex_t *mutex)
{
*mutex = 0;
}
static inline void
gomp_mutex_destroy (gomp_mutex_t *mutex)
{
}
static inline void
gomp_mutex_lock (gomp_mutex_t *mutex)
{
while (__sync_lock_test_and_set (mutex, 1))
/* spin */ ;
}
static inline void
gomp_mutex_unlock (gomp_mutex_t *mutex)
{
__sync_lock_release (mutex);
}
#endif /* GOMP_MUTEX_H */

View File

@ -1,8 +1,5 @@
/* OpenACC Runtime Fortran wrapper routines
Copyright (C) 2014-2016 Free Software Foundation, Inc.
Contributed by Mentor Embedded.
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
@ -26,15 +23,27 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* Temporary hack; this will be provided by libgfortran. */
/* This is the NVPTX implementation of the thread pool management
for libgomp. This type is private to the library. */
extern void _gfortran_abort (void);
#ifndef GOMP_POOL_H
#define GOMP_POOL_H 1
__asm__ ("// BEGIN GLOBAL FUNCTION DECL: _gfortran_abort\n"
".visible .func _gfortran_abort;\n"
"// BEGIN GLOBAL FUNCTION DEF: _gfortran_abort\n"
".visible .func _gfortran_abort\n"
"{\n"
"trap;\n"
"ret;\n"
"}\n");
#include "libgomp.h"
/* Get the thread pool. */
static inline struct gomp_thread_pool *
gomp_get_thread_pool (struct gomp_thread *thr, unsigned nthreads)
{
/* NVPTX is running with a fixed pool of pre-started threads. */
return thr->thread_pool;
}
static inline void
gomp_release_thread_pool (struct gomp_thread_pool *pool)
{
/* Do nothing. */
}
#endif /* GOMP_POOL_H */

View File

@ -1 +0,0 @@
/* Empty stub for omp task priority support. */

View File

@ -0,0 +1,41 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file contains system specific routines related to counting
online processors and dynamic load balancing. */
#include "libgomp.h"
unsigned
gomp_dynamic_max_threads (void)
{
return gomp_icv (false)->nthreads_var;
}
int
omp_get_num_procs (void)
{
return gomp_icv (false)->nthreads_var;
}

View File

@ -0,0 +1,73 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is an NVPTX specific implementation of a mutex synchronization
mechanism for libgomp. This type is private to the library. This
implementation uses atomic instructions and busy waiting.
A ptrlock has four states:
0/NULL Initial
1 Owned by me, I get to write a pointer to ptrlock.
2 Some thread is waiting on the ptrlock.
>2 Ptrlock contains a valid pointer.
It is not valid to gain the ptrlock and then write a NULL to it. */
#ifndef GOMP_PTRLOCK_H
#define GOMP_PTRLOCK_H 1
typedef void *gomp_ptrlock_t;
static inline void gomp_ptrlock_init (gomp_ptrlock_t *ptrlock, void *ptr)
{
*ptrlock = ptr;
}
static inline void *gomp_ptrlock_get (gomp_ptrlock_t *ptrlock)
{
uintptr_t v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
if (v > 2)
return (void *) v;
if (v == 0
&& __atomic_compare_exchange_n (ptrlock, &v, 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_ACQUIRE))
return NULL;
while (v == 1)
v = (uintptr_t) __atomic_load_n (ptrlock, MEMMODEL_ACQUIRE);
return (void *) v;
}
static inline void gomp_ptrlock_set (gomp_ptrlock_t *ptrlock, void *ptr)
{
__atomic_store_n (ptrlock, ptr, MEMMODEL_RELEASE);
}
static inline void gomp_ptrlock_destroy (gomp_ptrlock_t *ptrlock)
{
}
#endif /* GOMP_PTRLOCK_H */

View File

@ -0,0 +1,65 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is an NVPTX specific implementation of a semaphore synchronization
mechanism for libgomp. This type is private to the library. This
semaphore implementation uses atomic instructions and busy waiting. */
#ifndef GOMP_SEM_H
#define GOMP_SEM_H 1
typedef int gomp_sem_t;
static inline void
gomp_sem_init (gomp_sem_t *sem, int value)
{
*sem = value;
}
static inline void
gomp_sem_destroy (gomp_sem_t *sem)
{
}
static inline void
gomp_sem_wait (gomp_sem_t *sem)
{
int count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
for (;;)
{
while (count == 0)
count = __atomic_load_n (sem, MEMMODEL_ACQUIRE);
if (__atomic_compare_exchange_n (sem, &count, count - 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
return;
}
}
static inline void
gomp_sem_post (gomp_sem_t *sem)
{
(void) __atomic_add_fetch (sem, 1, MEMMODEL_RELEASE);
}
#endif /* GOMP_SEM_H */

View File

@ -0,0 +1,70 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is a simplified barrier that is suitable for thread pool
synchronizaton. Only a subset of full barrier API (bar.h) is exposed.
Here in the NVPTX-specific implementation, we expect that thread pool
corresponds to a PTX CTA (thread block). */
#ifndef GOMP_SIMPLE_BARRIER_H
#define GOMP_SIMPLE_BARRIER_H 1
typedef struct
{
unsigned count;
} gomp_simple_barrier_t;
static inline void
gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
{
bar->count = count * 32;
}
/* Unused on NVPTX.
static inline void
gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
{
bar->count = count * 32;
}
*/
static inline void
gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
{
}
static inline void
gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
{
asm volatile ("bar.sync 0, %0;" : : "r" (bar->count) : "memory");
}
static inline void
gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
{
asm volatile ("bar.arrive 0, %0;" : : "r" (bar->count) : "memory");
}
#endif /* GOMP_SIMPLE_BARRIER_H */

View File

@ -0,0 +1,49 @@
/* Copyright (C) 2013-2016 Free Software Foundation, Inc.
Contributed by Jakub Jelinek <jakub@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
#include "libgomp.h"
#include <limits.h>
void
GOMP_teams (unsigned int num_teams, unsigned int thread_limit)
{
if (thread_limit)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->thread_limit_var
= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
}
unsigned int num_blocks, block_id;
asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
if (!num_teams || num_teams >= num_blocks)
num_teams = num_blocks;
else if (block_id >= num_teams)
{
gomp_free_thread (nvptx_thrs);
asm ("exit;");
}
gomp_num_teams_var = num_teams - 1;
}

View File

@ -0,0 +1,43 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file handles the maintainence of tasks in response to task
creation and termination. */
#ifdef __nvptx_softstack__
#include "libgomp.h"
/* NVPTX is an accelerator-only target, so this should never be called. */
bool
gomp_target_task_fn (void *data)
{
__builtin_unreachable ();
}
#include "../../task.c"
#endif

View File

@ -0,0 +1,178 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file handles maintainance of threads on NVPTX. */
#if defined __nvptx_softstack__ && defined __nvptx_unisimt__
#include "libgomp.h"
#include <stdlib.h>
#include <string.h>
struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
static void gomp_thread_start (struct gomp_thread_pool *);
/* This externally visible function handles target region entry. It
sets up a per-team thread pool and transfers control by calling FN (FN_DATA)
in the master thread or gomp_thread_start in other threads.
The name of this function is part of the interface with the compiler: for
each target region, GCC emits a PTX .kernel function that sets up soft-stack
and uniform-simt state and calls this function, passing in FN the original
function outlined for the target region. */
void
gomp_nvptx_main (void (*fn) (void *), void *fn_data)
{
int tid, ntids;
asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
asm ("mov.u32 %0, %%ntid.y;" : "=r" (ntids));
if (tid == 0)
{
gomp_global_icv.nthreads_var = ntids;
/* Starting additional threads is not supported. */
gomp_global_icv.dyn_var = true;
nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
struct gomp_thread_pool *pool = alloca (sizeof (*pool));
pool->threads = alloca (ntids * sizeof (*pool->threads));
for (tid = 0; tid < ntids; tid++)
pool->threads[tid] = nvptx_thrs + tid;
pool->threads_size = ntids;
pool->threads_used = ntids;
pool->threads_busy = 1;
pool->last_team = NULL;
gomp_simple_barrier_init (&pool->threads_dock, ntids);
nvptx_thrs[0].thread_pool = pool;
asm ("bar.sync 0;");
fn (fn_data);
gomp_free_thread (nvptx_thrs);
}
else
{
asm ("bar.sync 0;");
gomp_thread_start (nvptx_thrs[0].thread_pool);
}
}
/* This function contains the idle loop in which a thread waits
to be called up to become part of a team. */
static void
gomp_thread_start (struct gomp_thread_pool *pool)
{
struct gomp_thread *thr = gomp_thread ();
gomp_sem_init (&thr->release, 0);
thr->thread_pool = pool;
do
{
gomp_simple_barrier_wait (&pool->threads_dock);
if (!thr->fn)
continue;
thr->fn (thr->data);
thr->fn = NULL;
struct gomp_task *task = thr->task;
gomp_team_barrier_wait_final (&thr->ts.team->barrier);
gomp_finish_task (task);
}
/* Work around an NVIDIA driver bug: when generating sm_50 machine code,
it can trash stack pointer R1 in loops lacking exit edges. Add a cheap
artificial exit that the driver would not be able to optimize out. */
while (nvptx_thrs);
}
/* Launch a team. */
void
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
unsigned flags, struct gomp_team *team)
{
struct gomp_thread *thr, *nthr;
struct gomp_task *task;
struct gomp_task_icv *icv;
struct gomp_thread_pool *pool;
unsigned long nthreads_var;
thr = gomp_thread ();
pool = thr->thread_pool;
task = thr->task;
icv = task ? &task->icv : &gomp_global_icv;
/* Always save the previous state, even if this isn't a nested team.
In particular, we should save any work share state from an outer
orphaned work share construct. */
team->prev_ts = thr->ts;
thr->ts.team = team;
thr->ts.team_id = 0;
++thr->ts.level;
if (nthreads > 1)
++thr->ts.active_level;
thr->ts.work_share = &team->work_shares[0];
thr->ts.last_work_share = NULL;
thr->ts.single_count = 0;
thr->ts.static_trip = 0;
thr->task = &team->implicit_task[0];
nthreads_var = icv->nthreads_var;
gomp_init_task (thr->task, task, icv);
team->implicit_task[0].icv.nthreads_var = nthreads_var;
if (nthreads == 1)
return;
/* Release existing idle threads. */
for (unsigned i = 1; i < nthreads; ++i)
{
nthr = pool->threads[i];
nthr->ts.team = team;
nthr->ts.work_share = &team->work_shares[0];
nthr->ts.last_work_share = NULL;
nthr->ts.team_id = i;
nthr->ts.level = team->prev_ts.level + 1;
nthr->ts.active_level = thr->ts.active_level;
nthr->ts.single_count = 0;
nthr->ts.static_trip = 0;
nthr->task = &team->implicit_task[i];
gomp_init_task (nthr->task, task, icv);
team->implicit_task[i].icv.nthreads_var = nthreads_var;
nthr->fn = fn;
nthr->data = data;
team->ordered_release[i] = &nthr->release;
}
gomp_simple_barrier_wait (&pool->threads_dock);
}
#include "../../team.c"
#endif

View File

@ -0,0 +1,49 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Dmitry Melnik <dm@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file implements timer routines for NVPTX. It uses the %clock64 cycle
counter. */
#include "libgomp.h"
/* This is set from host in plugin-nvptx.c. */
double __nvptx_clocktick = 0;
double
omp_get_wtime (void)
{
uint64_t clock;
asm ("mov.u64 %0, %%clock64;" : "=r" (clock));
return clock * __nvptx_clocktick;
}
double
omp_get_wtick (void)
{
return __nvptx_clocktick;
}
ialias (omp_get_wtime)
ialias (omp_get_wtick)

View File

@ -0,0 +1,69 @@
/* Copyright (C) 2015-2016 Free Software Foundation, Inc.
Contributed by Alexander Monakov <amonakov@ispras.ru>
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is a simplified barrier that is suitable for thread pool
synchronizaton. Only a subset of full barrier API (bar.h) is exposed. */
#ifndef GOMP_SIMPLE_BARRIER_H
#define GOMP_SIMPLE_BARRIER_H 1
#include "bar.h"
typedef struct
{
gomp_barrier_t bar;
} gomp_simple_barrier_t;
static inline void
gomp_simple_barrier_init (gomp_simple_barrier_t *bar, unsigned count)
{
gomp_barrier_init (&bar->bar, count);
}
static inline void
gomp_simple_barrier_reinit (gomp_simple_barrier_t *bar, unsigned count)
{
gomp_barrier_reinit (&bar->bar, count);
}
static inline void
gomp_simple_barrier_destroy (gomp_simple_barrier_t *bar)
{
gomp_barrier_destroy (&bar->bar);
}
static inline void
gomp_simple_barrier_wait (gomp_simple_barrier_t *bar)
{
gomp_barrier_wait (&bar->bar);
}
static inline void
gomp_simple_barrier_wait_last (gomp_simple_barrier_t *bar)
{
gomp_barrier_wait_last (&bar->bar);
}
#endif /* GOMP_SIMPLE_BARRIER_H */

7
libgomp/configure vendored
View File

@ -15070,6 +15070,7 @@ case "$host" in
;;
nvptx*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
libgomp_use_pthreads=no
;;
*)
# Check to see if -pthread or -lpthread is needed. Prefer the former.
@ -15115,6 +15116,12 @@ rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
esac
if test x$libgomp_use_pthreads != xno; then
$as_echo "#define LIBGOMP_USE_PTHREADS 1" >>confdefs.h
fi
# Plugins for offload execution, configure.ac fragment. -*- mode: autoconf -*-
#
# Copyright (C) 2014-2016 Free Software Foundation, Inc.

View File

@ -181,6 +181,7 @@ case "$host" in
;;
nvptx*-*-*)
# NVPTX does not support Pthreads, has its own code replacement.
libgomp_use_pthreads=no
;;
*)
# Check to see if -pthread or -lpthread is needed. Prefer the former.
@ -202,6 +203,11 @@ case "$host" in
[AC_MSG_ERROR([Pthreads are required to build libgomp])])])
esac
if test x$libgomp_use_pthreads != xno; then
AC_DEFINE(LIBGOMP_USE_PTHREADS, 1,
[Define to 1 if libgomp should use POSIX threads.])
fi
m4_include([plugin/configfrag.ac])
# Check for functions needed.

View File

@ -115,33 +115,11 @@ GOMP_critical_name_end (void **pptr)
gomp_mutex_unlock (plock);
}
/* This mutex is used when atomic operations don't exist for the target
in the mode requested. The result is not globally atomic, but works so
long as all parallel references are within #pragma omp atomic directives.
According to responses received from omp@openmp.org, appears to be within
spec. Which makes sense, since that's how several other compilers
handle this situation as well. */
static gomp_mutex_t atomic_lock;
void
GOMP_atomic_start (void)
{
gomp_mutex_lock (&atomic_lock);
}
void
GOMP_atomic_end (void)
{
gomp_mutex_unlock (&atomic_lock);
}
#if !GOMP_MUTEX_INIT_0
static void __attribute__((constructor))
initialize_critical (void)
{
gomp_mutex_init (&default_lock);
gomp_mutex_init (&atomic_lock);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&create_lock_lock);
#endif

View File

@ -23,8 +23,8 @@
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file defines the OpenMP internal control variables, and arranges
for them to be initialized from environment variables at startup. */
/* This file arranges for OpenMP internal control variables to be initialized
from environment variables at startup. */
#include "libgomp.h"
#include "libgomp_f.h"
@ -55,35 +55,6 @@
# define strtoull(ptr, eptr, base) strtoul (ptr, eptr, base)
#endif
struct gomp_task_icv gomp_global_icv = {
.nthreads_var = 1,
.thread_limit_var = UINT_MAX,
.run_sched_var = GFS_DYNAMIC,
.run_sched_chunk_size = 1,
.default_device_var = 0,
.dyn_var = false,
.nest_var = false,
.bind_var = omp_proc_bind_false,
.target_data = NULL
};
unsigned long gomp_max_active_levels_var = INT_MAX;
bool gomp_cancel_var = false;
int gomp_max_task_priority_var = 0;
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_managed_threads_lock;
#endif
unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
char *gomp_bind_var_list;
unsigned long gomp_bind_var_list_len;
void **gomp_places_list;
unsigned long gomp_places_list_len;
int gomp_debug_var;
char *goacc_device_type;
int goacc_device_num;
/* Parse the OMP_SCHEDULE environment variable. */
static void
@ -1302,240 +1273,3 @@ initialize_env (void)
goacc_runtime_initialize ();
}
/* The public OpenMP API routines that access these variables. */
void
omp_set_num_threads (int n)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->nthreads_var = (n > 0 ? n : 1);
}
void
omp_set_dynamic (int val)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->dyn_var = val;
}
int
omp_get_dynamic (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->dyn_var;
}
void
omp_set_nested (int val)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->nest_var = val;
}
int
omp_get_nested (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->nest_var;
}
void
omp_set_schedule (omp_sched_t kind, int chunk_size)
{
struct gomp_task_icv *icv = gomp_icv (true);
switch (kind)
{
case omp_sched_static:
if (chunk_size < 1)
chunk_size = 0;
icv->run_sched_chunk_size = chunk_size;
break;
case omp_sched_dynamic:
case omp_sched_guided:
if (chunk_size < 1)
chunk_size = 1;
icv->run_sched_chunk_size = chunk_size;
break;
case omp_sched_auto:
break;
default:
return;
}
icv->run_sched_var = kind;
}
void
omp_get_schedule (omp_sched_t *kind, int *chunk_size)
{
struct gomp_task_icv *icv = gomp_icv (false);
*kind = icv->run_sched_var;
*chunk_size = icv->run_sched_chunk_size;
}
int
omp_get_max_threads (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->nthreads_var;
}
int
omp_get_thread_limit (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
}
void
omp_set_max_active_levels (int max_levels)
{
if (max_levels >= 0)
gomp_max_active_levels_var = max_levels;
}
int
omp_get_max_active_levels (void)
{
return gomp_max_active_levels_var;
}
int
omp_get_cancellation (void)
{
return gomp_cancel_var;
}
int
omp_get_max_task_priority (void)
{
return gomp_max_task_priority_var;
}
omp_proc_bind_t
omp_get_proc_bind (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->bind_var;
}
void
omp_set_default_device (int device_num)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->default_device_var = device_num >= 0 ? device_num : 0;
}
int
omp_get_default_device (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->default_device_var;
}
int
omp_get_num_devices (void)
{
return gomp_get_num_devices ();
}
int
omp_get_num_teams (void)
{
/* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */
return 1;
}
int
omp_get_team_num (void)
{
/* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */
return 0;
}
int
omp_is_initial_device (void)
{
/* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */
return 1;
}
int
omp_get_initial_device (void)
{
return GOMP_DEVICE_HOST_FALLBACK;
}
int
omp_get_num_places (void)
{
return gomp_places_list_len;
}
int
omp_get_place_num (void)
{
if (gomp_places_list == NULL)
return -1;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
return (int) thr->place - 1;
}
int
omp_get_partition_num_places (void)
{
if (gomp_places_list == NULL)
return 0;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
return thr->ts.place_partition_len;
}
void
omp_get_partition_place_nums (int *place_nums)
{
if (gomp_places_list == NULL)
return;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
unsigned int i;
for (i = 0; i < thr->ts.place_partition_len; i++)
*place_nums++ = thr->ts.place_partition_off + i;
}
ialias (omp_set_dynamic)
ialias (omp_set_nested)
ialias (omp_set_num_threads)
ialias (omp_get_dynamic)
ialias (omp_get_nested)
ialias (omp_set_schedule)
ialias (omp_get_schedule)
ialias (omp_get_max_threads)
ialias (omp_get_thread_limit)
ialias (omp_set_max_active_levels)
ialias (omp_get_max_active_levels)
ialias (omp_get_cancellation)
ialias (omp_get_proc_bind)
ialias (omp_set_default_device)
ialias (omp_get_default_device)
ialias (omp_get_num_devices)
ialias (omp_get_num_teams)
ialias (omp_get_team_num)
ialias (omp_is_initial_device)
ialias (omp_get_initial_device)
ialias (omp_get_max_task_priority)
ialias (omp_get_num_places)
ialias (omp_get_place_num)
ialias (omp_get_partition_num_places)
ialias (omp_get_partition_place_nums)

77
libgomp/icv-device.c Normal file
View File

@ -0,0 +1,77 @@
/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file defines OpenMP API entry points that accelerator targets are
expected to replace. */
#include "libgomp.h"
void
omp_set_default_device (int device_num)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->default_device_var = device_num >= 0 ? device_num : 0;
}
int
omp_get_default_device (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->default_device_var;
}
int
omp_get_num_devices (void)
{
return gomp_get_num_devices ();
}
int
omp_get_num_teams (void)
{
/* Hardcoded to 1 on host, MIC, HSAIL? Maybe variable on PTX. */
return 1;
}
int
omp_get_team_num (void)
{
/* Hardcoded to 0 on host, MIC, HSAIL? Maybe variable on PTX. */
return 0;
}
int
omp_is_initial_device (void)
{
/* Hardcoded to 1 on host, should be 0 on MIC, HSAIL, PTX. */
return 1;
}
ialias (omp_set_default_device)
ialias (omp_get_default_device)
ialias (omp_get_num_devices)
ialias (omp_get_num_teams)
ialias (omp_get_team_num)
ialias (omp_is_initial_device)

248
libgomp/icv.c Normal file
View File

@ -0,0 +1,248 @@
/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file defines the OpenMP internal control variables and associated
OpenMP API entry points. */
#include "libgomp.h"
#include "gomp-constants.h"
#include <limits.h>
struct gomp_task_icv gomp_global_icv = {
.nthreads_var = 1,
.thread_limit_var = UINT_MAX,
.run_sched_var = GFS_DYNAMIC,
.run_sched_chunk_size = 1,
.default_device_var = 0,
.dyn_var = false,
.nest_var = false,
.bind_var = omp_proc_bind_false,
.target_data = NULL
};
unsigned long gomp_max_active_levels_var = INT_MAX;
bool gomp_cancel_var = false;
int gomp_max_task_priority_var = 0;
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_t gomp_managed_threads_lock;
#endif
unsigned long gomp_available_cpus = 1, gomp_managed_threads = 1;
unsigned long long gomp_spin_count_var, gomp_throttled_spin_count_var;
unsigned long *gomp_nthreads_var_list, gomp_nthreads_var_list_len;
char *gomp_bind_var_list;
unsigned long gomp_bind_var_list_len;
void **gomp_places_list;
unsigned long gomp_places_list_len;
int gomp_debug_var;
unsigned int gomp_num_teams_var;
char *goacc_device_type;
int goacc_device_num;
void
omp_set_num_threads (int n)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->nthreads_var = (n > 0 ? n : 1);
}
void
omp_set_dynamic (int val)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->dyn_var = val;
}
int
omp_get_dynamic (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->dyn_var;
}
void
omp_set_nested (int val)
{
struct gomp_task_icv *icv = gomp_icv (true);
icv->nest_var = val;
}
int
omp_get_nested (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->nest_var;
}
void
omp_set_schedule (omp_sched_t kind, int chunk_size)
{
struct gomp_task_icv *icv = gomp_icv (true);
switch (kind)
{
case omp_sched_static:
if (chunk_size < 1)
chunk_size = 0;
icv->run_sched_chunk_size = chunk_size;
break;
case omp_sched_dynamic:
case omp_sched_guided:
if (chunk_size < 1)
chunk_size = 1;
icv->run_sched_chunk_size = chunk_size;
break;
case omp_sched_auto:
break;
default:
return;
}
icv->run_sched_var = kind;
}
void
omp_get_schedule (omp_sched_t *kind, int *chunk_size)
{
struct gomp_task_icv *icv = gomp_icv (false);
*kind = icv->run_sched_var;
*chunk_size = icv->run_sched_chunk_size;
}
int
omp_get_max_threads (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->nthreads_var;
}
int
omp_get_thread_limit (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->thread_limit_var > INT_MAX ? INT_MAX : icv->thread_limit_var;
}
void
omp_set_max_active_levels (int max_levels)
{
if (max_levels >= 0)
gomp_max_active_levels_var = max_levels;
}
int
omp_get_max_active_levels (void)
{
return gomp_max_active_levels_var;
}
int
omp_get_cancellation (void)
{
return gomp_cancel_var;
}
int
omp_get_max_task_priority (void)
{
return gomp_max_task_priority_var;
}
omp_proc_bind_t
omp_get_proc_bind (void)
{
struct gomp_task_icv *icv = gomp_icv (false);
return icv->bind_var;
}
int
omp_get_initial_device (void)
{
return GOMP_DEVICE_HOST_FALLBACK;
}
int
omp_get_num_places (void)
{
return gomp_places_list_len;
}
int
omp_get_place_num (void)
{
if (gomp_places_list == NULL)
return -1;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
return (int) thr->place - 1;
}
int
omp_get_partition_num_places (void)
{
if (gomp_places_list == NULL)
return 0;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
return thr->ts.place_partition_len;
}
void
omp_get_partition_place_nums (int *place_nums)
{
if (gomp_places_list == NULL)
return;
struct gomp_thread *thr = gomp_thread ();
if (thr->place == 0)
gomp_init_affinity ();
unsigned int i;
for (i = 0; i < thr->ts.place_partition_len; i++)
*place_nums++ = thr->ts.place_partition_off + i;
}
ialias (omp_set_dynamic)
ialias (omp_set_nested)
ialias (omp_set_num_threads)
ialias (omp_get_dynamic)
ialias (omp_get_nested)
ialias (omp_set_schedule)
ialias (omp_get_schedule)
ialias (omp_get_max_threads)
ialias (omp_get_thread_limit)
ialias (omp_set_max_active_levels)
ialias (omp_get_max_active_levels)
ialias (omp_get_cancellation)
ialias (omp_get_proc_bind)
ialias (omp_get_initial_device)
ialias (omp_get_max_task_priority)
ialias (omp_get_num_places)
ialias (omp_get_place_num)
ialias (omp_get_partition_num_places)
ialias (omp_get_partition_place_nums)

View File

@ -45,7 +45,9 @@
#include "gstdint.h"
#include "libgomp-plugin.h"
#ifdef HAVE_PTHREAD_H
#include <pthread.h>
#endif
#include <stdbool.h>
#include <stdlib.h>
#include <stdarg.h>
@ -122,6 +124,7 @@ struct htab;
#include "sem.h"
#include "mutex.h"
#include "bar.h"
#include "simple-bar.h"
#include "ptrlock.h"
@ -360,6 +363,7 @@ extern char *gomp_bind_var_list;
extern unsigned long gomp_bind_var_list_len;
extern void **gomp_places_list;
extern unsigned long gomp_places_list_len;
extern unsigned int gomp_num_teams_var;
extern int gomp_debug_var;
extern int goacc_device_num;
extern char *goacc_device_type;
@ -626,8 +630,8 @@ struct gomp_thread_pool
/* Number of threads running in this contention group. */
unsigned long threads_busy;
/* This barrier holds and releases threads waiting in threads. */
gomp_barrier_t threads_dock;
/* This barrier holds and releases threads waiting in thread pools. */
gomp_simple_barrier_t threads_dock;
};
enum gomp_cancel_kind
@ -642,7 +646,15 @@ enum gomp_cancel_kind
/* ... and here is that TLS data. */
#if defined HAVE_TLS || defined USE_EMUTLS
#if defined __nvptx__
extern struct gomp_thread *nvptx_thrs __attribute__((shared));
static inline struct gomp_thread *gomp_thread (void)
{
int tid;
asm ("mov.u32 %0, %%tid.y;" : "=r" (tid));
return nvptx_thrs + tid;
}
#elif defined HAVE_TLS || defined USE_EMUTLS
extern __thread struct gomp_thread gomp_tls_data;
static inline struct gomp_thread *gomp_thread (void)
{
@ -671,17 +683,21 @@ static inline struct gomp_task_icv *gomp_icv (bool write)
return &gomp_global_icv;
}
#ifdef LIBGOMP_USE_PTHREADS
/* The attributes to be used during thread creation. */
extern pthread_attr_t gomp_thread_attr;
extern pthread_key_t gomp_thread_destructor;
#endif
/* Function prototypes. */
/* affinity.c */
extern void gomp_init_affinity (void);
#ifdef LIBGOMP_USE_PTHREADS
extern void gomp_init_thread_affinity (pthread_attr_t *, unsigned int);
#endif
extern void **gomp_affinity_alloc (unsigned long, bool);
extern void gomp_affinity_init_place (void *);
extern bool gomp_affinity_add_cpus (void *, unsigned long, unsigned long,

123
libgomp/lock.c Normal file
View File

@ -0,0 +1,123 @@
/* Copyright (C) 2005-2016 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This is a generic implementation of the public OpenMP locking primitives in
terms of internal gomp_mutex_t. It is not meant to be compiled on its own.
It is #include'd from config/{linux,nvptx}/lock.c. */
#include <string.h>
#include "libgomp.h"
/* The internal gomp_mutex_t and the external non-recursive omp_lock_t
have the same form. Re-use it. */
void
gomp_init_lock_30 (omp_lock_t *lock)
{
gomp_mutex_init (lock);
}
void
gomp_destroy_lock_30 (omp_lock_t *lock)
{
gomp_mutex_destroy (lock);
}
void
gomp_set_lock_30 (omp_lock_t *lock)
{
gomp_mutex_lock (lock);
}
void
gomp_unset_lock_30 (omp_lock_t *lock)
{
gomp_mutex_unlock (lock);
}
int
gomp_test_lock_30 (omp_lock_t *lock)
{
int oldval = 0;
return __atomic_compare_exchange_n (lock, &oldval, 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_RELAXED);
}
void
gomp_init_nest_lock_30 (omp_nest_lock_t *lock)
{
memset (lock, '\0', sizeof (*lock));
}
void
gomp_destroy_nest_lock_30 (omp_nest_lock_t *lock)
{
}
void
gomp_set_nest_lock_30 (omp_nest_lock_t *lock)
{
void *me = gomp_icv (true);
if (lock->owner != me)
{
gomp_mutex_lock (&lock->lock);
lock->owner = me;
}
lock->count++;
}
void
gomp_unset_nest_lock_30 (omp_nest_lock_t *lock)
{
if (--lock->count == 0)
{
lock->owner = NULL;
gomp_mutex_unlock (&lock->lock);
}
}
int
gomp_test_nest_lock_30 (omp_nest_lock_t *lock)
{
void *me = gomp_icv (true);
int oldval;
if (lock->owner == me)
return ++lock->count;
oldval = 0;
if (__atomic_compare_exchange_n (&lock->lock, &oldval, 1, false,
MEMMODEL_ACQUIRE, MEMMODEL_RELAXED))
{
lock->owner = me;
lock->count = 1;
return 1;
}
return 0;
}

View File

@ -41,6 +41,7 @@
#include <cuda.h>
#include <stdbool.h>
#include <stdint.h>
#include <limits.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
@ -274,6 +275,8 @@ struct targ_fn_descriptor
{
CUfunction fn;
const struct targ_fn_launch *launch;
int regs_per_thread;
int max_threads_per_block;
};
/* A loaded PTX image. */
@ -307,8 +310,12 @@ struct ptx_device
bool overlap;
bool map;
bool concur;
int mode;
bool mkern;
int mode;
int clock_khz;
int num_sms;
int regs_per_block;
int regs_per_sm;
struct ptx_image_data *images; /* Images loaded on device. */
pthread_mutex_t image_lock; /* Lock for above list. */
@ -658,6 +665,39 @@ nvptx_open_device (int n)
&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
ptx_dev->mkern = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
ptx_dev->clock_khz = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
ptx_dev->num_sms = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
ptx_dev->regs_per_block = pi;
/* CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82 is defined only
in CUDA 6.0 and newer. */
r = cuDeviceGetAttribute (&pi, 82, dev);
/* Fallback: use limit of registers per block, which is usually equal. */
if (r == CUDA_ERROR_INVALID_VALUE)
pi = ptx_dev->regs_per_block;
else if (r != CUDA_SUCCESS)
{
GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
return NULL;
}
ptx_dev->regs_per_sm = pi;
CUDA_CALL_ERET (NULL, cuDeviceGetAttribute,
&pi, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
if (pi != 32)
{
GOMP_PLUGIN_error ("Only warp size 32 is supported");
return NULL;
}
r = cuDeviceGetAttribute (&async_engines,
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
if (r != CUDA_SUCCESS)
@ -725,10 +765,8 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
CUjit_option opts[6];
void *optvals[6];
float elapsed = 0.0;
#define LOGSIZE 8192
char elog[LOGSIZE];
char ilog[LOGSIZE];
unsigned long logsize = LOGSIZE;
char elog[1024];
char ilog[16384];
CUlinkState linkstate;
CUresult r;
void *linkout;
@ -741,13 +779,13 @@ link_ptx (CUmodule *module, const struct targ_ptx_obj *ptx_objs,
optvals[1] = &ilog[0];
opts[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optvals[2] = (void *) logsize;
optvals[2] = (void *) sizeof ilog;
opts[3] = CU_JIT_ERROR_LOG_BUFFER;
optvals[3] = &elog[0];
opts[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optvals[4] = (void *) logsize;
optvals[4] = (void *) sizeof elog;
opts[5] = CU_JIT_LOG_VERBOSE;
optvals[5] = (void *) 1;
@ -1164,7 +1202,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
}
#ifndef DISABLE_ASYNC
if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@ -1220,7 +1258,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
}
#ifndef DISABLE_ASYNC
if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
if (nvthd && nvthd->current_stream != nvthd->ptx_dev->null_stream)
{
CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
CUDA_CALL (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
@ -1518,7 +1556,7 @@ GOMP_OFFLOAD_get_name (void)
unsigned int
GOMP_OFFLOAD_get_caps (void)
{
return GOMP_OFFLOAD_CAP_OPENACC_200;
return GOMP_OFFLOAD_CAP_OPENACC_200 | GOMP_OFFLOAD_CAP_OPENMP_400;
}
int
@ -1588,6 +1626,23 @@ GOMP_OFFLOAD_version (void)
return GOMP_VERSION;
}
/* Initialize __nvptx_clocktick, if present in MODULE. */
static void
nvptx_set_clocktick (CUmodule module, struct ptx_device *dev)
{
CUdeviceptr dptr;
CUresult r = cuModuleGetGlobal (&dptr, NULL, module, "__nvptx_clocktick");
if (r == CUDA_ERROR_NOT_FOUND)
return;
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));
double __nvptx_clocktick = 1e-3 / dev->clock_khz;
r = cuMemcpyHtoD (dptr, &__nvptx_clocktick, sizeof (__nvptx_clocktick));
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r));
}
/* Load the (partial) program described by TARGET_DATA to device
number ORD. Allocate and return TARGET_TABLE. */
@ -1648,12 +1703,19 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
{
CUfunction function;
int nregs, mthrs;
CUDA_CALL_ERET (-1, cuModuleGetFunction, &function, module,
fn_descs[i].fn);
CUDA_CALL_ERET (-1, cuFuncGetAttribute, &nregs,
CU_FUNC_ATTRIBUTE_NUM_REGS, function);
CUDA_CALL_ERET (-1, cuFuncGetAttribute, &mthrs,
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, function);
targ_fns->fn = function;
targ_fns->launch = &fn_descs[i];
targ_fns->regs_per_thread = nregs;
targ_fns->max_threads_per_block = mthrs;
targ_tbl->start = (uintptr_t) targ_fns;
targ_tbl->end = targ_tbl->start + 1;
@ -1671,6 +1733,8 @@ GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
targ_tbl->end = targ_tbl->start + bytes;
}
nvptx_set_clocktick (module, dev);
return fn_entries + var_entries;
}
@ -1736,6 +1800,15 @@ GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
&& nvptx_host2dev (dst, src, n));
}
bool
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
{
struct ptx_device *ptx_dev = ptx_devices[ord];
CUDA_CALL (cuMemcpyDtoDAsync, (CUdeviceptr) dst, (CUdeviceptr) src, n,
ptx_dev->null_stream->stream);
return true;
}
void (*device_run) (int n, void *fn_ptr, void *vars) = NULL;
void
@ -1857,3 +1930,123 @@ GOMP_OFFLOAD_openacc_set_cuda_stream (int async, void *stream)
{
return nvptx_set_cuda_stream (async, stream);
}
/* Adjust launch dimensions: pick good values for number of blocks and warps
and ensure that number of warps does not exceed CUDA limits as well as GCC's
own limits. */
static void
nvptx_adjust_launch_bounds (struct targ_fn_descriptor *fn,
struct ptx_device *ptx_dev,
int *teams_p, int *threads_p)
{
int max_warps_block = fn->max_threads_per_block / 32;
/* Maximum 32 warps per block is an implementation limit in NVPTX backend
and libgcc, which matches documented limit of all GPUs as of 2015. */
if (max_warps_block > 32)
max_warps_block = 32;
if (*threads_p <= 0)
*threads_p = 8;
if (*threads_p > max_warps_block)
*threads_p = max_warps_block;
int regs_per_block = fn->regs_per_thread * 32 * *threads_p;
/* This is an estimate of how many blocks the device can host simultaneously.
Actual limit, which may be lower, can be queried with "occupancy control"
driver interface (since CUDA 6.0). */
int max_blocks = ptx_dev->regs_per_sm / regs_per_block * ptx_dev->num_sms;
if (*teams_p <= 0 || *teams_p > max_blocks)
*teams_p = max_blocks;
}
/* Return the size of per-warp stacks (see gcc -msoft-stack) to use for OpenMP
target regions. */
static size_t
nvptx_stacks_size ()
{
return 128 * 1024;
}
/* Return contiguous storage for NUM stacks, each SIZE bytes. */
static void *
nvptx_stacks_alloc (size_t size, int num)
{
CUdeviceptr stacks;
CUresult r = cuMemAlloc (&stacks, size * num);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemAlloc error: %s", cuda_error (r));
return (void *) stacks;
}
/* Release storage previously allocated by nvptx_stacks_alloc. */
static void
nvptx_stacks_free (void *p, int num)
{
CUresult r = cuMemFree ((CUdeviceptr) p);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuMemFree error: %s", cuda_error (r));
}
void
GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
{
CUfunction function = ((struct targ_fn_descriptor *) tgt_fn)->fn;
CUresult r;
struct ptx_device *ptx_dev = ptx_devices[ord];
const char *maybe_abort_msg = "(perhaps abort was called)";
int teams = 0, threads = 0;
if (!args)
GOMP_PLUGIN_fatal ("No target arguments provided");
while (*args)
{
intptr_t id = (intptr_t) *args++, val;
if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
val = (intptr_t) *args++;
else
val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
if ((id & GOMP_TARGET_ARG_DEVICE_MASK) != GOMP_TARGET_ARG_DEVICE_ALL)
continue;
val = val > INT_MAX ? INT_MAX : val;
id &= GOMP_TARGET_ARG_ID_MASK;
if (id == GOMP_TARGET_ARG_NUM_TEAMS)
teams = val;
else if (id == GOMP_TARGET_ARG_THREAD_LIMIT)
threads = val;
}
nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);
size_t stack_size = nvptx_stacks_size ();
void *stacks = nvptx_stacks_alloc (stack_size, teams * threads);
void *fn_args[] = {tgt_vars, stacks, (void *) stack_size};
size_t fn_args_size = sizeof fn_args;
void *config[] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, fn_args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &fn_args_size,
CU_LAUNCH_PARAM_END
};
r = cuLaunchKernel (function,
teams, 1, 1,
32, threads, 1,
0, ptx_dev->null_stream->stream, NULL, config);
if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r));
r = cuCtxSynchronize ();
if (r == CUDA_ERROR_LAUNCH_FAILED)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
maybe_abort_msg);
else if (r != CUDA_SUCCESS)
GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));
nvptx_stacks_free (stacks, teams * threads);
}
void
GOMP_OFFLOAD_async_run (int ord, void *tgt_fn, void *tgt_vars, void **args,
void *async_data)
{
GOMP_PLUGIN_fatal ("GOMP_OFFLOAD_async_run unimplemented");
}

View File

@ -31,6 +31,7 @@
#include <stdlib.h>
#include <string.h>
#ifdef LIBGOMP_USE_PTHREADS
/* This attribute contains PTHREAD_CREATE_DETACHED. */
pthread_attr_t gomp_thread_attr;
@ -110,7 +111,7 @@ gomp_thread_start (void *xdata)
{
pool->threads[thr->ts.team_id] = thr;
gomp_barrier_wait (&pool->threads_dock);
gomp_simple_barrier_wait (&pool->threads_dock);
do
{
struct gomp_team *team = thr->ts.team;
@ -120,7 +121,7 @@ gomp_thread_start (void *xdata)
gomp_team_barrier_wait_final (&team->barrier);
gomp_finish_task (task);
gomp_barrier_wait (&pool->threads_dock);
gomp_simple_barrier_wait (&pool->threads_dock);
local_fn = thr->fn;
local_data = thr->data;
@ -134,6 +135,7 @@ gomp_thread_start (void *xdata)
thr->task = NULL;
return NULL;
}
#endif
static inline struct gomp_team *
get_last_team (unsigned nthreads)
@ -224,11 +226,17 @@ gomp_free_pool_helper (void *thread_pool)
struct gomp_thread *thr = gomp_thread ();
struct gomp_thread_pool *pool
= (struct gomp_thread_pool *) thread_pool;
gomp_barrier_wait_last (&pool->threads_dock);
gomp_simple_barrier_wait_last (&pool->threads_dock);
gomp_sem_destroy (&thr->release);
thr->thread_pool = NULL;
thr->task = NULL;
#ifdef LIBGOMP_USE_PTHREADS
pthread_exit (NULL);
#elif defined(__nvptx__)
asm ("exit;");
#else
#error gomp_free_pool_helper must terminate the thread
#endif
}
/* Free a thread pool and release its threads. */
@ -250,12 +258,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
nthr->data = pool;
}
/* This barrier undocks threads docked on pool->threads_dock. */
gomp_barrier_wait (&pool->threads_dock);
gomp_simple_barrier_wait (&pool->threads_dock);
/* And this waits till all threads have called gomp_barrier_wait_last
in gomp_free_pool_helper. */
gomp_barrier_wait (&pool->threads_dock);
gomp_simple_barrier_wait (&pool->threads_dock);
/* Now it is safe to destroy the barrier and free the pool. */
gomp_barrier_destroy (&pool->threads_dock);
gomp_simple_barrier_destroy (&pool->threads_dock);
#ifdef HAVE_SYNC_BUILTINS
__sync_fetch_and_add (&gomp_managed_threads,
@ -266,10 +274,12 @@ gomp_free_thread (void *arg __attribute__((unused)))
gomp_mutex_unlock (&gomp_managed_threads_lock);
#endif
}
free (pool->threads);
if (pool->last_team)
free_team (pool->last_team);
#ifndef __nvptx__
free (pool->threads);
free (pool);
#endif
thr->thread_pool = NULL;
}
if (thr->ts.level == 0 && __builtin_expect (thr->ts.team != NULL, 0))
@ -284,6 +294,7 @@ gomp_free_thread (void *arg __attribute__((unused)))
/* Launch a team. */
#ifdef LIBGOMP_USE_PTHREADS
void
gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
unsigned flags, struct gomp_team *team)
@ -429,7 +440,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
else if (old_threads_used == 0)
{
n = 0;
gomp_barrier_init (&pool->threads_dock, nthreads);
gomp_simple_barrier_init (&pool->threads_dock, nthreads);
}
else
{
@ -437,7 +448,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
/* Increase the barrier threshold to make sure all new
threads arrive before the team is released. */
gomp_barrier_reinit (&pool->threads_dock, nthreads);
gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
}
/* Not true yet, but soon will be. We're going to release all
@ -670,8 +681,8 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
threads and all the threads we're going to let die
arrive before the team is released. */
if (affinity_count)
gomp_barrier_reinit (&pool->threads_dock,
nthreads + affinity_count);
gomp_simple_barrier_reinit (&pool->threads_dock,
nthreads + affinity_count);
}
}
@ -812,7 +823,10 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
pthread_attr_destroy (&thread_attr);
do_release:
gomp_barrier_wait (nested ? &team->barrier : &pool->threads_dock);
if (nested)
gomp_barrier_wait (&team->barrier);
else
gomp_simple_barrier_wait (&pool->threads_dock);
/* Decrease the barrier threshold to match the number of threads
that should arrive back at the end of this team. The extra
@ -830,7 +844,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
if (affinity_count)
diff = -affinity_count;
gomp_barrier_reinit (&pool->threads_dock, nthreads);
gomp_simple_barrier_reinit (&pool->threads_dock, nthreads);
#ifdef HAVE_SYNC_BUILTINS
__sync_fetch_and_add (&gomp_managed_threads, diff);
@ -844,6 +858,7 @@ gomp_team_start (void (*fn) (void *), void *data, unsigned nthreads,
&& team->prev_ts.place_partition_len > 64)
free (affinity_thr);
}
#endif
/* Terminate the current team. This is only to be called by the master
@ -919,6 +934,7 @@ gomp_team_end (void)
}
}
#ifdef LIBGOMP_USE_PTHREADS
/* Constructors for this file. */
@ -943,6 +959,7 @@ team_destructor (void)
crashes. */
pthread_key_delete (gomp_thread_destructor);
}
#endif
struct gomp_task_icv *
gomp_new_icv (void)
@ -951,6 +968,8 @@ gomp_new_icv (void)
struct gomp_task *task = gomp_malloc (sizeof (struct gomp_task));
gomp_init_task (task, NULL, &gomp_global_icv);
thr->task = task;
#ifdef LIBGOMP_USE_PTHREADS
pthread_setspecific (gomp_thread_destructor, thr);
#endif
return &task->icv;
}

View File

@ -7,7 +7,7 @@ global ALWAYS_CFLAGS
set shlib_ext [get_shlib_extension]
set lang_library_path "../libgfortran/.libs"
set lang_link_flags "-lgfortran"
set lang_link_flags "-lgfortran -foffload=-lgfortran"
if [info exists lang_include_flags] then {
unset lang_include_flags
}

View File

@ -9,7 +9,7 @@ global ALWAYS_CFLAGS
set shlib_ext [get_shlib_extension]
set lang_library_path "../libgfortran/.libs"
set lang_link_flags "-lgfortran"
set lang_link_flags "-lgfortran -foffload=-lgfortran"
if [info exists lang_include_flags] then {
unset lang_include_flags
}