Optimize GCN OpenMP malloc performance

2019-11-13  Andrew Stubbs  <ams@codesourcery.com>

	libgomp/
	* config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena
	and use team_malloc variants.
	(gomp_gcn_exit_kernel): Use team_free.
	* libgomp.h (TEAM_ARENA_SIZE): Define.
	(TEAM_ARENA_START): Define.
	(TEAM_ARENA_FREE): Define.
	(TEAM_ARENA_END): Define.
	(team_malloc): New function.
	(team_malloc_cleared): New function.
	(team_free): New function.
	* team.c (gomp_new_team): Initialize and use team_malloc.
	(free_team): Use team_free.
	(gomp_free_thread): Use team_free.
	(gomp_pause_host): Use team_free.
	* work.c (gomp_init_work_share): Use team_malloc.
	(gomp_fini_work_share): Use team_free.

From-SVN: r278136
This commit is contained in:
Andrew Stubbs 2019-11-13 12:38:09 +00:00 committed by Andrew Stubbs
parent fa4999953d
commit cee1645106
5 changed files with 106 additions and 12 deletions

View File

@ -1,3 +1,22 @@
2019-11-13 Andrew Stubbs <ams@codesourcery.com>
* config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena
and use team_malloc variants.
(gomp_gcn_exit_kernel): Use team_free.
* libgomp.h (TEAM_ARENA_SIZE): Define.
(TEAM_ARENA_START): Define.
(TEAM_ARENA_FREE): Define.
(TEAM_ARENA_END): Define.
(team_malloc): New function.
(team_malloc_cleared): New function.
(team_free): New function.
* team.c (gomp_new_team): Initialize and use team_malloc.
(free_team): Use team_free.
(gomp_free_thread): Use team_free.
(gomp_pause_host): Use team_free.
* work.c (gomp_init_work_share): Use team_malloc.
(gomp_fini_work_share): Use team_free.
2019-11-13 Andrew Stubbs <ams@codesourcery.com>
Kwok Cheung Yeung <kcy@codesourcery.com>
Julian Brown <julian@codesourcery.com>

View File

@ -57,16 +57,28 @@ gomp_gcn_enter_kernel (void)
/* Starting additional threads is not supported. */
gomp_global_icv.dyn_var = true;
/* Initialize the team arena for optimized memory allocation.
The arena has been allocated on the host side, and the address
passed in via the kernargs. Each team takes a small slice of it. */
register void **kernargs asm("s8");
void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
*arena_start = team_arena;
*arena_free = team_arena;
*arena_end = team_arena + TEAM_ARENA_SIZE;
/* Allocate and initialize the team-local-storage data. */
struct gomp_thread *thrs = gomp_malloc_cleared (sizeof (*thrs)
struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
* numthreads);
set_gcn_thrs (thrs);
/* Allocate and initailize a pool of threads in the team.
The threads are already running, of course, we just need to manage
the communication between them. */
struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool));
pool->threads = gomp_malloc (sizeof (void *) * numthreads);
struct gomp_thread_pool *pool = team_malloc (sizeof (*pool));
pool->threads = team_malloc (sizeof (void *) * numthreads);
for (int tid = 0; tid < numthreads; tid++)
pool->threads[tid] = &thrs[tid];
pool->threads_size = numthreads;
@ -91,7 +103,7 @@ void
gomp_gcn_exit_kernel (void)
{
gomp_free_thread (gcn_thrs ());
free (gcn_thrs ());
team_free (gcn_thrs ());
}
/* This function contains the idle loop in which a thread waits

View File

@ -106,6 +106,69 @@ extern void gomp_aligned_free (void *);
GCC's builtin alloca(). */
#define gomp_alloca(x) __builtin_alloca(x)
/* Optimized allocators for team-specific data that will die with the team. */
#ifdef __AMDGCN__
/* The arena is initialized in config/gcn/team.c. */
#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */
#define TEAM_ARENA_START 16 /* LDS offset of free pointer. */
#define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */
#define TEAM_ARENA_END 32 /* LDS offset of end pointer. */
static inline void * __attribute__((malloc))
team_malloc (size_t size)
{
/* 4-byte align the size. */
size = (size + 3) & ~3;
/* Allocate directly from the arena.
The compiler does not support DS atomics, yet. */
void *result;
asm ("ds_add_rtn_u64 %0, %1, %2\n\ts_waitcnt 0"
: "=v"(result) : "v"(TEAM_ARENA_FREE), "v"(size), "e"(1L) : "memory");
/* Handle OOM. */
if (result + size > *(void * __lds *)TEAM_ARENA_END)
{
/* While this is experimental, let's make sure we know when OOM
happens. */
const char msg[] = "GCN team arena exhausted\n";
write (2, msg, sizeof(msg)-1);
/* Fall back to using the heap (slowly). */
result = gomp_malloc (size);
}
return result;
}
static inline void * __attribute__((malloc))
team_malloc_cleared (size_t size)
{
char *result = team_malloc (size);
/* Clear the allocated memory. */
__builtin_memset (result, 0, size);
return result;
}
static inline void
team_free (void *ptr)
{
/* The whole arena is freed when the kernel exits.
However, if we fell back to using heap then we should free it.
It would be better if this function could be a no-op, but at least
LDS loads are cheap. */
if (ptr < *(void * __lds *)TEAM_ARENA_START
|| ptr >= *(void * __lds *)TEAM_ARENA_END)
free (ptr);
}
#else
#define team_malloc(...) gomp_malloc (__VA_ARGS__)
#define team_malloc_cleared(...) gomp_malloc_cleared (__VA_ARGS__)
#define team_free(...) free (__VA_ARGS__)
#endif
/* error.c */
extern void gomp_vdebug (int, const char *, va_list);

View File

@ -171,7 +171,7 @@ gomp_new_team (unsigned nthreads)
{
size_t extra = sizeof (team->ordered_release[0])
+ sizeof (team->implicit_task[0]);
team = gomp_malloc (sizeof (*team) + nthreads * extra);
team = team_malloc (sizeof (*team) + nthreads * extra);
#ifndef HAVE_SYNC_BUILTINS
gomp_mutex_init (&team->work_share_list_free_lock);
@ -221,7 +221,7 @@ free_team (struct gomp_team *team)
gomp_barrier_destroy (&team->barrier);
gomp_mutex_destroy (&team->task_lock);
priority_queue_free (&team->task_queue);
free (team);
team_free (team);
}
static void
@ -285,8 +285,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
if (pool->last_team)
free_team (pool->last_team);
#ifndef __nvptx__
free (pool->threads);
free (pool);
team_free (pool->threads);
team_free (pool);
#endif
thr->thread_pool = NULL;
}
@ -1082,8 +1082,8 @@ gomp_pause_host (void)
if (pool->last_team)
free_team (pool->last_team);
#ifndef __nvptx__
free (pool->threads);
free (pool);
team_free (pool->threads);
team_free (pool);
#endif
thr->thread_pool = NULL;
}

View File

@ -120,7 +120,7 @@ gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
else
ordered = nthreads * sizeof (*ws->ordered_team_ids);
if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
ws->ordered_team_ids = gomp_malloc (ordered);
ws->ordered_team_ids = team_malloc (ordered);
else
ws->ordered_team_ids = ws->inline_ordered_team_ids;
memset (ws->ordered_team_ids, '\0', ordered);
@ -142,7 +142,7 @@ gomp_fini_work_share (struct gomp_work_share *ws)
{
gomp_mutex_destroy (&ws->lock);
if (ws->ordered_team_ids != ws->inline_ordered_team_ids)
free (ws->ordered_team_ids);
team_free (ws->ordered_team_ids);
gomp_ptrlock_destroy (&ws->next_ws);
}