gcc/libgomp/work.c
Andrew Stubbs cee1645106 Optimize GCN OpenMP malloc performance
2019-11-13  Andrew Stubbs  <ams@codesourcery.com>

	libgomp/
	* config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena
	and use team_malloc variants.
	(gomp_gcn_exit_kernel): Use team_free.
	* libgomp.h (TEAM_ARENA_SIZE): Define.
	(TEAM_ARENA_START): Define.
	(TEAM_ARENA_FREE): Define.
	(TEAM_ARENA_END): Define.
	(team_malloc): New function.
	(team_malloc_cleared): New function.
	(team_free): New function.
	* team.c (gomp_new_team): Initialize and use team_malloc.
	(free_team): Use team_free.
	(gomp_free_thread): Use team_free.
	(gomp_pause_host): Use team_free.
	* work.c (gomp_init_work_share): Use team_malloc.
	(gomp_fini_work_share): Use team_free.

From-SVN: r278136
2019-11-13 12:38:09 +00:00

314 lines
9.0 KiB
C

/* Copyright (C) 2005-2019 Free Software Foundation, Inc.
Contributed by Richard Henderson <rth@redhat.com>.
This file is part of the GNU Offloading and Multi Processing Library
(libgomp).
Libgomp is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* This file contains routines to manage the work-share queue for a team
of threads. */
#include "libgomp.h"
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
/* Allocate a new work share structure, preferably from current team's
free gomp_work_share cache. */
static struct gomp_work_share *
alloc_work_share (struct gomp_team *team)
{
struct gomp_work_share *ws;
unsigned int i;
/* This is called in a critical section. */
if (team->work_share_list_alloc != NULL)
{
ws = team->work_share_list_alloc;
team->work_share_list_alloc = ws->next_free;
return ws;
}
#ifdef HAVE_SYNC_BUILTINS
ws = team->work_share_list_free;
/* We need atomic read from work_share_list_free,
as free_work_share can be called concurrently. */
__asm ("" : "+r" (ws));
if (ws && ws->next_free)
{
struct gomp_work_share *next = ws->next_free;
ws->next_free = NULL;
team->work_share_list_alloc = next->next_free;
return next;
}
#else
gomp_mutex_lock (&team->work_share_list_free_lock);
ws = team->work_share_list_free;
if (ws)
{
team->work_share_list_alloc = ws->next_free;
team->work_share_list_free = NULL;
gomp_mutex_unlock (&team->work_share_list_free_lock);
return ws;
}
gomp_mutex_unlock (&team->work_share_list_free_lock);
#endif
team->work_share_chunk *= 2;
/* Allocating gomp_work_share structures aligned is just an
optimization, don't do it when using the fallback method. */
#ifdef GOMP_HAVE_EFFICIENT_ALIGNED_ALLOC
ws = gomp_aligned_alloc (__alignof (struct gomp_work_share),
team->work_share_chunk
* sizeof (struct gomp_work_share));
#else
ws = gomp_malloc (team->work_share_chunk * sizeof (struct gomp_work_share));
#endif
ws->next_alloc = team->work_shares[0].next_alloc;
team->work_shares[0].next_alloc = ws;
team->work_share_list_alloc = &ws[1];
for (i = 1; i < team->work_share_chunk - 1; i++)
ws[i].next_free = &ws[i + 1];
ws[i].next_free = NULL;
return ws;
}
/* Initialize an already allocated struct gomp_work_share.
This shouldn't touch the next_alloc field. */
void
gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
unsigned nthreads)
{
gomp_mutex_init (&ws->lock);
if (__builtin_expect (ordered, 0))
{
#define INLINE_ORDERED_TEAM_IDS_SIZE \
(sizeof (struct gomp_work_share) \
- offsetof (struct gomp_work_share, inline_ordered_team_ids))
if (__builtin_expect (ordered != 1, 0))
{
size_t o = nthreads * sizeof (*ws->ordered_team_ids);
o += __alignof__ (long long) - 1;
if ((offsetof (struct gomp_work_share, inline_ordered_team_ids)
& (__alignof__ (long long) - 1)) == 0)
o &= ~(__alignof__ (long long) - 1);
ordered += o - 1;
}
else
ordered = nthreads * sizeof (*ws->ordered_team_ids);
if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
ws->ordered_team_ids = team_malloc (ordered);
else
ws->ordered_team_ids = ws->inline_ordered_team_ids;
memset (ws->ordered_team_ids, '\0', ordered);
ws->ordered_num_used = 0;
ws->ordered_owner = -1;
ws->ordered_cur = 0;
}
else
ws->ordered_team_ids = ws->inline_ordered_team_ids;
gomp_ptrlock_init (&ws->next_ws, NULL);
ws->threads_completed = 0;
}
/* Do any needed destruction of gomp_work_share fields before it
is put back into free gomp_work_share cache or freed. */
void
gomp_fini_work_share (struct gomp_work_share *ws)
{
gomp_mutex_destroy (&ws->lock);
if (ws->ordered_team_ids != ws->inline_ordered_team_ids)
team_free (ws->ordered_team_ids);
gomp_ptrlock_destroy (&ws->next_ws);
}
/* Free a work share struct, if not orphaned, put it into current
team's free gomp_work_share cache. */
static inline void
free_work_share (struct gomp_team *team, struct gomp_work_share *ws)
{
gomp_fini_work_share (ws);
if (__builtin_expect (team == NULL, 0))
free (ws);
else
{
struct gomp_work_share *next_ws;
#ifdef HAVE_SYNC_BUILTINS
do
{
next_ws = team->work_share_list_free;
ws->next_free = next_ws;
}
while (!__sync_bool_compare_and_swap (&team->work_share_list_free,
next_ws, ws));
#else
gomp_mutex_lock (&team->work_share_list_free_lock);
next_ws = team->work_share_list_free;
ws->next_free = next_ws;
team->work_share_list_free = ws;
gomp_mutex_unlock (&team->work_share_list_free_lock);
#endif
}
}
/* The current thread is ready to begin the next work sharing construct.
In all cases, thr->ts.work_share is updated to point to the new
structure. In all cases the work_share lock is locked. Return true
if this was the first thread to reach this point. */
bool
gomp_work_share_start (size_t ordered)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
struct gomp_work_share *ws;
/* Work sharing constructs can be orphaned. */
if (team == NULL)
{
ws = gomp_malloc (sizeof (*ws));
gomp_init_work_share (ws, ordered, 1);
thr->ts.work_share = ws;
return true;
}
ws = thr->ts.work_share;
thr->ts.last_work_share = ws;
ws = gomp_ptrlock_get (&ws->next_ws);
if (ws == NULL)
{
/* This thread encountered a new ws first. */
struct gomp_work_share *ws = alloc_work_share (team);
gomp_init_work_share (ws, ordered, team->nthreads);
thr->ts.work_share = ws;
return true;
}
else
{
thr->ts.work_share = ws;
return false;
}
}
/* The current thread is done with its current work sharing construct.
This version does imply a barrier at the end of the work-share. */
void
gomp_work_share_end (void)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
gomp_barrier_state_t bstate;
/* Work sharing constructs can be orphaned. */
if (team == NULL)
{
free_work_share (NULL, thr->ts.work_share);
thr->ts.work_share = NULL;
return;
}
bstate = gomp_barrier_wait_start (&team->barrier);
if (gomp_barrier_last_thread (bstate))
{
if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
{
team->work_shares_to_free = thr->ts.work_share;
free_work_share (team, thr->ts.last_work_share);
}
}
gomp_team_barrier_wait_end (&team->barrier, bstate);
thr->ts.last_work_share = NULL;
}
/* The current thread is done with its current work sharing construct.
This version implies a cancellable barrier at the end of the work-share. */
bool
gomp_work_share_end_cancel (void)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
gomp_barrier_state_t bstate;
/* Cancellable work sharing constructs cannot be orphaned. */
bstate = gomp_barrier_wait_cancel_start (&team->barrier);
if (gomp_barrier_last_thread (bstate))
{
if (__builtin_expect (thr->ts.last_work_share != NULL, 1))
{
team->work_shares_to_free = thr->ts.work_share;
free_work_share (team, thr->ts.last_work_share);
}
}
thr->ts.last_work_share = NULL;
return gomp_team_barrier_wait_cancel_end (&team->barrier, bstate);
}
/* The current thread is done with its current work sharing construct.
This version does NOT imply a barrier at the end of the work-share. */
void
gomp_work_share_end_nowait (void)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
struct gomp_work_share *ws = thr->ts.work_share;
unsigned completed;
/* Work sharing constructs can be orphaned. */
if (team == NULL)
{
free_work_share (NULL, ws);
thr->ts.work_share = NULL;
return;
}
if (__builtin_expect (thr->ts.last_work_share == NULL, 0))
return;
#ifdef HAVE_SYNC_BUILTINS
completed = __sync_add_and_fetch (&ws->threads_completed, 1);
#else
gomp_mutex_lock (&ws->lock);
completed = ++ws->threads_completed;
gomp_mutex_unlock (&ws->lock);
#endif
if (completed == team->nthreads)
{
team->work_shares_to_free = thr->ts.work_share;
free_work_share (team, thr->ts.last_work_share);
}
thr->ts.last_work_share = NULL;
}