6ce1307231
libgomp/ 2016-05-26 Chung-Lin Tang <cltang@codesourcery.com> * target.c (gomp_device_copy): New function. (gomp_copy_host2dev): Likewise. (gomp_copy_dev2host): Likewise. (gomp_free_device_memory): Likewise. (gomp_map_vars_existing): Adjust to call gomp_copy_host2dev. (gomp_map_pointer): Likewise. (gomp_map_vars): Adjust to call gomp_copy_host2dev, handle NULL value from alloc_func plugin hook. (gomp_unmap_tgt): Adjust to call gomp_free_device_memory. (gomp_copy_from_async): Adjust to call gomp_copy_dev2host. (gomp_unmap_vars): Likewise. (gomp_update): Adjust to call gomp_copy_dev2host and gomp_copy_host2dev functions. (gomp_unload_image_from_device): Handle false value from unload_image_func plugin hook. (gomp_init_device): Handle false value from init_device_func plugin hook. (gomp_exit_data): Adjust to call gomp_copy_dev2host. (omp_target_free): Adjust to call gomp_free_device_memory. (omp_target_memcpy): Handle return values from host2dev_func, dev2host_func, and dev2dev_func plugin hooks. (omp_target_memcpy_rect_worker): Likewise. (gomp_target_fini): Handle false value from fini_device_func plugin hook. * libgomp.h (struct gomp_device_descr): Adjust return type of init_device_func, fini_device_func, unload_image_func, free_func, dev2host_func,host2dev_func, and dev2dev_func plugin hooks to 'bool'. * oacc-init.c (acc_shutdown_1): Handle false value from fini_device_func plugin hook. * oacc-host.c (host_init_device): Change return type to bool. (host_fini_device): Likewise. (host_unload_image): Likewise. (host_free): Likewise. (host_dev2host): Likewise. (host_host2dev): Likewise. * oacc-mem.c (acc_free): Handle plugin hook fatal error case. (acc_memcpy_to_device): Likewise. (acc_memcpy_from_device): Likewise. (delete_copyout): Add libfnname parameter, handle free_func hook fatal error case. (acc_delete): Adjust delete_copyout call. (acc_copyout): Likewise. (update_dev_host): Move gomp_mutex_unlock to after host2dev/dev2host hook calls. * plugin/plugin-hsa.c (hsa_warn): Adjust 'hsa_error' local variable to 'hsa_error_msg', for clarity. (hsa_fatal): Likewise. (hsa_error): New function. (init_hsa_context): Change return type to bool, adjust to return false on error. (GOMP_OFFLOAD_get_num_devices): Adjust to handle init_hsa_context return value. (GOMP_OFFLOAD_init_device): Change return type to bool, adjust to return false on error. (get_agent_info): Adjust to return NULL on error. (destroy_hsa_program): Change return type to bool, adjust to return false on error. (GOMP_OFFLOAD_load_image): Adjust to return -1 on error. (destroy_module): Change return type to bool, adjust to return false on error. (GOMP_OFFLOAD_unload_image): Likewise. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_alloc): Change to return NULL when called. (GOMP_OFFLOAD_free): Change to return false when called. (GOMP_OFFLOAD_dev2host): Likewise. (GOMP_OFFLOAD_host2dev): Likewise. (GOMP_OFFLOAD_dev2dev): Likewise. * plugin/plugin-nvptx.c (CUDA_CALL_ERET): New convenience macro. (CUDA_CALL): Likewise. (CUDA_CALL_ASSERT): Likewise. (map_init): Change return type to bool, use CUDA_CALL* macros. (map_fini): Likewise. (init_streams_for_device): Change return type to bool, adjust call to map_init. (fini_streams_for_device): Change return type to bool, adjust call to map_fini. (select_stream_for_async): Release stream_lock before calls to GOMP_PLUGIN_fatal, adjust call to map_init. (nvptx_init): Use CUDA_CALL* macros. (nvptx_attach_host_thread_to_device): Change return type to bool, use CUDA_CALL* macros. (nvptx_open_device): Use CUDA_CALL* macros. (nvptx_close_device): Change return type to bool, use CUDA_CALL* macros. (nvptx_get_num_devices): Use CUDA_CALL* macros. (link_ptx): Change return type to bool, use CUDA_CALL* macros. (nvptx_exec): Use CUDA_CALL* macros. (nvptx_alloc): Use CUDA_CALL* macros. (nvptx_free): Change return type to bool, use CUDA_CALL* macros. (nvptx_host2dev): Likewise. (nvptx_dev2host): Likewise. (nvptx_wait): Use CUDA_CALL* macros. (nvptx_wait_async): Likewise. (nvptx_wait_all): Likewise. (nvptx_wait_all_async): Likewise. (nvptx_set_cuda_stream): Adjust order of stream_lock acquire, use CUDA_CALL* macros, adjust call to map_fini. (GOMP_OFFLOAD_init_device): Change return type to bool, adjust code accordingly. (GOMP_OFFLOAD_fini_device): Likewise. (GOMP_OFFLOAD_load_image): Adjust calls to nvptx_attach_host_thread_to_device/link_ptx to handle errors, use CUDA_CALL* macros. (GOMP_OFFLOAD_unload_image): Change return type to bool, adjust return code. (GOMP_OFFLOAD_alloc): Adjust calls to code to handle error return. (GOMP_OFFLOAD_free): Change return type to bool, adjust calls to handle error return. (GOMP_OFFLOAD_dev2host): Likewise. (GOMP_OFFLOAD_host2dev): Likewise. (GOMP_OFFLOAD_openacc_register_async_cleanup): Use CUDA_CALL* macros. (GOMP_OFFLOAD_openacc_create_thread_data): Likewise. liboffloadmic/ 2016-05-26 Chung-Lin Tang <cltang@codesourcery.com> * plugin/libgomp-plugin-intelmic.cpp (offload): Change return type to bool, adjust return code. (GOMP_OFFLOAD_init_device): Likewise. (GOMP_OFFLOAD_fini_device): Likewise. (get_target_table): Likewise. (offload_image): Likwise. (GOMP_OFFLOAD_load_image): Adjust call to offload_image(), change to return -1 on error. (GOMP_OFFLOAD_unload_image): Change return type to bool, adjust return code. (GOMP_OFFLOAD_alloc): Likewise. (GOMP_OFFLOAD_free): Likewise. (GOMP_OFFLOAD_host2dev): Likewise. (GOMP_OFFLOAD_dev2host): Likewise. (GOMP_OFFLOAD_dev2dev): Likewise. From-SVN: r236768
1611 lines
48 KiB
C
1611 lines
48 KiB
C
/* Plugin for HSAIL execution.
|
|
|
|
Copyright (C) 2013-2016 Free Software Foundation, Inc.
|
|
|
|
Contributed by Martin Jambor <mjambor@suse.cz> and
|
|
Martin Liska <mliska@suse.cz>.
|
|
|
|
This file is part of the GNU Offloading and Multi Processing Library
|
|
(libgomp).
|
|
|
|
Libgomp is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <pthread.h>
|
|
#include <hsa.h>
|
|
#include <hsa_ext_finalize.h>
|
|
#include <dlfcn.h>
|
|
#include "libgomp-plugin.h"
|
|
#include "gomp-constants.h"
|
|
|
|
/* Keep the following GOMP prefixed structures in sync with respective parts of
|
|
the compiler. */
|
|
|
|
/* Structure describing the run-time and grid properties of an HSA kernel
|
|
lauch. */
|
|
|
|
struct GOMP_kernel_launch_attributes
|
|
{
|
|
/* Number of dimensions the workload has. Maximum number is 3. */
|
|
uint32_t ndim;
|
|
/* Size of the grid in the three respective dimensions. */
|
|
uint32_t gdims[3];
|
|
/* Size of work-groups in the respective dimensions. */
|
|
uint32_t wdims[3];
|
|
};
|
|
|
|
/* Collection of information needed for a dispatch of a kernel from a
|
|
kernel. */
|
|
|
|
struct GOMP_hsa_kernel_dispatch
|
|
{
|
|
/* Pointer to a command queue associated with a kernel dispatch agent. */
|
|
void *queue;
|
|
/* Pointer to reserved memory for OMP data struct copying. */
|
|
void *omp_data_memory;
|
|
/* Pointer to a memory space used for kernel arguments passing. */
|
|
void *kernarg_address;
|
|
/* Kernel object. */
|
|
uint64_t object;
|
|
/* Synchronization signal used for dispatch synchronization. */
|
|
uint64_t signal;
|
|
/* Private segment size. */
|
|
uint32_t private_segment_size;
|
|
/* Group segment size. */
|
|
uint32_t group_segment_size;
|
|
/* Number of children kernel dispatches. */
|
|
uint64_t kernel_dispatch_count;
|
|
/* Debug purpose argument. */
|
|
uint64_t debug;
|
|
/* Levels-var ICV. */
|
|
uint64_t omp_level;
|
|
/* Kernel dispatch structures created for children kernel dispatches. */
|
|
struct GOMP_hsa_kernel_dispatch **children_dispatches;
|
|
/* Number of threads. */
|
|
uint32_t omp_num_threads;
|
|
};
|
|
|
|
/* Part of the libgomp plugin interface. Return the name of the accelerator,
|
|
which is "hsa". */
|
|
|
|
const char *
|
|
GOMP_OFFLOAD_get_name (void)
|
|
{
|
|
return "hsa";
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Return the specific capabilities the
|
|
HSA accelerator have. */
|
|
|
|
unsigned int
|
|
GOMP_OFFLOAD_get_caps (void)
|
|
{
|
|
return GOMP_OFFLOAD_CAP_SHARED_MEM | GOMP_OFFLOAD_CAP_OPENMP_400;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Identify as HSA accelerator. */
|
|
|
|
int
|
|
GOMP_OFFLOAD_get_type (void)
|
|
{
|
|
return OFFLOAD_TARGET_TYPE_HSA;
|
|
}
|
|
|
|
/* Return the libgomp version number we're compatible with. There is
|
|
no requirement for cross-version compatibility. */
|
|
|
|
unsigned
|
|
GOMP_OFFLOAD_version (void)
|
|
{
|
|
return GOMP_VERSION;
|
|
}
|
|
|
|
/* Flag to decide whether print to stderr information about what is going on.
|
|
Set in init_debug depending on environment variables. */
|
|
|
|
static bool debug;
|
|
|
|
/* Flag to decide if the runtime should suppress a possible fallback to host
|
|
execution. */
|
|
|
|
static bool suppress_host_fallback;
|
|
|
|
/* Initialize debug and suppress_host_fallback according to the environment. */
|
|
|
|
static void
|
|
init_enviroment_variables (void)
|
|
{
|
|
if (getenv ("HSA_DEBUG"))
|
|
debug = true;
|
|
else
|
|
debug = false;
|
|
|
|
if (getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
|
|
suppress_host_fallback = true;
|
|
else
|
|
suppress_host_fallback = false;
|
|
}
|
|
|
|
/* Print a logging message with PREFIX to stderr if HSA_DEBUG value
|
|
is set to true. */
|
|
|
|
#define HSA_LOG(prefix, ...) \
|
|
do \
|
|
{ \
|
|
if (debug) \
|
|
{ \
|
|
fprintf (stderr, prefix); \
|
|
fprintf (stderr, __VA_ARGS__); \
|
|
} \
|
|
} \
|
|
while (false);
|
|
|
|
/* Print a debugging message to stderr. */
|
|
|
|
#define HSA_DEBUG(...) HSA_LOG ("HSA debug: ", __VA_ARGS__)
|
|
|
|
/* Print a warning message to stderr. */
|
|
|
|
#define HSA_WARNING(...) HSA_LOG ("HSA warning: ", __VA_ARGS__)
|
|
|
|
/* Print HSA warning STR with an HSA STATUS code. */
|
|
|
|
static void
|
|
hsa_warn (const char *str, hsa_status_t status)
|
|
{
|
|
if (!debug)
|
|
return;
|
|
|
|
const char *hsa_error_msg;
|
|
hsa_status_string (status, &hsa_error_msg);
|
|
|
|
fprintf (stderr, "HSA warning: %s\nRuntime message: %s", str, hsa_error_msg);
|
|
}
|
|
|
|
/* Report a fatal error STR together with the HSA error corresponding to STATUS
|
|
and terminate execution of the current process. */
|
|
|
|
static void
|
|
hsa_fatal (const char *str, hsa_status_t status)
|
|
{
|
|
const char *hsa_error_msg;
|
|
hsa_status_string (status, &hsa_error_msg);
|
|
GOMP_PLUGIN_fatal ("HSA fatal error: %s\nRuntime message: %s", str,
|
|
hsa_error_msg);
|
|
}
|
|
|
|
/* Like hsa_fatal, except only report error message, and return FALSE
|
|
for propagating error processing to outside of plugin. */
|
|
|
|
static bool
|
|
hsa_error (const char *str, hsa_status_t status)
|
|
{
|
|
const char *hsa_error_msg;
|
|
hsa_status_string (status, &hsa_error_msg);
|
|
GOMP_PLUGIN_error ("HSA fatal error: %s\nRuntime message: %s", str,
|
|
hsa_error_msg);
|
|
return false;
|
|
}
|
|
|
|
struct hsa_kernel_description
|
|
{
|
|
const char *name;
|
|
unsigned omp_data_size;
|
|
bool gridified_kernel_p;
|
|
unsigned kernel_dependencies_count;
|
|
const char **kernel_dependencies;
|
|
};
|
|
|
|
struct global_var_info
|
|
{
|
|
const char *name;
|
|
void *address;
|
|
};
|
|
|
|
/* Data passed by the static initializer of a compilation unit containing BRIG
|
|
to GOMP_offload_register. */
|
|
|
|
struct brig_image_desc
|
|
{
|
|
hsa_ext_module_t brig_module;
|
|
const unsigned kernel_count;
|
|
struct hsa_kernel_description *kernel_infos;
|
|
const unsigned global_variable_count;
|
|
struct global_var_info *global_variables;
|
|
};
|
|
|
|
struct agent_info;
|
|
|
|
/* Information required to identify, finalize and run any given kernel. */
|
|
|
|
struct kernel_info
|
|
{
|
|
/* Name of the kernel, required to locate it within the brig module. */
|
|
const char *name;
|
|
/* Size of memory space for OMP data. */
|
|
unsigned omp_data_size;
|
|
/* The specific agent the kernel has been or will be finalized for and run
|
|
on. */
|
|
struct agent_info *agent;
|
|
/* The specific module where the kernel takes place. */
|
|
struct module_info *module;
|
|
/* Mutex enforcing that at most once thread ever initializes a kernel for
|
|
use. A thread should have locked agent->modules_rwlock for reading before
|
|
acquiring it. */
|
|
pthread_mutex_t init_mutex;
|
|
/* Flag indicating whether the kernel has been initialized and all fields
|
|
below it contain valid data. */
|
|
bool initialized;
|
|
/* Flag indicating that the kernel has a problem that blocks an execution. */
|
|
bool initialization_failed;
|
|
/* The object to be put into the dispatch queue. */
|
|
uint64_t object;
|
|
/* Required size of kernel arguments. */
|
|
uint32_t kernarg_segment_size;
|
|
/* Required size of group segment. */
|
|
uint32_t group_segment_size;
|
|
/* Required size of private segment. */
|
|
uint32_t private_segment_size;
|
|
/* List of all kernel dependencies. */
|
|
const char **dependencies;
|
|
/* Number of dependencies. */
|
|
unsigned dependencies_count;
|
|
/* Maximum OMP data size necessary for kernel from kernel dispatches. */
|
|
unsigned max_omp_data_size;
|
|
/* True if the kernel is gridified. */
|
|
bool gridified_kernel_p;
|
|
};
|
|
|
|
/* Information about a particular brig module, its image and kernels. */
|
|
|
|
struct module_info
|
|
{
|
|
/* The next and previous module in the linked list of modules of an agent. */
|
|
struct module_info *next, *prev;
|
|
/* The description with which the program has registered the image. */
|
|
struct brig_image_desc *image_desc;
|
|
|
|
/* Number of kernels in this module. */
|
|
int kernel_count;
|
|
/* An array of kernel_info structures describing each kernel in this
|
|
module. */
|
|
struct kernel_info kernels[];
|
|
};
|
|
|
|
/* Information about shared brig library. */
|
|
|
|
struct brig_library_info
|
|
{
|
|
char *file_name;
|
|
hsa_ext_module_t image;
|
|
};
|
|
|
|
/* Description of an HSA GPU agent and the program associated with it. */
|
|
|
|
struct agent_info
|
|
{
|
|
/* The HSA ID of the agent. Assigned when hsa_context is initialized. */
|
|
hsa_agent_t id;
|
|
/* Whether the agent has been initialized. The fields below are usable only
|
|
if it has been. */
|
|
bool initialized;
|
|
/* The HSA ISA of this agent. */
|
|
hsa_isa_t isa;
|
|
/* Command queue of the agent. */
|
|
hsa_queue_t *command_q;
|
|
/* Kernel from kernel dispatch command queue. */
|
|
hsa_queue_t *kernel_dispatch_command_q;
|
|
/* The HSA memory region from which to allocate kernel arguments. */
|
|
hsa_region_t kernarg_region;
|
|
|
|
/* Read-write lock that protects kernels which are running or about to be run
|
|
from interference with loading and unloading of images. Needs to be
|
|
locked for reading while a kernel is being run, and for writing if the
|
|
list of modules is manipulated (and thus the HSA program invalidated). */
|
|
pthread_rwlock_t modules_rwlock;
|
|
/* The first module in a linked list of modules associated with this
|
|
kernel. */
|
|
struct module_info *first_module;
|
|
|
|
/* Mutex enforcing that only one thread will finalize the HSA program. A
|
|
thread should have locked agent->modules_rwlock for reading before
|
|
acquiring it. */
|
|
pthread_mutex_t prog_mutex;
|
|
/* Flag whether the HSA program that consists of all the modules has been
|
|
finalized. */
|
|
bool prog_finalized;
|
|
/* Flag whether the program was finalized but with a failure. */
|
|
bool prog_finalized_error;
|
|
/* HSA executable - the finalized program that is used to locate kernels. */
|
|
hsa_executable_t executable;
|
|
/* List of BRIG libraries. */
|
|
struct brig_library_info **brig_libraries;
|
|
/* Number of loaded shared BRIG libraries. */
|
|
unsigned brig_libraries_count;
|
|
};
|
|
|
|
/* Information about the whole HSA environment and all of its agents. */
|
|
|
|
struct hsa_context_info
|
|
{
|
|
/* Whether the structure has been initialized. */
|
|
bool initialized;
|
|
/* Number of usable GPU HSA agents in the system. */
|
|
int agent_count;
|
|
/* Array of agent_info structures describing the individual HSA agents. */
|
|
struct agent_info *agents;
|
|
};
|
|
|
|
/* Information about the whole HSA environment and all of its agents. */
|
|
|
|
static struct hsa_context_info hsa_context;
|
|
|
|
/* Find kernel for an AGENT by name provided in KERNEL_NAME. */
|
|
|
|
static struct kernel_info *
|
|
get_kernel_for_agent (struct agent_info *agent, const char *kernel_name)
|
|
{
|
|
struct module_info *module = agent->first_module;
|
|
|
|
while (module)
|
|
{
|
|
for (unsigned i = 0; i < module->kernel_count; i++)
|
|
if (strcmp (module->kernels[i].name, kernel_name) == 0)
|
|
return &module->kernels[i];
|
|
|
|
module = module->next;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Return true if the agent is a GPU and acceptable of concurrent submissions
|
|
from different threads. */
|
|
|
|
static bool
|
|
suitable_hsa_agent_p (hsa_agent_t agent)
|
|
{
|
|
hsa_device_type_t device_type;
|
|
hsa_status_t status
|
|
= hsa_agent_get_info (agent, HSA_AGENT_INFO_DEVICE, &device_type);
|
|
if (status != HSA_STATUS_SUCCESS || device_type != HSA_DEVICE_TYPE_GPU)
|
|
return false;
|
|
|
|
uint32_t features = 0;
|
|
status = hsa_agent_get_info (agent, HSA_AGENT_INFO_FEATURE, &features);
|
|
if (status != HSA_STATUS_SUCCESS
|
|
|| !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
|
|
return false;
|
|
hsa_queue_type_t queue_type;
|
|
status = hsa_agent_get_info (agent, HSA_AGENT_INFO_QUEUE_TYPE, &queue_type);
|
|
if (status != HSA_STATUS_SUCCESS
|
|
|| (queue_type != HSA_QUEUE_TYPE_MULTI))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Callback of hsa_iterate_agents, if AGENT is a GPU device, increment
|
|
agent_count in hsa_context. */
|
|
|
|
static hsa_status_t
|
|
count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
|
|
{
|
|
if (suitable_hsa_agent_p (agent))
|
|
hsa_context.agent_count++;
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
/* Callback of hsa_iterate_agents, if AGENT is a GPU device, assign the agent
|
|
id to the describing structure in the hsa context. The index of the
|
|
structure is pointed to by DATA, increment it afterwards. */
|
|
|
|
static hsa_status_t
|
|
assign_agent_ids (hsa_agent_t agent, void *data)
|
|
{
|
|
if (suitable_hsa_agent_p (agent))
|
|
{
|
|
int *agent_index = (int *) data;
|
|
hsa_context.agents[*agent_index].id = agent;
|
|
++*agent_index;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
/* Initialize hsa_context if it has not already been done.
|
|
Return TRUE on success. */
|
|
|
|
static bool
|
|
init_hsa_context (void)
|
|
{
|
|
hsa_status_t status;
|
|
int agent_index = 0;
|
|
|
|
if (hsa_context.initialized)
|
|
return true;
|
|
init_enviroment_variables ();
|
|
status = hsa_init ();
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Run-time could not be initialized", status);
|
|
HSA_DEBUG ("HSA run-time initialized\n");
|
|
status = hsa_iterate_agents (count_gpu_agents, NULL);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("HSA GPU devices could not be enumerated", status);
|
|
HSA_DEBUG ("There are %i HSA GPU devices.\n", hsa_context.agent_count);
|
|
|
|
hsa_context.agents
|
|
= GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
|
|
* sizeof (struct agent_info));
|
|
status = hsa_iterate_agents (assign_agent_ids, &agent_index);
|
|
if (agent_index != hsa_context.agent_count)
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to assign IDs to all HSA agents");
|
|
return false;
|
|
}
|
|
hsa_context.initialized = true;
|
|
return true;
|
|
}
|
|
|
|
/* Callback of dispatch queues to report errors. */
|
|
|
|
static void
|
|
queue_callback (hsa_status_t status,
|
|
hsa_queue_t *queue __attribute__ ((unused)),
|
|
void *data __attribute__ ((unused)))
|
|
{
|
|
hsa_fatal ("Asynchronous queue error", status);
|
|
}
|
|
|
|
/* Callback of hsa_agent_iterate_regions. Determine if a memory REGION can be
|
|
used for kernarg allocations and if so write it to the memory pointed to by
|
|
DATA and break the query. */
|
|
|
|
static hsa_status_t
|
|
get_kernarg_memory_region (hsa_region_t region, void *data)
|
|
{
|
|
hsa_status_t status;
|
|
hsa_region_segment_t segment;
|
|
|
|
status = hsa_region_get_info (region, HSA_REGION_INFO_SEGMENT, &segment);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return status;
|
|
if (segment != HSA_REGION_SEGMENT_GLOBAL)
|
|
return HSA_STATUS_SUCCESS;
|
|
|
|
uint32_t flags;
|
|
status = hsa_region_get_info (region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return status;
|
|
if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
|
|
{
|
|
hsa_region_t *ret = (hsa_region_t *) data;
|
|
*ret = region;
|
|
return HSA_STATUS_INFO_BREAK;
|
|
}
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Return the number of HSA devices on
|
|
the system. */
|
|
|
|
int
|
|
GOMP_OFFLOAD_get_num_devices (void)
|
|
{
|
|
if (!init_hsa_context ())
|
|
return 0;
|
|
return hsa_context.agent_count;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Initialize agent number N so that it
|
|
can be used for computation. Return TRUE on success. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_init_device (int n)
|
|
{
|
|
if (!init_hsa_context ())
|
|
return false;
|
|
if (n >= hsa_context.agent_count)
|
|
{
|
|
GOMP_PLUGIN_error ("Request to initialize non-existing HSA device %i", n);
|
|
return false;
|
|
}
|
|
struct agent_info *agent = &hsa_context.agents[n];
|
|
|
|
if (agent->initialized)
|
|
return true;
|
|
|
|
if (pthread_rwlock_init (&agent->modules_rwlock, NULL))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to initialize an HSA agent rwlock");
|
|
return false;
|
|
}
|
|
if (pthread_mutex_init (&agent->prog_mutex, NULL))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to initialize an HSA agent program mutex");
|
|
return false;
|
|
}
|
|
|
|
uint32_t queue_size;
|
|
hsa_status_t status;
|
|
status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
|
|
&queue_size);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error requesting maximum queue size of the HSA agent",
|
|
status);
|
|
status = hsa_agent_get_info (agent->id, HSA_AGENT_INFO_ISA, &agent->isa);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error querying the ISA of the agent", status);
|
|
status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
|
|
queue_callback, NULL, UINT32_MAX, UINT32_MAX,
|
|
&agent->command_q);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error creating command queue", status);
|
|
|
|
status = hsa_queue_create (agent->id, queue_size, HSA_QUEUE_TYPE_MULTI,
|
|
queue_callback, NULL, UINT32_MAX, UINT32_MAX,
|
|
&agent->kernel_dispatch_command_q);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error creating kernel dispatch command queue", status);
|
|
|
|
agent->kernarg_region.handle = (uint64_t) -1;
|
|
status = hsa_agent_iterate_regions (agent->id, get_kernarg_memory_region,
|
|
&agent->kernarg_region);
|
|
if (agent->kernarg_region.handle == (uint64_t) -1)
|
|
{
|
|
GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
|
|
"arguments");
|
|
return false;
|
|
}
|
|
HSA_DEBUG ("HSA agent initialized, queue has id %llu\n",
|
|
(long long unsigned) agent->command_q->id);
|
|
HSA_DEBUG ("HSA agent initialized, kernel dispatch queue has id %llu\n",
|
|
(long long unsigned) agent->kernel_dispatch_command_q->id);
|
|
agent->initialized = true;
|
|
return true;
|
|
}
|
|
|
|
/* Verify that hsa_context has already been initialized and return the
|
|
agent_info structure describing device number N. Return NULL on error. */
|
|
|
|
static struct agent_info *
|
|
get_agent_info (int n)
|
|
{
|
|
if (!hsa_context.initialized)
|
|
{
|
|
GOMP_PLUGIN_error ("Attempt to use uninitialized HSA context.");
|
|
return NULL;
|
|
}
|
|
if (n >= hsa_context.agent_count)
|
|
{
|
|
GOMP_PLUGIN_error ("Request to operate on anon-existing HSA device %i", n);
|
|
return NULL;
|
|
}
|
|
if (!hsa_context.agents[n].initialized)
|
|
{
|
|
GOMP_PLUGIN_error ("Attempt to use an uninitialized HSA agent.");
|
|
return NULL;
|
|
}
|
|
return &hsa_context.agents[n];
|
|
}
|
|
|
|
/* Insert MODULE to the linked list of modules of AGENT. */
|
|
|
|
static void
|
|
add_module_to_agent (struct agent_info *agent, struct module_info *module)
|
|
{
|
|
if (agent->first_module)
|
|
agent->first_module->prev = module;
|
|
module->next = agent->first_module;
|
|
module->prev = NULL;
|
|
agent->first_module = module;
|
|
}
|
|
|
|
/* Remove MODULE from the linked list of modules of AGENT. */
|
|
|
|
static void
|
|
remove_module_from_agent (struct agent_info *agent, struct module_info *module)
|
|
{
|
|
if (agent->first_module == module)
|
|
agent->first_module = module->next;
|
|
if (module->prev)
|
|
module->prev->next = module->next;
|
|
if (module->next)
|
|
module->next->prev = module->prev;
|
|
}
|
|
|
|
/* Free the HSA program in agent and everything associated with it and set
|
|
agent->prog_finalized and the initialized flags of all kernels to false.
|
|
Return TRUE on success. */
|
|
|
|
static bool
|
|
destroy_hsa_program (struct agent_info *agent)
|
|
{
|
|
if (!agent->prog_finalized || agent->prog_finalized_error)
|
|
return true;
|
|
|
|
hsa_status_t status;
|
|
|
|
HSA_DEBUG ("Destroying the current HSA program.\n");
|
|
|
|
status = hsa_executable_destroy (agent->executable);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Could not destroy HSA executable", status);
|
|
|
|
struct module_info *module;
|
|
for (module = agent->first_module; module; module = module->next)
|
|
{
|
|
int i;
|
|
for (i = 0; i < module->kernel_count; i++)
|
|
module->kernels[i].initialized = false;
|
|
}
|
|
agent->prog_finalized = false;
|
|
return true;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Load BRIG module described by struct
|
|
brig_image_desc in TARGET_DATA and return references to kernel descriptors
|
|
in TARGET_TABLE. */
|
|
|
|
int
|
|
GOMP_OFFLOAD_load_image (int ord, unsigned version, void *target_data,
|
|
struct addr_pair **target_table)
|
|
{
|
|
if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
|
|
{
|
|
GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin"
|
|
" (expected %u, received %u)",
|
|
GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
|
|
return -1;
|
|
}
|
|
|
|
struct brig_image_desc *image_desc = (struct brig_image_desc *) target_data;
|
|
struct agent_info *agent;
|
|
struct addr_pair *pair;
|
|
struct module_info *module;
|
|
struct kernel_info *kernel;
|
|
int kernel_count = image_desc->kernel_count;
|
|
|
|
agent = get_agent_info (ord);
|
|
if (!agent)
|
|
return -1;
|
|
|
|
if (pthread_rwlock_wrlock (&agent->modules_rwlock))
|
|
{
|
|
GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock");
|
|
return -1;
|
|
}
|
|
if (agent->prog_finalized
|
|
&& !destroy_hsa_program (agent))
|
|
return -1;
|
|
|
|
HSA_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
|
|
pair = GOMP_PLUGIN_malloc (kernel_count * sizeof (struct addr_pair));
|
|
*target_table = pair;
|
|
module = (struct module_info *)
|
|
GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
|
|
+ kernel_count * sizeof (struct kernel_info));
|
|
module->image_desc = image_desc;
|
|
module->kernel_count = kernel_count;
|
|
|
|
kernel = &module->kernels[0];
|
|
|
|
/* Allocate memory for kernel dependencies. */
|
|
for (unsigned i = 0; i < kernel_count; i++)
|
|
{
|
|
pair->start = (uintptr_t) kernel;
|
|
pair->end = (uintptr_t) (kernel + 1);
|
|
|
|
struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
|
|
kernel->agent = agent;
|
|
kernel->module = module;
|
|
kernel->name = d->name;
|
|
kernel->omp_data_size = d->omp_data_size;
|
|
kernel->gridified_kernel_p = d->gridified_kernel_p;
|
|
kernel->dependencies_count = d->kernel_dependencies_count;
|
|
kernel->dependencies = d->kernel_dependencies;
|
|
if (pthread_mutex_init (&kernel->init_mutex, NULL))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to initialize an HSA kernel mutex");
|
|
return -1;
|
|
}
|
|
|
|
kernel++;
|
|
pair++;
|
|
}
|
|
|
|
add_module_to_agent (agent, module);
|
|
if (pthread_rwlock_unlock (&agent->modules_rwlock))
|
|
{
|
|
GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock");
|
|
return -1;
|
|
}
|
|
return kernel_count;
|
|
}
|
|
|
|
/* Add a shared BRIG library from a FILE_NAME to an AGENT. */
|
|
|
|
static struct brig_library_info *
|
|
add_shared_library (const char *file_name, struct agent_info *agent)
|
|
{
|
|
struct brig_library_info *library = NULL;
|
|
|
|
void *f = dlopen (file_name, RTLD_NOW);
|
|
void *start = dlsym (f, "__brig_start");
|
|
void *end = dlsym (f, "__brig_end");
|
|
|
|
if (start == NULL || end == NULL)
|
|
return NULL;
|
|
|
|
unsigned size = end - start;
|
|
char *buf = (char *) GOMP_PLUGIN_malloc (size);
|
|
memcpy (buf, start, size);
|
|
|
|
library = GOMP_PLUGIN_malloc (sizeof (struct agent_info));
|
|
library->file_name = (char *) GOMP_PLUGIN_malloc
|
|
((strlen (file_name) + 1));
|
|
strcpy (library->file_name, file_name);
|
|
library->image = (hsa_ext_module_t) buf;
|
|
|
|
return library;
|
|
}
|
|
|
|
/* Release memory used for BRIG shared libraries that correspond
|
|
to an AGENT. */
|
|
|
|
static void
|
|
release_agent_shared_libraries (struct agent_info *agent)
|
|
{
|
|
for (unsigned i = 0; i < agent->brig_libraries_count; i++)
|
|
if (agent->brig_libraries[i])
|
|
{
|
|
free (agent->brig_libraries[i]->file_name);
|
|
free (agent->brig_libraries[i]->image);
|
|
free (agent->brig_libraries[i]);
|
|
}
|
|
|
|
free (agent->brig_libraries);
|
|
}
|
|
|
|
/* Create and finalize the program consisting of all loaded modules. */
|
|
|
|
static void
|
|
create_and_finalize_hsa_program (struct agent_info *agent)
|
|
{
|
|
hsa_status_t status;
|
|
hsa_ext_program_t prog_handle;
|
|
int mi = 0;
|
|
|
|
if (pthread_mutex_lock (&agent->prog_mutex))
|
|
GOMP_PLUGIN_fatal ("Could not lock an HSA agent program mutex");
|
|
if (agent->prog_finalized)
|
|
goto final;
|
|
|
|
status = hsa_ext_program_create (HSA_MACHINE_MODEL_LARGE, HSA_PROFILE_FULL,
|
|
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT,
|
|
NULL, &prog_handle);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not create an HSA program", status);
|
|
|
|
HSA_DEBUG ("Created a finalized program\n");
|
|
|
|
struct module_info *module = agent->first_module;
|
|
while (module)
|
|
{
|
|
status = hsa_ext_program_add_module (prog_handle,
|
|
module->image_desc->brig_module);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not add a module to the HSA program", status);
|
|
module = module->next;
|
|
mi++;
|
|
}
|
|
|
|
/* Load all shared libraries. */
|
|
const char *libraries[] = { "libhsamath.so", "libhsastd.so" };
|
|
const unsigned libraries_count = sizeof (libraries) / sizeof (const char *);
|
|
|
|
agent->brig_libraries_count = libraries_count;
|
|
agent->brig_libraries = GOMP_PLUGIN_malloc_cleared
|
|
(sizeof (struct brig_library_info) * libraries_count);
|
|
|
|
for (unsigned i = 0; i < libraries_count; i++)
|
|
{
|
|
struct brig_library_info *library = add_shared_library (libraries[i],
|
|
agent);
|
|
if (library == NULL)
|
|
{
|
|
HSA_WARNING ("Could not open a shared BRIG library: %s\n",
|
|
libraries[i]);
|
|
continue;
|
|
}
|
|
|
|
status = hsa_ext_program_add_module (prog_handle, library->image);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_warn ("Could not add a shared BRIG library the HSA program",
|
|
status);
|
|
else
|
|
HSA_DEBUG ("a shared BRIG library has been added to a program: %s\n",
|
|
libraries[i]);
|
|
}
|
|
|
|
hsa_ext_control_directives_t control_directives;
|
|
memset (&control_directives, 0, sizeof (control_directives));
|
|
hsa_code_object_t code_object;
|
|
status = hsa_ext_program_finalize (prog_handle, agent->isa,
|
|
HSA_EXT_FINALIZER_CALL_CONVENTION_AUTO,
|
|
control_directives, "",
|
|
HSA_CODE_OBJECT_TYPE_PROGRAM,
|
|
&code_object);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
{
|
|
hsa_warn ("Finalization of the HSA program failed", status);
|
|
goto failure;
|
|
}
|
|
|
|
HSA_DEBUG ("Finalization done\n");
|
|
hsa_ext_program_destroy (prog_handle);
|
|
|
|
status
|
|
= hsa_executable_create (HSA_PROFILE_FULL, HSA_EXECUTABLE_STATE_UNFROZEN,
|
|
"", &agent->executable);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not create HSA executable", status);
|
|
|
|
module = agent->first_module;
|
|
while (module)
|
|
{
|
|
/* Initialize all global variables declared in the module. */
|
|
for (unsigned i = 0; i < module->image_desc->global_variable_count; i++)
|
|
{
|
|
struct global_var_info *var;
|
|
var = &module->image_desc->global_variables[i];
|
|
status
|
|
= hsa_executable_global_variable_define (agent->executable,
|
|
var->name, var->address);
|
|
|
|
HSA_DEBUG ("Defining global variable: %s, address: %p\n", var->name,
|
|
var->address);
|
|
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not define a global variable in the HSA program",
|
|
status);
|
|
}
|
|
|
|
module = module->next;
|
|
}
|
|
|
|
status = hsa_executable_load_code_object (agent->executable, agent->id,
|
|
code_object, "");
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not add a code object to the HSA executable", status);
|
|
status = hsa_executable_freeze (agent->executable, "");
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not freeze the HSA executable", status);
|
|
|
|
HSA_DEBUG ("Froze HSA executable with the finalized code object\n");
|
|
|
|
/* If all goes good, jump to final. */
|
|
goto final;
|
|
|
|
failure:
|
|
agent->prog_finalized_error = true;
|
|
|
|
final:
|
|
agent->prog_finalized = true;
|
|
|
|
if (pthread_mutex_unlock (&agent->prog_mutex))
|
|
GOMP_PLUGIN_fatal ("Could not unlock an HSA agent program mutex");
|
|
}
|
|
|
|
/* Create kernel dispatch data structure for given KERNEL. */
|
|
|
|
static struct GOMP_hsa_kernel_dispatch *
|
|
create_single_kernel_dispatch (struct kernel_info *kernel,
|
|
unsigned omp_data_size)
|
|
{
|
|
struct agent_info *agent = kernel->agent;
|
|
struct GOMP_hsa_kernel_dispatch *shadow
|
|
= GOMP_PLUGIN_malloc_cleared (sizeof (struct GOMP_hsa_kernel_dispatch));
|
|
|
|
shadow->queue = agent->command_q;
|
|
shadow->omp_data_memory
|
|
= omp_data_size > 0 ? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
|
|
unsigned dispatch_count = kernel->dependencies_count;
|
|
shadow->kernel_dispatch_count = dispatch_count;
|
|
|
|
shadow->children_dispatches
|
|
= GOMP_PLUGIN_malloc (dispatch_count * sizeof (shadow));
|
|
|
|
shadow->object = kernel->object;
|
|
|
|
hsa_signal_t sync_signal;
|
|
hsa_status_t status = hsa_signal_create (1, 0, NULL, &sync_signal);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Error creating the HSA sync signal", status);
|
|
|
|
shadow->signal = sync_signal.handle;
|
|
shadow->private_segment_size = kernel->private_segment_size;
|
|
shadow->group_segment_size = kernel->group_segment_size;
|
|
|
|
status
|
|
= hsa_memory_allocate (agent->kernarg_region, kernel->kernarg_segment_size,
|
|
&shadow->kernarg_address);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not allocate memory for HSA kernel arguments", status);
|
|
|
|
return shadow;
|
|
}
|
|
|
|
/* Release data structure created for a kernel dispatch in SHADOW argument. */
|
|
|
|
static void
|
|
release_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *shadow)
|
|
{
|
|
HSA_DEBUG ("Released kernel dispatch: %p has value: %lu (%p)\n", shadow,
|
|
shadow->debug, (void *) shadow->debug);
|
|
|
|
hsa_memory_free (shadow->kernarg_address);
|
|
|
|
hsa_signal_t s;
|
|
s.handle = shadow->signal;
|
|
hsa_signal_destroy (s);
|
|
|
|
free (shadow->omp_data_memory);
|
|
|
|
for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
|
|
release_kernel_dispatch (shadow->children_dispatches[i]);
|
|
|
|
free (shadow->children_dispatches);
|
|
free (shadow);
|
|
}
|
|
|
|
/* Initialize a KERNEL without its dependencies. MAX_OMP_DATA_SIZE is used
|
|
to calculate maximum necessary memory for OMP data allocation. */
|
|
|
|
static void
|
|
init_single_kernel (struct kernel_info *kernel, unsigned *max_omp_data_size)
|
|
{
|
|
hsa_status_t status;
|
|
struct agent_info *agent = kernel->agent;
|
|
hsa_executable_symbol_t kernel_symbol;
|
|
status = hsa_executable_get_symbol (agent->executable, NULL, kernel->name,
|
|
agent->id, 0, &kernel_symbol);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
{
|
|
hsa_warn ("Could not find symbol for kernel in the code object", status);
|
|
goto failure;
|
|
}
|
|
HSA_DEBUG ("Located kernel %s\n", kernel->name);
|
|
status
|
|
= hsa_executable_symbol_get_info (kernel_symbol,
|
|
HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
|
|
&kernel->object);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not extract a kernel object from its symbol", status);
|
|
status = hsa_executable_symbol_get_info
|
|
(kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
|
|
&kernel->kernarg_segment_size);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not get info about kernel argument size", status);
|
|
status = hsa_executable_symbol_get_info
|
|
(kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
|
|
&kernel->group_segment_size);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not get info about kernel group segment size", status);
|
|
status = hsa_executable_symbol_get_info
|
|
(kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
|
|
&kernel->private_segment_size);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
hsa_fatal ("Could not get info about kernel private segment size",
|
|
status);
|
|
|
|
HSA_DEBUG ("Kernel structure for %s fully initialized with "
|
|
"following segment sizes: \n", kernel->name);
|
|
HSA_DEBUG (" group_segment_size: %u\n",
|
|
(unsigned) kernel->group_segment_size);
|
|
HSA_DEBUG (" private_segment_size: %u\n",
|
|
(unsigned) kernel->private_segment_size);
|
|
HSA_DEBUG (" kernarg_segment_size: %u\n",
|
|
(unsigned) kernel->kernarg_segment_size);
|
|
HSA_DEBUG (" omp_data_size: %u\n", kernel->omp_data_size);
|
|
HSA_DEBUG (" gridified_kernel_p: %u\n", kernel->gridified_kernel_p);
|
|
|
|
if (kernel->omp_data_size > *max_omp_data_size)
|
|
*max_omp_data_size = kernel->omp_data_size;
|
|
|
|
for (unsigned i = 0; i < kernel->dependencies_count; i++)
|
|
{
|
|
struct kernel_info *dependency
|
|
= get_kernel_for_agent (agent, kernel->dependencies[i]);
|
|
|
|
if (dependency == NULL)
|
|
{
|
|
HSA_DEBUG ("Could not find a dependency for a kernel: %s, "
|
|
"dependency name: %s\n", kernel->name,
|
|
kernel->dependencies[i]);
|
|
goto failure;
|
|
}
|
|
|
|
if (dependency->dependencies_count > 0)
|
|
{
|
|
HSA_DEBUG ("HSA does not allow kernel dispatching code with "
|
|
"a depth bigger than one\n")
|
|
goto failure;
|
|
}
|
|
|
|
init_single_kernel (dependency, max_omp_data_size);
|
|
}
|
|
|
|
return;
|
|
|
|
failure:
|
|
kernel->initialization_failed = true;
|
|
}
|
|
|
|
/* Indent stream F by INDENT spaces. */
|
|
|
|
static void
|
|
indent_stream (FILE *f, unsigned indent)
|
|
{
|
|
fprintf (f, "%*s", indent, "");
|
|
}
|
|
|
|
/* Dump kernel DISPATCH data structure and indent it by INDENT spaces. */
|
|
|
|
static void
|
|
print_kernel_dispatch (struct GOMP_hsa_kernel_dispatch *dispatch, unsigned indent)
|
|
{
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "this: %p\n", dispatch);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "queue: %p\n", dispatch->queue);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "omp_data_memory: %p\n", dispatch->omp_data_memory);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "kernarg_address: %p\n", dispatch->kernarg_address);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "object: %lu\n", dispatch->object);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "signal: %lu\n", dispatch->signal);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "private_segment_size: %u\n",
|
|
dispatch->private_segment_size);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "group_segment_size: %u\n",
|
|
dispatch->group_segment_size);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "children dispatches: %lu\n",
|
|
dispatch->kernel_dispatch_count);
|
|
indent_stream (stderr, indent);
|
|
fprintf (stderr, "omp_num_threads: %u\n",
|
|
dispatch->omp_num_threads);
|
|
fprintf (stderr, "\n");
|
|
|
|
for (unsigned i = 0; i < dispatch->kernel_dispatch_count; i++)
|
|
print_kernel_dispatch (dispatch->children_dispatches[i], indent + 2);
|
|
}
|
|
|
|
/* Create kernel dispatch data structure for a KERNEL and all its
|
|
dependencies. */
|
|
|
|
static struct GOMP_hsa_kernel_dispatch *
|
|
create_kernel_dispatch (struct kernel_info *kernel, unsigned omp_data_size)
|
|
{
|
|
struct GOMP_hsa_kernel_dispatch *shadow
|
|
= create_single_kernel_dispatch (kernel, omp_data_size);
|
|
shadow->omp_num_threads = 64;
|
|
shadow->debug = 0;
|
|
shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
|
|
|
|
/* Create kernel dispatch data structures. We do not allow to have
|
|
a kernel dispatch with depth bigger than one. */
|
|
for (unsigned i = 0; i < kernel->dependencies_count; i++)
|
|
{
|
|
struct kernel_info *dependency
|
|
= get_kernel_for_agent (kernel->agent, kernel->dependencies[i]);
|
|
shadow->children_dispatches[i]
|
|
= create_single_kernel_dispatch (dependency, omp_data_size);
|
|
shadow->children_dispatches[i]->queue
|
|
= kernel->agent->kernel_dispatch_command_q;
|
|
shadow->children_dispatches[i]->omp_level = 1;
|
|
}
|
|
|
|
return shadow;
|
|
}
|
|
|
|
/* Do all the work that is necessary before running KERNEL for the first time.
|
|
The function assumes the program has been created, finalized and frozen by
|
|
create_and_finalize_hsa_program. */
|
|
|
|
static void
|
|
init_kernel (struct kernel_info *kernel)
|
|
{
|
|
if (pthread_mutex_lock (&kernel->init_mutex))
|
|
GOMP_PLUGIN_fatal ("Could not lock an HSA kernel initialization mutex");
|
|
if (kernel->initialized)
|
|
{
|
|
if (pthread_mutex_unlock (&kernel->init_mutex))
|
|
GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
|
|
"mutex");
|
|
|
|
return;
|
|
}
|
|
|
|
/* Precomputed maximum size of OMP data necessary for a kernel from kernel
|
|
dispatch operation. */
|
|
init_single_kernel (kernel, &kernel->max_omp_data_size);
|
|
|
|
if (!kernel->initialization_failed)
|
|
HSA_DEBUG ("\n");
|
|
|
|
kernel->initialized = true;
|
|
if (pthread_mutex_unlock (&kernel->init_mutex))
|
|
GOMP_PLUGIN_fatal ("Could not unlock an HSA kernel initialization "
|
|
"mutex");
|
|
}
|
|
|
|
/* Parse the target attributes INPUT provided by the compiler and return true
|
|
if we should run anything all. If INPUT is NULL, fill DEF with default
|
|
values, then store INPUT or DEF into *RESULT. */
|
|
|
|
static bool
|
|
parse_target_attributes (void **input,
|
|
struct GOMP_kernel_launch_attributes *def,
|
|
struct GOMP_kernel_launch_attributes **result)
|
|
{
|
|
if (!input)
|
|
GOMP_PLUGIN_fatal ("No target arguments provided");
|
|
|
|
bool attrs_found = false;
|
|
while (*input)
|
|
{
|
|
uintptr_t id = (uintptr_t) *input;
|
|
if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_HSA
|
|
&& ((id & GOMP_TARGET_ARG_ID_MASK)
|
|
== GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
|
|
{
|
|
input++;
|
|
attrs_found = true;
|
|
break;
|
|
}
|
|
|
|
if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
|
|
input++;
|
|
input++;
|
|
}
|
|
|
|
if (!attrs_found)
|
|
{
|
|
def->ndim = 1;
|
|
def->gdims[0] = 1;
|
|
def->gdims[1] = 1;
|
|
def->gdims[2] = 1;
|
|
def->wdims[0] = 1;
|
|
def->wdims[1] = 1;
|
|
def->wdims[2] = 1;
|
|
*result = def;
|
|
HSA_DEBUG ("GOMP_OFFLOAD_run called with no launch attributes\n");
|
|
return true;
|
|
}
|
|
|
|
struct GOMP_kernel_launch_attributes *kla;
|
|
kla = (struct GOMP_kernel_launch_attributes *) *input;
|
|
*result = kla;
|
|
if (kla->ndim != 1)
|
|
GOMP_PLUGIN_fatal ("HSA does not yet support number of dimensions "
|
|
"different from one.");
|
|
if (kla->gdims[0] == 0)
|
|
return false;
|
|
|
|
HSA_DEBUG ("GOMP_OFFLOAD_run called with grid size %u and group size %u\n",
|
|
kla->gdims[0], kla->wdims[0]);
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Return true if the HSA runtime can run function FN_PTR. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_can_run (void *fn_ptr)
|
|
{
|
|
struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
|
|
struct agent_info *agent = kernel->agent;
|
|
create_and_finalize_hsa_program (agent);
|
|
|
|
if (agent->prog_finalized_error)
|
|
goto failure;
|
|
|
|
init_kernel (kernel);
|
|
if (kernel->initialization_failed)
|
|
goto failure;
|
|
|
|
return true;
|
|
|
|
failure:
|
|
if (suppress_host_fallback)
|
|
GOMP_PLUGIN_fatal ("HSA host fallback has been suppressed");
|
|
HSA_DEBUG ("HSA target cannot be launched, doing a host fallback\n");
|
|
return false;
|
|
}
|
|
|
|
/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET. */
|
|
|
|
void
|
|
packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
|
|
{
|
|
__atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Run a kernel on device N and pass it
|
|
an array of pointers in VARS as a parameter. The kernel is identified by
|
|
FN_PTR which must point to a kernel_info structure. */
|
|
|
|
void
|
|
GOMP_OFFLOAD_run (int n, void *fn_ptr, void *vars, void **args)
|
|
{
|
|
struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
|
|
struct agent_info *agent = kernel->agent;
|
|
struct GOMP_kernel_launch_attributes def;
|
|
struct GOMP_kernel_launch_attributes *kla;
|
|
if (!parse_target_attributes (args, &def, &kla))
|
|
{
|
|
HSA_DEBUG ("Will not run HSA kernel because the grid size is zero\n");
|
|
return;
|
|
}
|
|
if (pthread_rwlock_rdlock (&agent->modules_rwlock))
|
|
GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock");
|
|
|
|
if (!agent->initialized)
|
|
GOMP_PLUGIN_fatal ("Agent must be initialized");
|
|
|
|
if (!kernel->initialized)
|
|
GOMP_PLUGIN_fatal ("Called kernel must be initialized");
|
|
|
|
struct GOMP_hsa_kernel_dispatch *shadow
|
|
= create_kernel_dispatch (kernel, kernel->max_omp_data_size);
|
|
|
|
if (debug)
|
|
{
|
|
fprintf (stderr, "\nKernel has following dependencies:\n");
|
|
print_kernel_dispatch (shadow, 2);
|
|
}
|
|
|
|
uint64_t index = hsa_queue_add_write_index_release (agent->command_q, 1);
|
|
HSA_DEBUG ("Got AQL index %llu\n", (long long int) index);
|
|
|
|
/* Wait until the queue is not full before writing the packet. */
|
|
while (index - hsa_queue_load_read_index_acquire (agent->command_q)
|
|
>= agent->command_q->size)
|
|
;
|
|
|
|
hsa_kernel_dispatch_packet_t *packet;
|
|
packet = ((hsa_kernel_dispatch_packet_t *) agent->command_q->base_address)
|
|
+ index % agent->command_q->size;
|
|
|
|
memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
|
|
packet->grid_size_x = kla->gdims[0];
|
|
uint32_t wgs = kla->wdims[0];
|
|
if (wgs == 0)
|
|
/* TODO: Provide a default via environment. */
|
|
wgs = 64;
|
|
else if (wgs > kla->gdims[0])
|
|
wgs = kla->gdims[0];
|
|
packet->workgroup_size_x = wgs;
|
|
packet->grid_size_y = 1;
|
|
packet->workgroup_size_y = 1;
|
|
packet->grid_size_z = 1;
|
|
packet->workgroup_size_z = 1;
|
|
packet->private_segment_size = kernel->private_segment_size;
|
|
packet->group_segment_size = kernel->group_segment_size;
|
|
packet->kernel_object = kernel->object;
|
|
packet->kernarg_address = shadow->kernarg_address;
|
|
hsa_signal_t s;
|
|
s.handle = shadow->signal;
|
|
packet->completion_signal = s;
|
|
hsa_signal_store_relaxed (s, 1);
|
|
memcpy (shadow->kernarg_address, &vars, sizeof (vars));
|
|
|
|
/* PR hsa/70337. */
|
|
size_t vars_size = sizeof (vars);
|
|
if (kernel->kernarg_segment_size > vars_size)
|
|
{
|
|
if (kernel->kernarg_segment_size != vars_size
|
|
+ sizeof (struct hsa_kernel_runtime *))
|
|
GOMP_PLUGIN_fatal ("Kernel segment size has an unexpected value");
|
|
memcpy (packet->kernarg_address + vars_size, &shadow,
|
|
sizeof (struct hsa_kernel_runtime *));
|
|
}
|
|
|
|
HSA_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
|
|
|
|
uint16_t header;
|
|
header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
|
|
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
|
|
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
|
|
|
|
HSA_DEBUG ("Going to dispatch kernel %s\n", kernel->name);
|
|
|
|
packet_store_release ((uint32_t *) packet, header,
|
|
1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
|
|
|
|
hsa_signal_store_release (agent->command_q->doorbell_signal, index);
|
|
|
|
/* TODO: GPU agents in Carrizo APUs cannot properly update L2 cache for
|
|
signal wait and signal load operations on their own and we need to
|
|
periodically call the hsa_signal_load_acquire on completion signals of
|
|
children kernels in the CPU to make that happen. As soon the
|
|
limitation will be resolved, this workaround can be removed. */
|
|
|
|
HSA_DEBUG ("Kernel dispatched, waiting for completion\n");
|
|
|
|
/* Root signal waits with 1ms timeout. */
|
|
while (hsa_signal_wait_acquire (s, HSA_SIGNAL_CONDITION_LT, 1, 1000 * 1000,
|
|
HSA_WAIT_STATE_BLOCKED) != 0)
|
|
for (unsigned i = 0; i < shadow->kernel_dispatch_count; i++)
|
|
{
|
|
hsa_signal_t child_s;
|
|
child_s.handle = shadow->children_dispatches[i]->signal;
|
|
|
|
HSA_DEBUG ("Waiting for children completion signal: %lu\n",
|
|
shadow->children_dispatches[i]->signal);
|
|
hsa_signal_load_acquire (child_s);
|
|
}
|
|
|
|
release_kernel_dispatch (shadow);
|
|
|
|
if (pthread_rwlock_unlock (&agent->modules_rwlock))
|
|
GOMP_PLUGIN_fatal ("Unable to unlock an HSA agent rwlock");
|
|
}
|
|
|
|
/* Information to be passed to a thread running a kernel asycnronously. */
|
|
|
|
struct async_run_info
|
|
{
|
|
int device;
|
|
void *tgt_fn;
|
|
void *tgt_vars;
|
|
void **args;
|
|
void *async_data;
|
|
};
|
|
|
|
/* Thread routine to run a kernel asynchronously. */
|
|
|
|
static void *
|
|
run_kernel_asynchronously (void *thread_arg)
|
|
{
|
|
struct async_run_info *info = (struct async_run_info *) thread_arg;
|
|
int device = info->device;
|
|
void *tgt_fn = info->tgt_fn;
|
|
void *tgt_vars = info->tgt_vars;
|
|
void **args = info->args;
|
|
void *async_data = info->async_data;
|
|
|
|
free (info);
|
|
GOMP_OFFLOAD_run (device, tgt_fn, tgt_vars, args);
|
|
GOMP_PLUGIN_target_task_completion (async_data);
|
|
return NULL;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Run a kernel like GOMP_OFFLOAD_run
|
|
does, but asynchronously and call GOMP_PLUGIN_target_task_completion when it
|
|
has finished. */
|
|
|
|
void
|
|
GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
|
|
void **args, void *async_data)
|
|
{
|
|
pthread_t pt;
|
|
struct async_run_info *info;
|
|
HSA_DEBUG ("GOMP_OFFLOAD_async_run invoked\n")
|
|
info = GOMP_PLUGIN_malloc (sizeof (struct async_run_info));
|
|
|
|
info->device = device;
|
|
info->tgt_fn = tgt_fn;
|
|
info->tgt_vars = tgt_vars;
|
|
info->args = args;
|
|
info->async_data = async_data;
|
|
|
|
int err = pthread_create (&pt, NULL, &run_kernel_asynchronously, info);
|
|
if (err != 0)
|
|
GOMP_PLUGIN_fatal ("HSA asynchronous thread creation failed: %s",
|
|
strerror (err));
|
|
err = pthread_detach (pt);
|
|
if (err != 0)
|
|
GOMP_PLUGIN_fatal ("Failed to detach a thread to run HSA kernel "
|
|
"asynchronously: %s", strerror (err));
|
|
}
|
|
|
|
/* Deinitialize all information associated with MODULE and kernels within
|
|
it. Return TRUE on success. */
|
|
|
|
static bool
|
|
destroy_module (struct module_info *module)
|
|
{
|
|
int i;
|
|
for (i = 0; i < module->kernel_count; i++)
|
|
if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to destroy an HSA kernel initialization "
|
|
"mutex");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Unload BRIG module described by
|
|
struct brig_image_desc in TARGET_DATA from agent number N. Return
|
|
TRUE on success. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_unload_image (int n, unsigned version, void *target_data)
|
|
{
|
|
if (GOMP_VERSION_DEV (version) > GOMP_VERSION_HSA)
|
|
{
|
|
GOMP_PLUGIN_error ("Offload data incompatible with HSA plugin"
|
|
" (expected %u, received %u)",
|
|
GOMP_VERSION_HSA, GOMP_VERSION_DEV (version));
|
|
return false;
|
|
}
|
|
|
|
struct agent_info *agent;
|
|
agent = get_agent_info (n);
|
|
if (!agent)
|
|
return false;
|
|
|
|
if (pthread_rwlock_wrlock (&agent->modules_rwlock))
|
|
{
|
|
GOMP_PLUGIN_error ("Unable to write-lock an HSA agent rwlock");
|
|
return false;
|
|
}
|
|
struct module_info *module = agent->first_module;
|
|
while (module)
|
|
{
|
|
if (module->image_desc == target_data)
|
|
break;
|
|
module = module->next;
|
|
}
|
|
if (!module)
|
|
{
|
|
GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
|
|
"loaded before");
|
|
return false;
|
|
}
|
|
|
|
remove_module_from_agent (agent, module);
|
|
if (!destroy_module (module))
|
|
return false;
|
|
free (module);
|
|
if (!destroy_hsa_program (agent))
|
|
return false;
|
|
if (pthread_rwlock_unlock (&agent->modules_rwlock))
|
|
{
|
|
GOMP_PLUGIN_error ("Unable to unlock an HSA agent rwlock");
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Deinitialize all information and
|
|
status associated with agent number N. We do not attempt any
|
|
synchronization, assuming the user and libgomp will not attempt
|
|
deinitialization of a device that is in any way being used at the same
|
|
time. Return TRUE on success. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_fini_device (int n)
|
|
{
|
|
struct agent_info *agent = get_agent_info (n);
|
|
if (!agent)
|
|
return false;
|
|
|
|
if (!agent->initialized)
|
|
return true;
|
|
|
|
struct module_info *next_module = agent->first_module;
|
|
while (next_module)
|
|
{
|
|
struct module_info *module = next_module;
|
|
next_module = module->next;
|
|
if (!destroy_module (module))
|
|
return false;
|
|
free (module);
|
|
}
|
|
agent->first_module = NULL;
|
|
if (!destroy_hsa_program (agent))
|
|
return false;
|
|
|
|
release_agent_shared_libraries (agent);
|
|
|
|
hsa_status_t status = hsa_queue_destroy (agent->command_q);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error destroying command queue", status);
|
|
status = hsa_queue_destroy (agent->kernel_dispatch_command_q);
|
|
if (status != HSA_STATUS_SUCCESS)
|
|
return hsa_error ("Error destroying kernel dispatch command queue", status);
|
|
if (pthread_mutex_destroy (&agent->prog_mutex))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to destroy an HSA agent program mutex");
|
|
return false;
|
|
}
|
|
if (pthread_rwlock_destroy (&agent->modules_rwlock))
|
|
{
|
|
GOMP_PLUGIN_error ("Failed to destroy an HSA agent rwlock");
|
|
return false;
|
|
}
|
|
agent->initialized = false;
|
|
return true;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Not implemented as it is not required
|
|
for HSA. */
|
|
|
|
void *
|
|
GOMP_OFFLOAD_alloc (int ord, size_t size)
|
|
{
|
|
GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_alloc is not implemented because "
|
|
"it should never be called");
|
|
return NULL;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Not implemented as it is not required
|
|
for HSA. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_free (int ord, void *ptr)
|
|
{
|
|
GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_free is not implemented because "
|
|
"it should never be called");
|
|
return false;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Not implemented as it is not required
|
|
for HSA. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_dev2host (int ord, void *dst, const void *src, size_t n)
|
|
{
|
|
GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2host is not implemented because "
|
|
"it should never be called");
|
|
return false;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Not implemented as it is not required
|
|
for HSA. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_host2dev (int ord, void *dst, const void *src, size_t n)
|
|
{
|
|
GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_host2dev is not implemented because "
|
|
"it should never be called");
|
|
return false;
|
|
}
|
|
|
|
/* Part of the libgomp plugin interface. Not implemented as it is not required
|
|
for HSA. */
|
|
|
|
bool
|
|
GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n)
|
|
{
|
|
GOMP_PLUGIN_error ("HSA GOMP_OFFLOAD_dev2dev is not implemented because "
|
|
"it should never be called");
|
|
return false;
|
|
}
|