/* Copyright (C) 2013-2015 Free Software Foundation, Inc. Contributed by Jakub Jelinek . This file is part of the GNU Offloading and Multi Processing Library (libgomp). Libgomp is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ /* This file contains the support of offloading. */ #include "config.h" #include "libgomp.h" #include "oacc-plugin.h" #include "oacc-int.h" #include "gomp-constants.h" #include #include #include #ifdef HAVE_INTTYPES_H # include /* For PRIu64. */ #endif #include #include #ifdef PLUGIN_SUPPORT #include #include "plugin-suffix.h" #endif static void gomp_target_init (void); /* The whole initialization code for offloading plugins is only run one. */ static pthread_once_t gomp_is_initialized = PTHREAD_ONCE_INIT; /* Mutex for offload image registration. */ static gomp_mutex_t register_lock; /* This structure describes an offload image. It contains type of the target device, pointer to host table descriptor, and pointer to target data. */ struct offload_image_descr { enum offload_target_type type; const void *host_table; const void *target_data; }; /* Array of descriptors of offload images. */ static struct offload_image_descr *offload_images; /* Total number of offload images. */ static int num_offload_images; /* Array of descriptors for all available devices. */ static struct gomp_device_descr *devices; /* Total number of available devices. */ static int num_devices; /* Number of GOMP_OFFLOAD_CAP_OPENMP_400 devices. */ static int num_devices_openmp; /* Similar to gomp_realloc, but release register_lock before gomp_fatal. */ static void * gomp_realloc_unlock (void *old, size_t size) { void *ret = realloc (old, size); if (ret == NULL) { gomp_mutex_unlock (®ister_lock); gomp_fatal ("Out of memory allocating %lu bytes", (unsigned long) size); } return ret; } /* The comparison function. */ attribute_hidden int splay_compare (splay_tree_key x, splay_tree_key y) { if (x->host_start == x->host_end && y->host_start == y->host_end) return 0; if (x->host_end <= y->host_start) return -1; if (x->host_start >= y->host_end) return 1; return 0; } #include "splay-tree.h" attribute_hidden void gomp_init_targets_once (void) { (void) pthread_once (&gomp_is_initialized, gomp_target_init); } attribute_hidden int gomp_get_num_devices (void) { gomp_init_targets_once (); return num_devices_openmp; } static struct gomp_device_descr * resolve_device (int device_id) { if (device_id == GOMP_DEVICE_ICV) { struct gomp_task_icv *icv = gomp_icv (false); device_id = icv->default_device_var; } if (device_id < 0 || device_id >= gomp_get_num_devices ()) return NULL; return &devices[device_id]; } /* Handle the case where splay_tree_lookup found oldn for newn. Helper function of gomp_map_vars. */ static inline void gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key oldn, splay_tree_key newn, unsigned char kind) { if ((kind & GOMP_MAP_FLAG_FORCE) || oldn->host_start > newn->host_start || oldn->host_end < newn->host_end) { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("Trying to map into device [%p..%p) object when " "[%p..%p) is already mapped", (void *) newn->host_start, (void *) newn->host_end, (void *) oldn->host_start, (void *) oldn->host_end); } oldn->refcount++; } static int get_kind (bool is_openacc, void *kinds, int idx) { return is_openacc ? ((unsigned short *) kinds)[idx] : ((unsigned char *) kinds)[idx]; } static void gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr, uintptr_t target_offset, uintptr_t bias) { struct gomp_device_descr *devicep = tgt->device_descr; struct splay_tree_s *mem_map = &devicep->mem_map; struct splay_tree_key_s cur_node; cur_node.host_start = host_ptr; if (cur_node.host_start == (uintptr_t) NULL) { cur_node.tgt_offset = (uintptr_t) NULL; /* FIXME: see comment about coalescing host/dev transfers below. */ devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + target_offset), (void *) &cur_node.tgt_offset, sizeof (void *)); return; } /* Add bias to the pointer value. */ cur_node.host_start += bias; cur_node.host_end = cur_node.host_start + 1; splay_tree_key n = splay_tree_lookup (mem_map, &cur_node); if (n == NULL) { /* Could be possibly zero size array section. */ cur_node.host_end--; n = splay_tree_lookup (mem_map, &cur_node); if (n == NULL) { cur_node.host_start--; n = splay_tree_lookup (mem_map, &cur_node); cur_node.host_start++; } } if (n == NULL) { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("Pointer target of array section wasn't mapped"); } cur_node.host_start -= n->host_start; cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset + cur_node.host_start; /* At this point tgt_offset is target address of the array section. Now subtract bias to get what we want to initialize the pointer with. */ cur_node.tgt_offset -= bias; /* FIXME: see comment about coalescing host/dev transfers below. */ devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + target_offset), (void *) &cur_node.tgt_offset, sizeof (void *)); } attribute_hidden struct target_mem_desc * gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, void **devaddrs, size_t *sizes, void *kinds, bool is_openacc, bool is_target) { size_t i, tgt_align, tgt_size, not_found_cnt = 0; const int rshift = is_openacc ? 8 : 3; const int typemask = is_openacc ? 0xff : 0x7; struct splay_tree_s *mem_map = &devicep->mem_map; struct splay_tree_key_s cur_node; struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt) + sizeof (tgt->list[0]) * mapnum); tgt->list_count = mapnum; tgt->refcount = 1; tgt->device_descr = devicep; if (mapnum == 0) return tgt; tgt_align = sizeof (void *); tgt_size = 0; if (is_target) { size_t align = 4 * sizeof (void *); tgt_align = align; tgt_size = mapnum * sizeof (void *); } gomp_mutex_lock (&devicep->lock); for (i = 0; i < mapnum; i++) { int kind = get_kind (is_openacc, kinds, i); if (hostaddrs[i] == NULL) { tgt->list[i] = NULL; continue; } cur_node.host_start = (uintptr_t) hostaddrs[i]; if (!GOMP_MAP_POINTER_P (kind & typemask)) cur_node.host_end = cur_node.host_start + sizes[i]; else cur_node.host_end = cur_node.host_start + sizeof (void *); splay_tree_key n = splay_tree_lookup (mem_map, &cur_node); if (n) { tgt->list[i] = n; gomp_map_vars_existing (devicep, n, &cur_node, kind & typemask); } else { tgt->list[i] = NULL; size_t align = (size_t) 1 << (kind >> rshift); not_found_cnt++; if (tgt_align < align) tgt_align = align; tgt_size = (tgt_size + align - 1) & ~(align - 1); tgt_size += cur_node.host_end - cur_node.host_start; if ((kind & typemask) == GOMP_MAP_TO_PSET) { size_t j; for (j = i + 1; j < mapnum; j++) if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j) & typemask)) break; else if ((uintptr_t) hostaddrs[j] < cur_node.host_start || ((uintptr_t) hostaddrs[j] + sizeof (void *) > cur_node.host_end)) break; else { tgt->list[j] = NULL; i++; } } } } if (devaddrs) { if (mapnum != 1) { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("unexpected aggregation"); } tgt->to_free = devaddrs[0]; tgt->tgt_start = (uintptr_t) tgt->to_free; tgt->tgt_end = tgt->tgt_start + sizes[0]; } else if (not_found_cnt || is_target) { /* Allocate tgt_align aligned tgt_size block of memory. */ /* FIXME: Perhaps change interface to allocate properly aligned memory. */ tgt->to_free = devicep->alloc_func (devicep->target_id, tgt_size + tgt_align - 1); tgt->tgt_start = (uintptr_t) tgt->to_free; tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1); tgt->tgt_end = tgt->tgt_start + tgt_size; } else { tgt->to_free = NULL; tgt->tgt_start = 0; tgt->tgt_end = 0; } tgt_size = 0; if (is_target) tgt_size = mapnum * sizeof (void *); tgt->array = NULL; if (not_found_cnt) { tgt->array = gomp_malloc (not_found_cnt * sizeof (*tgt->array)); splay_tree_node array = tgt->array; size_t j; for (i = 0; i < mapnum; i++) if (tgt->list[i] == NULL) { int kind = get_kind (is_openacc, kinds, i); if (hostaddrs[i] == NULL) continue; splay_tree_key k = &array->key; k->host_start = (uintptr_t) hostaddrs[i]; if (!GOMP_MAP_POINTER_P (kind & typemask)) k->host_end = k->host_start + sizes[i]; else k->host_end = k->host_start + sizeof (void *); splay_tree_key n = splay_tree_lookup (mem_map, k); if (n) { tgt->list[i] = n; gomp_map_vars_existing (devicep, n, k, kind & typemask); } else { size_t align = (size_t) 1 << (kind >> rshift); tgt->list[i] = k; tgt_size = (tgt_size + align - 1) & ~(align - 1); k->tgt = tgt; k->tgt_offset = tgt_size; tgt_size += k->host_end - k->host_start; k->copy_from = GOMP_MAP_COPY_FROM_P (kind & typemask); k->refcount = 1; k->async_refcount = 0; tgt->refcount++; array->left = NULL; array->right = NULL; splay_tree_insert (mem_map, array); switch (kind & typemask) { case GOMP_MAP_ALLOC: case GOMP_MAP_FROM: case GOMP_MAP_FORCE_ALLOC: case GOMP_MAP_FORCE_FROM: break; case GOMP_MAP_TO: case GOMP_MAP_TOFROM: case GOMP_MAP_FORCE_TO: case GOMP_MAP_FORCE_TOFROM: /* FIXME: Perhaps add some smarts, like if copying several adjacent fields from host to target, use some host buffer to avoid sending each var individually. */ devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, k->host_end - k->host_start); break; case GOMP_MAP_POINTER: gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start, k->tgt_offset, sizes[i]); break; case GOMP_MAP_TO_PSET: /* FIXME: see above FIXME comment. */ devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, k->host_end - k->host_start); for (j = i + 1; j < mapnum; j++) if (!GOMP_MAP_POINTER_P (get_kind (is_openacc, kinds, j) & typemask)) break; else if ((uintptr_t) hostaddrs[j] < k->host_start || ((uintptr_t) hostaddrs[j] + sizeof (void *) > k->host_end)) break; else { tgt->list[j] = k; k->refcount++; gomp_map_pointer (tgt, (uintptr_t) *(void **) hostaddrs[j], k->tgt_offset + ((uintptr_t) hostaddrs[j] - k->host_start), sizes[j]); i++; } break; case GOMP_MAP_FORCE_PRESENT: { /* We already looked up the memory region above and it was missing. */ size_t size = k->host_end - k->host_start; gomp_mutex_unlock (&devicep->lock); #ifdef HAVE_INTTYPES_H gomp_fatal ("present clause: !acc_is_present (%p, " "%"PRIu64" (0x%"PRIx64"))", (void *) k->host_start, (uint64_t) size, (uint64_t) size); #else gomp_fatal ("present clause: !acc_is_present (%p, " "%lu (0x%lx))", (void *) k->host_start, (unsigned long) size, (unsigned long) size); #endif } break; case GOMP_MAP_FORCE_DEVICEPTR: assert (k->host_end - k->host_start == sizeof (void *)); devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, sizeof (void *)); break; default: gomp_mutex_unlock (&devicep->lock); gomp_fatal ("%s: unhandled kind 0x%.2x", __FUNCTION__, kind); } array++; } } } if (is_target) { for (i = 0; i < mapnum; i++) { if (tgt->list[i] == NULL) cur_node.tgt_offset = (uintptr_t) NULL; else cur_node.tgt_offset = tgt->list[i]->tgt->tgt_start + tgt->list[i]->tgt_offset; /* FIXME: see above FIXME comment. */ devicep->host2dev_func (devicep->target_id, (void *) (tgt->tgt_start + i * sizeof (void *)), (void *) &cur_node.tgt_offset, sizeof (void *)); } } gomp_mutex_unlock (&devicep->lock); return tgt; } static void gomp_unmap_tgt (struct target_mem_desc *tgt) { /* Deallocate on target the tgt->tgt_start .. tgt->tgt_end region. */ if (tgt->tgt_end) tgt->device_descr->free_func (tgt->device_descr->target_id, tgt->to_free); free (tgt->array); free (tgt); } /* Decrease the refcount for a set of mapped variables, and queue asychronous copies from the device back to the host after any work that has been issued. Because the regions are still "live", increment an asynchronous reference count to indicate that they should not be unmapped from host-side data structures until the asynchronous copy has completed. */ attribute_hidden void gomp_copy_from_async (struct target_mem_desc *tgt) { struct gomp_device_descr *devicep = tgt->device_descr; size_t i; gomp_mutex_lock (&devicep->lock); for (i = 0; i < tgt->list_count; i++) if (tgt->list[i] == NULL) ; else if (tgt->list[i]->refcount > 1) { tgt->list[i]->refcount--; tgt->list[i]->async_refcount++; } else { splay_tree_key k = tgt->list[i]; if (k->copy_from) devicep->dev2host_func (devicep->target_id, (void *) k->host_start, (void *) (k->tgt->tgt_start + k->tgt_offset), k->host_end - k->host_start); } gomp_mutex_unlock (&devicep->lock); } /* Unmap variables described by TGT. If DO_COPYFROM is true, copy relevant variables back from device to host: if it is false, it is assumed that this has been done already, i.e. by gomp_copy_from_async above. */ attribute_hidden void gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom) { struct gomp_device_descr *devicep = tgt->device_descr; if (tgt->list_count == 0) { free (tgt); return; } gomp_mutex_lock (&devicep->lock); size_t i; for (i = 0; i < tgt->list_count; i++) if (tgt->list[i] == NULL) ; else if (tgt->list[i]->refcount > 1) tgt->list[i]->refcount--; else if (tgt->list[i]->async_refcount > 0) tgt->list[i]->async_refcount--; else { splay_tree_key k = tgt->list[i]; if (k->copy_from && do_copyfrom) devicep->dev2host_func (devicep->target_id, (void *) k->host_start, (void *) (k->tgt->tgt_start + k->tgt_offset), k->host_end - k->host_start); splay_tree_remove (&devicep->mem_map, k); if (k->tgt->refcount > 1) k->tgt->refcount--; else gomp_unmap_tgt (k->tgt); } if (tgt->refcount > 1) tgt->refcount--; else gomp_unmap_tgt (tgt); gomp_mutex_unlock (&devicep->lock); } static void gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, size_t *sizes, void *kinds, bool is_openacc) { size_t i; struct splay_tree_key_s cur_node; const int typemask = is_openacc ? 0xff : 0x7; if (!devicep) return; if (mapnum == 0) return; gomp_mutex_lock (&devicep->lock); for (i = 0; i < mapnum; i++) if (sizes[i]) { cur_node.host_start = (uintptr_t) hostaddrs[i]; cur_node.host_end = cur_node.host_start + sizes[i]; splay_tree_key n = splay_tree_lookup (&devicep->mem_map, &cur_node); if (n) { int kind = get_kind (is_openacc, kinds, i); if (n->host_start > cur_node.host_start || n->host_end < cur_node.host_end) { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("Trying to update [%p..%p) object when " "only [%p..%p) is mapped", (void *) cur_node.host_start, (void *) cur_node.host_end, (void *) n->host_start, (void *) n->host_end); } if (GOMP_MAP_COPY_TO_P (kind & typemask)) devicep->host2dev_func (devicep->target_id, (void *) (n->tgt->tgt_start + n->tgt_offset + cur_node.host_start - n->host_start), (void *) cur_node.host_start, cur_node.host_end - cur_node.host_start); if (GOMP_MAP_COPY_FROM_P (kind & typemask)) devicep->dev2host_func (devicep->target_id, (void *) cur_node.host_start, (void *) (n->tgt->tgt_start + n->tgt_offset + cur_node.host_start - n->host_start), cur_node.host_end - cur_node.host_start); } else { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("Trying to update [%p..%p) object that is not mapped", (void *) cur_node.host_start, (void *) cur_node.host_end); } } gomp_mutex_unlock (&devicep->lock); } /* Load image pointed by TARGET_DATA to the device, specified by DEVICEP. And insert to splay tree the mapping between addresses from HOST_TABLE and from loaded target image. We rely in the host and device compiler emitting variable and functions in the same order. */ static void gomp_load_image_to_device (struct gomp_device_descr *devicep, const void *host_table, const void *target_data, bool is_register_lock) { void **host_func_table = ((void ***) host_table)[0]; void **host_funcs_end = ((void ***) host_table)[1]; void **host_var_table = ((void ***) host_table)[2]; void **host_vars_end = ((void ***) host_table)[3]; /* The func table contains only addresses, the var table contains addresses and corresponding sizes. */ int num_funcs = host_funcs_end - host_func_table; int num_vars = (host_vars_end - host_var_table) / 2; /* Load image to device and get target addresses for the image. */ struct addr_pair *target_table = NULL; int i, num_target_entries = devicep->load_image_func (devicep->target_id, target_data, &target_table); if (num_target_entries != num_funcs + num_vars) { gomp_mutex_unlock (&devicep->lock); if (is_register_lock) gomp_mutex_unlock (®ister_lock); gomp_fatal ("Can't map target functions or variables"); } /* Insert host-target address mapping into splay tree. */ struct target_mem_desc *tgt = gomp_malloc (sizeof (*tgt)); tgt->array = gomp_malloc ((num_funcs + num_vars) * sizeof (*tgt->array)); tgt->refcount = 1; tgt->tgt_start = 0; tgt->tgt_end = 0; tgt->to_free = NULL; tgt->prev = NULL; tgt->list_count = 0; tgt->device_descr = devicep; splay_tree_node array = tgt->array; for (i = 0; i < num_funcs; i++) { splay_tree_key k = &array->key; k->host_start = (uintptr_t) host_func_table[i]; k->host_end = k->host_start + 1; k->tgt = tgt; k->tgt_offset = target_table[i].start; k->refcount = 1; k->async_refcount = 0; k->copy_from = false; array->left = NULL; array->right = NULL; splay_tree_insert (&devicep->mem_map, array); array++; } for (i = 0; i < num_vars; i++) { struct addr_pair *target_var = &target_table[num_funcs + i]; if (target_var->end - target_var->start != (uintptr_t) host_var_table[i * 2 + 1]) { gomp_mutex_unlock (&devicep->lock); if (is_register_lock) gomp_mutex_unlock (®ister_lock); gomp_fatal ("Can't map target variables (size mismatch)"); } splay_tree_key k = &array->key; k->host_start = (uintptr_t) host_var_table[i * 2]; k->host_end = k->host_start + (uintptr_t) host_var_table[i * 2 + 1]; k->tgt = tgt; k->tgt_offset = target_var->start; k->refcount = 1; k->async_refcount = 0; k->copy_from = false; array->left = NULL; array->right = NULL; splay_tree_insert (&devicep->mem_map, array); array++; } free (target_table); } /* Unload the mappings described by target_data from device DEVICE_P. The device must be locked. */ static void gomp_unload_image_from_device (struct gomp_device_descr *devicep, const void *host_table, const void *target_data) { void **host_func_table = ((void ***) host_table)[0]; void **host_funcs_end = ((void ***) host_table)[1]; void **host_var_table = ((void ***) host_table)[2]; void **host_vars_end = ((void ***) host_table)[3]; /* The func table contains only addresses, the var table contains addresses and corresponding sizes. */ int num_funcs = host_funcs_end - host_func_table; int num_vars = (host_vars_end - host_var_table) / 2; unsigned j; struct splay_tree_key_s k; splay_tree_key node = NULL; /* Find mapping at start of node array */ if (num_funcs || num_vars) { k.host_start = (num_funcs ? (uintptr_t) host_func_table[0] : (uintptr_t) host_var_table[0]); k.host_end = k.host_start + 1; node = splay_tree_lookup (&devicep->mem_map, &k); } devicep->unload_image_func (devicep->target_id, target_data); /* Remove mappings from splay tree. */ for (j = 0; j < num_funcs; j++) { k.host_start = (uintptr_t) host_func_table[j]; k.host_end = k.host_start + 1; splay_tree_remove (&devicep->mem_map, &k); } for (j = 0; j < num_vars; j++) { k.host_start = (uintptr_t) host_var_table[j * 2]; k.host_end = k.host_start + (uintptr_t) host_var_table[j * 2 + 1]; splay_tree_remove (&devicep->mem_map, &k); } if (node) { free (node->tgt); free (node); } } /* This function should be called from every offload image while loading. It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of the target, and TARGET_DATA needed by target plugin. */ void GOMP_offload_register (const void *host_table, int target_type, const void *target_data) { int i; gomp_mutex_lock (®ister_lock); /* Load image to all initialized devices. */ for (i = 0; i < num_devices; i++) { struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); if (devicep->type == target_type && devicep->is_initialized) gomp_load_image_to_device (devicep, host_table, target_data, true); gomp_mutex_unlock (&devicep->lock); } /* Insert image to array of pending images. */ offload_images = gomp_realloc_unlock (offload_images, (num_offload_images + 1) * sizeof (struct offload_image_descr)); offload_images[num_offload_images].type = target_type; offload_images[num_offload_images].host_table = host_table; offload_images[num_offload_images].target_data = target_data; num_offload_images++; gomp_mutex_unlock (®ister_lock); } /* This function should be called from every offload image while unloading. It gets the descriptor of the host func and var tables HOST_TABLE, TYPE of the target, and TARGET_DATA needed by target plugin. */ void GOMP_offload_unregister (const void *host_table, int target_type, const void *target_data) { int i; gomp_mutex_lock (®ister_lock); /* Unload image from all initialized devices. */ for (i = 0; i < num_devices; i++) { struct gomp_device_descr *devicep = &devices[i]; gomp_mutex_lock (&devicep->lock); if (devicep->type == target_type && devicep->is_initialized) gomp_unload_image_from_device (devicep, host_table, target_data); gomp_mutex_unlock (&devicep->lock); } /* Remove image from array of pending images. */ for (i = 0; i < num_offload_images; i++) if (offload_images[i].target_data == target_data) { offload_images[i] = offload_images[--num_offload_images]; break; } gomp_mutex_unlock (®ister_lock); } /* This function initializes the target device, specified by DEVICEP. DEVICEP must be locked on entry, and remains locked on return. */ attribute_hidden void gomp_init_device (struct gomp_device_descr *devicep) { int i; devicep->init_device_func (devicep->target_id); /* Load to device all images registered by the moment. */ for (i = 0; i < num_offload_images; i++) { struct offload_image_descr *image = &offload_images[i]; if (image->type == devicep->type) gomp_load_image_to_device (devicep, image->host_table, image->target_data, false); } devicep->is_initialized = true; } attribute_hidden void gomp_unload_device (struct gomp_device_descr *devicep) { if (devicep->is_initialized) { unsigned i; /* Unload from device all images registered at the moment. */ for (i = 0; i < num_offload_images; i++) { struct offload_image_descr *image = &offload_images[i]; if (image->type == devicep->type) gomp_unload_image_from_device (devicep, image->host_table, image->target_data); } } } /* Free address mapping tables. MM must be locked on entry, and remains locked on return. */ attribute_hidden void gomp_free_memmap (struct splay_tree_s *mem_map) { while (mem_map->root) { struct target_mem_desc *tgt = mem_map->root->key.tgt; splay_tree_remove (mem_map, &mem_map->root->key); free (tgt->array); free (tgt); } } /* This function de-initializes the target device, specified by DEVICEP. DEVICEP must be locked on entry, and remains locked on return. */ attribute_hidden void gomp_fini_device (struct gomp_device_descr *devicep) { if (devicep->is_initialized) devicep->fini_device_func (devicep->target_id); devicep->is_initialized = false; } /* Called when encountering a target directive. If DEVICE is GOMP_DEVICE_ICV, it means use device-var ICV. If it is GOMP_DEVICE_HOST_FALLBACK (or any value larger than last available hw device), use host fallback. FN is address of host code, UNUSED is part of the current ABI, but we're not actually using it. HOSTADDRS, SIZES and KINDS are arrays with MAPNUM entries, with addresses of the host objects, sizes of the host objects (resp. for pointer kind pointer bias and assumed sizeof (void *) size) and kinds. */ void GOMP_target (int device, void (*fn) (void *), const void *unused, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds) { struct gomp_device_descr *devicep = resolve_device (device); if (devicep == NULL || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) { /* Host fallback. */ struct gomp_thread old_thr, *thr = gomp_thread (); old_thr = *thr; memset (thr, '\0', sizeof (*thr)); if (gomp_places_list) { thr->place = old_thr.place; thr->ts.place_partition_len = gomp_places_list_len; } fn (hostaddrs); gomp_free_thread (thr); *thr = old_thr; return; } gomp_mutex_lock (&devicep->lock); if (!devicep->is_initialized) gomp_init_device (devicep); gomp_mutex_unlock (&devicep->lock); void *fn_addr; if (devicep->capabilities & GOMP_OFFLOAD_CAP_NATIVE_EXEC) fn_addr = (void *) fn; else { gomp_mutex_lock (&devicep->lock); struct splay_tree_key_s k; k.host_start = (uintptr_t) fn; k.host_end = k.host_start + 1; splay_tree_key tgt_fn = splay_tree_lookup (&devicep->mem_map, &k); if (tgt_fn == NULL) { gomp_mutex_unlock (&devicep->lock); gomp_fatal ("Target function wasn't mapped"); } gomp_mutex_unlock (&devicep->lock); fn_addr = (void *) tgt_fn->tgt_offset; } struct target_mem_desc *tgt_vars = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false, true); struct gomp_thread old_thr, *thr = gomp_thread (); old_thr = *thr; memset (thr, '\0', sizeof (*thr)); if (gomp_places_list) { thr->place = old_thr.place; thr->ts.place_partition_len = gomp_places_list_len; } devicep->run_func (devicep->target_id, fn_addr, (void *) tgt_vars->tgt_start); gomp_free_thread (thr); *thr = old_thr; gomp_unmap_vars (tgt_vars, true); } void GOMP_target_data (int device, const void *unused, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds) { struct gomp_device_descr *devicep = resolve_device (device); if (devicep == NULL || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) { /* Host fallback. */ struct gomp_task_icv *icv = gomp_icv (false); if (icv->target_data) { /* Even when doing a host fallback, if there are any active #pragma omp target data constructs, need to remember the new #pragma omp target data, otherwise GOMP_target_end_data would get out of sync. */ struct target_mem_desc *tgt = gomp_map_vars (NULL, 0, NULL, NULL, NULL, NULL, false, false); tgt->prev = icv->target_data; icv->target_data = tgt; } return; } gomp_mutex_lock (&devicep->lock); if (!devicep->is_initialized) gomp_init_device (devicep); gomp_mutex_unlock (&devicep->lock); struct target_mem_desc *tgt = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, false, false); struct gomp_task_icv *icv = gomp_icv (true); tgt->prev = icv->target_data; icv->target_data = tgt; } void GOMP_target_end_data (void) { struct gomp_task_icv *icv = gomp_icv (false); if (icv->target_data) { struct target_mem_desc *tgt = icv->target_data; icv->target_data = tgt->prev; gomp_unmap_vars (tgt, true); } } void GOMP_target_update (int device, const void *unused, size_t mapnum, void **hostaddrs, size_t *sizes, unsigned char *kinds) { struct gomp_device_descr *devicep = resolve_device (device); if (devicep == NULL || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) return; gomp_mutex_lock (&devicep->lock); if (!devicep->is_initialized) gomp_init_device (devicep); gomp_mutex_unlock (&devicep->lock); gomp_update (devicep, mapnum, hostaddrs, sizes, kinds, false); } void GOMP_teams (unsigned int num_teams, unsigned int thread_limit) { if (thread_limit) { struct gomp_task_icv *icv = gomp_icv (true); icv->thread_limit_var = thread_limit > INT_MAX ? UINT_MAX : thread_limit; } (void) num_teams; } #ifdef PLUGIN_SUPPORT /* This function tries to load a plugin for DEVICE. Name of plugin is passed in PLUGIN_NAME. The handles of the found functions are stored in the corresponding fields of DEVICE. The function returns TRUE on success and FALSE otherwise. */ static bool gomp_load_plugin_for_device (struct gomp_device_descr *device, const char *plugin_name) { const char *err = NULL, *last_missing = NULL; int optional_present, optional_total; /* Clear any existing error. */ dlerror (); void *plugin_handle = dlopen (plugin_name, RTLD_LAZY); if (!plugin_handle) { err = dlerror (); goto out; } /* Check if all required functions are available in the plugin and store their handlers. */ #define DLSYM(f) \ do \ { \ device->f##_func = dlsym (plugin_handle, "GOMP_OFFLOAD_" #f); \ err = dlerror (); \ if (err != NULL) \ goto out; \ } \ while (0) /* Similar, but missing functions are not an error. */ #define DLSYM_OPT(f, n) \ do \ { \ const char *tmp_err; \ device->f##_func = dlsym (plugin_handle, "GOMP_OFFLOAD_" #n); \ tmp_err = dlerror (); \ if (tmp_err == NULL) \ optional_present++; \ else \ last_missing = #n; \ optional_total++; \ } \ while (0) DLSYM (get_name); DLSYM (get_caps); DLSYM (get_type); DLSYM (get_num_devices); DLSYM (init_device); DLSYM (fini_device); DLSYM (load_image); DLSYM (unload_image); DLSYM (alloc); DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) DLSYM (run); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) { optional_present = optional_total = 0; DLSYM_OPT (openacc.exec, openacc_parallel); DLSYM_OPT (openacc.register_async_cleanup, openacc_register_async_cleanup); DLSYM_OPT (openacc.async_test, openacc_async_test); DLSYM_OPT (openacc.async_test_all, openacc_async_test_all); DLSYM_OPT (openacc.async_wait, openacc_async_wait); DLSYM_OPT (openacc.async_wait_async, openacc_async_wait_async); DLSYM_OPT (openacc.async_wait_all, openacc_async_wait_all); DLSYM_OPT (openacc.async_wait_all_async, openacc_async_wait_all_async); DLSYM_OPT (openacc.async_set_async, openacc_async_set_async); DLSYM_OPT (openacc.create_thread_data, openacc_create_thread_data); DLSYM_OPT (openacc.destroy_thread_data, openacc_destroy_thread_data); /* Require all the OpenACC handlers if we have GOMP_OFFLOAD_CAP_OPENACC_200. */ if (optional_present != optional_total) { err = "plugin missing OpenACC handler function"; goto out; } optional_present = optional_total = 0; DLSYM_OPT (openacc.cuda.get_current_device, openacc_get_current_cuda_device); DLSYM_OPT (openacc.cuda.get_current_context, openacc_get_current_cuda_context); DLSYM_OPT (openacc.cuda.get_stream, openacc_get_cuda_stream); DLSYM_OPT (openacc.cuda.set_stream, openacc_set_cuda_stream); /* Make sure all the CUDA functions are there if any of them are. */ if (optional_present && optional_present != optional_total) { err = "plugin missing OpenACC CUDA handler function"; goto out; } } #undef DLSYM #undef DLSYM_OPT out: if (err != NULL) { gomp_error ("while loading %s: %s", plugin_name, err); if (last_missing) gomp_error ("missing function was %s", last_missing); if (plugin_handle) dlclose (plugin_handle); } return err == NULL; } /* This function initializes the runtime needed for offloading. It parses the list of offload targets and tries to load the plugins for these targets. On return, the variables NUM_DEVICES and NUM_DEVICES_OPENMP will be set, and the array DEVICES initialized, containing descriptors for corresponding devices, first the GOMP_OFFLOAD_CAP_OPENMP_400 ones, follows by the others. */ static void gomp_target_init (void) { const char *prefix ="libgomp-plugin-"; const char *suffix = SONAME_SUFFIX (1); const char *cur, *next; char *plugin_name; int i, new_num_devices; num_devices = 0; devices = NULL; cur = OFFLOAD_TARGETS; if (*cur) do { struct gomp_device_descr current_device; next = strchr (cur, ','); plugin_name = (char *) malloc (1 + (next ? next - cur : strlen (cur)) + strlen (prefix) + strlen (suffix)); if (!plugin_name) { num_devices = 0; break; } strcpy (plugin_name, prefix); strncat (plugin_name, cur, next ? next - cur : strlen (cur)); strcat (plugin_name, suffix); if (gomp_load_plugin_for_device (¤t_device, plugin_name)) { new_num_devices = current_device.get_num_devices_func (); if (new_num_devices >= 1) { /* Augment DEVICES and NUM_DEVICES. */ devices = realloc (devices, (num_devices + new_num_devices) * sizeof (struct gomp_device_descr)); if (!devices) { num_devices = 0; free (plugin_name); break; } current_device.name = current_device.get_name_func (); /* current_device.capabilities has already been set. */ current_device.type = current_device.get_type_func (); current_device.mem_map.root = NULL; current_device.is_initialized = false; current_device.openacc.data_environ = NULL; for (i = 0; i < new_num_devices; i++) { current_device.target_id = i; devices[num_devices] = current_device; gomp_mutex_init (&devices[num_devices].lock); num_devices++; } } } free (plugin_name); cur = next + 1; } while (next); /* In DEVICES, sort the GOMP_OFFLOAD_CAP_OPENMP_400 ones first, and set NUM_DEVICES_OPENMP. */ struct gomp_device_descr *devices_s = malloc (num_devices * sizeof (struct gomp_device_descr)); if (!devices_s) { num_devices = 0; free (devices); devices = NULL; } num_devices_openmp = 0; for (i = 0; i < num_devices; i++) if (devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) devices_s[num_devices_openmp++] = devices[i]; int num_devices_after_openmp = num_devices_openmp; for (i = 0; i < num_devices; i++) if (!(devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)) devices_s[num_devices_after_openmp++] = devices[i]; free (devices); devices = devices_s; for (i = 0; i < num_devices; i++) { /* The 'devices' array can be moved (by the realloc call) until we have found all the plugins, so registering with the OpenACC runtime (which takes a copy of the pointer argument) must be delayed until now. */ if (devices[i].capabilities & GOMP_OFFLOAD_CAP_OPENACC_200) goacc_register (&devices[i]); } } #else /* PLUGIN_SUPPORT */ /* If dlfcn.h is unavailable we always fallback to host execution. GOMP_target* routines are just stubs for this case. */ static void gomp_target_init (void) { } #endif /* PLUGIN_SUPPORT */