fbdec79172
+2013-10-30 Balaji V. Iyer <balaji.v.iyer@intel.com> + + * configure: Reconfigured. + * configure.ac: Add AC_FUNC_ALLOCA. + * runtime/sysdep-unix.c: Added check for alloca.h before + including it. + * config/x86/cilk-abi-vla.c: Likewise. + * runtime/cilk_fiber-unix.cpp: Likewise. + From-SVN: r204232
442 lines
16 KiB
C
442 lines
16 KiB
C
/* cilk-abi-vla.cpp -*-C++-*-
|
|
*
|
|
*************************************************************************
|
|
*
|
|
* @copyright
|
|
* Copyright (C) 2013, Intel Corporation
|
|
* All rights reserved.
|
|
*
|
|
* @copyright
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* @copyright
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
|
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
|
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
|
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************/
|
|
|
|
/*
|
|
* Implementation of Variable Length Array (VLA) ABI.
|
|
*
|
|
* __cilkrts_stack_alloc() and __cilkrts_stack_free must be compiled
|
|
* such that ebp/rbp is used for the stack frames. This is done by having
|
|
* each of them use alloca, which forces the special frame types needed on
|
|
* each of the ABIs. Additionally, for some forms of stack frame, special
|
|
* care must be taken because the alloca space may not be at the bottom of the
|
|
* stack frame of the caller. For Intel64 windows, and for some options
|
|
* with other ABIs, a preallocated parameter block may exist on the stack
|
|
* at a lower address than the alloca. If this is the case, the parameter
|
|
* distance_from_sp_to_alloca_area will be non-zero, and will indicate how
|
|
* much pre-allocated parameter space resides in the caller's stack frame
|
|
* between the alloca area, and the bottom of the stack when the call to
|
|
* the cilkrts is made. As such, when non-zero it also includes any space
|
|
* used for passing the cilkrts_stack_alloc or cilkrts_stack_free parameters.
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
|
|
// Getting a definition for alloca appears to be a pain in the butt. Here's
|
|
// a variant on what's recommended in the autoconf doc
|
|
#if defined _MSC_VER
|
|
# include <malloc.h>
|
|
# define alloca _alloca
|
|
#elif defined HAVE_ALLOCA_H
|
|
# include <alloca.h>
|
|
#elif defined __GNUC__
|
|
# define alloca __builtin_alloca
|
|
#elif defined _AIX
|
|
# define alloca __alloca
|
|
#else
|
|
# include <stddef.h>
|
|
# ifdef __cplusplus
|
|
extern "C"
|
|
# endif
|
|
void *alloca (size_t);
|
|
#endif
|
|
|
|
#ifdef _WIN32
|
|
# define INLINE static __inline
|
|
# pragma warning(disable:1025) // Don't whine about zero extending result of unary operation
|
|
#else
|
|
# define INLINE static inline
|
|
#endif
|
|
|
|
|
|
#include "internal/abi.h"
|
|
#include "cilk-abi-vla-internal.h"
|
|
|
|
#if defined(__x86_64) || defined(_M_X64)
|
|
INLINE void setsp(void *val)
|
|
{
|
|
__asm__("movq %0, %%rsp" : : "r"(val): "rsp");
|
|
}
|
|
INLINE char* getsp(void)
|
|
{
|
|
void *res;
|
|
|
|
__asm__("movq %%rsp, %0" : "=r"(res): : "rsp");
|
|
return res;
|
|
}
|
|
INLINE char* getbp(void)
|
|
{
|
|
void *res;
|
|
|
|
__asm__("movq %%rbp, %0" : "=r"(res): : "rbp");
|
|
return res;
|
|
}
|
|
INLINE void copy_frame_down_and_move_bp(
|
|
char *dst,
|
|
char *src,
|
|
size_t cpy_bytes,
|
|
char *new_ebp
|
|
)
|
|
{
|
|
// In this version, dst is guaranteed to be lower address than src,
|
|
// therefore copying upwards from src into dst is safe in case
|
|
// there is overlap. The number of bytes is also guaranteed to be
|
|
// a multiple of 8, and the copy is done in 64 bit word chunks for
|
|
// best efficiency.
|
|
__asm__(
|
|
"movq %0, %%rdi;"
|
|
"movq %1, %%rsi;"
|
|
"movq %2, %%rcx;"
|
|
"shrq $3, %%rcx;"
|
|
"rep movsq;"
|
|
"movq %3, %%rbp" :
|
|
:
|
|
"rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
|
|
"rsi", "rdi", "rcx", "rbp", "memory");
|
|
}
|
|
INLINE void copy_frame_up_and_move_bp(
|
|
char *dst,
|
|
char *src,
|
|
size_t cpy_bytes,
|
|
char *new_ebp
|
|
)
|
|
{
|
|
// In this version, dst is guaranteed to be higher address than src,
|
|
// therefore copying downwards from src into dst is safe in case
|
|
// there is overlap. The number of bytes is also guaranteed to be
|
|
// a multiple of 8, and the copy is done in 64 bit word chunks for
|
|
// best efficiency.
|
|
dst += cpy_bytes - 8;
|
|
src += cpy_bytes - 8;
|
|
__asm__(
|
|
"movq %0, %%rdi;"
|
|
"movq %1, %%rsi;"
|
|
"movq %2, %%rcx;"
|
|
"shrq $3, %%rcx;"
|
|
"std; rep movsq; cld;"
|
|
"movl %3, %%rbp;" :
|
|
:
|
|
"rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
|
|
"rsi", "rdi", "rcx", "rbp", "memory");
|
|
}
|
|
#else
|
|
INLINE void setsp(void *val)
|
|
{
|
|
__asm__("movl %0, %%esp" : : "r"(val): "esp");
|
|
}
|
|
INLINE char* getsp(void)
|
|
{
|
|
void *res;
|
|
|
|
__asm__("movl %%esp, %0" : "=r"(res): : "esp");
|
|
return res;
|
|
}
|
|
INLINE char* getbp(void)
|
|
{
|
|
void *res;
|
|
|
|
__asm__("movl %%ebp, %0" : "=r"(res): : "ebp");
|
|
return res;
|
|
}
|
|
INLINE void copy_frame_down_and_move_bp(
|
|
char *dst,
|
|
char *src,
|
|
size_t cpy_bytes,
|
|
char *new_ebp
|
|
)
|
|
{
|
|
// In this version, dst is guaranteed to be lower address than src,
|
|
// therefore copying upwards from src into dst is safe in case
|
|
// there is overlap. The number of bytes is also guaranteed to be
|
|
// a multiple of 4, and the copy is done in 32 bit word chunks for
|
|
// best efficiency.
|
|
__asm__(
|
|
"movl %0, %%edi;"
|
|
"movl %1, %%esi;"
|
|
"movl %2, %%ecx;"
|
|
"shrl $2, %%ecx;"
|
|
"rep movsd;"
|
|
"movl %3, %%ebp" :
|
|
:
|
|
"rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
|
|
"esi", "edi", "ecx", "ebp", "memory");
|
|
}
|
|
INLINE void copy_frame_up_and_move_bp(
|
|
char *dst,
|
|
char *src,
|
|
size_t cpy_bytes,
|
|
char *new_ebp
|
|
)
|
|
{
|
|
// In this version, dst is guaranteed to be higher address than src,
|
|
// therefore copying downwards from src into dst is safe in case
|
|
// there is overlap. The number of bytes is also guaranteed to be
|
|
// a multiple of 4, and the copy is done in 32 bit word chunks for
|
|
// best efficiency.
|
|
dst += cpy_bytes - 4;
|
|
src += cpy_bytes - 4;
|
|
__asm__(
|
|
"movl %0, %%edi;"
|
|
"movl %1, %%esi;"
|
|
"movl %2, %%ecx;"
|
|
"shrl $2, %%ecx;"
|
|
"std; rep movsd; cld;"
|
|
"movl %3, %%ebp" :
|
|
// "=D"(dst), "=S"(src), "=C"(cpy_bytes) :
|
|
:
|
|
"rm"(dst), "rm"(src), "rm"(cpy_bytes), "rm"(new_ebp) :
|
|
"esi", "edi", "ecx", "ebp", "memory");
|
|
}
|
|
#endif
|
|
|
|
|
|
#define c_cilk_ptr_from_heap 0xc2f2f00d
|
|
#define c_cilk_ptr_from_stack 0xc3f30d0f
|
|
|
|
CILK_ABI(__cilkrts_void_ptr)
|
|
__cilkrts_stack_alloc(
|
|
__cilkrts_stack_frame *sf,
|
|
size_t size,
|
|
size_t distance_from_sp_to_alloca_area,
|
|
uint32_t align, // align is always >= minimum stack alignment and
|
|
// >= ptr_size as well, and must be a power of 2.
|
|
uint32_t needs_tag // non-zero if the pointer being returned needs to
|
|
// be tagged
|
|
)
|
|
{
|
|
#ifdef __INTEL_COMPILER
|
|
// full_size will be a multiple of align, and contains
|
|
// enough extra space to allocate a marker.
|
|
size_t full_size = (size + align - 1) & ~(align - 1);
|
|
|
|
if (needs_tag) {
|
|
full_size += align;
|
|
}
|
|
|
|
char *t;
|
|
if (sf->worker != 0 &&
|
|
((sf->flags & CILK_FRAME_UNSYNCHED) != 0)) {
|
|
t = vla_internal_heap_alloc(sf, full_size, align);
|
|
if (needs_tag) {
|
|
t += align;
|
|
((uint32_t*)t)[-1] = c_cilk_ptr_from_heap;
|
|
}
|
|
return (void *)t;
|
|
}
|
|
|
|
// stack is still synced, allocate full_size from esp,
|
|
// and record in 32 bits immediately below the space
|
|
// allocated that this was space that this was
|
|
// allocated in the stack.
|
|
char *old_ebp = getbp();
|
|
char *old_esp = getsp();
|
|
|
|
// make top_ptr point to base of first parameter.
|
|
char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
|
|
sizeof(char *));
|
|
size_t param_size = 0;
|
|
|
|
#if defined(__x86_64)
|
|
// For Intel64 linux & MACH ABI, all the parameters were passed in
|
|
// register, so top of the stack frame above the return address
|
|
// is just the size of the return address plus
|
|
// distance_from_sp_to_alloca_area on the chance that the alloca
|
|
// area isn't at the very bottom of the calling functions stack.
|
|
#elif defined(__MACH__)
|
|
// For ia32 MACH, parameter size is always a mutliple of 16
|
|
// bytes to keep the stack 16 byte aligned. So we need to round
|
|
// number of parameters up to multiple of 4.
|
|
param_size = 8 * sizeof(char *);
|
|
#else
|
|
// For both windows Intel64 ABI, and the IA32 windows and
|
|
// linux ABIs, space is reserved on the stack for all these
|
|
// parameters. param_size is 5 * size of a stack slot.
|
|
param_size = 5 * sizeof(char *);
|
|
#endif
|
|
|
|
// now make top_ptr point above the params, or if
|
|
// distance_from_sp_to_alloca_area is not zero, make
|
|
// it point above that area. When non-zero,
|
|
// distance_from_sp_to_alloca area is expected to contain
|
|
// the parameter space, so we only add one or the other,
|
|
// not both.
|
|
top_ptr += (distance_from_sp_to_alloca_area != 0) ?
|
|
distance_from_sp_to_alloca_area : param_size;
|
|
|
|
// t needs to end up at current value of top_ptr less full_size and less
|
|
// distance_from_sp_to_alloca_area and
|
|
// then rounded down to the alignment needed. Then we have to bump
|
|
// esp down by current frame_size, so that when all is done with respect
|
|
// to executing the return sequence, the final value of esp will be the
|
|
// same value as t.
|
|
t = (top_ptr - full_size) - distance_from_sp_to_alloca_area;
|
|
intptr_t temp = (intptr_t)t;
|
|
temp &= ~((intptr_t)(align - 1));
|
|
t = (char *)temp;
|
|
|
|
// ok, the value of t is set where we need it. Now set esp
|
|
// to the value of t less the current frame size.
|
|
// So now when we do regular return esp should be left such
|
|
// that it has moved down by full_size.
|
|
size_t cur_fm_size = (top_ptr - old_esp);
|
|
char *new_esp = t - cur_fm_size;
|
|
char *new_ebp = old_ebp - (old_esp - new_esp);
|
|
|
|
// extend the stack down by at least the difference between where
|
|
// I want it to be and where it currently is. This should take care
|
|
// of touching any pages necessary.
|
|
char *foo = alloca(old_esp - new_esp);
|
|
setsp(foo < new_esp ? foo : new_esp);
|
|
|
|
// Now set esp exactly where I want it.
|
|
// setsp(new_esp);
|
|
|
|
copy_frame_down_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
|
|
|
|
if (needs_tag) {
|
|
t += align;
|
|
((uint32_t*)t)[-1] = c_cilk_ptr_from_stack;
|
|
}
|
|
|
|
return t;
|
|
#else // Not __INTEL_COMPILER
|
|
// Not supported unless we can figure out how to get the size of the frame
|
|
return NULL;
|
|
#endif
|
|
}
|
|
|
|
// This frees the space allocated for a variable length array.
|
|
CILK_ABI(void)
|
|
__cilkrts_stack_free(
|
|
__cilkrts_stack_frame *sf,
|
|
void *p,
|
|
size_t size,
|
|
size_t distance_from_sp_to_alloca_area,
|
|
uint32_t align, // same requirements as for align in allocation,
|
|
// and must match alignment that was passed when
|
|
// doing the allocation
|
|
uint32_t known_from_stack // non-zero if this is known to be allocated
|
|
// on the stack, and therefore has no tag
|
|
)
|
|
{
|
|
#ifdef __INTEL_COMPILER
|
|
uint32_t *t = (uint32_t*)p;
|
|
|
|
// full_size will be a multiple of align, and contains
|
|
// enough extra space to allocate a marker if one was needed.
|
|
size_t full_size = (size + align - 1) & ~(align - 1);
|
|
if (known_from_stack == 0) {
|
|
// if the compiler hasn't told the run-time that this is
|
|
// known to be on the stack, then this pointer must have been
|
|
// tagged such that the run-time can tell.
|
|
assert(t[-1] == c_cilk_ptr_from_stack ||
|
|
t[-1] == c_cilk_ptr_from_heap);
|
|
|
|
known_from_stack = t[-1] == c_cilk_ptr_from_stack;
|
|
full_size += align; // accounts for extra space for marker
|
|
t = (uint32_t *)(((char *)t) - align);
|
|
}
|
|
|
|
if (known_from_stack) {
|
|
// alloca useage forces an ebp/rbp based stack frame even though
|
|
// 0 and unused.
|
|
char *foo = alloca(0);
|
|
if (sf->worker == 0 || (sf->flags & CILK_FRAME_UNSYNCHED) == 0) {
|
|
// p was allocated from current stack frame and we
|
|
// are synced on current stack frame. Return the
|
|
// amount of the stack that needs to be freed.
|
|
char *old_ebp = getbp();
|
|
char *old_esp = getsp();
|
|
|
|
// make top_ptr point to base of first parameter.
|
|
char *top_ptr = ((char *)(_AddressOfReturnAddress()) +
|
|
sizeof(char *));
|
|
size_t param_size = 0;
|
|
|
|
#if defined(__x86_64)
|
|
// For Intel64 linux & MACH ABI, all the parameters were passed in
|
|
// register, so top of the stack frame above the return address
|
|
// is just the size of the return address plus
|
|
// distance_from_sp_to_alloca_area on the chance that the alloca
|
|
// area isn't at the very bottom of the calling functions stack.
|
|
#elif defined(__MACH__)
|
|
// For ia32 MACH, parameter size is always a mutliple of 16
|
|
// bytes to keep the stack 16 byte aligned. So we need to round
|
|
// number of parameters up to multiple of 4.
|
|
param_size = 8 * sizeof(char *);
|
|
#else
|
|
// For both windows Intel64 ABI, and the IA32 windows and
|
|
// linux ABIs, space is reserved on the stack for all these
|
|
// parameters. param_size is 5 * size of a stack slot.
|
|
param_size = 6 * sizeof(char *);
|
|
#endif
|
|
|
|
// now make top_ptr point above the params, or if
|
|
// distance_from_sp_to_alloca_area is not zero, make
|
|
// it point above that area. When non-zero,
|
|
// distance_from_sp_to_alloca area is expected to contain
|
|
// the parameter space, so we only add one or the other,
|
|
// not both.
|
|
top_ptr += (distance_from_sp_to_alloca_area != 0) ?
|
|
distance_from_sp_to_alloca_area : param_size;
|
|
|
|
size_t cur_fm_size = (top_ptr - old_esp);
|
|
char *new_esp = old_esp + full_size;
|
|
char *new_ebp = old_ebp + full_size;
|
|
|
|
copy_frame_up_and_move_bp(new_esp, old_esp, cur_fm_size, new_ebp);
|
|
setsp(new_esp);
|
|
}
|
|
else {
|
|
// p was allocated on stack frame, but that is
|
|
// no longer the current stack frame. Need to adjust the
|
|
// saved esp that is somewhere in the cilk runtime so that
|
|
// on sync, esp will be cut back correctly.
|
|
vla_free_from_original_stack(sf, full_size);
|
|
}
|
|
}
|
|
else {
|
|
vla_internal_heap_free(t, full_size);
|
|
}
|
|
#else // Not __INTEL_COMPILER
|
|
// Not supported unless we can figure out how to get the size of the frame
|
|
#endif
|
|
}
|