0cfe1b4698
2014-02-18 Balaji V. Iyer <balaji.v.iyer@intel.com> + + * include/cilk/common.h (__CILKRTS_ABI_VERSION): Set the ABI version + to 1 instead of 0. + * runtime/cilk-abi-cilk-for.cpp (cilk_for_recursive): Added a fix to + initialize stack frame correctly. + From-SVN: r207870
417 lines
17 KiB
C++
417 lines
17 KiB
C++
/* cilk-abi-cilk-for.cpp -*-C++-*-
|
|
*
|
|
*************************************************************************
|
|
*
|
|
* @copyright
|
|
* Copyright (C) 2011, 2013, Intel Corporation
|
|
* All rights reserved.
|
|
*
|
|
* @copyright
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in
|
|
* the documentation and/or other materials provided with the
|
|
* distribution.
|
|
* * Neither the name of Intel Corporation nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* @copyright
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
|
|
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
|
|
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
|
|
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
**************************************************************************/
|
|
|
|
/* Implementation of cilk_for ABI.
|
|
*
|
|
* This file must be C++, not C, in order to handle C++ exceptions correctly
|
|
* from within the body of the cilk_for loop
|
|
*/
|
|
|
|
#include "internal/abi.h"
|
|
#include "metacall_impl.h"
|
|
#include "global_state.h"
|
|
|
|
// Icky macros to determine if we're compiled with optimization. Based on
|
|
// the declaration of __CILKRTS_ASSERT in common.h
|
|
#if defined(_WIN32)
|
|
# if defined (_DEBUG)
|
|
# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
|
|
# else
|
|
# define CILKRTS_OPTIMIZED 1
|
|
# endif // defined(_DEBUG)
|
|
#else
|
|
# if defined(__OPTIMIZE__)
|
|
# define CILKRTS_OPTIMIZED 1
|
|
# else
|
|
# define CILKRTS_OPTIMIZED 0
|
|
# endif
|
|
#endif
|
|
|
|
template <typename count_t>
|
|
static inline int grainsize(int req, count_t count)
|
|
{
|
|
// A positive requested grain size comes from the user. A very high grain
|
|
// size risks losing parallelism, but the user told us what they want for
|
|
// grainsize. Who are we to argue?
|
|
if (req > 0)
|
|
return req;
|
|
|
|
// At present, a negative requested grain size is treated the same way as
|
|
// a zero grain size, i.e., the runtime computes the actual grainsize
|
|
// using a hueristic. In the future, the compiler may give us additional
|
|
// information about the size of the cilk_for body by passing a negative
|
|
// grain size.
|
|
|
|
// Avoid generating a zero grainsize, even for empty loops.
|
|
if (count < 1)
|
|
return 1;
|
|
|
|
global_state_t* g = cilkg_get_global_state();
|
|
if (g->under_ptool)
|
|
{
|
|
// Grainsize = 1, when running under PIN, and when the grainsize has
|
|
// not explicitly been set by the user.
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
// Divide loop count by 8 times the worker count and round up.
|
|
const int Px8 = g->P * 8;
|
|
count_t n = (count + Px8 - 1) / Px8;
|
|
|
|
// 2K should be enough to amortize the cost of the cilk_for. Any
|
|
// larger grainsize risks losing parallelism.
|
|
if (n > 2048)
|
|
return 2048;
|
|
return (int) n; // n <= 2048, so no loss of precision on cast to int
|
|
}
|
|
}
|
|
|
|
/*
|
|
* call_cilk_for_loop_body
|
|
*
|
|
* Centralizes the code to call the loop body. The compiler should be
|
|
* inlining this code
|
|
*
|
|
* low - Low loop index we're considering in this portion of the algorithm
|
|
* high - High loop index we're considering in this portion of the algorithm
|
|
* body - lambda function for the cilk_for loop body
|
|
* data - data used by the lambda function
|
|
* w - __cilkrts_worker we're currently executing on
|
|
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
|
|
* the cilk_for loop to flatten out the internal nodes
|
|
*/
|
|
template <typename count_t, typename F>
|
|
inline static
|
|
void call_cilk_for_loop_body(count_t low, count_t high,
|
|
F body, void *data,
|
|
__cilkrts_worker *w,
|
|
__cilkrts_pedigree *loop_root_pedigree)
|
|
{
|
|
// Cilkscreen should not report this call in a stack trace
|
|
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
|
|
|
|
// The worker is only valid until the first spawn. Fetch the
|
|
// __cilkrts_stack_frame out of the worker, since it will be stable across
|
|
// steals. The sf pointer actually points to the *parent's*
|
|
// __cilkrts_stack_frame, since this function is a non-spawning function
|
|
// and therefore has no cilk stack frame of its own.
|
|
__cilkrts_stack_frame *sf = w->current_stack_frame;
|
|
|
|
// Save the pedigree node pointed to by the worker. We'll need to restore
|
|
// that when we exit since the spawn helpers in the cilk_for call tree
|
|
// will assume that it's valid
|
|
const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
|
|
|
|
// Add the leaf pedigree node to the chain. The parent is the root node
|
|
// to flatten the tree regardless of the DAG branches in the cilk_for
|
|
// divide-and-conquer recursion.
|
|
//
|
|
// The rank is initialized to the low index. The user is
|
|
// expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
|
|
// loop body.
|
|
__cilkrts_pedigree loop_leaf_pedigree;
|
|
|
|
loop_leaf_pedigree.rank = (uint64_t)low;
|
|
loop_leaf_pedigree.parent = loop_root_pedigree;
|
|
|
|
// The worker's pedigree always starts with a rank of 0
|
|
w->pedigree.rank = 0;
|
|
w->pedigree.parent = &loop_leaf_pedigree;
|
|
|
|
// Call the compiler generated cilk_for loop body lambda function
|
|
body(data, low, high);
|
|
|
|
// The loop body may have included spawns, so we must refetch the worker
|
|
// from the __cilkrts_stack_frame, which is stable regardless of which
|
|
// worker we're executing on.
|
|
w = sf->worker;
|
|
|
|
// Restore the pedigree chain. It must be valid because the spawn helpers
|
|
// generated by the cilk_for implementation will access it.
|
|
w->pedigree.parent = saved_next_pedigree_node;
|
|
}
|
|
|
|
/* capture_spawn_arg_stack_frame
|
|
*
|
|
* Efficiently get the address of the caller's __cilkrts_stack_frame. The
|
|
* preconditons are that 'w' is the worker at the time of the call and
|
|
* 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
|
|
* spawn helper. This function should be called only within the argument list
|
|
* of a function that is being spawned because that is the only situation in
|
|
* which these preconditions hold. This function returns the worker
|
|
* (unchanged) after storing the captured stack frame pointer is stored in the
|
|
* sf argument.
|
|
*
|
|
* The purpose of this function is to get the caller's stack frame in a
|
|
* context where the caller's worker is known but its stack frame is not
|
|
* necessarily initialized. The "shrink wrap" optimization delays
|
|
* initializing the contents of a spawning function's '__cilkrts_stack_frame'
|
|
* as well as the 'current_stack_frame' pointer within the worker. By calling
|
|
* this function within a spawning function's argument list, we can ensure
|
|
* that these initializations have occured but that a detach (which would
|
|
* invalidate the worker pointer in the caller) has not yet occured. Once the
|
|
* '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
|
|
* remainder of the caller's execution, and becomes an efficient way to get
|
|
* the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
|
|
* even after a spawn or sync.
|
|
*/
|
|
inline __cilkrts_worker*
|
|
capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
|
|
{
|
|
// Get current stack frame
|
|
sf = w->current_stack_frame;
|
|
#ifdef __INTEL_COMPILER
|
|
# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
|
|
// In older compilers 'w->current_stack_frame' points to the
|
|
// spawn-helper's stack frame. In newer compiler's however, it points
|
|
// directly to the pointer's stack frame. (This change was made to avoid
|
|
// having the spawn helper in the frame list when evaluating function
|
|
// arguments, thus avoiding corruption when those arguments themselves
|
|
// contain cilk_spawns.)
|
|
|
|
// w->current_stack_frame is the spawn helper's stack frame.
|
|
// w->current_stack_frame->call_parent is the caller's stack frame.
|
|
sf = sf->call_parent;
|
|
# endif
|
|
#endif
|
|
return w;
|
|
}
|
|
|
|
/*
|
|
* cilk_for_recursive
|
|
*
|
|
* Templatized function to implement the recursive divide-and-conquer
|
|
* algorithm that's how we implement a cilk_for.
|
|
*
|
|
* low - Low loop index we're considering in this portion of the algorithm
|
|
* high - High loop index we're considering in this portion of the algorithm
|
|
* body - lambda function for the cilk_for loop body
|
|
* data - data used by the lambda function
|
|
* grain - grain size (0 if it should be computed)
|
|
* w - __cilkrts_worker we're currently executing on
|
|
* loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
|
|
* the cilk_for loop to flatten out the internal nodes
|
|
*/
|
|
template <typename count_t, typename F>
|
|
static
|
|
void cilk_for_recursive(count_t low, count_t high,
|
|
F body, void *data, int grain,
|
|
__cilkrts_worker *w,
|
|
__cilkrts_pedigree *loop_root_pedigree)
|
|
{
|
|
tail_recurse:
|
|
// Cilkscreen should not report this call in a stack trace
|
|
// This needs to be done everytime the worker resumes
|
|
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
|
|
|
|
count_t count = high - low;
|
|
// Invariant: count > 0, grain >= 1
|
|
if (count > grain)
|
|
{
|
|
// Invariant: count >= 2
|
|
count_t mid = low + count / 2;
|
|
// The worker is valid only until the first spawn and is expensive to
|
|
// retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
|
|
// '__cilkrts_stack_frame' is more stable, but isn't initialized until
|
|
// the first spawn. Thus, we want to grab the address of the
|
|
// '__cilkrts_stack_frame' after it is initialized but before the
|
|
// spawn detaches. The only place we can do that is within the
|
|
// argument list of the spawned function, hence the call to
|
|
// capture_spawn_arg_stack_frame().
|
|
__cilkrts_stack_frame *sf;
|
|
#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
|
|
// The current version of gcc initializes the sf structure eagerly.
|
|
// We can take advantage of this fact to avoid calling
|
|
// `capture_spawn_arg_stack_frame` when compiling with gcc.
|
|
// Remove this if the "shrink-wrap" optimization is implemented.
|
|
sf = w->current_stack_frame;
|
|
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
|
|
loop_root_pedigree);
|
|
#else
|
|
_Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
|
|
capture_spawn_arg_stack_frame(sf, w),
|
|
loop_root_pedigree);
|
|
#endif
|
|
w = sf->worker;
|
|
low = mid;
|
|
|
|
goto tail_recurse;
|
|
}
|
|
|
|
// Call the cilk_for loop body lambda function passed in by the compiler to
|
|
// execute one grain
|
|
call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
|
|
}
|
|
|
|
static void noop() { }
|
|
|
|
/*
|
|
* cilk_for_root
|
|
*
|
|
* Templatized function to implement the top level of a cilk_for loop.
|
|
*
|
|
* body - lambda function for the cilk_for loop body
|
|
* data - data used by the lambda function
|
|
* count - trip count for loop
|
|
* grain - grain size (0 if it should be computed)
|
|
*/
|
|
template <typename count_t, typename F>
|
|
static void cilk_for_root(F body, void *data, count_t count, int grain)
|
|
{
|
|
// Cilkscreen should not report this call in a stack trace
|
|
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
|
|
|
|
// Pedigree computation:
|
|
//
|
|
// If the last pedigree node on entry to the _Cilk_for has value X,
|
|
// then at the start of each iteration of the loop body, the value of
|
|
// the last pedigree node should be 0, the value of the second-to-last
|
|
// node should equal the loop counter, and the value of the
|
|
// third-to-last node should be X. On return from the _Cilk_for, the
|
|
// value of the last pedigree should be incremented to X+2. The
|
|
// pedigree within the loop is thus flattened, such that the depth of
|
|
// recursion does not affect the results either inside or outside of
|
|
// the loop. Note that the pedigree after the loop exists is the same
|
|
// as if a single spawn and sync were executed within this function.
|
|
|
|
// TBD: Since the shrink-wrap optimization was turned on in the compiler,
|
|
// it is not possible to get the current stack frame without actually
|
|
// forcing a call to bind-thread. This spurious spawn is a temporary
|
|
// stopgap until the correct intrinsics are added to give us total control
|
|
// over frame initialization.
|
|
_Cilk_spawn noop();
|
|
|
|
// Fetch the current worker. From that we can get the current stack frame
|
|
// which will be constant even if we're stolen
|
|
__cilkrts_worker *w = __cilkrts_get_tls_worker();
|
|
__cilkrts_stack_frame *sf = w->current_stack_frame;
|
|
|
|
// Decrement the rank by one to undo the pedigree change from the
|
|
// _Cilk_spawn
|
|
--w->pedigree.rank;
|
|
|
|
// Save the current worker pedigree into loop_root_pedigree, which will be
|
|
// the root node for our flattened pedigree.
|
|
__cilkrts_pedigree loop_root_pedigree = w->pedigree;
|
|
|
|
// Don't splice the loop_root node in yet. It will be done when we
|
|
// call the loop body lambda function
|
|
// w->pedigree.rank = 0;
|
|
// w->pedigree.next = &loop_root_pedigree;
|
|
|
|
/* Spawn is necessary at top-level to force runtime to start up.
|
|
* Runtime must be started in order to call the grainsize() function.
|
|
*/
|
|
int gs = grainsize(grain, count);
|
|
cilk_for_recursive((count_t) 0, count, body, data, gs, w,
|
|
&loop_root_pedigree);
|
|
|
|
// Need to refetch the worker after calling a spawning function.
|
|
w = sf->worker;
|
|
|
|
// Restore the pedigree in the worker.
|
|
w->pedigree = loop_root_pedigree;
|
|
|
|
// Bump the worker pedigree.
|
|
++w->pedigree.rank;
|
|
|
|
// Implicit sync will increment the pedigree leaf rank again, for a total
|
|
// of two increments. If the noop spawn above is removed, then we'll need
|
|
// to re-enable the following code:
|
|
// // If this is an optimized build, then the compiler will have optimized
|
|
// // out the increment of the worker's pedigree in the implied sync. We
|
|
// // need to add one to make the pedigree_loop test work correctly.
|
|
// #if CILKRTS_OPTIMIZED
|
|
// ++sf->worker->pedigree.rank;
|
|
// #endif
|
|
}
|
|
|
|
// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
|
|
// __cilkrts_cilk_for_64.
|
|
extern "C" {
|
|
|
|
/*
|
|
* __cilkrts_cilk_for_32
|
|
*
|
|
* Implementation of cilk_for for 32-bit trip counts (regardless of processor
|
|
* word size). Assumes that the range is 0 - count.
|
|
*
|
|
* body - lambda function for the cilk_for loop body
|
|
* data - data used by the lambda function
|
|
* count - trip count for loop
|
|
* grain - grain size (0 if it should be computed)
|
|
*/
|
|
|
|
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
|
|
cilk32_t count, int grain)
|
|
{
|
|
// Cilkscreen should not report this call in a stack trace
|
|
NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
|
|
|
|
// Check for an empty range here as an optimization - don't need to do any
|
|
// __cilkrts_stack_frame initialization
|
|
if (count > 0)
|
|
cilk_for_root(body, data, count, grain);
|
|
}
|
|
|
|
/*
|
|
* __cilkrts_cilk_for_64
|
|
*
|
|
* Implementation of cilk_for for 64-bit trip counts (regardless of processor
|
|
* word size). Assumes that the range is 0 - count.
|
|
*
|
|
* body - lambda function for the cilk_for loop body
|
|
* data - data used by the lambda function
|
|
* count - trip count for loop
|
|
* grain - grain size (0 if it should be computed)
|
|
*/
|
|
CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
|
|
cilk64_t count, int grain)
|
|
{
|
|
// Check for an empty range here as an optimization - don't need to do any
|
|
// __cilkrts_stack_frame initialization
|
|
if (count > 0)
|
|
cilk_for_root(body, data, count, grain);
|
|
}
|
|
|
|
} // end extern "C"
|
|
|
|
/* End cilk-abi-cilk-for.cpp */
|