gcc/libcilkrts/runtime/frame_malloc.c
2013-10-29 11:37:47 -07:00

463 lines
15 KiB
C

/* frame_malloc.c -*-C-*-
*
*************************************************************************
*
* @copyright
* Copyright (C) 2009-2013, Intel Corporation
* All rights reserved.
*
* @copyright
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* * Neither the name of Intel Corporation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* @copyright
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
* WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
**************************************************************************/
#include "frame_malloc.h"
#include "bug.h"
#include "local_state.h"
#include "cilk_malloc.h"
#ifndef __VXWORKS__
#include <memory.h>
#endif
/* #define USE_MMAP 1 */
#if USE_MMAP
#define __USE_MISC 1
#include <sys/mman.h>
#include <errno.h>
#endif
// Define to fill the stack frame header with the fill character when pushing
// it on a free list. Note that this should be #ifdef'd out when checked in!
#ifdef _DEBUG
#define HEADER_FILL_CHAR 0xbf
#endif
// HEADER_FILL_CHAR should not be defined when checked in, so put out a warning
// message if this is a release build
#if defined(NDEBUG) && defined (HEADER_FILL_CHAR)
#pragma message ("Warning: HEADER_FILL_CHAR defined for a release build")
#endif
static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size);
#ifndef _WIN32
const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] =
{
64, 128, 256, 512, 1024, 2048
};
#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket]
/* threshold above which we use slow malloc */
#define FRAME_MALLOC_MAX_SIZE 2048
#else // _WIN32
/* Note that this must match the implementation of framesz_to_bucket in
* asmilator/layout.ml! */
#define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket)))
/* threshold above which we use slow malloc */
#define FRAME_MALLOC_MAX_SIZE \
FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1)
#endif // _WIN32
/* utility procedures */
static void push(struct free_list **b, struct free_list *p)
{
#ifdef HEADER_FILL_CHAR
memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0));
#endif
/* cons! onto free list */
p->cdr = *b;
*b = p;
}
static struct free_list *pop(struct free_list **b)
{
struct free_list *p = *b;
if (p)
*b = p->cdr;
return p;
}
/*************************************************************
global allocator:
*************************************************************/
/* request slightly less than 2^K from the OS, which after malloc
overhead and alignment should end up filling each VM page almost
completely. 128 is a guess of the total malloc overhead and cache
line alignment */
#define FRAME_MALLOC_CHUNK (32 * 1024 - 128)
/** Implements linked list of frames */
struct pool_cons {
char *p; /**< This element of the list */
struct pool_cons *cdr; /**< Remainder of the list */
};
static void extend_global_pool(global_state_t *g)
{
/* FIXME: memalign to a cache line? */
struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c));
g->frame_malloc.pool_begin =
(char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK);
g->frame_malloc.pool_end =
g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK;
g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK;
c->p = g->frame_malloc.pool_begin;
c->cdr = g->frame_malloc.pool_list;
g->frame_malloc.pool_list = c;
}
/* the size is already canonicalized at this point */
static struct free_list *global_alloc(global_state_t *g, int bucket)
{
struct free_list *mem;
size_t size;
CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
g->frame_malloc.allocated_from_global_pool += size;
if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) {
CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end);
if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) {
/* We waste the fragment of pool. */
g->frame_malloc.wasted +=
g->frame_malloc.pool_end - g->frame_malloc.pool_begin;
extend_global_pool(g);
}
mem = (struct free_list *)g->frame_malloc.pool_begin;
g->frame_malloc.pool_begin += size;
}
return mem;
}
static void global_free(global_state_t *g, void *mem, int bucket)
{
size_t size;
CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS);
size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
g->frame_malloc.allocated_from_global_pool -= size;
push(&g->frame_malloc.global_free_list[bucket], mem);
}
void __cilkrts_frame_malloc_global_init(global_state_t *g)
{
int i;
__cilkrts_mutex_init(&g->frame_malloc.lock);
g->frame_malloc.check_for_leaks = 1;
g->frame_malloc.pool_list = 0;
g->frame_malloc.pool_begin = 0;
g->frame_malloc.pool_end = 0;
g->frame_malloc.batch_size = 8000;
g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size;
g->frame_malloc.allocated_from_os = 0;
g->frame_malloc.allocated_from_global_pool = 0;
g->frame_malloc.wasted = 0;
for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
g->frame_malloc.global_free_list[i] = 0;
}
// Counts how many bytes are in the global free list.
static size_t count_memory_in_global_list(global_state_t *g)
{
// Count the memory remaining in the global free list.
size_t size_remaining_in_global_list = 0;
int i;
for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
struct free_list *p;
size_t size_in_bucket = 0;
p = g->frame_malloc.global_free_list[i];
while (p) {
size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i);
p = p->cdr;
}
size_remaining_in_global_list += size_in_bucket;
}
return size_remaining_in_global_list;
}
void __cilkrts_frame_malloc_global_cleanup(global_state_t *g)
{
struct pool_cons *c;
if (g->frame_malloc.check_for_leaks) {
size_t memory_in_global_list = count_memory_in_global_list(g);
// TBD: This check is weak. Short of memory corruption,
// I don't see how we have more memory in the free list
// than allocated from the os.
// Ideally, we should count the memory in the global free list
// and check that we have it all. But I believe the runtime
// itself also uses some memory, which is not being tracked.
if (memory_in_global_list > g->frame_malloc.allocated_from_os) {
__cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n");
}
}
while ((c = g->frame_malloc.pool_list)) {
g->frame_malloc.pool_list = c->cdr;
__cilkrts_free(c->p);
__cilkrts_free(c);
}
__cilkrts_mutex_destroy(0, &g->frame_malloc.lock);
// Check that all the memory moved from the global pool into
// workers has been returned to the global pool.
if (g->frame_malloc.check_for_leaks
&& (g->frame_malloc.allocated_from_global_pool != 0))
{
__cilkrts_bug("\n"
"---------------------------" "\n"
" MEMORY LEAK DETECTED!!! " "\n"
"---------------------------" "\n"
"\n"
);
}
}
/*************************************************************
per-worker allocator
*************************************************************/
/* allocate a batch of frames of size SIZE from the global pool and
store them in the worker's free list */
static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size)
{
global_state_t *g = w->g;
__cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
#if USE_MMAP
char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (p == MAP_FAILED)
__cilkrts_bug("mmap failed %d", errno);
assert(size < 4096);
assert(p != MAP_FAILED);
mprotect(p, 4096, PROT_NONE);
mprotect(p + 8192, 4096, PROT_NONE);
w->l->bucket_potential[bucket] += size;
push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size));
#else
size_t bytes_allocated = 0;
do {
w->l->bucket_potential[bucket] += size;
bytes_allocated += size;
push(&w->l->free_list[bucket], global_alloc(g, bucket));
} while (bytes_allocated < g->frame_malloc.batch_size);
#endif
} __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
}
static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size)
{
struct free_list *p, *q;
global_state_t *g = w->g;
size_t pot = w->l->bucket_potential[bucket];
size_t newpot;
/* Keep up to POT/2 elements in the free list. The cost of
counting up to POT/2 is amortized against POT. */
newpot = 0;
for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot;
p = p->cdr, newpot += size)
;
w->l->bucket_potential[bucket] = newpot;
if (p) {
/* free the rest of the list. The cost of grabbing the lock
is amortized against POT/2; the cost of traversing the rest
of the list is amortized against the free operation that
puts the element on the list. */
__cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
while ((q = pop(&p->cdr)))
#if USE_MMAP
munmap((char *)q + size - 8192, 12288);
#else
global_free(g, q, bucket);
#endif
} __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
}
}
// Free all the memory in this bucket for the specified worker,
// returning it to the global pool's free list.
static void move_bucket_to_global_free_list(__cilkrts_worker *w,
int bucket)
{
struct free_list *p, *q;
global_state_t *g = w->g;
p = w->l->free_list[bucket];
if (p) {
__cilkrts_mutex_lock(w, &g->frame_malloc.lock); {
while ((q = pop(&p))) {
#if USE_MMAP
size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
munmap((char *)q + size - 8192, 12288);
#else
global_free(g, q, bucket);
#endif
}
} __cilkrts_mutex_unlock(w, &g->frame_malloc.lock);
}
// I'm not sure this does anything useful now, since
// the worker is about to be destroyed. But why not?
w->l->bucket_potential[bucket] = 0;
}
static int bucket_of_size(size_t size)
{
int i;
for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i)
if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i))
return i;
CILK_ASSERT(0 /* can't happen */);
return -1;
}
size_t __cilkrts_frame_malloc_roundup(size_t size)
{
if (size > FRAME_MALLOC_MAX_SIZE) {
/* nothing, leave it alone */
} else {
int bucket = bucket_of_size(size);
size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
}
return size;
}
size_t __cilkrts_size_of_bucket(int bucket)
{
CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS);
return FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
}
void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size)
{
int bucket;
void *mem;
/* if too large, or if no worker, fall back to __cilkrts_malloc() */
if (!w || size > FRAME_MALLOC_MAX_SIZE) {
NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE);
return __cilkrts_malloc(size);
}
START_INTERVAL(w, INTERVAL_FRAME_ALLOC); {
bucket = bucket_of_size(size);
size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
while (!(mem = pop(&w->l->free_list[bucket]))) {
/* get a batch of frames from the global pool */
START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) {
allocate_batch(w, bucket, size);
} STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL);
}
} STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC);
return mem;
}
void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size)
{
int bucket;
struct free_list *p = (struct free_list *)p0;
/* if too large, or if no worker, fall back to __cilkrts_free() */
if (!w || size > FRAME_MALLOC_MAX_SIZE) {
NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE);
__cilkrts_free(p);
return;
}
#if CILK_LIB_DEBUG
*(volatile long *)w;
#endif
START_INTERVAL(w, INTERVAL_FRAME_FREE); {
bucket = bucket_of_size(size);
size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket);
w->l->bucket_potential[bucket] += size;
push(&w->l->free_list[bucket], p);
if (w->l->bucket_potential[bucket] >
w->g->frame_malloc.potential_limit) {
START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) {
gc_bucket(w, bucket, size);
} STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL);
}
} STOP_INTERVAL(w, INTERVAL_FRAME_FREE);
}
void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w)
{
int i;
local_state *l = w->l;
for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
l->free_list[i] = 0;
l->bucket_potential[i] = 0;
}
}
void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w)
{
int i;
// Move memory to the global pool. This operation
// ensures the memory does not become unreachable / leak
// when the worker is destroyed.
for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) {
move_bucket_to_global_free_list(w, i);
}
}
/*
Local Variables: **
c-file-style:"bsd" **
c-basic-offset:4 **
indent-tabs-mode:nil **
End: **
*/