libgo: reduce overhead for memory/block/mutex profiling

Revise the gccgo version of memory/block/mutex profiling to reduce runtime overhead. The main change is to collect raw stack traces while the profile is on line, then post-process the stacks just prior to the point where we are ready to use the final product. Memory profiling (at a very low sampling rate) is enabled by default, and the overhead of the symbolization / DWARF-reading from backtrace_full was slowing things down relative to the main Go runtime. Reviewed-on: https://go-review.googlesource.com/c/gofrontend/+/171497 From-SVN: r271172
2019-05-14 14:59:42 +00:00 · 2019-05-14 14:59:42 +00:00 · 1ac09ef2c6
parent ce9f305e44
commit 1ac09ef2c6
11 changed files with 370 additions and 103 deletions
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@ -1,4 +1,4 @@
-3f015e128bf6d1d9279f3d43e26f60f0927019cb
+6112f9b8fa9d57d2db8a709cc8b44a94d778d08a

 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
--- a/libgo/go/runtime/heapdump.go
+++ b/libgo/go/runtime/heapdump.go
@ -437,17 +437,15 @@ func dumpmemstats() {
 	dumpint(uint64(memstats.numgc))
 }

-func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *location, size, allocs, frees uintptr) {
-	stk := (*[100000]location)(unsafe.Pointer(pstk))
+func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
+	stk := (*[100000]uintptr)(unsafe.Pointer(pstk))
 	dumpint(tagMemProf)
 	dumpint(uint64(uintptr(unsafe.Pointer(b))))
 	dumpint(uint64(size))
 	dumpint(uint64(nstk))
 	for i := uintptr(0); i < nstk; i++ {
-		pc := stk[i].pc
-		fn := stk[i].function
-		file := stk[i].filename
-		line := stk[i].lineno
+		pc := stk[i]
+		fn, file, line, _ := funcfileline(pc, -1)
 		if fn == "" {
 			var buf [64]byte
 			n := len(buf)
--- a/libgo/go/runtime/mgcmark.go
+++ b/libgo/go/runtime/mgcmark.go
@ -1085,7 +1085,7 @@ func scanstackblockwithmap(pc, b0, n0 uintptr, ptrmask *uint8, gcw *gcWork) {
 						span != nil && span.state != mSpanManual &&
 							(obj < span.base() || obj >= span.limit || span.state != mSpanInUse) {
 						print("runtime: found in object at *(", hex(b), "+", hex(i), ") = ", hex(obj), ", pc=", hex(pc), "\n")
-						name, file, line := funcfileline(pc, -1)
+						name, file, line, _ := funcfileline(pc, -1)
 						print(name, "\n", file, ":", line, "\n")
 						//gcDumpObject("object", b, i)
 						throw("found bad pointer in Go stack (incorrect use of unsafe or cgo?)")
--- a/libgo/go/runtime/mprof.go
+++ b/libgo/go/runtime/mprof.go
@ -24,6 +24,10 @@ const (
 	blockProfile
 	mutexProfile

+	// a profile bucket from one of the categories above whose stack
+	// trace has been fixed up / pruned.
+	prunedProfile
+
 	// size of bucket hash table
 	buckHashSize = 179999

@ -138,11 +142,13 @@ type blockRecord struct {
 }

 var (
-	mbuckets  *bucket // memory profile buckets
-	bbuckets  *bucket // blocking profile buckets
-	xbuckets  *bucket // mutex profile buckets
-	buckhash  *[179999]*bucket
-	bucketmem uintptr
+	mbuckets    *bucket // memory profile buckets
+	bbuckets    *bucket // blocking profile buckets
+	xbuckets    *bucket // mutex profile buckets
+	sbuckets    *bucket // pre-symbolization profile buckets (stacks fixed up)
+	freebuckets *bucket // freelist of unused fixed up profile buckets
+	buckhash    *[179999]*bucket
+	bucketmem   uintptr

 	mProf struct {
 		// All fields in mProf are protected by proflock.
@ -158,12 +164,35 @@ var (

 const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)

+// payloadOffset() returns a pointer into the part of a bucket
+// containing the profile payload (skips past the bucket struct itself
+// and then the stack trace).
+func payloadOffset(typ bucketType, nstk uintptr) uintptr {
+	if typ == prunedProfile {
+		// To allow reuse of prunedProfile buckets between different
+		// collections, allocate them with the max stack size (the portion
+		// of the stack used will vary from trace to trace).
+		nstk = maxStack
+	}
+	return unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr)
+}
+
+func max(x, y uintptr) uintptr {
+	if x > y {
+		return x
+	}
+	return y
+}
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
-	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(location{})
+	size := payloadOffset(typ, uintptr(nstk))
 	switch typ {
 	default:
 		throw("invalid profile bucket type")
+	case prunedProfile:
+		// stack-fixed buckets are large enough to accommodate any payload.
+		size += max(unsafe.Sizeof(memRecord{}), unsafe.Sizeof(blockRecord{}))
 	case memProfile:
 		size += unsafe.Sizeof(memRecord{})
 	case blockProfile, mutexProfile:
@ -178,31 +207,29 @@ func newBucket(typ bucketType, nstk int) *bucket {
 }

 // stk returns the slice in b holding the stack.
-func (b *bucket) stk() []location {
-	stk := (*[maxStack]location)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+func (b *bucket) stk() []uintptr {
+	stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
 	return stk[:b.nstk:b.nstk]
 }

 // mp returns the memRecord associated with the memProfile bucket b.
 func (b *bucket) mp() *memRecord {
-	if b.typ != memProfile {
+	if b.typ != memProfile && b.typ != prunedProfile {
 		throw("bad use of bucket.mp")
 	}
-	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(location{}))
-	return (*memRecord)(data)
+	return (*memRecord)(add(unsafe.Pointer(b), payloadOffset(b.typ, b.nstk)))
 }

 // bp returns the blockRecord associated with the blockProfile bucket b.
 func (b *bucket) bp() *blockRecord {
-	if b.typ != blockProfile && b.typ != mutexProfile {
+	if b.typ != blockProfile && b.typ != mutexProfile && b.typ != prunedProfile {
 		throw("bad use of bucket.bp")
 	}
-	data := add(unsafe.Pointer(b), unsafe.Sizeof(*b)+b.nstk*unsafe.Sizeof(location{}))
-	return (*blockRecord)(data)
+	return (*blockRecord)(add(unsafe.Pointer(b), payloadOffset(b.typ, b.nstk)))
 }

 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
-func stkbucket(typ bucketType, size uintptr, stk []location, alloc bool) *bucket {
+func stkbucket(typ bucketType, size uintptr, stk []uintptr, alloc bool) *bucket {
 	if buckhash == nil {
 		buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))
 		if buckhash == nil {
@ -212,8 +239,8 @@ func stkbucket(typ bucketType, size uintptr, stk []location, alloc bool) *bucket

 	// Hash stack.
 	var h uintptr
-	for _, loc := range stk {
-		h += loc.pc
+	for _, pc := range stk {
+		h += pc
 		h += h << 10
 		h ^= h >> 6
 	}
@ -249,6 +276,9 @@ func stkbucket(typ bucketType, size uintptr, stk []location, alloc bool) *bucket
 	} else if typ == mutexProfile {
 		b.allnext = xbuckets
 		xbuckets = b
+	} else if typ == prunedProfile {
+		b.allnext = sbuckets
+		sbuckets = b
 	} else {
 		b.allnext = bbuckets
 		bbuckets = b
@ -256,7 +286,7 @@ func stkbucket(typ bucketType, size uintptr, stk []location, alloc bool) *bucket
 	return b
 }

-func eqslice(x, y []location) bool {
+func eqslice(x, y []uintptr) bool {
 	if len(x) != len(y) {
 		return false
 	}
@ -338,8 +368,8 @@ func mProf_PostSweep() {

 // Called by malloc to record a profiled block.
 func mProf_Malloc(p unsafe.Pointer, size uintptr) {
-	var stk [maxStack]location
-	nstk := callers(4, stk[:])
+	var stk [maxStack]uintptr
+	nstk := callersRaw(1, stk[:])
 	lock(&proflock)
 	b := stkbucket(memProfile, size, stk[:nstk], true)
 	c := mProf.cycle
@ -414,13 +444,13 @@ func blocksampled(cycles int64) bool {
 func saveblockevent(cycles int64, skip int, which bucketType) {
 	gp := getg()
 	var nstk int
-	var stk [maxStack]location
+	var stk [maxStack]uintptr
 	if gp.m.curg == nil || gp.m.curg == gp {
-		nstk = callers(skip, stk[:])
+		nstk = callersRaw(skip, stk[:])
 	} else {
 		// FIXME: This should get a traceback of gp.m.curg.
 		// nstk = gcallers(gp.m.curg, skip, stk[:])
-		nstk = callers(skip, stk[:])
+		nstk = callersRaw(skip, stk[:])
 	}
 	lock(&proflock)
 	b := stkbucket(which, 0, stk[:nstk], true)
@ -521,6 +551,150 @@ func (r *MemProfileRecord) Stack() []uintptr {
 	return r.Stack0[0:]
 }

+// reusebucket tries to pick a prunedProfile bucket off
+// the freebuckets list, returning it if one is available or nil
+// if the free list is empty.
+func reusebucket(nstk int) *bucket {
+	var b *bucket
+	if freebuckets != nil {
+		b = freebuckets
+		freebuckets = freebuckets.allnext
+		b.typ = prunedProfile
+		b.nstk = uintptr(nstk)
+		mp := b.mp()
+		// Hack: rely on the fact that memprofile records are
+		// larger than blockprofile records when clearing.
+		*mp = memRecord{}
+	}
+	return b
+}
+
+// freebucket appends the specified prunedProfile bucket
+// onto the free list, and removes references to it from the hash.
+func freebucket(tofree *bucket) *bucket {
+	// Thread this bucket into the free list.
+	ret := tofree.allnext
+	tofree.allnext = freebuckets
+	freebuckets = tofree
+
+	// Clean up the hash. The hash may point directly to this bucket...
+	i := int(tofree.hash % buckHashSize)
+	if buckhash[i] == tofree {
+		buckhash[i] = tofree.next
+	} else {
+		// ... or when this bucket was inserted by stkbucket, it may have been
+		// chained off some other unrelated bucket.
+		for b := buckhash[i]; b != nil; b = b.next {
+			if b.next == tofree {
+				b.next = tofree.next
+				break
+			}
+		}
+	}
+	return ret
+}
+
+// fixupStack takes a 'raw' stack trace (stack of PCs generated by
+// callersRaw) and performs pre-symbolization fixup on it, returning
+// the results in 'canonStack'. For each frame we look at the
+// file/func/line information, then use that info to decide whether to
+// include the frame in the final symbolized stack (removing frames
+// corresponding to 'morestack' routines, for example). We also expand
+// frames if the PC values to which they refer correponds to inlined
+// functions to allow for expanded symbolic info to be filled in
+// later. Note: there is code in go-callers.c's backtrace_full callback()
+// function that performs very similar fixups; these two code paths
+// should be kept in sync.
+func fixupStack(stk []uintptr, canonStack *[maxStack]uintptr, size uintptr) int {
+	var cidx int
+	var termTrace bool
+	for _, pc := range stk {
+		// Subtract 1 from PC to undo the 1 we added in callback in
+		// go-callers.c.
+		function, file, _, frames := funcfileline(pc-1, -1)
+
+		// Skip split-stack functions (match by function name)
+		skipFrame := false
+		if hasPrefix(function, "_____morestack_") || hasPrefix(function, "__morestack_") {
+			skipFrame = true
+		}
+
+		// Skip split-stack functions (match by file)
+		if hasSuffix(file, "/morestack.S") {
+			skipFrame = true
+		}
+
+		// Skip thunks and recover functions.  There is no equivalent to
+		// these functions in the gc toolchain.
+		fcn := function
+		if hasSuffix(fcn, "..r") {
+			skipFrame = true
+		} else {
+			for fcn != "" && (fcn[len(fcn)-1] >= '0' && fcn[len(fcn)-1] <= '9') {
+				fcn = fcn[:len(fcn)-1]
+			}
+			if hasSuffix(fcn, "..stub") || hasSuffix(fcn, "..thunk") {
+				skipFrame = true
+			}
+		}
+		if skipFrame {
+			continue
+		}
+
+		// Terminate the trace if we encounter a frame corresponding to
+		// runtime.main, runtime.kickoff, makecontext, etc. See the
+		// corresponding code in go-callers.c, callback function used
+		// with backtrace_full.
+		if function == "makecontext" {
+			termTrace = true
+		}
+		if hasSuffix(file, "/proc.c") && function == "runtime_mstart" {
+			termTrace = true
+		}
+		if hasSuffix(file, "/proc.go") &&
+			(function == "runtime.main" || function == "runtime.kickoff") {
+			termTrace = true
+		}
+
+		// Expand inline frames.
+		for i := 0; i < frames; i++ {
+			(*canonStack)[cidx] = pc
+			cidx++
+			if cidx >= maxStack {
+				termTrace = true
+				break
+			}
+		}
+		if termTrace {
+			break
+		}
+	}
+	return cidx
+}
+
+// fixupBucket takes a raw memprofile bucket and creates a new bucket
+// in which the stack trace has been fixed up (inline frames expanded,
+// unwanted frames stripped out). Original bucket is left unmodified;
+// a new symbolizeProfile bucket may be generated as a side effect.
+// Payload information from the original bucket is incorporated into
+// the new bucket.
+func fixupBucket(b *bucket) {
+	var canonStack [maxStack]uintptr
+	frames := fixupStack(b.stk(), &canonStack, b.size)
+	cb := stkbucket(prunedProfile, b.size, canonStack[:frames], true)
+	switch b.typ {
+	default:
+		throw("invalid profile bucket type")
+	case memProfile:
+		rawrecord := b.mp()
+		cb.mp().active.add(&rawrecord.active)
+	case blockProfile, mutexProfile:
+		bpcount := b.bp().count
+		cb.bp().count += bpcount
+		cb.bp().cycles += bpcount
+	}
+}
+
 // MemProfile returns a profile of memory allocated and freed per allocation
 // site.
 //
@ -576,15 +750,31 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 		}
 	}
 	if n <= len(p) {
-		ok = true
-		idx := 0
-		for b := mbuckets; b != nil; b = b.allnext {
+		var bnext *bucket
+
+		// Post-process raw buckets to fix up their stack traces
+		for b := mbuckets; b != nil; b = bnext {
+			bnext = b.allnext
 			mp := b.mp()
 			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
-				record(&p[idx], b)
-				idx++
+				fixupBucket(b)
 			}
 		}
+
+		// Record pruned/fixed-up buckets
+		ok = true
+		idx := 0
+		for b := sbuckets; b != nil; b = b.allnext {
+			record(&p[idx], b)
+			idx++
+		}
+		n = idx
+
+		// Free up pruned buckets for use in next round
+		for b := sbuckets; b != nil; b = bnext {
+			bnext = freebucket(b)
+		}
+		sbuckets = nil
 	}
 	unlock(&proflock)
 	return
@ -597,18 +787,18 @@ func record(r *MemProfileRecord, b *bucket) {
 	r.FreeBytes = int64(mp.active.free_bytes)
 	r.AllocObjects = int64(mp.active.allocs)
 	r.FreeObjects = int64(mp.active.frees)
-	for i, loc := range b.stk() {
+	for i, pc := range b.stk() {
 		if i >= len(r.Stack0) {
 			break
 		}
-		r.Stack0[i] = loc.pc
+		r.Stack0[i] = pc
 	}
 	for i := int(b.nstk); i < len(r.Stack0); i++ {
 		r.Stack0[i] = 0
 	}
 }

-func iterate_memprof(fn func(*bucket, uintptr, *location, uintptr, uintptr, uintptr)) {
+func iterate_memprof(fn func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {
 	lock(&proflock)
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
@ -625,6 +815,49 @@ type BlockProfileRecord struct {
 	StackRecord
 }

+func harvestBlockMutexProfile(buckets *bucket, p []BlockProfileRecord) (n int, ok bool) {
+	for b := buckets; b != nil; b = b.allnext {
+		n++
+	}
+	if n <= len(p) {
+		var bnext *bucket
+
+		// Post-process raw buckets to create pruned/fixed-up buckets
+		for b := buckets; b != nil; b = bnext {
+			bnext = b.allnext
+			fixupBucket(b)
+		}
+
+		// Record
+		ok = true
+		for b := sbuckets; b != nil; b = b.allnext {
+			bp := b.bp()
+			r := &p[0]
+			r.Count = bp.count
+			r.Cycles = bp.cycles
+			i := 0
+			var pc uintptr
+			for i, pc = range b.stk() {
+				if i >= len(r.Stack0) {
+					break
+				}
+				r.Stack0[i] = pc
+			}
+			for ; i < len(r.Stack0); i++ {
+				r.Stack0[i] = 0
+			}
+			p = p[1:]
+		}
+
+		// Free up pruned buckets for use in next round.
+		for b := sbuckets; b != nil; b = bnext {
+			bnext = freebucket(b)
+		}
+		sbuckets = nil
+	}
+	return
+}
+
 // BlockProfile returns n, the number of records in the current blocking profile.
 // If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
 // If len(p) < n, BlockProfile does not change p and returns n, false.
@ -634,30 +867,7 @@ type BlockProfileRecord struct {
 // of calling BlockProfile directly.
 func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 	lock(&proflock)
-	for b := bbuckets; b != nil; b = b.allnext {
-		n++
-	}
-	if n <= len(p) {
-		ok = true
-		for b := bbuckets; b != nil; b = b.allnext {
-			bp := b.bp()
-			r := &p[0]
-			r.Count = bp.count
-			r.Cycles = bp.cycles
-			i := 0
-			var loc location
-			for i, loc = range b.stk() {
-				if i >= len(r.Stack0) {
-					break
-				}
-				r.Stack0[i] = loc.pc
-			}
-			for ; i < len(r.Stack0); i++ {
-				r.Stack0[i] = 0
-			}
-			p = p[1:]
-		}
-	}
+	n, ok = harvestBlockMutexProfile(bbuckets, p)
 	unlock(&proflock)
 	return
 }
@ -670,30 +880,7 @@ func BlockProfile(p []BlockProfileRecord) (n int, ok bool) {
 // instead of calling MutexProfile directly.
 func MutexProfile(p []BlockProfileRecord) (n int, ok bool) {
 	lock(&proflock)
-	for b := xbuckets; b != nil; b = b.allnext {
-		n++
-	}
-	if n <= len(p) {
-		ok = true
-		for b := xbuckets; b != nil; b = b.allnext {
-			bp := b.bp()
-			r := &p[0]
-			r.Count = int64(bp.count)
-			r.Cycles = bp.cycles
-			i := 0
-			var loc location
-			for i, loc = range b.stk() {
-				if i >= len(r.Stack0) {
-					break
-				}
-				r.Stack0[i] = loc.pc
-			}
-			for ; i < len(r.Stack0); i++ {
-				r.Stack0[i] = 0
-			}
-			p = p[1:]
-		}
-	}
+	n, ok = harvestBlockMutexProfile(xbuckets, p)
 	unlock(&proflock)
 	return
 }
--- a/libgo/go/runtime/panic.go
+++ b/libgo/go/runtime/panic.go
@ -53,7 +53,7 @@ var indexError = error(errorString("index out of range"))
 // entire runtime stack for easier debugging.

 func panicindex() {
-	name, _, _ := funcfileline(getcallerpc()-1, -1)
+	name, _, _, _ := funcfileline(getcallerpc()-1, -1)
 	if hasPrefix(name, "runtime.") {
 		throw(string(indexError.(errorString)))
 	}
@ -64,7 +64,7 @@ func panicindex() {
 var sliceError = error(errorString("slice bounds out of range"))

 func panicslice() {
-	name, _, _ := funcfileline(getcallerpc()-1, -1)
+	name, _, _, _ := funcfileline(getcallerpc()-1, -1)
 	if hasPrefix(name, "runtime.") {
 		throw(string(sliceError.(errorString)))
 	}
--- a/libgo/go/runtime/string.go
+++ b/libgo/go/runtime/string.go
@ -360,6 +360,10 @@ func hasPrefix(s, prefix string) bool {
 	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
 }

+func hasSuffix(s, suffix string) bool {
+	return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix
+}
+
 const (
 	maxUint = ^uint(0)
 	maxInt  = int(maxUint >> 1)
--- a/libgo/go/runtime/symtab.go
+++ b/libgo/go/runtime/symtab.go
@ -79,7 +79,7 @@ func (ci *Frames) Next() (frame Frame, more bool) {

 	// Subtract 1 from PC to undo the 1 we added in callback in
 	// go-callers.c.
-	function, file, line := funcfileline(pc-1, int32(i))
+	function, file, line, _ := funcfileline(pc-1, int32(i))
 	if function == "" && file == "" {
 		return Frame{}, more
 	}
@ -158,7 +158,7 @@ const (
 // the a *Func describing the innermost function, but with an entry
 // of the outermost function.
 func FuncForPC(pc uintptr) *Func {
-	name, _, _ := funcfileline(pc, -1)
+	name, _, _, _ := funcfileline(pc, -1)
 	if name == "" {
 		return nil
 	}
@ -187,7 +187,7 @@ func (f *Func) Entry() uintptr {
 // The result will not be accurate if pc is not a program
 // counter within f.
 func (f *Func) FileLine(pc uintptr) (file string, line int) {
-	_, file, line = funcfileline(pc, -1)
+	_, file, line, _ = funcfileline(pc, -1)
 	return file, line
 }

@ -261,5 +261,5 @@ func demangleSymbol(s string) string {
 }

 // implemented in go-caller.c
-func funcfileline(uintptr, int32) (string, string, int)
+func funcfileline(uintptr, int32) (string, string, int, int)
 func funcentry(uintptr) uintptr
--- a/libgo/go/runtime/traceback_gccgo.go
+++ b/libgo/go/runtime/traceback_gccgo.go
@ -20,7 +20,7 @@ func printcreatedby(gp *g) {
 	if entry != 0 && tracepc > entry {
 		tracepc -= sys.PCQuantum
 	}
-	function, file, line := funcfileline(tracepc, -1)
+	function, file, line, _ := funcfileline(tracepc, -1)
 	if function != "" && showframe(function, gp, false) && gp.goid != 1 {
 		printcreatedby1(function, file, line, entry, pc)
 	}
@ -61,6 +61,16 @@ func callers(skip int, locbuf []location) int {
 	return int(n)
 }

+//go:noescape
+//extern runtime_callersRaw
+func c_callersRaw(skip int32, pcs *uintptr, max int32) int32
+
+// callersRaw returns a raw (PCs only) stack trace of the current goroutine.
+func callersRaw(skip int, pcbuf []uintptr) int {
+	n := c_callersRaw(int32(skip)+1, &pcbuf[0], int32(len(pcbuf)))
+	return int(n)
+}
+
 // traceback prints a traceback of the current goroutine.
 // This differs from the gc version, which is given pc, sp, lr and g and
 // can print a traceback of any goroutine.
@ -83,7 +93,7 @@ func traceback(skip int32) {
 func printAncestorTraceback(ancestor ancestorInfo) {
 	print("[originating from goroutine ", ancestor.goid, "]:\n")
 	for fidx, pc := range ancestor.pcs {
-		function, file, line := funcfileline(pc, -1)
+		function, file, line, _ := funcfileline(pc, -1)
 		if showfuncinfo(function, fidx == 0) {
 			printAncestorTracebackFuncInfo(function, file, line, pc)
 		}
@ -92,7 +102,7 @@ func printAncestorTraceback(ancestor ancestorInfo) {
 		print("...additional frames elided...\n")
 	}
 	// Show what created goroutine, except main goroutine (goid 1).
-	function, file, line := funcfileline(ancestor.gopc, -1)
+	function, file, line, _ := funcfileline(ancestor.gopc, -1)
 	if function != "" && showfuncinfo(function, false) && ancestor.goid != 1 {
 		printcreatedby1(function, file, line, funcentry(ancestor.gopc), ancestor.gopc)
 	}
--- a/libgo/runtime/go-caller.c
+++ b/libgo/runtime/go-caller.c
@ -26,11 +26,13 @@ struct caller
  String file;
  intgo line;
  intgo index;
+  intgo frames;
 };

 /* Collect file/line information for a PC value.  If this is called
-   more than once, due to inlined functions, we use the last call, as
-   that is usually the most useful one.  */
+   more than once, due to inlined functions, we record the number of
+   inlined frames but return file/func/line for the last call, as
+   that is usually the most useful one.   */

 static int
 callback (void *data, uintptr_t pc __attribute__ ((unused)),
@ -38,6 +40,8 @@ callback (void *data, uintptr_t pc __attribute__ ((unused)),
 {
  struct caller *c = (struct caller *) data;

+  c->frames++;
+
  /* The libbacktrace library says that these strings might disappear,
     but with the current implementation they won't.  We can't easily
     allocate memory here, so for now assume that we can save a
@ -125,18 +129,19 @@ __go_get_backtrace_state ()
  return back_state;
 }

-/* Return function/file/line information for PC.  The index parameter
+/* Return function/file/line/nframes information for PC.  The index parameter
   is the entry on the stack of inlined functions; -1 means the last
-   one.  */
+   one, with *nframes set to the count of inlined frames for this PC.  */

 static _Bool
-__go_file_line (uintptr pc, int index, String *fn, String *file, intgo *line)
+__go_file_line (uintptr pc, int index, String *fn, String *file, intgo *line, intgo *nframes)
 {
  struct caller c;
  struct backtrace_state *state;

  runtime_memclr (&c, sizeof c);
  c.index = index;
+  c.frames = 0;
  runtime_xadd (&__go_runtime_in_callers, 1);
  state = __go_get_backtrace_state ();
  runtime_xadd (&__go_runtime_in_callers, -1);
@ -144,6 +149,7 @@ __go_file_line (uintptr pc, int index, String *fn, String *file, intgo *line)
  *fn = c.fn;
  *file = c.file;
  *line = c.line;
+  *nframes = c.frames;

  // If backtrace_pcinfo didn't get the function name from the debug
  // info, try to get it from the symbol table.
@ -222,7 +228,7 @@ runtime_funcfileline (uintptr targetpc, int32 index)
  struct funcfileline_return ret;

  if (!__go_file_line (targetpc, index, &ret.retfn, &ret.retfile,
-		       &ret.retline))
+		       &ret.retline, &ret.retframes))
    runtime_memclr (&ret, sizeof ret);
  return ret;
 }
--- a/libgo/runtime/go-callers.c
+++ b/libgo/runtime/go-callers.c
@ -63,7 +63,9 @@ callback (void *data, uintptr_t pc, const char *filename, int lineno,

  /* Skip thunks and recover functions.  There is no equivalent to
     these functions in the gc toolchain, so returning them here means
-     significantly different results for runtime.Caller(N).  */
+     significantly different results for runtime.Caller(N). See also
+     similar code in runtime/mprof.go that strips out such functions
+     for block/mutex/memory profiles.  */
  if (function != NULL && !arg->keep_thunks)
    {
      const char *p;
@ -262,3 +264,62 @@ Callers (intgo skip, struct __go_open_array pc)

  return ret;
 }
+
+struct callersRaw_data
+{
+  uintptr* pcbuf;
+  int skip;
+  int index;
+  int max;
+};
+
+// Callback function for backtrace_simple.  Just collect pc's.
+// Return zero to continue, non-zero to stop.
+
+static int callback_raw (void *data, uintptr_t pc)
+{
+  struct callersRaw_data *arg = (struct callersRaw_data *) data;
+
+  if (arg->skip > 0)
+    {
+      --arg->skip;
+      return 0;
+    }
+
+  /* On the call to backtrace_simple the pc value was most likely
+     decremented if there was a normal call, since the pc referred to
+     the instruction where the call returned and not the call itself.
+     This was done so that the line number referred to the call
+     instruction.  To make sure the actual pc from the call stack is
+     used, it is incremented here.
+
+     In the case of a signal, the pc was not decremented by
+     backtrace_full but still incremented here.  That doesn't really
+     hurt anything since the line number is right and the pc refers to
+     the same instruction.  */
+
+  arg->pcbuf[arg->index] = pc + 1;
+  arg->index++;
+  return arg->index >= arg->max;
+}
+
+/* runtime_callersRaw is similar to runtime_callers() above, but
+   it returns raw PC values as opposed to file/func/line locations. */
+int32
+runtime_callersRaw (int32 skip, uintptr *pcbuf, int32 m)
+{
+  struct callersRaw_data data;
+  struct backtrace_state* state;
+
+  data.pcbuf = pcbuf;
+  data.skip = skip + 1;
+  data.index = 0;
+  data.max = m;
+  runtime_xadd (&__go_runtime_in_callers, 1);
+  state = __go_get_backtrace_state ();
+  backtrace_simple (state, 0, callback_raw, error_callback, &data);
+  runtime_xadd (&__go_runtime_in_callers, -1);
+
+  return data.index;
+}
+
--- a/libgo/runtime/runtime.h
+++ b/libgo/runtime/runtime.h
@ -485,6 +485,7 @@ struct funcfileline_return
  String retfn;
  String retfile;
  intgo retline;
+  intgo retframes;
 };

 struct funcfileline_return