gcc/libgo/go/runtime/mbitmap.go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Garbage collector: type and heap bitmaps.
//
// Stack, data, and bss bitmaps
//
// Stack frames and global variables in the data and bss sections are described
// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
// to be visited during GC. The bits in each byte are consumed starting with
// the low bit: 1<<0, 1<<1, and so on.
//
// Heap bitmap
//
// The allocated heap comes from a subset of the memory in the range [start, used),
// where start == mheap_.arena_start and used == mheap_.arena_used.
// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
// stored in bytes indexed backward in memory from start.
// That is, the byte at address start-1 holds the 2-bit entries for the four words
// start through start+3*ptrSize, the byte at start-2 holds the entries for
// start+4*ptrSize through start+7*ptrSize, and so on.
//
// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
// The meaning of the high bit depends on the position of the word being described
// in its allocated object. In all words *except* the second word, the
// high bit indicates that the object is still being described. In
// these words, if a bit pair with a high bit 0 is encountered, the
// low bit can also be assumed to be 0, and the object description is
// over. This 00 is called the ``dead'' encoding: it signals that the
// rest of the words in the object are uninteresting to the garbage
// collector.
//
// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
//
// The 2-bit entries are split when written into the byte, so that the top half
// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
// bits.
// This form allows a copy from the 1-bit to the 4-bit form to keep the
// pointer bits contiguous, instead of having to space them out.
//
// The code makes use of the fact that the zero value for a heap bitmap
// has no live pointer bit set and is (depending on position), not used,
// not checkmarked, and is the dead encoding.
// These properties must be preserved when modifying the encoding.
//
// Checkmarks
//
// In a concurrent garbage collector, one worries about failing to mark
// a live object due to mutations without write barriers or bugs in the
// collector implementation. As a sanity check, the GC has a 'checkmark'
// mode that retraverses the object graph with the world stopped, to make
// sure that everything that should be marked is marked.
// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
// for the second word of the object holds the checkmark bit.
// When not in checkmark mode, this bit is set to 1.
//
// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
// means every allocated object has two words, so there is room for the
// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
// just one word, so the second bit pair is not available for encoding the
// checkmark. However, because non-pointer allocations are combined
// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
// must be a pointer, so the type bit in the first word is not actually needed.
// It is still used in general, except in checkmark the type bit is repurposed
// as the checkmark bit and then reinitialized (to 1) as the type bit when
// finished.
//

package runtime

import (
	"runtime/internal/atomic"
	"runtime/internal/sys"
	"unsafe"
)

const (
	bitPointer = 1 << 0
	bitScan    = 1 << 4

	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte

	// all scan/pointer bits in a byte
	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
)

// addb returns the byte pointer p+n.
//go:nowritebarrier
//go:nosplit
func addb(p *byte, n uintptr) *byte {
	// Note: wrote out full expression instead of calling add(p, n)
	// to reduce the number of temporaries generated by the
	// compiler for this trivial expression during inlining.
	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
}

// subtractb returns the byte pointer p-n.
// subtractb is typically used when traversing the pointer tables referred to by hbits
// which are arranged in reverse order.
//go:nowritebarrier
//go:nosplit
func subtractb(p *byte, n uintptr) *byte {
	// Note: wrote out full expression instead of calling add(p, -n)
	// to reduce the number of temporaries generated by the
	// compiler for this trivial expression during inlining.
	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
}

// add1 returns the byte pointer p+1.
//go:nowritebarrier
//go:nosplit
func add1(p *byte) *byte {
	// Note: wrote out full expression instead of calling addb(p, 1)
	// to reduce the number of temporaries generated by the
	// compiler for this trivial expression during inlining.
	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
}

// subtract1 returns the byte pointer p-1.
// subtract1 is typically used when traversing the pointer tables referred to by hbits
// which are arranged in reverse order.
//go:nowritebarrier
//
// nosplit because it is used during write barriers and must not be preempted.
//go:nosplit
func subtract1(p *byte) *byte {
	// Note: wrote out full expression instead of calling subtractb(p, 1)
	// to reduce the number of temporaries generated by the
	// compiler for this trivial expression during inlining.
	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
}

// mHeap_MapBits is called each time arena_used is extended.
// It maps any additional bitmap memory needed for the new arena memory.
// It must be called with the expected new value of arena_used,
// *before* h.arena_used has been updated.
// Waiting to update arena_used until after the memory has been mapped
// avoids faults when other threads try access the bitmap immediately
// after observing the change to arena_used.
//
//go:nowritebarrier
func (h *mheap) mapBits(arena_used uintptr) {
	// Caller has added extra mappings to the arena.
	// Add extra mappings of bitmap words as needed.
	// We allocate extra bitmap pieces in chunks of bitmapChunk.
	const bitmapChunk = 8192

	n := (arena_used - mheap_.arena_start) / heapBitmapScale
	n = round(n, bitmapChunk)
	n = round(n, physPageSize)
	if h.bitmap_mapped >= n {
		return
	}

	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
	h.bitmap_mapped = n
}

// heapBits provides access to the bitmap bits for a single heap word.
// The methods on heapBits take value receivers so that the compiler
// can more easily inline calls to those methods and registerize the
// struct fields independently.
type heapBits struct {
	bitp  *uint8
	shift uint32
}

// markBits provides access to the mark bit for an object in the heap.
// bytep points to the byte holding the mark bit.
// mask is a byte with a single bit set that can be &ed with *bytep
// to see if the bit has been set.
// *m.byte&m.mask != 0 indicates the mark bit is set.
// index can be used along with span information to generate
// the address of the object in the heap.
// We maintain one set of mark bits for allocation and one for
// marking purposes.
type markBits struct {
	bytep *uint8
	mask  uint8
	index uintptr
}

//go:nosplit
func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
	whichByte := allocBitIndex / 8
	whichBit := allocBitIndex % 8
	bytePtr := addb(s.allocBits, whichByte)
	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
}

// refillaCache takes 8 bytes s.allocBits starting at whichByte
// and negates them so that ctz (count trailing zeros) instructions
// can be used. It then places these 8 bytes into the cached 64 bit
// s.allocCache.
func (s *mspan) refillAllocCache(whichByte uintptr) {
	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
	aCache := uint64(0)
	aCache |= uint64(bytes[0])
	aCache |= uint64(bytes[1]) << (1 * 8)
	aCache |= uint64(bytes[2]) << (2 * 8)
	aCache |= uint64(bytes[3]) << (3 * 8)
	aCache |= uint64(bytes[4]) << (4 * 8)
	aCache |= uint64(bytes[5]) << (5 * 8)
	aCache |= uint64(bytes[6]) << (6 * 8)
	aCache |= uint64(bytes[7]) << (7 * 8)
	s.allocCache = ^aCache
}

// nextFreeIndex returns the index of the next free object in s at
// or after s.freeindex.
// There are hardware instructions that can be used to make this
// faster if profiling warrants it.
func (s *mspan) nextFreeIndex() uintptr {
	sfreeindex := s.freeindex
	snelems := s.nelems
	if sfreeindex == snelems {
		return sfreeindex
	}
	if sfreeindex > snelems {
		throw("s.freeindex > s.nelems")
	}

	aCache := s.allocCache

	bitIndex := sys.Ctz64(aCache)
	for bitIndex == 64 {
		// Move index to start of next cached bits.
		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
		if sfreeindex >= snelems {
			s.freeindex = snelems
			return snelems
		}
		whichByte := sfreeindex / 8
		// Refill s.allocCache with the next 64 alloc bits.
		s.refillAllocCache(whichByte)
		aCache = s.allocCache
		bitIndex = sys.Ctz64(aCache)
		// nothing available in cached bits
		// grab the next 8 bytes and try again.
	}
	result := sfreeindex + uintptr(bitIndex)
	if result >= snelems {
		s.freeindex = snelems
		return snelems
	}

	s.allocCache >>= (bitIndex + 1)
	sfreeindex = result + 1

	if sfreeindex%64 == 0 && sfreeindex != snelems {
		// We just incremented s.freeindex so it isn't 0.
		// As each 1 in s.allocCache was encountered and used for allocation
		// it was shifted away. At this point s.allocCache contains all 0s.
		// Refill s.allocCache so that it corresponds
		// to the bits at s.allocBits starting at s.freeindex.
		whichByte := sfreeindex / 8
		s.refillAllocCache(whichByte)
	}
	s.freeindex = sfreeindex
	return result
}

// isFree returns whether the index'th object in s is unallocated.
func (s *mspan) isFree(index uintptr) bool {
	if index < s.freeindex {
		return false
	}
	whichByte := index / 8
	whichBit := index % 8
	byteVal := *addb(s.allocBits, whichByte)
	return byteVal&uint8(1<<whichBit) == 0
}

func (s *mspan) objIndex(p uintptr) uintptr {
	byteOffset := p - s.base()
	if byteOffset == 0 {
		return 0
	}
	if s.baseMask != 0 {
		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
		return byteOffset >> s.divShift
	}
	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
}

func markBitsForAddr(p uintptr) markBits {
	s := spanOf(p)
	objIndex := s.objIndex(p)
	return s.markBitsForIndex(objIndex)
}

func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
	whichByte := objIndex / 8
	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
	bytePtr := addb(s.gcmarkBits, whichByte)
	return markBits{bytePtr, bitMask, objIndex}
}

func (s *mspan) markBitsForBase() markBits {
	return markBits{s.gcmarkBits, uint8(1), 0}
}

// isMarked reports whether mark bit m is set.
func (m markBits) isMarked() bool {
	return *m.bytep&m.mask != 0
}

// setMarked sets the marked bit in the markbits, atomically. Some compilers
// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
// inlining it manually.
func (m markBits) setMarked() {
	// Might be racing with other updates, so use atomic update always.
	// We used to be clever here and use a non-atomic update in certain
	// cases, but it's not worth the risk.
	atomic.Or8(m.bytep, m.mask)
}

// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
func (m markBits) setMarkedNonAtomic() {
	*m.bytep |= m.mask
}

// clearMarked clears the marked bit in the markbits, atomically.
func (m markBits) clearMarked() {
	// Might be racing with other updates, so use atomic update always.
	// We used to be clever here and use a non-atomic update in certain
	// cases, but it's not worth the risk.
	atomic.And8(m.bytep, ^m.mask)
}

// clearMarkedNonAtomic clears the marked bit non-atomically.
func (m markBits) clearMarkedNonAtomic() {
	*m.bytep ^= m.mask
}

// markBitsForSpan returns the markBits for the span base address base.
func markBitsForSpan(base uintptr) (mbits markBits) {
	if base < mheap_.arena_start || base >= mheap_.arena_used {
		throw("markBitsForSpan: base out of range")
	}
	mbits = markBitsForAddr(base)
	if mbits.mask != 1 {
		throw("markBitsForSpan: unaligned start")
	}
	return mbits
}

// advance advances the markBits to the next object in the span.
func (m *markBits) advance() {
	if m.mask == 1<<7 {
		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
		m.mask = 1
	} else {
		m.mask = m.mask << 1
	}
	m.index++
}

// heapBitsForAddr returns the heapBits for the address addr.
// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
//
// nosplit because it is used during write barriers and must not be preempted.
//go:nosplit
func heapBitsForAddr(addr uintptr) heapBits {
	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
	off := (addr - mheap_.arena_start) / sys.PtrSize
	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
}

// heapBitsForSpan returns the heapBits for the span base address base.
func heapBitsForSpan(base uintptr) (hbits heapBits) {
	if base < mheap_.arena_start || base >= mheap_.arena_used {
		throw("heapBitsForSpan: base out of range")
	}
	return heapBitsForAddr(base)
}

// heapBitsForObject returns the base address for the heap object
// containing the address p, the heapBits for base,
// the object's span, and of the index of the object in s.
// If p does not point into a heap object,
// return base == 0
// otherwise return the base of the object.
//
// For gccgo, the forStack parameter is true if the value came from the stack.
// The stack is collected conservatively and may contain invalid pointers.
//
// refBase and refOff optionally give the base address of the object
// in which the pointer p was found and the byte offset at which it
// was found. These are used for error reporting.
func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
	arenaStart := mheap_.arena_start
	if p < arenaStart || p >= mheap_.arena_used {
		return
	}
	off := p - arenaStart
	idx := off >> _PageShift
	// p points into the heap, but possibly to the middle of an object.
	// Consult the span table to find the block beginning.
	s = mheap_.spans[idx]
	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
		if s == nil || s.state == _MSpanStack || forStack {
			// If s is nil, the virtual address has never been part of the heap.
			// This pointer may be to some mmap'd region, so we allow it.
			// Pointers into stacks are also ok, the runtime manages these explicitly.
			return
		}

		// The following ensures that we are rigorous about what data
		// structures hold valid pointers.
		if debug.invalidptr != 0 {
			// Typically this indicates an incorrect use
			// of unsafe or cgo to store a bad pointer in
			// the Go heap. It may also indicate a runtime
			// bug.
			//
			// TODO(austin): We could be more aggressive
			// and detect pointers to unallocated objects
			// in allocated spans.
			printlock()
			print("runtime: pointer ", hex(p))
			if s.state != mSpanInUse {
				print(" to unallocated span")
			} else {
				print(" to unused region of span")
			}
			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
			if refBase != 0 {
				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
				gcDumpObject("object", refBase, refOff)
			}
			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
		}
		return
	}

	if forStack {
		// A span can be entered in mheap_.spans, and be set
		// to mSpanInUse, before it is fully initialized.
		// All we need in practice is allocBits and gcmarkBits,
		// so make sure they are set.
		if s.allocBits == nil || s.gcmarkBits == nil {
			return
		}
	}

	// If this span holds object of a power of 2 size, just mask off the bits to
	// the interior of the object. Otherwise use the size to get the base.
	if s.baseMask != 0 {
		// optimize for power of 2 sized objects.
		base = s.base()
		base = base + (p-base)&uintptr(s.baseMask)
		objIndex = (base - s.base()) >> s.divShift
		// base = p & s.baseMask is faster for small spans,
		// but doesn't work for large spans.
		// Overall, it's faster to use the more general computation above.
	} else {
		base = s.base()
		if p-base >= s.elemsize {
			// n := (p - base) / s.elemsize, using division by multiplication
			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
			base += objIndex * s.elemsize
		}
	}
	// Now that we know the actual base, compute heapBits to return to caller.
	hbits = heapBitsForAddr(base)
	return
}

// prefetch the bits.
func (h heapBits) prefetch() {
	prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
}

// next returns the heapBits describing the next pointer-sized word in memory.
// That is, if h describes address p, h.next() describes p+ptrSize.
// Note that next does not modify h. The caller must record the result.
//
// nosplit because it is used during write barriers and must not be preempted.
//go:nosplit
func (h heapBits) next() heapBits {
	if h.shift < 3*heapBitsShift {
		return heapBits{h.bitp, h.shift + heapBitsShift}
	}
	return heapBits{subtract1(h.bitp), 0}
}

// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
// h.forward(1) is equivalent to h.next(), just slower.
// Note that forward does not modify h. The caller must record the result.
// bits returns the heap bits for the current word.
func (h heapBits) forward(n uintptr) heapBits {
	n += uintptr(h.shift) / heapBitsShift
	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
}

// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
// The result includes in its higher bits the bits for subsequent words
// described by the same bitmap byte.
func (h heapBits) bits() uint32 {
	// The (shift & 31) eliminates a test and conditional branch
	// from the generated code.
	return uint32(*h.bitp) >> (h.shift & 31)
}

// morePointers returns true if this word and all remaining words in this object
// are scalars.
// h must not describe the second word of the object.
func (h heapBits) morePointers() bool {
	return h.bits()&bitScan != 0
}

// isPointer reports whether the heap bits describe a pointer word.
//
// nosplit because it is used during write barriers and must not be preempted.
//go:nosplit
func (h heapBits) isPointer() bool {
	return h.bits()&bitPointer != 0
}

// hasPointers reports whether the given object has any pointers.
// It must be told how large the object at h is for efficiency.
// h must describe the initial word of the object.
func (h heapBits) hasPointers(size uintptr) bool {
	if size == sys.PtrSize { // 1-word objects are always pointers
		return true
	}
	return (*h.bitp>>h.shift)&bitScan != 0
}

// isCheckmarked reports whether the heap bits have the checkmarked bit set.
// It must be told how large the object at h is, because the encoding of the
// checkmark bit varies by size.
// h must describe the initial word of the object.
func (h heapBits) isCheckmarked(size uintptr) bool {
	if size == sys.PtrSize {
		return (*h.bitp>>h.shift)&bitPointer != 0
	}
	// All multiword objects are 2-word aligned,
	// so we know that the initial word's 2-bit pair
	// and the second word's 2-bit pair are in the
	// same heap bitmap byte, *h.bitp.
	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
}

// setCheckmarked sets the checkmarked bit.
// It must be told how large the object at h is, because the encoding of the
// checkmark bit varies by size.
// h must describe the initial word of the object.
func (h heapBits) setCheckmarked(size uintptr) {
	if size == sys.PtrSize {
		atomic.Or8(h.bitp, bitPointer<<h.shift)
		return
	}
	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
}

// bulkBarrierPreWrite executes writebarrierptr_prewrite1
// for every pointer slot in the memory range [src, src+size),
// using pointer/scalar information from [dst, dst+size).
// This executes the write barriers necessary before a memmove.
// src, dst, and size must be pointer-aligned.
// The range [dst, dst+size) must lie within a single object.
//
// As a special case, src == 0 indicates that this is being used for a
// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
// barrier.
//
// Callers should call bulkBarrierPreWrite immediately before
// calling memmove(dst, src, size). This function is marked nosplit
// to avoid being preempted; the GC must not stop the goroutine
// between the memmove and the execution of the barriers.
// The caller is also responsible for cgo pointer checks if this
// may be writing Go pointers into non-Go memory.
//
// The pointer bitmap is not maintained for allocations containing
// no pointers at all; any caller of bulkBarrierPreWrite must first
// make sure the underlying allocation contains pointers, usually
// by checking typ.kind&kindNoPointers.
//
//go:nosplit
func bulkBarrierPreWrite(dst, src, size uintptr) {
	if (dst|src|size)&(sys.PtrSize-1) != 0 {
		throw("bulkBarrierPreWrite: unaligned arguments")
	}
	if !writeBarrier.needed {
		return
	}
	if !inheap(dst) {
		// If dst is a global, use the data or BSS bitmaps to
		// execute write barriers.
		roots := gcRoots
		for roots != nil {
			for i := 0; i < roots.count; i++ {
				pr := roots.roots[i]
				addr := uintptr(pr.decl)
				if addr <= dst && dst < addr+pr.size {
					if dst < addr+pr.ptrdata {
						bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
					}
					return
				}
			}
			roots = roots.next
		}
		return
	}

	h := heapBitsForAddr(dst)
	if src == 0 {
		for i := uintptr(0); i < size; i += sys.PtrSize {
			if h.isPointer() {
				dstx := (*uintptr)(unsafe.Pointer(dst + i))
				writebarrierptr_prewrite1(dstx, 0)
			}
			h = h.next()
		}
	} else {
		for i := uintptr(0); i < size; i += sys.PtrSize {
			if h.isPointer() {
				dstx := (*uintptr)(unsafe.Pointer(dst + i))
				srcx := (*uintptr)(unsafe.Pointer(src + i))
				writebarrierptr_prewrite1(dstx, *srcx)
			}
			h = h.next()
		}
	}
}

// bulkBarrierBitmap executes write barriers for copying from [src,
// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
// assumed to start maskOffset bytes into the data covered by the
// bitmap in bits (which may not be a multiple of 8).
//
// This is used by bulkBarrierPreWrite for writes to data and BSS.
//
//go:nosplit
func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
	word := maskOffset / sys.PtrSize
	bits = addb(bits, word/8)
	mask := uint8(1) << (word % 8)

	for i := uintptr(0); i < size; i += sys.PtrSize {
		if mask == 0 {
			bits = addb(bits, 1)
			if *bits == 0 {
				// Skip 8 words.
				i += 7 * sys.PtrSize
				continue
			}
			mask = 1
		}
		if *bits&mask != 0 {
			dstx := (*uintptr)(unsafe.Pointer(dst + i))
			if src == 0 {
				writebarrierptr_prewrite1(dstx, 0)
			} else {
				srcx := (*uintptr)(unsafe.Pointer(src + i))
				writebarrierptr_prewrite1(dstx, *srcx)
			}
		}
		mask <<= 1
	}
}

// typeBitsBulkBarrier executes writebarrierptr_prewrite for every
// pointer that would be copied from [src, src+size) to [dst,
// dst+size) by a memmove using the type bitmap to locate those
// pointer slots.
//
// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
// dst, src, and size must be pointer-aligned.
// The type typ must have a plain bitmap, not a GC program.
// The only use of this function is in channel sends, and the
// 64 kB channel element limit takes care of this for us.
//
// Must not be preempted because it typically runs right before memmove,
// and the GC must observe them as an atomic action.
//
//go:nosplit
func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
	if typ == nil {
		throw("runtime: typeBitsBulkBarrier without type")
	}
	if typ.size != size {
		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " of size ", typ.size, " but memory size", size)
		throw("runtime: invalid typeBitsBulkBarrier")
	}
	if typ.kind&kindGCProg != 0 {
		println("runtime: typeBitsBulkBarrier with type ", *typ.string, " with GC prog")
		throw("runtime: invalid typeBitsBulkBarrier")
	}
	if !writeBarrier.needed {
		return
	}
	ptrmask := typ.gcdata
	var bits uint32
	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
		if i&(sys.PtrSize*8-1) == 0 {
			bits = uint32(*ptrmask)
			ptrmask = addb(ptrmask, 1)
		} else {
			bits = bits >> 1
		}
		if bits&1 != 0 {
			dstx := (*uintptr)(unsafe.Pointer(dst + i))
			srcx := (*uintptr)(unsafe.Pointer(src + i))
			writebarrierptr_prewrite(dstx, *srcx)
		}
	}
}

// The methods operating on spans all require that h has been returned
// by heapBitsForSpan and that size, n, total are the span layout description
// returned by the mspan's layout method.
// If total > size*n, it means that there is extra leftover memory in the span,
// usually due to rounding.
//
// TODO(rsc): Perhaps introduce a different heapBitsSpan type.

// initSpan initializes the heap bitmap for a span.
// It clears all checkmark bits.
// If this is a span of pointer-sized objects, it initializes all
// words to pointer/scan.
// Otherwise, it initializes all words to scalar/dead.
func (h heapBits) initSpan(s *mspan) {
	size, n, total := s.layout()

	// Init the markbit structures
	s.freeindex = 0
	s.allocCache = ^uint64(0) // all 1s indicating all free.
	s.nelems = n
	s.allocBits = nil
	s.gcmarkBits = nil
	s.gcmarkBits = newMarkBits(s.nelems)
	s.allocBits = newAllocBits(s.nelems)

	// Clear bits corresponding to objects.
	if total%heapBitmapScale != 0 {
		throw("initSpan: unaligned length")
	}
	nbyte := total / heapBitmapScale
	if sys.PtrSize == 8 && size == sys.PtrSize {
		end := h.bitp
		bitp := subtractb(end, nbyte-1)
		for {
			*bitp = bitPointerAll | bitScanAll
			if bitp == end {
				break
			}
			bitp = add1(bitp)
		}
		return
	}
	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
}

// initCheckmarkSpan initializes a span for being checkmarked.
// It clears the checkmark bits, which are set to 1 in normal operation.
func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
	if sys.PtrSize == 8 && size == sys.PtrSize {
		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
		// Only possible on 64-bit system, since minimum size is 8.
		// Must clear type bit (checkmark bit) of every word.
		// The type bit is the lower of every two-bit pair.
		bitp := h.bitp
		for i := uintptr(0); i < n; i += 4 {
			*bitp &^= bitPointerAll
			bitp = subtract1(bitp)
		}
		return
	}
	for i := uintptr(0); i < n; i++ {
		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
		h = h.forward(size / sys.PtrSize)
	}
}

// clearCheckmarkSpan undoes all the checkmarking in a span.
// The actual checkmark bits are ignored, so the only work to do
// is to fix the pointer bits. (Pointer bits are ignored by scanobject
// but consulted by typedmemmove.)
func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
	if sys.PtrSize == 8 && size == sys.PtrSize {
		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
		// Only possible on 64-bit system, since minimum size is 8.
		// Must clear type bit (checkmark bit) of every word.
		// The type bit is the lower of every two-bit pair.
		bitp := h.bitp
		for i := uintptr(0); i < n; i += 4 {
			*bitp |= bitPointerAll
			bitp = subtract1(bitp)
		}
	}
}

// oneBitCount is indexed by byte and produces the
// number of 1 bits in that byte. For example 128 has 1 bit set
// and oneBitCount[128] will holds 1.
var oneBitCount = [256]uint8{
	0, 1, 1, 2, 1, 2, 2, 3,
	1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4,
	2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4,
	2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4,
	2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6,
	4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4,
	2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6,
	4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5,
	3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6,
	4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6,
	4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7,
	5, 6, 6, 7, 6, 7, 7, 8}

// countFree runs through the mark bits in a span and counts the number of free objects
// in the span.
// TODO:(rlh) Use popcount intrinsic.
func (s *mspan) countFree() int {
	count := 0
	maxIndex := s.nelems / 8
	for i := uintptr(0); i < maxIndex; i++ {
		mrkBits := *addb(s.gcmarkBits, i)
		count += int(oneBitCount[mrkBits])
	}
	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
		mrkBits := *addb(s.gcmarkBits, maxIndex)
		mask := uint8((1 << bitsInLastByte) - 1)
		bits := mrkBits & mask
		count += int(oneBitCount[bits])
	}
	return int(s.nelems) - count
}

// heapBitsSetType records that the new allocation [x, x+size)
// holds in [x, x+dataSize) one or more values of type typ.
// (The number of values is given by dataSize / typ.size.)
// If dataSize < size, the fragment [x+dataSize, x+size) is
// recorded as non-pointer data.
// It is known that the type has pointers somewhere;
// malloc does not call heapBitsSetType when there are no pointers,
// because all free objects are marked as noscan during
// heapBitsSweepSpan.
//
// There can only be one allocation from a given span active at a time,
// and the bitmap for a span always falls on byte boundaries,
// so there are no write-write races for access to the heap bitmap.
// Hence, heapBitsSetType can access the bitmap without atomics.
//
// There can be read-write races between heapBitsSetType and things
// that read the heap bitmap like scanobject. However, since
// heapBitsSetType is only used for objects that have not yet been
// made reachable, readers will ignore bits being modified by this
// function. This does mean this function cannot transiently modify
// bits that belong to neighboring objects. Also, on weakly-ordered
// machines, callers must execute a store/store (publication) barrier
// between calling this function and making the object reachable.
func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
	const doubleCheck = false // slow but helpful; enable to test modifications to this code

	// dataSize is always size rounded up to the next malloc size class,
	// except in the case of allocating a defer block, in which case
	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
	// arbitrarily larger.
	//
	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
	// assume that dataSize == size without checking it explicitly.

	if sys.PtrSize == 8 && size == sys.PtrSize {
		// It's one word and it has pointers, it must be a pointer.
		// Since all allocated one-word objects are pointers
		// (non-pointers are aggregated into tinySize allocations),
		// initSpan sets the pointer bits for us. Nothing to do here.
		if doubleCheck {
			h := heapBitsForAddr(x)
			if !h.isPointer() {
				throw("heapBitsSetType: pointer bit missing")
			}
			if !h.morePointers() {
				throw("heapBitsSetType: scan bit missing")
			}
		}
		return
	}

	h := heapBitsForAddr(x)
	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)

	// Heap bitmap bits for 2-word object are only 4 bits,
	// so also shared with objects next to it.
	// This is called out as a special case primarily for 32-bit systems,
	// so that on 32-bit systems the code below can assume all objects
	// are 4-word aligned (because they're all 16-byte aligned).
	if size == 2*sys.PtrSize {
		if typ.size == sys.PtrSize {
			// We're allocating a block big enough to hold two pointers.
			// On 64-bit, that means the actual object must be two pointers,
			// or else we'd have used the one-pointer-sized block.
			// On 32-bit, however, this is the 8-byte block, the smallest one.
			// So it could be that we're allocating one pointer and this was
			// just the smallest block available. Distinguish by checking dataSize.
			// (In general the number of instances of typ being allocated is
			// dataSize/typ.size.)
			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
				// 1 pointer object. On 32-bit machines clear the bit for the
				// unused second word.
				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
				*h.bitp |= (bitPointer | bitScan) << h.shift
			} else {
				// 2-element slice of pointer.
				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
			}
			return
		}
		// Otherwise typ.size must be 2*sys.PtrSize,
		// and typ.kind&kindGCProg == 0.
		if doubleCheck {
			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
				throw("heapBitsSetType")
			}
		}
		b := uint32(*ptrmask)
		hb := (b & 3) | bitScan
		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
		// 110011 is shifted h.shift and complemented.
		// This clears out the bits that are about to be
		// ored into *h.hbitp in the next instructions.
		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
		*h.bitp |= uint8(hb << h.shift)
		return
	}

	// Copy from 1-bit ptrmask into 2-bit bitmap.
	// The basic approach is to use a single uintptr as a bit buffer,
	// alternating between reloading the buffer and writing bitmap bytes.
	// In general, one load can supply two bitmap byte writes.
	// This is a lot of lines of code, but it compiles into relatively few
	// machine instructions.

	var (
		// Ptrmask input.
		p     *byte   // last ptrmask byte read
		b     uintptr // ptrmask bits already loaded
		nb    uintptr // number of bits in b at next read
		endp  *byte   // final ptrmask byte to read (then repeat)
		endnb uintptr // number of valid bits in *endp
		pbits uintptr // alternate source of bits

		// Heap bitmap output.
		w     uintptr // words processed
		nw    uintptr // number of words to process
		hbitp *byte   // next heap bitmap byte to write
		hb    uintptr // bits being prepared for *hbitp
	)

	hbitp = h.bitp

	// Handle GC program. Delayed until this part of the code
	// so that we can use the same double-checking mechanism
	// as the 1-bit case. Nothing above could have encountered
	// GC programs: the cases were all too small.
	if typ.kind&kindGCProg != 0 {
		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
		if doubleCheck {
			// Double-check the heap bits written by GC program
			// by running the GC program to create a 1-bit pointer mask
			// and then jumping to the double-check code below.
			// This doesn't catch bugs shared between the 1-bit and 4-bit
			// GC program execution, but it does catch mistakes specific
			// to just one of those and bugs in heapBitsSetTypeGCProg's
			// implementation of arrays.
			lock(&debugPtrmask.lock)
			if debugPtrmask.data == nil {
				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
			}
			ptrmask = debugPtrmask.data
			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
			goto Phase4
		}
		return
	}

	// Note about sizes:
	//
	// typ.size is the number of words in the object,
	// and typ.ptrdata is the number of words in the prefix
	// of the object that contains pointers. That is, the final
	// typ.size - typ.ptrdata words contain no pointers.
	// This allows optimization of a common pattern where
	// an object has a small header followed by a large scalar
	// buffer. If we know the pointers are over, we don't have
	// to scan the buffer's heap bitmap at all.
	// The 1-bit ptrmasks are sized to contain only bits for
	// the typ.ptrdata prefix, zero padded out to a full byte
	// of bitmap. This code sets nw (below) so that heap bitmap
	// bits are only written for the typ.ptrdata prefix; if there is
	// more room in the allocated object, the next heap bitmap
	// entry is a 00, indicating that there are no more pointers
	// to scan. So only the ptrmask for the ptrdata bytes is needed.
	//
	// Replicated copies are not as nice: if there is an array of
	// objects with scalar tails, all but the last tail does have to
	// be initialized, because there is no way to say "skip forward".
	// However, because of the possibility of a repeated type with
	// size not a multiple of 4 pointers (one heap bitmap byte),
	// the code already must handle the last ptrmask byte specially
	// by treating it as containing only the bits for endnb pointers,
	// where endnb <= 4. We represent large scalar tails that must
	// be expanded in the replication by setting endnb larger than 4.
	// This will have the effect of reading many bits out of b,
	// but once the real bits are shifted out, b will supply as many
	// zero bits as we try to read, which is exactly what we need.

	p = ptrmask
	if typ.size < dataSize {
		// Filling in bits for an array of typ.
		// Set up for repetition of ptrmask during main loop.
		// Note that ptrmask describes only a prefix of
		const maxBits = sys.PtrSize*8 - 7
		if typ.ptrdata/sys.PtrSize <= maxBits {
			// Entire ptrmask fits in uintptr with room for a byte fragment.
			// Load into pbits and never read from ptrmask again.
			// This is especially important when the ptrmask has
			// fewer than 8 bits in it; otherwise the reload in the middle
			// of the Phase 2 loop would itself need to loop to gather
			// at least 8 bits.

			// Accumulate ptrmask into b.
			// ptrmask is sized to describe only typ.ptrdata, but we record
			// it as describing typ.size bytes, since all the high bits are zero.
			nb = typ.ptrdata / sys.PtrSize
			for i := uintptr(0); i < nb; i += 8 {
				b |= uintptr(*p) << i
				p = add1(p)
			}
			nb = typ.size / sys.PtrSize

			// Replicate ptrmask to fill entire pbits uintptr.
			// Doubling and truncating is fewer steps than
			// iterating by nb each time. (nb could be 1.)
			// Since we loaded typ.ptrdata/sys.PtrSize bits
			// but are pretending to have typ.size/sys.PtrSize,
			// there might be no replication necessary/possible.
			pbits = b
			endnb = nb
			if nb+nb <= maxBits {
				for endnb <= sys.PtrSize*8 {
					pbits |= pbits << endnb
					endnb += endnb
				}
				// Truncate to a multiple of original ptrmask.
				endnb = maxBits / nb * nb
				pbits &= 1<<endnb - 1
				b = pbits
				nb = endnb
			}

			// Clear p and endp as sentinel for using pbits.
			// Checked during Phase 2 loop.
			p = nil
			endp = nil
		} else {
			// Ptrmask is larger. Read it multiple times.
			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
			endp = addb(ptrmask, n)
			endnb = typ.size/sys.PtrSize - n*8
		}
	}
	if p != nil {
		b = uintptr(*p)
		p = add1(p)
		nb = 8
	}

	if typ.size == dataSize {
		// Single entry: can stop once we reach the non-pointer data.
		nw = typ.ptrdata / sys.PtrSize
	} else {
		// Repeated instances of typ in an array.
		// Have to process first N-1 entries in full, but can stop
		// once we reach the non-pointer data in the final entry.
		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
	}
	if nw == 0 {
		// No pointers! Caller was supposed to check.
		println("runtime: invalid type ", *typ.string)
		throw("heapBitsSetType: called with non-pointer type")
		return
	}
	if nw < 2 {
		// Must write at least 2 words, because the "no scan"
		// encoding doesn't take effect until the third word.
		nw = 2
	}

	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
	// The leading byte is special because it contains the bits for word 1,
	// which does not have the scan bit set.
	// The leading half-byte is special because it's a half a byte,
	// so we have to be careful with the bits already there.
	switch {
	default:
		throw("heapBitsSetType: unexpected shift")

	case h.shift == 0:
		// Ptrmask and heap bitmap are aligned.
		// Handle first byte of bitmap specially.
		//
		// The first byte we write out covers the first four
		// words of the object. The scan/dead bit on the first
		// word must be set to scan since there are pointers
		// somewhere in the object. The scan/dead bit on the
		// second word is the checkmark, so we don't set it.
		// In all following words, we set the scan/dead
		// appropriately to indicate that the object contains
		// to the next 2-bit entry in the bitmap.
		//
		// TODO: It doesn't matter if we set the checkmark, so
		// maybe this case isn't needed any more.
		hb = b & bitPointerAll
		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
		if w += 4; w >= nw {
			goto Phase3
		}
		*hbitp = uint8(hb)
		hbitp = subtract1(hbitp)
		b >>= 4
		nb -= 4

	case sys.PtrSize == 8 && h.shift == 2:
		// Ptrmask and heap bitmap are misaligned.
		// The bits for the first two words are in a byte shared
		// with another object, so we must be careful with the bits
		// already there.
		// We took care of 1-word and 2-word objects above,
		// so this is at least a 6-word object.
		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
		// This is not noscan, so set the scan bit in the
		// first word.
		hb |= bitScan << (2 * heapBitsShift)
		b >>= 2
		nb -= 2
		// Note: no bitScan for second word because that's
		// the checkmark.
		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
		*hbitp |= uint8(hb)
		hbitp = subtract1(hbitp)
		if w += 2; w >= nw {
			// We know that there is more data, because we handled 2-word objects above.
			// This must be at least a 6-word object. If we're out of pointer words,
			// mark no scan in next bitmap byte and finish.
			hb = 0
			w += 4
			goto Phase3
		}
	}

	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
	// The loop computes the bits for that last write but does not execute the write;
	// it leaves the bits in hb for processing by phase 3.
	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
	// use in the first half of the loop right now, and then we only adjust nb explicitly
	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
	nb -= 4
	for {
		// Emit bitmap byte.
		// b has at least nb+4 bits, with one exception:
		// if w+4 >= nw, then b has only nw-w bits,
		// but we'll stop at the break and then truncate
		// appropriately in Phase 3.
		hb = b & bitPointerAll
		hb |= bitScanAll
		if w += 4; w >= nw {
			break
		}
		*hbitp = uint8(hb)
		hbitp = subtract1(hbitp)
		b >>= 4

		// Load more bits. b has nb right now.
		if p != endp {
			// Fast path: keep reading from ptrmask.
			// nb unmodified: we just loaded 8 bits,
			// and the next iteration will consume 8 bits,
			// leaving us with the same nb the next time we're here.
			if nb < 8 {
				b |= uintptr(*p) << nb
				p = add1(p)
			} else {
				// Reduce the number of bits in b.
				// This is important if we skipped
				// over a scalar tail, since nb could
				// be larger than the bit width of b.
				nb -= 8
			}
		} else if p == nil {
			// Almost as fast path: track bit count and refill from pbits.
			// For short repetitions.
			if nb < 8 {
				b |= pbits << nb
				nb += endnb
			}
			nb -= 8 // for next iteration
		} else {
			// Slow path: reached end of ptrmask.
			// Process final partial byte and rewind to start.
			b |= uintptr(*p) << nb
			nb += endnb
			if nb < 8 {
				b |= uintptr(*ptrmask) << nb
				p = add1(ptrmask)
			} else {
				nb -= 8
				p = ptrmask
			}
		}

		// Emit bitmap byte.
		hb = b & bitPointerAll
		hb |= bitScanAll
		if w += 4; w >= nw {
			break
		}
		*hbitp = uint8(hb)
		hbitp = subtract1(hbitp)
		b >>= 4
	}

Phase3:
	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
	if w > nw {
		// Counting the 4 entries in hb not yet written to memory,
		// there are more entries than possible pointer slots.
		// Discard the excess entries (can't be more than 3).
		mask := uintptr(1)<<(4-(w-nw)) - 1
		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
	}

	// Change nw from counting possibly-pointer words to total words in allocation.
	nw = size / sys.PtrSize

	// Write whole bitmap bytes.
	// The first is hb, the rest are zero.
	if w <= nw {
		*hbitp = uint8(hb)
		hbitp = subtract1(hbitp)
		hb = 0 // for possible final half-byte below
		for w += 4; w <= nw; w += 4 {
			*hbitp = 0
			hbitp = subtract1(hbitp)
		}
	}

	// Write final partial bitmap byte if any.
	// We know w > nw, or else we'd still be in the loop above.
	// It can be bigger only due to the 4 entries in hb that it counts.
	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
	// and can discard the 4 sitting in hb.
	// But if w == nw+2, we need to write first two in hb.
	// The byte is shared with the next object, so be careful with
	// existing bits.
	if w == nw+2 {
		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
	}

Phase4:
	// Phase 4: all done, but perhaps double check.
	if doubleCheck {
		end := heapBitsForAddr(x + size)
		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
			println("ended at wrong bitmap byte for", *typ.string, "x", dataSize/typ.size)
			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
			h0 := heapBitsForAddr(x)
			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
			throw("bad heapBitsSetType")
		}

		// Double-check that bits to be written were written correctly.
		// Does not check that other bits were not written, unfortunately.
		h := heapBitsForAddr(x)
		nptr := typ.ptrdata / sys.PtrSize
		ndata := typ.size / sys.PtrSize
		count := dataSize / typ.size
		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
		for i := uintptr(0); i < size/sys.PtrSize; i++ {
			j := i % ndata
			var have, want uint8
			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
			if i >= totalptr {
				want = 0 // deadmarker
				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
					want = bitScan
				}
			} else {
				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
					want |= bitPointer
				}
				if i != 1 {
					want |= bitScan
				} else {
					have &^= bitScan
				}
			}
			if have != want {
				println("mismatch writing bits for", *typ.string, "x", dataSize/typ.size)
				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
				h0 := heapBitsForAddr(x)
				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
				if typ.kind&kindGCProg != 0 {
					println("GC program:")
					dumpGCProg(addb(typ.gcdata, 4))
				}
				throw("bad heapBitsSetType")
			}
			h = h.next()
		}
		if ptrmask == debugPtrmask.data {
			unlock(&debugPtrmask.lock)
		}
	}
}

// heapBitsSetTypeNoScan marks x as noscan by setting the first word
// of x in the heap bitmap to scalar/dead.
func heapBitsSetTypeNoScan(x uintptr) {
	h := heapBitsForAddr(uintptr(x))
	*h.bitp &^= (bitPointer | bitScan) << h.shift
}

var debugPtrmask struct {
	lock mutex
	data *byte
}

// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
// progSize is the size of the memory described by the program.
// elemSize is the size of the element that the GC program describes (a prefix of).
// dataSize is the total size of the intended data, a multiple of elemSize.
// allocSize is the total size of the allocated memory.
//
// GC programs are only used for large allocations.
// heapBitsSetType requires that allocSize is a multiple of 4 words,
// so that the relevant bitmap bytes are not shared with surrounding
// objects.
func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
		// Alignment will be wrong.
		throw("heapBitsSetTypeGCProg: small allocation")
	}
	var totalBits uintptr
	if elemSize == dataSize {
		totalBits = runGCProg(prog, nil, h.bitp, 2)
		if totalBits*sys.PtrSize != progSize {
			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
			throw("heapBitsSetTypeGCProg: unexpected bit count")
		}
	} else {
		count := dataSize / elemSize

		// Piece together program trailer to run after prog that does:
		//	literal(0)
		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
		//	repeat(elemSize, count-1) // repeat that element for count
		// This zero-pads the data remaining in the first element and then
		// repeats that first element to fill the array.
		var trailer [40]byte // 3 varints (max 10 each) + some bytes
		i := 0
		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
			// literal(0)
			trailer[i] = 0x01
			i++
			trailer[i] = 0
			i++
			if n > 1 {
				// repeat(1, n-1)
				trailer[i] = 0x81
				i++
				n--
				for ; n >= 0x80; n >>= 7 {
					trailer[i] = byte(n | 0x80)
					i++
				}
				trailer[i] = byte(n)
				i++
			}
		}
		// repeat(elemSize/ptrSize, count-1)
		trailer[i] = 0x80
		i++
		n := elemSize / sys.PtrSize
		for ; n >= 0x80; n >>= 7 {
			trailer[i] = byte(n | 0x80)
			i++
		}
		trailer[i] = byte(n)
		i++
		n = count - 1
		for ; n >= 0x80; n >>= 7 {
			trailer[i] = byte(n | 0x80)
			i++
		}
		trailer[i] = byte(n)
		i++
		trailer[i] = 0
		i++

		runGCProg(prog, &trailer[0], h.bitp, 2)

		// Even though we filled in the full array just now,
		// record that we only filled in up to the ptrdata of the
		// last element. This will cause the code below to
		// memclr the dead section of the final array element,
		// so that scanobject can stop early in the final element.
		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
	}
	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
}

// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
// size the size of the region described by prog, in bytes.
// The resulting bitvector will have no more than size/sys.PtrSize bits.
func progToPointerMask(prog *byte, size uintptr) bitvector {
	n := (size/sys.PtrSize + 7) / 8
	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
	x[len(x)-1] = 0xa1 // overflow check sentinel
	n = runGCProg(prog, nil, &x[0], 1)
	if x[len(x)-1] != 0xa1 {
		throw("progToPointerMask: overflow")
	}
	return bitvector{int32(n), &x[0]}
}

// Packed GC pointer bitmaps, aka GC programs.
//
// For large types containing arrays, the type information has a
// natural repetition that can be encoded to save space in the
// binary and in the memory representation of the type information.
//
// The encoding is a simple Lempel-Ziv style bytecode machine
// with the following instructions:
//
//	00000000: stop
//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
//	10000000 n c: repeat the previous n bits c times; n, c are varints
//	1nnnnnnn c: repeat the previous n bits c times; c is a varint

// runGCProg executes the GC program prog, and then trailer if non-nil,
// writing to dst with entries of the given size.
// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
// starting at dst (because the heap bitmap does). In this case, the caller guarantees
// that only whole bytes in dst need to be written.
//
// runGCProg returns the number of 1- or 2-bit entries written to memory.
func runGCProg(prog, trailer, dst *byte, size int) uintptr {
	dstStart := dst

	// Bits waiting to be written to memory.
	var bits uintptr
	var nbits uintptr

	p := prog
Run:
	for {
		// Flush accumulated full bytes.
		// The rest of the loop assumes that nbits <= 7.
		for ; nbits >= 8; nbits -= 8 {
			if size == 1 {
				*dst = uint8(bits)
				dst = add1(dst)
				bits >>= 8
			} else {
				v := bits&bitPointerAll | bitScanAll
				*dst = uint8(v)
				dst = subtract1(dst)
				bits >>= 4
				v = bits&bitPointerAll | bitScanAll
				*dst = uint8(v)
				dst = subtract1(dst)
				bits >>= 4
			}
		}

		// Process one instruction.
		inst := uintptr(*p)
		p = add1(p)
		n := inst & 0x7F
		if inst&0x80 == 0 {
			// Literal bits; n == 0 means end of program.
			if n == 0 {
				// Program is over; continue in trailer if present.
				if trailer != nil {
					//println("trailer")
					p = trailer
					trailer = nil
					continue
				}
				//println("done")
				break Run
			}
			//println("lit", n, dst)
			nbyte := n / 8
			for i := uintptr(0); i < nbyte; i++ {
				bits |= uintptr(*p) << nbits
				p = add1(p)
				if size == 1 {
					*dst = uint8(bits)
					dst = add1(dst)
					bits >>= 8
				} else {
					v := bits&0xf | bitScanAll
					*dst = uint8(v)
					dst = subtract1(dst)
					bits >>= 4
					v = bits&0xf | bitScanAll
					*dst = uint8(v)
					dst = subtract1(dst)
					bits >>= 4
				}
			}
			if n %= 8; n > 0 {
				bits |= uintptr(*p) << nbits
				p = add1(p)
				nbits += n
			}
			continue Run
		}

		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
		if n == 0 {
			for off := uint(0); ; off += 7 {
				x := uintptr(*p)
				p = add1(p)
				n |= (x & 0x7F) << off
				if x&0x80 == 0 {
					break
				}
			}
		}

		// Count is encoded in a varint in the next bytes.
		c := uintptr(0)
		for off := uint(0); ; off += 7 {
			x := uintptr(*p)
			p = add1(p)
			c |= (x & 0x7F) << off
			if x&0x80 == 0 {
				break
			}
		}
		c *= n // now total number of bits to copy

		// If the number of bits being repeated is small, load them
		// into a register and use that register for the entire loop
		// instead of repeatedly reading from memory.
		// Handling fewer than 8 bits here makes the general loop simpler.
		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
		// it will not overflow.
		src := dst
		const maxBits = sys.PtrSize*8 - 7
		if n <= maxBits {
			// Start with bits in output buffer.
			pattern := bits
			npattern := nbits

			// If we need more bits, fetch them from memory.
			if size == 1 {
				src = subtract1(src)
				for npattern < n {
					pattern <<= 8
					pattern |= uintptr(*src)
					src = subtract1(src)
					npattern += 8
				}
			} else {
				src = add1(src)
				for npattern < n {
					pattern <<= 4
					pattern |= uintptr(*src) & 0xf
					src = add1(src)
					npattern += 4
				}
			}

			// We started with the whole bit output buffer,
			// and then we loaded bits from whole bytes.
			// Either way, we might now have too many instead of too few.
			// Discard the extra.
			if npattern > n {
				pattern >>= npattern - n
				npattern = n
			}

			// Replicate pattern to at most maxBits.
			if npattern == 1 {
				// One bit being repeated.
				// If the bit is 1, make the pattern all 1s.
				// If the bit is 0, the pattern is already all 0s,
				// but we can claim that the number of bits
				// in the word is equal to the number we need (c),
				// because right shift of bits will zero fill.
				if pattern == 1 {
					pattern = 1<<maxBits - 1
					npattern = maxBits
				} else {
					npattern = c
				}
			} else {
				b := pattern
				nb := npattern
				if nb+nb <= maxBits {
					// Double pattern until the whole uintptr is filled.
					for nb <= sys.PtrSize*8 {
						b |= b << nb
						nb += nb
					}
					// Trim away incomplete copy of original pattern in high bits.
					// TODO(rsc): Replace with table lookup or loop on systems without divide?
					nb = maxBits / npattern * npattern
					b &= 1<<nb - 1
					pattern = b
					npattern = nb
				}
			}

			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
			// Since pattern contains >8 bits, there will be full bytes to flush
			// on each iteration.
			for ; c >= npattern; c -= npattern {
				bits |= pattern << nbits
				nbits += npattern
				if size == 1 {
					for nbits >= 8 {
						*dst = uint8(bits)
						dst = add1(dst)
						bits >>= 8
						nbits -= 8
					}
				} else {
					for nbits >= 4 {
						*dst = uint8(bits&0xf | bitScanAll)
						dst = subtract1(dst)
						bits >>= 4
						nbits -= 4
					}
				}
			}

			// Add final fragment to bit buffer.
			if c > 0 {
				pattern &= 1<<c - 1
				bits |= pattern << nbits
				nbits += c
			}
			continue Run
		}

		// Repeat; n too large to fit in a register.
		// Since nbits <= 7, we know the first few bytes of repeated data
		// are already written to memory.
		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
		if size == 1 {
			// Leading src fragment.
			src = subtractb(src, (off+7)/8)
			if frag := off & 7; frag != 0 {
				bits |= uintptr(*src) >> (8 - frag) << nbits
				src = add1(src)
				nbits += frag
				c -= frag
			}
			// Main loop: load one byte, write another.
			// The bits are rotating through the bit buffer.
			for i := c / 8; i > 0; i-- {
				bits |= uintptr(*src) << nbits
				src = add1(src)
				*dst = uint8(bits)
				dst = add1(dst)
				bits >>= 8
			}
			// Final src fragment.
			if c %= 8; c > 0 {
				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
				nbits += c
			}
		} else {
			// Leading src fragment.
			src = addb(src, (off+3)/4)
			if frag := off & 3; frag != 0 {
				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
				src = subtract1(src)
				nbits += frag
				c -= frag
			}
			// Main loop: load one byte, write another.
			// The bits are rotating through the bit buffer.
			for i := c / 4; i > 0; i-- {
				bits |= (uintptr(*src) & 0xf) << nbits
				src = subtract1(src)
				*dst = uint8(bits&0xf | bitScanAll)
				dst = subtract1(dst)
				bits >>= 4
			}
			// Final src fragment.
			if c %= 4; c > 0 {
				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
				nbits += c
			}
		}
	}

	// Write any final bits out, using full-byte writes, even for the final byte.
	var totalBits uintptr
	if size == 1 {
		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
		nbits += -nbits & 7
		for ; nbits > 0; nbits -= 8 {
			*dst = uint8(bits)
			dst = add1(dst)
			bits >>= 8
		}
	} else {
		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
		nbits += -nbits & 3
		for ; nbits > 0; nbits -= 4 {
			v := bits&0xf | bitScanAll
			*dst = uint8(v)
			dst = subtract1(dst)
			bits >>= 4
		}
	}
	return totalBits
}

func dumpGCProg(p *byte) {
	nptr := 0
	for {
		x := *p
		p = add1(p)
		if x == 0 {
			print("\t", nptr, " end\n")
			break
		}
		if x&0x80 == 0 {
			print("\t", nptr, " lit ", x, ":")
			n := int(x+7) / 8
			for i := 0; i < n; i++ {
				print(" ", hex(*p))
				p = add1(p)
			}
			print("\n")
			nptr += int(x)
		} else {
			nbit := int(x &^ 0x80)
			if nbit == 0 {
				for nb := uint(0); ; nb += 7 {
					x := *p
					p = add1(p)
					nbit |= int(x&0x7f) << nb
					if x&0x80 == 0 {
						break
					}
				}
			}
			count := 0
			for nb := uint(0); ; nb += 7 {
				x := *p
				p = add1(p)
				count |= int(x&0x7f) << nb
				if x&0x80 == 0 {
					break
				}
			}
			print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
			nptr += nbit * count
		}
	}
}

// Testing.

// gcbits returns the GC type info for x, for testing.
// The result is the bitmap entries (0 or 1), one entry per byte.
//go:linkname reflect_gcbits reflect.gcbits
func reflect_gcbits(x interface{}) []byte {
	ret := getgcmask(x)
	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
	nptr := typ.ptrdata / sys.PtrSize
	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
		ret = ret[:len(ret)-1]
	}
	return ret
}

// Returns GC type info for object p for testing.
func getgcmask(ep interface{}) (mask []byte) {
	e := *efaceOf(&ep)
	p := e.data
	t := e._type
	// data or bss
	roots := gcRoots
	for roots != nil {
		for i := 0; i < roots.count; i++ {
			pr := roots.roots[i]
			addr := uintptr(pr.decl)
			if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
				n := (*ptrtype)(unsafe.Pointer(t)).elem.size
				mask = make([]byte, n/sys.PtrSize)
				copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
			}
			return
		}
		roots = roots.next
	}

	// heap
	var n uintptr
	var base uintptr
	if mlookup(uintptr(p), &base, &n, nil) != 0 {
		mask = make([]byte, n/sys.PtrSize)
		for i := uintptr(0); i < n; i += sys.PtrSize {
			hbits := heapBitsForAddr(base + i)
			if hbits.isPointer() {
				mask[i/sys.PtrSize] = 1
			}
			if i != 1*sys.PtrSize && !hbits.morePointers() {
				mask = mask[:i/sys.PtrSize]
				break
			}
		}
		return
	}

	// otherwise, not something the GC knows about.
	// possibly read-only data, like malloc(0).
	// must not have pointers
	// For gccgo, may live on the stack, which is collected conservatively.
	return
}
-												Big merge of changes to gofrontend repo that were postponed due to the
GCC release freeze.

	* go-backend.c: Include "go-c.h".
	* go-gcc.cc (Gcc_backend::write_export_data): New method.

	* go-gcc.cc (Gcc_backend::Gcc_backend): Declare
	__builtin_prefetch.
	* Make-lang.in (GO_OBJS): Add go/wb.o.

commit 884c9f2cafb3fc1decaca70f1817ae269e4c6889
Author: Than McIntosh <thanm@google.com>
Date:   Mon Jan 23 15:07:07 2017 -0500

    compiler: insert additional conversion for type desc ptr expr
    
    Change the method Type::type_descriptor_pointer to apply an additional
    type conversion to its result Bexpression, to avoid type clashes in
    the back end. The backend expression for a given type descriptor var
    is given a type of "_type", however the virtual calls that create the
    variable use types derived from _type, hence the need to force a
    conversion.
    
    Reviewed-on: https://go-review.googlesource.com/35506


commit 5f0647c71e3b29eddcd0eecc44e7ba44ae7fc8dd
Author: Than McIntosh <thanm@google.com>
Date:   Mon Jan 23 15:22:26 2017 -0500

    compiler: insure tree integrity in Call_expression::set_result
    
    Depending on the back end, it can be problematic to reuse Bexpressions
    (passing the same Bexpression to more than one Backend call to create
    additional Bexpressions or Bstatements). The Call_expression::set_result
    method was reusing its Bexpression input in more than one tree
    context; the fix is to pass in an Expression instead and generate
    multiple Bexpression references to it within the method.
    
    Reviewed-on: https://go-review.googlesource.com/35505


commit 7a8e49870885af898c3c790275e513d1764a2828
Author: Ian Lance Taylor <iant@golang.org>
Date:   Tue Jan 24 21:19:06 2017 -0800

    runtime: copy more of the scheduler from the Go 1.8 runtime
    
    Copies mstart, newm, m0, g0, and friends.
    
    Reviewed-on: https://go-review.googlesource.com/35645


commit 3546e2f002d0277d805ec59c5403bc1d4eda4ed9
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Jan 26 19:47:37 2017 -0800

    runtime: remove a few C functions that are no longer used
    
    Reviewed-on: https://go-review.googlesource.com/35849


commit a71b835254f6d3164a0e6beaf54f2b175d1a6a92
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Jan 26 16:51:16 2017 -0800

    runtime: copy over more of the Go 1.8 scheduler
    
    In particular __go_go (aka newproc) and goexit[01].
    
    Reviewed-on: https://go-review.googlesource.com/35847


commit c3ffff725adbe54d8283c373b6aa7dc95d6fc27f
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Jan 27 16:58:20 2017 -0800

    runtime: copy syscall handling from Go 1.8 runtime
    
    Entering a syscall still has to start in C, to save the registers.
    Fix entersyscallblock to save them more reliably.
    
    This copies over the tracing code for syscalls, which we previously
    weren't doing, and lets us turn on runtime/trace/check.
    
    Reviewed-on: https://go-review.googlesource.com/35912


commit d5b921de4a28b04000fc4c8dac7f529a4a624dfc
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Jan 27 18:34:11 2017 -0800

    runtime: copy SIGPROF handling from Go 1.8 runtime
    
    Also copy over Breakpoint.
    
    Fix Func.Name and Func.Entry to not crash on a nil Func.
    
    Reviewed-on: https://go-review.googlesource.com/35913


commit cc60235e55aef14b15c3d2114030245beb3adfef
Author: Than McIntosh <thanm@google.com>
Date:   Mon Feb 6 11:12:12 2017 -0500

    compiler: convert go_write_export_data to Backend method.
    
    Convert the helper function 'go_write_export_data' into a Backend
    class method, to allow for an implementation of this function that
    needs to access backend state.
    
    Reviewed-on: https://go-review.googlesource.com/36357


commit e387439bfd24d5e142874b8e68e7039f74c744d7
Author: Than McIntosh <thanm@google.com>
Date:   Wed Feb 8 11:13:46 2017 -0500

    compiler: insert backend conversion in temporary statement init
    
    Insert an additional type conversion in Temporary_statement::do_get_backend
    when assigning a Bexpression initializer to the temporary variable, to
    avoid potential clashes in the back end. This can come up when assigning
    something of concrete pointer-to-function type to a variable of generic
    pointer-to-function type.
    
    Reviewed-on: https://go-review.googlesource.com/36591


commit c5acf0ce09e61ff623847a35a99da465b8571609
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 1 17:57:53 2017 +0100

    libgo: build tags for aix
    
    Build tags for the libgo source files required to build
    libgo on AIX.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37633


commit 67ed19616898ea18a101ec9325b82d028cd395d9
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 2 15:41:31 2017 +0100

    libgo: handle AIX tag in match.sh and gotest
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37638


commit 83ea2d694c10b2dd83fc8620c43da13d20db754e
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 1 17:48:16 2017 +0100

    libgo: add AIX support in configure and Makefile
    
    - support for GOOS=aix
    - CFLAGS/GOCFLAGS/LDFLAGS for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37632


commit 35d577fe22ffa16a3ccaadf5dae9f6f425c8ec8c
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Mon Mar 6 15:00:15 2017 +0100

    runtime: adapt memory management to AIX mmap
    
    On AIX:
    * mmap does not allow to map an already mapped range,
    * mmap range start at 0x30000000 for 32 bits processes,
    * mmap range start at 0x70000000_00000000 for 64 bits processes
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37845


commit 4e49e56a5fd4072b4ca7fcefe4158d6885d9ee62
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Mon Mar 6 13:42:26 2017 +0100

    runtime: add getproccount implementation for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37844


commit ff626470294237ac664127894826614edc46a3d0
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Mon Mar 6 17:31:21 2017 +0100

    runtime: handle ERESTART errno with AIX's wait4
    
    On AIX, wait4 may return with errno set to ERESTART, which causes unexepected
    behavior (for instance, go build may exit with the message "wait: restart
    system call" after running a command, even if it was successfull).
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37846


commit 37daabbfc83d533b826ef9ab10e2dee7406e7198
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Mon Mar 6 11:02:58 2017 +0100

    runtime: support for AIX's procfs tree
    
    On AIX, the process executable file is available under /proc/<pid>/object/a.out
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37842


commit a0275c039d56acf4bf48151978c1a4ec5758cc2c
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Mar 8 07:00:05 2017 -0800

    libgo/Makefile.am: don't use nonportable \n or \t in sed expression
    
    The resulting zstdpktlist.go is less pretty, but it works.
    
    Reviewed-on: https://go-review.googlesource.com/37940


commit 29b190f76105aafa2b50b48249afdafecc97a4be
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 9 16:02:34 2017 +0100

    runtime: netpoll and semaphores for AIX
    
    semaphore implementation based on Solaris implementation in
    libgo/go/runtime/os_solaris.go
    
    netpoll is just a stub to avoid build failure on AIX.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37966


commit 55ca6d3f3cddf0ff9ccb074b2694da9fc54de7ec
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 9 15:38:30 2017 +0100

    libmain: ensure initfn is called when loading a go library
    
    AIX does not support .init_array.
    The alterative is to export the __go_init function and tell the linker
    it is an init function with the -Wl,-binitfini:__go_init option.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37965


commit 349a30d17d880ac8bc1a35e1a2ffee6d6e870ae9
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Mar 10 11:15:08 2017 +0100

    libgo: use an import list for missing symbols
    
    libgo depends on symbols provided by Go programs at runtime. On AIX,
    this requires either to build libgo with -Wl,-berok linker option and
    the programs with -Wl,-brtl, or to provide a list of imported symbols
    when building libgo. The second options seems preferable, to avoid
    requiring an additional option for every Go program.
    
    There are also some symbols that are specific to GNU ld and do not
    exist when linking with AIX ld (__data_start, __edata, __etext and
    __bss_start).
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37969


commit 91db0ea1ff068ca1d97b9c99612100ea5b96ddb2
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 8 15:34:45 2017 +0100

    crypto/x509: add certificate files locations for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/37952


commit 92e521c854e91709b949548c47e267377850f26a
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Mar 10 14:10:11 2017 -0800

    compiler: fix check for pointer in Temporary_reference_expression
    
    The check for an unrepresentable pointer in
    Temporary_reference_expression::do_get_backend was incorrectly
    translated from C to Go in https://golang.org/cl/14346043.  Fix the
    check to use points_to rather than has_pointer and deref.  This should
    not make any difference in practice as either way the condition will
    only be true for a pointer to void, but points_to is correct and more
    efficient.
    
    Reviewed-on: https://go-review.googlesource.com/38009


commit 9a0b676e59e7171a630c48fdc3d4de6712bad0ca
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 16 16:51:53 2017 +0100

    libgo: add missing _arpcom struct to *sysinfo.go
    
    This struct is filtered due to having a field of type _in6_addr,
    but other types exported to *sysinfo.go are depending on it.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38251


commit 61262a757bdd3d9a595ab6a90f68c0c4ebed7bc1
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 16 18:27:46 2017 +0100

    syscall: raw_ptrace stub for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38253


commit 8029632b50880fd9b5e39299c738b38e3386595f
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 15 16:58:37 2017 +0100

    libgo: adapt runtime.inc to AIX
    
    * Two AIX types are wrongfully exported to runtime.inc as their names
      make them look like a Go type.
    * The sigset go type conflicts with a system sigset type.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38192


commit 25f3a90d14bc268479369ecc0eada72791612f86
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 15 16:58:37 2017 +0100

    libgo: update Makefile.in, accidentally omitted from last change
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38310


commit d52b4895616b66f93b460366527e74336829aaa5
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 16 18:39:26 2017 +0100

    syscall: TIOCSCTTY does not exist on AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38254


commit ff1ec3847a4472008e5d53a98b6694b1e54ca322
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 16 18:07:34 2017 +0100

    syscall: syscall does not exist on AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38252


commit c1ee60dabf0b243a0b0286215481a5d326c34596
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Mar 17 17:18:18 2017 +0100

    net: EAI_OVERFLOW does not exist on AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38266


commit ad4ad29aed9f70b14b39b488bfeb9ee745382ec4
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Mar 17 17:23:56 2017 +0100

    net: sockopt/sockoptip stubs for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38267


commit 5d7db2d7542fe7082f426d42f8c2ce14aad6df55
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Mar 17 16:35:05 2017 +0100

    os/user: add listgroups stub for AIX
    
    This is required to build os/user.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38263


commit 4e57a7973e9fa4cb5ab977c6d792e62a8f7c5795
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 22 11:11:30 2017 +0100

    os: fix readdirnames for AIX
    
    Largefile implementation should be used on AIX.
    
    readdir64_r function returns 9 and sets result to NULL when
    reaching end of directory, so this return code should not
    always be considered as an error.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38359


commit b34036967d1ec57b25e3debe077439b4210a1d4a
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Mar 17 17:39:31 2017 +0100

    libgo: adapt sigtab.go to AIX
    
    On AIX, _NSIG is not directly defined to its integer value in
    gen-sysinfo.go.
    The real value is _SIGMAX32+1 or _SIGMAX64+1, depending if we are
    building a 32bit ligbo or a 64bit libgo, so we need to read one of
    those constants to set nsig value in mksigtab.sh
    
    This change also ensures that all signal numbers from 0 to nsig-1
    are referenced in sigtable.
    
    Reviewed-on: https://go-review.googlesource.com/38268


commit 20991c32671a183ec859b4f285df37fdd4634247
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 23 17:28:09 2017 +0100

    syscall: missing import in socket_bsd.go
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38369


commit c34754bd9adf5496c4c26257eaa50793553c11e8
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 22 17:57:01 2017 +0100

    sycall: WCOREDUMP macro is not defined on AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38363


commit 4f38813482227b12ea0ac6ac1b981ff9ef9853ef
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 23 17:44:43 2017 +0100

    libgo: additional build tags for AIX
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38510


commit d117ede6ff5a7083e9c40eba28a0f94f3535d773
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 23 17:48:46 2017 +0100

    go/build: add AIX to "go build" command known OS
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38511


commit 7b0ddaa6a6a71f9eb1c374122d29775b13c2cac5
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Mar 23 09:57:01 2017 -0700

    compiler: don't crash if imported package imports this one
    
    When building a test it's OK if test code imports a package that
    imports this one. The go tool is supposed to catch cases where this
    creates an impossible initialization order. The compiler already has
    code to permit this in Gogo::add_import_init_fn. This CL avoids a
    compiler crash on a similar case when writing out the export data.
    
    I have no test case for this. Basically it pushes a compiler crash
    into an error reported elsewhere.
    
    Problem was reported by Tony Reix.
    
    Reviewed-on: https://go-review.googlesource.com/38462


commit 925636975d075e3e3353823b09db3f933f23cb03
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Mar 29 14:14:18 2017 -0700

    runtime: copy finalizer support from Go 1.8 runtime
    
    Reviewed-on: https://go-review.googlesource.com/38794


commit 1ccb22b96cb3b1011db0e427877d9ddecb577fa9
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Mar 30 15:21:06 2017 +0200

    runtime: initcontext and setcontext stubs for AIX
    
    Further investigations are required to understand the clobbering
    issue and implement a proper fix. Until then, those stubs are
    required to allow the build to complete.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38930


commit 27db481f369b54256063c72b911d22390c59199c
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 29 18:07:25 2017 +0200

    os: fix Readlink failure on AIX
    
    AIX readlink routine returns an error if the link is longer
    than the buffer, instead of truncating the link.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38700


commit c93babbf48eddd0bc34d4179ffb302dc60087299
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 29 17:26:35 2017 +0200

    compiler: implement support for reading AIX big archives
    
    This is required to read go export from a Go library.
    
    Code courtesy of Damien Bergamini from Atos Infogérance.
    
    Issue golang/go#19200
    Reviewed-on: https://go-review.googlesource.com/38698


commit 930dd53482bdee3a9074850d168d0b9d7819c135
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 6 18:50:11 2017 -0700

    compiler: fix whether conversions are static initializers
    
    The compiler was incorrectly treating type conversions from string to
    int or vice-versa as static initializers.  That doesn't work, as those
    conversions are implemented via a function call.
    
    This case may never actually arise but it seems like the right thing to do.
    
    Reviewed-on: https://go-review.googlesource.com/39872


commit f02691e4195728dbf06f4dde0853c6bccc922183
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 6 17:24:08 2017 -0700

    compiler, runtime: don't let slices point past end of memory block
    
    When Go code uses a slice expression like [:len(str)] or [:cap(slice)],
    it's natural for the resulting pointer to point just past the end of
    the memory block.  If the next memory block is not live, we now have a
    live pointer to a dead block, which will unnecessarily keep the block
    alive.  That wastes space, and with the new Go 1.8 GC (not yet
    committed) will trigger an error when using GODEBUG=gccheckmark=1.
    
    This changes the implementation of slice expressions to not move the
    pointer if the resulting string length or slice capacity is 0.  When
    the length/capacity is zero, the pointer is never used anyhow.
    
    Reviewed-on: https://go-review.googlesource.com/39870


commit 17527c35b027e1afcc318faf5563909e1e9d44a6
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 6 15:30:11 2017 -0700

    compiler: emit write barriers
    
    The Go 1.8 concurrent GC requires optional write barriers for all
    assignments that may change pointer values in the heap or in a global
    variable.  For details see https://blog.golang.org/go15gc.
    
    This changes the gofrontend code to emit write barriers as needed.
    This is in preparation for future changes.  At the moment the write
    barriers will do nothing.  They test runtime.writeBarrier.enabled,
    which will never be non-zero.  They call simple functions which just
    do a move without doing any of the other operations required by the
    write barrier.
    
    Reviewed-on: https://go-review.googlesource.com/39852


commit c0b00f072bf34b2c288e1271ec8118b88c4f6f6f
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Tue Apr 11 17:47:29 2017 +0200

    libgo: allow building gox files from PIC objects
    
    libtool builds non-PIC objects in the same directory as .lo files
    and PIC objects in a .libs subdirectory.
    BUILDGOX rule uses the non-PIC objects to build the gox files,
    but on AIX only the PIC objects are built.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/40355


commit ea0f3da174c5503a209043f14ddda34871cfec52
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 6 19:06:14 2017 -0700

    compiler: add code to generate a ptrmask for a type
    
    The Go 1.8 garbage collector uses a ptrmask for all types below a
    certain size.  A ptrmask is simply a bit vector with a single bit for
    each pointer-sized word in the value.  The bit is 1 if the type has a
    pointer in that position, 0 if it does not.
    
    This change adds code to the compiler to generate a ptrmask.  The code
    is not used by anything yet, it is just compiled.  It will be used
    when we switch over to the Go 1.8 garbage collector.
    
    The new Array_type::int_length method, and the new memory_size
    methods, will also be used by other patches coming later.
    
    Reviewed-on: https://go-review.googlesource.com/39873


commit 3029e1df3be3614d196a03c15e50e68ff850aa4c
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 7 10:31:39 2017 -0700

    compiler: add code to generate a gcprog for a type
    
    The Go 1.8 garbage collector uses a gcprog for all types above a
    certain size.  A gcprog describes where the pointers are in the type,
    using a simple bytecode machine that supports repeating bits.  The
    effect is to permit using much less space to describe arrays.  The
    format is described in runtime/mbitmap.go in the docs for runGCProg.
    This is not yet added to the gofrontend, but can be seen in the gc sources.
    
    This change adds code to the compiler to generate a gcprog.  The code
    is not used by anything yet, it is just compiled.  It will be used
    when we switch over to the Go 1.8 garbage collector.
    
    Reviewed-on: https://go-review.googlesource.com/39923


commit 8b01ef1e9176d20f4c9e667972fe031069a4d057
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 13 07:00:35 2017 -0700

    compiler: add ptrdata computations and expressions
    
    For the upcoming Go 1.8 GC we need to compute the "ptrdata" of a type:
    the number of bytes in the type that can contain pointers.  For types
    that do not contain pointers this number is zero.  For many types it
    is a number larger than zero but smaller than the total size of the
    type.  The Go 1.8 GC uses this number to make loops looking for
    pointers run faster by not scanning the suffix of a value that can not
    contain a pointer.
    
    Unfortunately there are two subtly different definitions of ptrdata,
    and we need both.  The first is the simple one: the prefix that can
    contain pointers.  The second is the number of bytes described by the
    gcprog for the type.  Recall that we describe the exact position of
    pointers in a type using either a ptrmask or a gcprog.  The ptrmask is
    simpler, the gcprog uses less space.  We use the gcprog for large
    types, currently defined as types that are more than 2048 bytes.  When
    the Go 1.8 runtime expands a gcprog, it verifies that the gcprog
    describes exactly the same number of bytes as the ptrdata field in the
    type descriptor.  If the last pointer-containing portion of a type is
    an array, and if the elements of the array have a ptrdata that is less
    than the size of the element type, then the simple definition of the
    ptrdata will not include the final non-pointer-containing bytes of the
    last element of the array.  However, the gcprog will define the array
    using a repeat count, and will therefore include the full size of the
    last element of the array.  So for a type that needs a gcprog, the
    ptrdata field in the type descriptor must be the size of the data
    described by the gcprog, and that is not necessarily the same as the
    simple ptrdata.
    
    It might seem that we can always use the gcprog version of the ptrdata
    calculation, since that is what will appear in a type descriptor, but
    it turns out that for global variables we always use a ptrmask, not a
    gcprog, even if the global variable is large.  This is because gcprogs
    are handled by expanding them into a ptrmask at runtime, and for a
    global variable there is no natural place to put the ptrmask.  Simpler
    to always use the ptrmask.  That means that we need to describe the
    size of the ptrmask, and that means that we need an expression for the
    simple form of the ptrdata.
    
    This CL implements the ptrdata calculation.  This code is not actually
    used yet.  It will be used later when the Go 1.8 GC is committed.
    
    Reviewed-on: https://go-review.googlesource.com/40573


commit 7a37331303b572412179a08141f1dd35339d40c8
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 06:55:48 2017 -0700

    compiler: zero length arrays never contain pointers
    
    Reviewed-on: https://go-review.googlesource.com/40696


commit c242f0508a64d3d74a28d498cbaeda785ff76258
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 07:26:54 2017 -0700

    bytes: disable allocations test on gccgo
    
    It turns out that testing.AllocsPerRun has not been producing correct
    results with the current gccgo memory allocator.  When we update to
    the Go 1.8 memory allocator, testing.AllocsPerRun will work again, and
    this test will fail due to lack of escape analysis.
    
    Reviewed-on: https://go-review.googlesource.com/40700


commit 0dc369f1d63376a36bfb0999a1b0377fd444bfab
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Tue Apr 11 16:22:38 2017 +0200

    os: alternative way to find executable path, using Args[0]
    
    AIX does not provide a proper way to find the original
    executable path from procfs, which contains just an
    hardlink.
    Executable path can be found using Args[0], Getcwd and
    $PATH.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/40353


commit f9bad1342569b338e3b2ea9f12ffc6d3d3fa3028
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 08:01:19 2017 -0700

    compiler: don't write struct with multiple sink fields to C header file
    
    When writing a struct to the C header file used by the C runtime code,
    a single sink field is fine: it will be called "_", which is valid C.
    There are structs with single sink fields that we want to write out,
    such as finblock.  As it happens, though, the Go 1.8 runtime has a
    struct with two sink fields, gcControllerState, which will produce a C
    definition with two fields named "_", which will fail.  Since we don't
    need to know that struct in C, rather than fix the general case, just
    punt if the struct has multiple sink fields.
    
    After the conversion to the Go 1.8 GC, we may be able to get rid of
    the C header file anyhow.  I'm not sure yet.
    
    Reviewed-on: https://go-review.googlesource.com/40701


commit cfc28901a572aeb15b2f10a38f79eec04c64dfb2
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 10:07:23 2017 -0700

    runtime: disable allocations test on gccgo
    
    It turns out that testing.AllocsPerRun has not been producing correct
    results with the current gccgo memory allocator.  When we update to
    the Go 1.8 memory allocator, testing.AllocsPerRun will work again, and
    these tests will fail due to lack of escape analysis.
    
    Reviewed-on: https://go-review.googlesource.com/40703


commit 36fedd76edaa48b9ec09709a70d9e4abaddf0caf
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 10:47:06 2017 -0700

    runtime: remove unused size argument from hash/equal fns
    
    The size argument was removed from hash and equal functions in CL
    34983.  Somehow I missed removing them from three of the predefined
    functions.
    
    Reviewed-on: https://go-review.googlesource.com/40770


commit 90f6accb48d2e78cad8955b9292933f6ce3fe4c8
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 13:23:05 2017 -0700

    runtime: remove unused stack.go
    
    We're never going to use stack.go for gccgo.  Although a build tag
    keeps it from being built, even having it around can be confusing.
    Remove it.
    
    Reviewed-on: https://go-review.googlesource.com/40774


commit befa71603fc66a214e01ac219f2bba36e19f136f
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 13:18:34 2017 -0700

    runtime: build fastlog
    
    Take out the build tags which were preventing fastlog2 from being
    built.  It's used by the upcoming Go 1.8 GC.
    
    Reviewed-on: https://go-review.googlesource.com/40773


commit b7e19e9be4ab4c3cd8f4c9506d79a8cd56bace40
Author: Ian Lance Taylor <iant@golang.org>
Date:   Fri Apr 14 10:04:23 2017 -0700

    runtime: add tests from Go 1.8
    
    Some runtime package tests never made it into the gofrontend repo for
    some reason.  Add them now.
    Reviewed-on: https://go-review.googlesource.com/40869


commit 1feef185aebd71bc2a09b9a04287461806096610
Author: Ian Lance Taylor <iant@golang.org>
Date:   Mon Apr 17 16:26:11 2017 -0700

    runtime: change mcall to take a Go function value
    
    For future work in bringing in the Go 1.8 GC, change the mcall
    function to take a Go function value, which means that mcall can take
    a closure rather than just a straight C function pointer.
    
    As part of this change move kickoff from C to Go, which we want to do
    anyhow so that we run the write barriers that it generates.
    
    Reviewed-on: https://go-review.googlesource.com/40935


commit c3db34f4efc2d610f74a01dd2ad7775f48889b29
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Tue Apr 11 16:11:26 2017 +0200

    runtime: netpoll implementation for AIX
    
    Code courtesy of Damien Bergamini from Atos Infogérance.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/40352


commit f5634dff40e53ad9ce61afd67fd07334e3af9d1f
Author: Ian Lance Taylor <iant@golang.org>
Date:   Tue Apr 18 22:06:07 2017 -0700

    runtime: move mstart from Go to C
    
    The assignments done in mstart must be done without write barriers, as
    mstart is running without an m or p.  In the gc toolchain the
    equivalent code to intialize g and g->m is written in assembler;
    on GNU/Linux, it's in the clone function.
    
    Reviewed-on: https://go-review.googlesource.com/40989


commit 671d7c74592f4b6fe3665af279482ba0ea47ca2d
Author: Ian Lance Taylor <iant@golang.org>
Date:   Tue Apr 18 17:47:28 2017 -0700

    compiler: varargs slices do not escape in runtime
    
    Also, don't try to allocate an empty slice on the stack, as it will
    confuse the GCC backend.
    
    Also add a few trivial style, code formatting, and debug output fixes.
    
    Updates golang/go#17431
    
    Reviewed-on: https://go-review.googlesource.com/40983


commit 94699d25f31353bf03419eda56b15993a39f3275
Author: Ian Lance Taylor <iant@golang.org>
Date:   Tue Apr 18 17:30:09 2017 -0700

    compiler: add Ptrmask_symbol_expression
    
    Add an expression to evaluate to the ptrmask for a type.  This will be
    used for global variables, which always use a ptrmask no matter how
    large they are.
    
    Reviewed-on: https://go-review.googlesource.com/40981


commit bfff1654eac5b9288fa6c431e66cba8c9da6a660
Author: Ian Lance Taylor <iant@golang.org>
Date:   Mon Apr 17 10:51:16 2017 -0700

    runtime: change g's in systemstack
    
    The systemstack function in the gc toolchain changes to a different g.
    This is often used to get more stack space; the gofrontend uses a
    different stack growth mechanism that does not require changing g's,
    so we've been running with a version of systemstack that keeps the
    same g.  However, the garbage collector has various tests to verify
    that it is running on g0 rather than on a normal g.  For simplicity,
    change the gofrontend version of systemstack to change to a different
    g just as the gc toolchain does.
    
    This permits us to uncomment some sanity checks in notetsleep.
    Doing that requires us to fix up a couple of places where C code calls
    {start,stop}TheWorldWithSema while not on g0.
    
    Note that this does slow down some code in the runtime package unnecessarily.
    It may be useful to find some places where the runtime calls
    systemstack only to get more stack space and change it to use some
    other function.  That other function would act like systemstack in the
    gc toolchain but simply call the argument in the gofrontend.
    
    Reviewed-on: https://go-review.googlesource.com/40973


commit b2ccc7601ce71a7c5732154cf9b2eeea64681469
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 10:36:12 2017 -0700

    compiler, runtime: include ptrmask in GC roots
    
    Change the list of registered GC roots to include a ptrmask,
    and change the data structures to be easily used from Go code.
    The new ptrmask will be used by the Go 1.8 GC to only scan pointers.
    Tweak the current GC to use the new structures, but ignore the new
    ptrmask information for now.
    
    The new GC root data includes the size of the variable.  The size is
    not currently used, but will be used later by the cgo checking code.
    
    Reviewed-on: https://go-review.googlesource.com/41075


commit 9e065149970bc180e4ca83bb99c74d9c4f43b47b
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 12:23:16 2017 -0700

    compiler, runtime: don't pass size to __go_new
    
    There is no reason to pass the size to __go_new, as the type
    descriptor includes the size anyhow.  This makes the function
    correspond to the Go 1.8 function runtime.newobject, which is what we
    will use when we update to the Go 1.8 memory allocator.
    
    Reviewed-on: https://go-review.googlesource.com/41080


commit c321de7b738c4a3387c1842919c9305acfa04c57
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 13:13:56 2017 -0700

    compiler, runtime, reflect: make type descriptors more like Go 1.8
    
    Change the type descriptor structure to be more like the one in the Go
    1.8 runtime.  Specifically we add the ptrdata field, rename the gc
    field to gcdata and change the type to *byte, and rearrange a few of
    the fields.  The structure is still not identical to the Go 1.8
    structure--we don't use any of the tricks to reduce overall executable
    size--but it is more similar.
    
    For now we don't use the new ptrdata field, and the gcdata field is
    still the old format rather than the new Go 1.8 ptrmask/gcprog format.
    
    Reviewed-on: https://go-review.googlesource.com/41081


commit 7b70c52cddeebea9ebeac003f8c6aad59497e5f0
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 14:54:29 2017 -0700

    reflect: make sure to clear unusable hash/equal function
    
    Otherwise we wind up copying the one from the prototype, which is wrong.
    
    Also rewrite the hash/equal functions to look like the ones in Go 1.8,
    mainly a matter of changing names and using arrayAt.
    
    Reviewed-on: https://go-review.googlesource.com/41133


commit 84d26f467f7de8bdbb0d230458135fe1b6b2a99d
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 14:59:13 2017 -0700

    runtime: remove duplicate declarations of SetFinalizer/KeepAlive
    
    These should have been removed in CL 38794.  It's a bug that the
    compiler even permits these duplicate declarations.
    
    Reviewed-on: https://go-review.googlesource.com/41134


commit f85ff7e64c24031f6d0bd7c9c426b6176cb95160
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 15:56:32 2017 -0700

    runtime: don't crash if panicstring called with no m
    
    It's possible for runtime_panicstring to be called with no m if a
    signal handler, or scheduler innards, do something wrong.  If that
    happens carry on with the panic rather than crashing.
    
    Reviewed-on: https://go-review.googlesource.com/41137


commit 5b362b04f642afb8b20715930416fc3b7d91bb12
Author: Than McIntosh <thanm@google.com>
Date:   Fri Mar 31 14:35:48 2017 -0400

    compiler: fix for expr sharing introduced by Order_eval::statement.
    
    When processing an expression statement with a top-level call
    that returns multiple results, Order_eval::statement can wind up
    creating a tree that has multiple references to the same call,
    which results in a confusing AST dump. Change the implementation
    to avoid introducing this unwanted sharing.
    
    Reviewed-on: https://go-review.googlesource.com/39210


commit b05b4260a68695bf9c9cc29e14ae86ca2699458a
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 16:00:28 2017 -0700

    runtime: restore correct m in gtraceback
    
    If gtraceback is used to get a stack trace of a g running in the same m,
    as can happen if we collect a stack trace from a g0, then restore the
    old m value, don't clear it.
    
    Reviewed-on: https://go-review.googlesource.com/41138


commit ca8bbf4dfac19b3f4f7ce21a688b96a418c75031
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 16:03:24 2017 -0700

    runtime: set startpc field when starting a new goroutine
    
    This puts the right value in a trace--previously it was always zero.
    
    Reviewed-on: https://go-review.googlesource.com/41139


commit ca8bbf4dfac19b3f4f7ce21a688b96a418c75031
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 16:03:24 2017 -0700

    runtime: set startpc field when starting a new goroutine
    
    This puts the right value in a trace--previously it was always zero.
    
    Reviewed-on: https://go-review.googlesource.com/41139


commit 887690dce42d7bf8f711f8ea082e4928fb70f2a5
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 17:06:11 2017 -0700

    runtime: add prefetch functions
    
    The Go 1.8 GC uses prefetch functions.  Add versions for gccgo that
    call __builtin_prefetch.  Uncomment the test for them in testAtomic64.
    Don't force the check function to return early, as now taking the
    address of a local variable in the runtime package does not force it
    onto the heap.
    
    Reviewed-on: https://go-review.googlesource.com/41144


commit 4269db69f9184e5a45c54aaee7352425a1f88bff
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 17:55:21 2017 -0700

    runtime: split up ticks to get correct alignment
    
    On 32-bit x86 a uint64 variable by itself is aligned to an 8-byte boundary.
    A uint64 field in a struct is aligned to a 4-byte boundary.
    The runtime.ticks variable has a uint64 field that must be aligned
    to an 8-byte boundary.  Rather than rely on luck, split up the struct
    into separate vars so that the required alignment happens reliably.
    
    It would be much nicer if issue golang/go#19057 were fixed somehow,
    but that is for another day.
    
    Reviewed-on: https://go-review.googlesource.com/41143


commit 66926cabdbdbf3431b4f172f7756e195c1c6c513
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Apr 20 17:15:38 2017 +0200

    libgo: fix bad value for O_CLOEXEC on AIX 7.1
    
    On AIX 7.1, O_CLOEXEC is defined as 0x0000001000000000, which
    creates an integer constant overflow error when building libgo.
    
    This affects only 7.1, O_CLOEXEC is not defined on 6.1 (and
    defaults to O in sysinfo.go) and is defined as 0x00800000 on
    AIX 7.2.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/41214


commit af288ff10aeafc47651f5def327ed56425d5be19
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 20 17:15:02 2017 -0700

    runtime: preserve stack context in tracebackothers
    
    The tracebackothers function works by saving the current stack context
    in the goroutine's context field and then calling gogo to switch to a
    new goroutine.  The new goroutine will collect its own stack trace and
    then call gogo to switch back to the original goroutine.  This works
    fine, but if the original goroutine was called by mcall then the
    contents of its context field are needed to return from the mcall.
    Fix this by saving the stack context across the calls to the other
    goroutines.
    
    Reviewed-on: https://go-review.googlesource.com/41293


commit 43101e5956e793f1b4de05c15d7738c785e927df
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Apr 21 10:58:52 2017 +0200

    os/user: use _posix_* libc functions
    
    libc getpwnam_r function has a different signature, we must use
    _posix_getpwnam_r instead (by default, the pwd.h system include
     file defines getpwnam_r as a static function calling
    _posix_getpwnam_r, so a C program calling getpwnam_r will indeed
    reference the _posix_getpwnam_r symbol).
    
    Idem for getpwuid_r, getgrnam_r and getgrgid_r.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/41334


commit 71e1fec4d2a536591ea6657a06916a17b5127071
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Apr 19 21:24:48 2017 -0700

    runtime: don't use pointers in g_ucontext_t or stackcontext
    
    The g_ucontext_t type holds registers saved for a goroutine.  We have
    to scan it for pointers, but since registers don't necessarily hold
    pointers we have to scan it conservatively.  That means that it should
    not have a pointer type, since the GC will always scan pointers.
    Instead it needs special treatment to be scanned conservatively.
    The current GC doesn't care when a pointer type holds a non-pointer,
    but the Go 1.8 GC does.
    
    For the current GC this means we have to explicitly scan the
    g_ucontext_t values in a G.
    
    While we're at it change stackcontext to be uintptr too.  The entries
    in stackcontext never hold pointers that the Go GC cares about.
    
    Reviewed-on: https://go-review.googlesource.com/41270


commit eab2960aee91d3e3a6baa5b1bce01262d24c714f
Author: Ian Lance Taylor <iant@golang.org>
Date:   Thu Apr 20 17:08:19 2017 -0700

    runtime/internal/sys: define Goexperiment
    
    The gc toolchain defines Goexperiment based on the environment
    variable GOEXPERIMENT when the toolchain is built.  We just always set
    Goexperiment to the empty string.
    
    Reviewed-on: https://go-review.googlesource.com/41292


commit be4a751943265c0637da859d15a4faf162f5c478
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Thu Apr 20 14:04:35 2017 +0200

    net: sockopt implementation for AIX
    
    This is a copy of the Linux implementation, it allows to
    run some simple client/server applications on AIX, while
    the current sockopt stubs don't.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/41213


commit 46a669c4ca5b80fd6f6a0a42095804d9f704611d
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Mar 29 17:55:06 2017 +0200

    math: fix sign for atan/expm1/log1p(-0)
    
    AIX libc returns +0 for atan(-0), expm1(-0) and log1p(-0),
    while matching Go functions must return -0.
    
    Code courtesy of Tony Reix.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/38699


commit 53b0e809130038a46f0a3d2870e3905f44ab888d
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Apr 26 17:29:22 2017 +0200

    runtime: fix context clobbering on AIX
    
    On AIX 64-bits, r13 is a pointer to thread data.
    setcontext() overwrites r13 with the value saved by getcontext().
    So, when a goroutine is scheduled on a new thread, r13 will point
    to the old thread data after calling setcontext().
    
    Code courtesy of Damien Bergamini.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/41854


commit f8d5ebd71c71e6e777200530d8204b92619157f8
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Wed Apr 26 18:01:19 2017 +0200

    runtime: fix wrong time calculation in semasleep
    
    tv_nsec is added twice when calculating the sleep end time.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/41855


commit ef56097f4ea848d48fbf61eba1c757fe7fce99d3
Author: Matthieu Sarter <matthieu.sarter.external@atos.net>
Date:   Fri Apr 28 10:27:32 2017 +0200

    libgo: pass $(NM) value when running benchmarks
    
    On AIX, we need to use "nm -B" instead of "nm", to have the
    epxected output format, so the configured $(NM) value from
    the Makefile should be exported before running gotest, which
    defaults to "nm" if $NM is not set.
    
    Issue golang/go#19200
    
    Reviewed-on: https://go-review.googlesource.com/42051


commit 0fb550083ae474fb964435927b899ec8e4b62771
Author: Ian Lance Taylor <iant@golang.org>
Date:   Wed Nov 16 21:12:53 2016 -0800

    runtime: copy garbage collector from Go 1.8 runtime
    
    This giant patch replaces the old Go 1.4 memory allocator and garbage
    collector with the new Go 1.8 code.  The memory allocator is fairly
    similar, though now written in Go rather than C.  The garbage
    collector is completely different.  It now uses ptrmask and gcprog
    information, which requires changes in the compiler and the reflect
    package as well as the runtime.  And, of course, the garbage collector
    now runs concurrently with program execution.
    
    In the gc toolchain the garbage collector is strict and precise at all
    levels.  In the gofrontend we do not have stack maps, so stacks, and
    register values, are collected conservatively.  That means that an
    old, no longer used, pointer on a stack or in a register can cause a
    memory object to live longer than it should.  That in turns means that
    we must disable some checks for invalid pointers in the garbage
    collection code.  Not only can we get an invalid pointer on the stack;
    the concurrent nature of the collector means that we can in effect
    resurrect a block that was already unmarked but that the collector had
    not yet gotten around to freeing, and that block can in turn point to
    other blocks that the collector had managed to already free.  So we
    must disable pointer checks in general.  In effect we are relying on
    the fact that the strict pointer checks in the gc toolchain ensure
    that the garbage collector is correct, and we just assume that it is
    correct for the gofrontend since we are using the same code.
    
    Reviewed-on: https://go-review.googlesource.com/41307


commit a95078d501175240d095500a8c5fbfb21bec65cb
Author: Ian Lance Taylor <iant@golang.org>
Date:   Mon Apr 24 16:33:47 2017 -0700

    libgo/Makefile: clean more files
    
    Fix up the mostlyclean, clean, and distclean targets to better follow
    https://www.gnu.org/prep/standards/html_node/Standard-Targets.html.
    
    Reviewed-on: https://go-review.googlesource.com/41625


commit 5956bf1055451cf4239cdfeca259c23b1ded54d8
Author: Ian Lance Taylor <iant@golang.org>
Date:   Mon May 8 13:35:11 2017 -0700

    libgo: delete goc2c
    
    The last .goc file has been removed, so remove goc2c.
    
    The goc2c program was my first contribution to the gc repository that
    was more than 100 lines:
    https://github.com/golang/go/commit/2b57a1124e87b0dc8bc1ff6899297b4d7d6e74f2
    The program was used in gc for a few years under various guises but
    was finally removed in https://golang.org/cl/132680043.  Now we can
    remove it from gofrontend as well.
    
    Reviewed-on: https://go-review.googlesource.com/42911


commit a222e35d041de0cd42506b61c93b8209e07702b9
Author: Than McIntosh <thanm@google.com>
Date:   Tue May 9 10:33:10 2017 -0400

    compiler: set "need_init_fn" when adding gc root
    
    Variables that back slice initializers in certain cases have to be
    added to the gc roots list, since they can be modified at runtime. The
    code that was doing this addition did not update the flag that tracks
    whether the package being compiled needs an initializer function,
    which resulted in the call in question being left out of the final
    generated code in certain cases. Fix is to change Gogo::add_gc_root()
    to update the "needs init" flag.
    
    Reviewed-on: https://go-review.googlesource.com/43030


commit 822ab419bf7d1c705cdce1c12133e7a11f56be2e
Author: Than McIntosh <thanm@google.com>
Date:   Tue May 9 11:36:51 2017 -0400

    compiler: fix variable context nit in write barrier generation
    
    Update the write barrier generation code to insure that the "lvalue
    context" tag on the space var expression is set only in the case where
    the expr feeds directly into an assignment. This is somewhat
    counter-intuitive, but needed in the case where the backend looks at
    context tags.
    
    Reviewed-on: https://go-review.googlesource.com/43031

From-SVN: r247848

											
										
										
											2017-05-10 19:26:09 +02:00
+								// Copyright 2009 The Go Authors. All rights reserved.
 								// Use of this source code is governed by a BSD-style
 								// license that can be found in the LICENSE file.
 								// Garbage collector: type and heap bitmaps.
 								//
 								// Stack, data, and bss bitmaps
 								//
 								// Stack frames and global variables in the data and bss sections are described
 								// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
 								// to be visited during GC. The bits in each byte are consumed starting with
 								// the low bit: 1<<0, 1<<1, and so on.
 								//
 								// Heap bitmap
 								//
 								// The allocated heap comes from a subset of the memory in the range [start, used),
 								// where start == mheap_.arena_start and used == mheap_.arena_used.
 								// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
 								// stored in bytes indexed backward in memory from start.
 								// That is, the byte at address start-1 holds the 2-bit entries for the four words
 								// start through start+3*ptrSize, the byte at start-2 holds the entries for
 								// start+4*ptrSize through start+7*ptrSize, and so on.
 								//
 								// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
 								// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
 								// The meaning of the high bit depends on the position of the word being described
 								// in its allocated object. In all words *except* the second word, the
 								// high bit indicates that the object is still being described. In
 								// these words, if a bit pair with a high bit 0 is encountered, the
 								// low bit can also be assumed to be 0, and the object description is
 								// over. This 00 is called the ``dead'' encoding: it signals that the
 								// rest of the words in the object are uninteresting to the garbage
 								// collector.
 								//
 								// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
 								//
 								// The 2-bit entries are split when written into the byte, so that the top half
 								// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
 								// bits.
 								// This form allows a copy from the 1-bit to the 4-bit form to keep the
 								// pointer bits contiguous, instead of having to space them out.
 								//
 								// The code makes use of the fact that the zero value for a heap bitmap
 								// has no live pointer bit set and is (depending on position), not used,
 								// not checkmarked, and is the dead encoding.
 								// These properties must be preserved when modifying the encoding.
 								//
 								// Checkmarks
 								//
 								// In a concurrent garbage collector, one worries about failing to mark
 								// a live object due to mutations without write barriers or bugs in the
 								// collector implementation. As a sanity check, the GC has a 'checkmark'
 								// mode that retraverses the object graph with the world stopped, to make
 								// sure that everything that should be marked is marked.
 								// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
 								// for the second word of the object holds the checkmark bit.
 								// When not in checkmark mode, this bit is set to 1.
 								//
 								// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
 								// means every allocated object has two words, so there is room for the
 								// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
 								// just one word, so the second bit pair is not available for encoding the
 								// checkmark. However, because non-pointer allocations are combined
 								// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
 								// must be a pointer, so the type bit in the first word is not actually needed.
 								// It is still used in general, except in checkmark the type bit is repurposed
 								// as the checkmark bit and then reinitialized (to 1) as the type bit when
 								// finished.
 								//
 								package runtime
 								import (
 									"runtime/internal/atomic"
 									"runtime/internal/sys"
 									"unsafe"
 								)
 								const (
 									bitPointer = 1 << 0
 									bitScan    = 1 << 4
 									heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
 									heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
 									// all scan/pointer bits in a byte
 									bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
 									bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
 								)
 								// addb returns the byte pointer p+n.
 								//go:nowritebarrier
 								//go:nosplit
 								func addb(p *byte, n uintptr) *byte {
 									// Note: wrote out full expression instead of calling add(p, n)
 									// to reduce the number of temporaries generated by the
 									// compiler for this trivial expression during inlining.
 									return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
 								}
 								// subtractb returns the byte pointer p-n.
 								// subtractb is typically used when traversing the pointer tables referred to by hbits
 								// which are arranged in reverse order.
 								//go:nowritebarrier
 								//go:nosplit
 								func subtractb(p *byte, n uintptr) *byte {
 									// Note: wrote out full expression instead of calling add(p, -n)
 									// to reduce the number of temporaries generated by the
 									// compiler for this trivial expression during inlining.
 									return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
 								}
 								// add1 returns the byte pointer p+1.
 								//go:nowritebarrier
 								//go:nosplit
 								func add1(p *byte) *byte {
 									// Note: wrote out full expression instead of calling addb(p, 1)
 									// to reduce the number of temporaries generated by the
 									// compiler for this trivial expression during inlining.
 									return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
 								}
 								// subtract1 returns the byte pointer p-1.
 								// subtract1 is typically used when traversing the pointer tables referred to by hbits
 								// which are arranged in reverse order.
 								//go:nowritebarrier
 								//
 								// nosplit because it is used during write barriers and must not be preempted.
 								//go:nosplit
 								func subtract1(p *byte) *byte {
 									// Note: wrote out full expression instead of calling subtractb(p, 1)
 									// to reduce the number of temporaries generated by the
 									// compiler for this trivial expression during inlining.
 									return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 								}
 								// mHeap_MapBits is called each time arena_used is extended.
 								// It maps any additional bitmap memory needed for the new arena memory.
 								// It must be called with the expected new value of arena_used,
 								// *before* h.arena_used has been updated.
 								// Waiting to update arena_used until after the memory has been mapped
 								// avoids faults when other threads try access the bitmap immediately
 								// after observing the change to arena_used.
 								//
 								//go:nowritebarrier
 								func (h *mheap) mapBits(arena_used uintptr) {
 									// Caller has added extra mappings to the arena.
 									// Add extra mappings of bitmap words as needed.
 									// We allocate extra bitmap pieces in chunks of bitmapChunk.
 									const bitmapChunk = 8192
 									n := (arena_used - mheap_.arena_start) / heapBitmapScale
 									n = round(n, bitmapChunk)
 									n = round(n, physPageSize)
 									if h.bitmap_mapped >= n {
 										return
 									}
 									sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
 									h.bitmap_mapped = n
 								}
 								// heapBits provides access to the bitmap bits for a single heap word.
 								// The methods on heapBits take value receivers so that the compiler
 								// can more easily inline calls to those methods and registerize the
 								// struct fields independently.
 								type heapBits struct {
 									bitp  *uint8
 									shift uint32
 								}
 								// markBits provides access to the mark bit for an object in the heap.
 								// bytep points to the byte holding the mark bit.
 								// mask is a byte with a single bit set that can be &ed with *bytep
 								// to see if the bit has been set.
 								// *m.byte&m.mask != 0 indicates the mark bit is set.
 								// index can be used along with span information to generate
 								// the address of the object in the heap.
 								// We maintain one set of mark bits for allocation and one for
 								// marking purposes.
 								type markBits struct {
 									bytep *uint8
 									mask  uint8
 									index uintptr
 								}
 								//go:nosplit
 								func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
 									whichByte := allocBitIndex / 8
 									whichBit := allocBitIndex % 8
 									bytePtr := addb(s.allocBits, whichByte)
 									return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
 								}
 								// refillaCache takes 8 bytes s.allocBits starting at whichByte
 								// and negates them so that ctz (count trailing zeros) instructions
 								// can be used. It then places these 8 bytes into the cached 64 bit
 								// s.allocCache.
 								func (s *mspan) refillAllocCache(whichByte uintptr) {
 									bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
 									aCache := uint64(0)
 									aCache |= uint64(bytes[0])
 									aCache |= uint64(bytes[1]) << (1 * 8)
 									aCache |= uint64(bytes[2]) << (2 * 8)
 									aCache |= uint64(bytes[3]) << (3 * 8)
 									aCache |= uint64(bytes[4]) << (4 * 8)
 									aCache |= uint64(bytes[5]) << (5 * 8)
 									aCache |= uint64(bytes[6]) << (6 * 8)
 									aCache |= uint64(bytes[7]) << (7 * 8)
 									s.allocCache = ^aCache
 								}
 								// nextFreeIndex returns the index of the next free object in s at
 								// or after s.freeindex.
 								// There are hardware instructions that can be used to make this
 								// faster if profiling warrants it.
 								func (s *mspan) nextFreeIndex() uintptr {
 									sfreeindex := s.freeindex
 									snelems := s.nelems
 									if sfreeindex == snelems {
 										return sfreeindex
 									}
 									if sfreeindex > snelems {
 										throw("s.freeindex > s.nelems")
 									}
 									aCache := s.allocCache
 									bitIndex := sys.Ctz64(aCache)
 									for bitIndex == 64 {
 										// Move index to start of next cached bits.
 										sfreeindex = (sfreeindex + 64) &^ (64 - 1)
 										if sfreeindex >= snelems {
 											s.freeindex = snelems
 											return snelems
 										}
 										whichByte := sfreeindex / 8
 										// Refill s.allocCache with the next 64 alloc bits.
 										s.refillAllocCache(whichByte)
 										aCache = s.allocCache
 										bitIndex = sys.Ctz64(aCache)
 										// nothing available in cached bits
 										// grab the next 8 bytes and try again.
 									}
 									result := sfreeindex + uintptr(bitIndex)
 									if result >= snelems {
 										s.freeindex = snelems
 										return snelems
 									}
 									s.allocCache >>= (bitIndex + 1)
 									sfreeindex = result + 1
 									if sfreeindex%64 == 0 && sfreeindex != snelems {
 										// We just incremented s.freeindex so it isn't 0.
 										// As each 1 in s.allocCache was encountered and used for allocation
 										// it was shifted away. At this point s.allocCache contains all 0s.
 										// Refill s.allocCache so that it corresponds
 										// to the bits at s.allocBits starting at s.freeindex.
 										whichByte := sfreeindex / 8
 										s.refillAllocCache(whichByte)
 									}
 									s.freeindex = sfreeindex
 									return result
 								}
 								// isFree returns whether the index'th object in s is unallocated.
 								func (s *mspan) isFree(index uintptr) bool {
 									if index < s.freeindex {
 										return false
 									}
 									whichByte := index / 8
 									whichBit := index % 8
 									byteVal := *addb(s.allocBits, whichByte)
 									return byteVal&uint8(1<<whichBit) == 0
 								}
 								func (s *mspan) objIndex(p uintptr) uintptr {
 									byteOffset := p - s.base()
 									if byteOffset == 0 {
 										return 0
 									}
 									if s.baseMask != 0 {
 										// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
 										return byteOffset >> s.divShift
 									}
 									return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
 								}
 								func markBitsForAddr(p uintptr) markBits {
 									s := spanOf(p)
 									objIndex := s.objIndex(p)
 									return s.markBitsForIndex(objIndex)
 								}
 								func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
 									whichByte := objIndex / 8
 									bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
 									bytePtr := addb(s.gcmarkBits, whichByte)
 									return markBits{bytePtr, bitMask, objIndex}
 								}
 								func (s *mspan) markBitsForBase() markBits {
 									return markBits{s.gcmarkBits, uint8(1), 0}
 								}
 								// isMarked reports whether mark bit m is set.
 								func (m markBits) isMarked() bool {
 									return *m.bytep&m.mask != 0
 								}
 								// setMarked sets the marked bit in the markbits, atomically. Some compilers
 								// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
 								// inlining it manually.
 								func (m markBits) setMarked() {
 									// Might be racing with other updates, so use atomic update always.
 									// We used to be clever here and use a non-atomic update in certain
 									// cases, but it's not worth the risk.
 									atomic.Or8(m.bytep, m.mask)
 								}
 								// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
 								func (m markBits) setMarkedNonAtomic() {
 									*m.bytep |= m.mask
 								}
 								// clearMarked clears the marked bit in the markbits, atomically.
 								func (m markBits) clearMarked() {
 									// Might be racing with other updates, so use atomic update always.
 									// We used to be clever here and use a non-atomic update in certain
 									// cases, but it's not worth the risk.
 									atomic.And8(m.bytep, ^m.mask)
 								}
 								// clearMarkedNonAtomic clears the marked bit non-atomically.
 								func (m markBits) clearMarkedNonAtomic() {
 									*m.bytep ^= m.mask
 								}
 								// markBitsForSpan returns the markBits for the span base address base.
 								func markBitsForSpan(base uintptr) (mbits markBits) {
 									if base < mheap_.arena_start || base >= mheap_.arena_used {
 										throw("markBitsForSpan: base out of range")
 									}
 									mbits = markBitsForAddr(base)
 									if mbits.mask != 1 {
 										throw("markBitsForSpan: unaligned start")
 									}
 									return mbits
 								}
 								// advance advances the markBits to the next object in the span.
 								func (m *markBits) advance() {
 									if m.mask == 1<<7 {
 										m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
 										m.mask = 1
 									} else {
 										m.mask = m.mask << 1
 									}
 									m.index++
 								}
 								// heapBitsForAddr returns the heapBits for the address addr.
 								// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 								//
 								// nosplit because it is used during write barriers and must not be preempted.
 								//go:nosplit
 								func heapBitsForAddr(addr uintptr) heapBits {
 									// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 									off := (addr - mheap_.arena_start) / sys.PtrSize
 									return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
 								}
 								// heapBitsForSpan returns the heapBits for the span base address base.
 								func heapBitsForSpan(base uintptr) (hbits heapBits) {
 									if base < mheap_.arena_start || base >= mheap_.arena_used {
 										throw("heapBitsForSpan: base out of range")
 									}
 									return heapBitsForAddr(base)
 								}
 								// heapBitsForObject returns the base address for the heap object
 								// containing the address p, the heapBits for base,
 								// the object's span, and of the index of the object in s.
 								// If p does not point into a heap object,
 								// return base == 0
 								// otherwise return the base of the object.
 								//
 								// For gccgo, the forStack parameter is true if the value came from the stack.
 								// The stack is collected conservatively and may contain invalid pointers.
 								//
 								// refBase and refOff optionally give the base address of the object
 								// in which the pointer p was found and the byte offset at which it
 								// was found. These are used for error reporting.
 								func heapBitsForObject(p, refBase, refOff uintptr, forStack bool) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
 									arenaStart := mheap_.arena_start
 									if p < arenaStart || p >= mheap_.arena_used {
 										return
 									}
 									off := p - arenaStart
 									idx := off >> _PageShift
 									// p points into the heap, but possibly to the middle of an object.
 									// Consult the span table to find the block beginning.
 									s = mheap_.spans[idx]
 									if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
 										if s == nil || s.state == _MSpanStack || forStack {
 											// If s is nil, the virtual address has never been part of the heap.
 											// This pointer may be to some mmap'd region, so we allow it.
 											// Pointers into stacks are also ok, the runtime manages these explicitly.
 											return
 										}
 										// The following ensures that we are rigorous about what data
 										// structures hold valid pointers.
 										if debug.invalidptr != 0 {
 											// Typically this indicates an incorrect use
 											// of unsafe or cgo to store a bad pointer in
 											// the Go heap. It may also indicate a runtime
 											// bug.
 											//
 											// TODO(austin): We could be more aggressive
 											// and detect pointers to unallocated objects
 											// in allocated spans.
 											printlock()
 											print("runtime: pointer ", hex(p))
 											if s.state != mSpanInUse {
 												print(" to unallocated span")
 											} else {
 												print(" to unused region of span")
 											}
 											print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
 											if refBase != 0 {
 												print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 												gcDumpObject("object", refBase, refOff)
 											}
 											throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
 										}
 										return
 									}
 									if forStack {
 										// A span can be entered in mheap_.spans, and be set
 										// to mSpanInUse, before it is fully initialized.
 										// All we need in practice is allocBits and gcmarkBits,
 										// so make sure they are set.
 										if s.allocBits == nil || s.gcmarkBits == nil {
 											return
 										}
 									}
 									// If this span holds object of a power of 2 size, just mask off the bits to
 									// the interior of the object. Otherwise use the size to get the base.
 									if s.baseMask != 0 {
 										// optimize for power of 2 sized objects.
 										base = s.base()
 										base = base + (p-base)&uintptr(s.baseMask)
 										objIndex = (base - s.base()) >> s.divShift
 										// base = p & s.baseMask is faster for small spans,
 										// but doesn't work for large spans.
 										// Overall, it's faster to use the more general computation above.
 									} else {
 										base = s.base()
 										if p-base >= s.elemsize {
 											// n := (p - base) / s.elemsize, using division by multiplication
 											objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
 											base += objIndex * s.elemsize
 										}
 									}
 									// Now that we know the actual base, compute heapBits to return to caller.
 									hbits = heapBitsForAddr(base)
 									return
 								}
 								// prefetch the bits.
 								func (h heapBits) prefetch() {
 									prefetchnta(uintptr(unsafe.Pointer((h.bitp))))
 								}
 								// next returns the heapBits describing the next pointer-sized word in memory.
 								// That is, if h describes address p, h.next() describes p+ptrSize.
 								// Note that next does not modify h. The caller must record the result.
 								//
 								// nosplit because it is used during write barriers and must not be preempted.
 								//go:nosplit
 								func (h heapBits) next() heapBits {
 									if h.shift < 3*heapBitsShift {
 										return heapBits{h.bitp, h.shift + heapBitsShift}
 									}
 									return heapBits{subtract1(h.bitp), 0}
 								}
 								// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
 								// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
 								// h.forward(1) is equivalent to h.next(), just slower.
 								// Note that forward does not modify h. The caller must record the result.
 								// bits returns the heap bits for the current word.
 								func (h heapBits) forward(n uintptr) heapBits {
 									n += uintptr(h.shift) / heapBitsShift
 									return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
 								}
 								// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
 								// The result includes in its higher bits the bits for subsequent words
 								// described by the same bitmap byte.
 								func (h heapBits) bits() uint32 {
 									// The (shift & 31) eliminates a test and conditional branch
 									// from the generated code.
 									return uint32(*h.bitp) >> (h.shift & 31)
 								}
 								// morePointers returns true if this word and all remaining words in this object
 								// are scalars.
 								// h must not describe the second word of the object.
 								func (h heapBits) morePointers() bool {
 									return h.bits()&bitScan != 0
 								}
 								// isPointer reports whether the heap bits describe a pointer word.
 								//
 								// nosplit because it is used during write barriers and must not be preempted.
 								//go:nosplit
 								func (h heapBits) isPointer() bool {
 									return h.bits()&bitPointer != 0
 								}
 								// hasPointers reports whether the given object has any pointers.
 								// It must be told how large the object at h is for efficiency.
 								// h must describe the initial word of the object.
 								func (h heapBits) hasPointers(size uintptr) bool {
 									if size == sys.PtrSize { // 1-word objects are always pointers
 										return true
 									}
 									return (*h.bitp>>h.shift)&bitScan != 0
 								}
 								// isCheckmarked reports whether the heap bits have the checkmarked bit set.
 								// It must be told how large the object at h is, because the encoding of the
 								// checkmark bit varies by size.
 								// h must describe the initial word of the object.
 								func (h heapBits) isCheckmarked(size uintptr) bool {
 									if size == sys.PtrSize {
 										return (*h.bitp>>h.shift)&bitPointer != 0
 									}
 									// All multiword objects are 2-word aligned,
 									// so we know that the initial word's 2-bit pair
 									// and the second word's 2-bit pair are in the
 									// same heap bitmap byte, *h.bitp.
 									return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
 								}
 								// setCheckmarked sets the checkmarked bit.
 								// It must be told how large the object at h is, because the encoding of the
 								// checkmark bit varies by size.
 								// h must describe the initial word of the object.
 								func (h heapBits) setCheckmarked(size uintptr) {
 									if size == sys.PtrSize {
 										atomic.Or8(h.bitp, bitPointer<<h.shift)
 										return
 									}
 									atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 								}
 								// bulkBarrierPreWrite executes writebarrierptr_prewrite1
 								// for every pointer slot in the memory range [src, src+size),
 								// using pointer/scalar information from [dst, dst+size).
 								// This executes the write barriers necessary before a memmove.
 								// src, dst, and size must be pointer-aligned.
 								// The range [dst, dst+size) must lie within a single object.
 								//
 								// As a special case, src == 0 indicates that this is being used for a
 								// memclr. bulkBarrierPreWrite will pass 0 for the src of each write
 								// barrier.
 								//
 								// Callers should call bulkBarrierPreWrite immediately before
 								// calling memmove(dst, src, size). This function is marked nosplit
 								// to avoid being preempted; the GC must not stop the goroutine
 								// between the memmove and the execution of the barriers.
 								// The caller is also responsible for cgo pointer checks if this
 								// may be writing Go pointers into non-Go memory.
 								//
 								// The pointer bitmap is not maintained for allocations containing
 								// no pointers at all; any caller of bulkBarrierPreWrite must first
 								// make sure the underlying allocation contains pointers, usually
 								// by checking typ.kind&kindNoPointers.
 								//
 								//go:nosplit
 								func bulkBarrierPreWrite(dst, src, size uintptr) {
 									if (dst|src|size)&(sys.PtrSize-1) != 0 {
 										throw("bulkBarrierPreWrite: unaligned arguments")
 									}
 									if !writeBarrier.needed {
 										return
 									}
 									if !inheap(dst) {
 										// If dst is a global, use the data or BSS bitmaps to
 										// execute write barriers.
 										roots := gcRoots
 										for roots != nil {
 											for i := 0; i < roots.count; i++ {
 												pr := roots.roots[i]
 												addr := uintptr(pr.decl)
 												if addr <= dst && dst < addr+pr.size {
 													if dst < addr+pr.ptrdata {
 														bulkBarrierBitmap(dst, src, size, dst-addr, pr.gcdata)
 													}
 													return
 												}
 											}
 											roots = roots.next
 										}
 										return
 									}
 									h := heapBitsForAddr(dst)
 									if src == 0 {
 										for i := uintptr(0); i < size; i += sys.PtrSize {
 											if h.isPointer() {
 												dstx := (*uintptr)(unsafe.Pointer(dst + i))
 												writebarrierptr_prewrite1(dstx, 0)
 											}
 											h = h.next()
 										}
 									} else {
 										for i := uintptr(0); i < size; i += sys.PtrSize {
 											if h.isPointer() {
 												dstx := (*uintptr)(unsafe.Pointer(dst + i))
 												srcx := (*uintptr)(unsafe.Pointer(src + i))
 												writebarrierptr_prewrite1(dstx, *srcx)
 											}
 											h = h.next()
 										}
 									}
 								}
 								// bulkBarrierBitmap executes write barriers for copying from [src,
 								// src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
 								// assumed to start maskOffset bytes into the data covered by the
 								// bitmap in bits (which may not be a multiple of 8).
 								//
 								// This is used by bulkBarrierPreWrite for writes to data and BSS.
 								//
 								//go:nosplit
 								func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 									word := maskOffset / sys.PtrSize
 									bits = addb(bits, word/8)
 									mask := uint8(1) << (word % 8)
 									for i := uintptr(0); i < size; i += sys.PtrSize {
 										if mask == 0 {
 											bits = addb(bits, 1)
 											if *bits == 0 {
 												// Skip 8 words.
 												i += 7 * sys.PtrSize
 												continue
 											}
 											mask = 1
 										}
 										if *bits&mask != 0 {
 											dstx := (*uintptr)(unsafe.Pointer(dst + i))
 											if src == 0 {
 												writebarrierptr_prewrite1(dstx, 0)
 											} else {
 												srcx := (*uintptr)(unsafe.Pointer(src + i))
 												writebarrierptr_prewrite1(dstx, *srcx)
 											}
 										}
 										mask <<= 1
 									}
 								}
 								// typeBitsBulkBarrier executes writebarrierptr_prewrite for every
 								// pointer that would be copied from [src, src+size) to [dst,
 								// dst+size) by a memmove using the type bitmap to locate those
 								// pointer slots.
 								//
 								// The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
 								// dst, src, and size must be pointer-aligned.
 								// The type typ must have a plain bitmap, not a GC program.
 								// The only use of this function is in channel sends, and the
 								// 64 kB channel element limit takes care of this for us.
 								//
 								// Must not be preempted because it typically runs right before memmove,
 								// and the GC must observe them as an atomic action.
 								//
 								//go:nosplit
 								func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
 									if typ == nil {
 										throw("runtime: typeBitsBulkBarrier without type")
 									}
 									if typ.size != size {
 										println("runtime: typeBitsBulkBarrier with type ", *typ.string, " of size ", typ.size, " but memory size", size)
 										throw("runtime: invalid typeBitsBulkBarrier")
 									}
 									if typ.kind&kindGCProg != 0 {
 										println("runtime: typeBitsBulkBarrier with type ", *typ.string, " with GC prog")
 										throw("runtime: invalid typeBitsBulkBarrier")
 									}
 									if !writeBarrier.needed {
 										return
 									}
 									ptrmask := typ.gcdata
 									var bits uint32
 									for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
 										if i&(sys.PtrSize*8-1) == 0 {
 											bits = uint32(*ptrmask)
 											ptrmask = addb(ptrmask, 1)
 										} else {
 											bits = bits >> 1
 										}
 										if bits&1 != 0 {
 											dstx := (*uintptr)(unsafe.Pointer(dst + i))
 											srcx := (*uintptr)(unsafe.Pointer(src + i))
 											writebarrierptr_prewrite(dstx, *srcx)
 										}
 									}
 								}
 								// The methods operating on spans all require that h has been returned
 								// by heapBitsForSpan and that size, n, total are the span layout description
 								// returned by the mspan's layout method.
 								// If total > size*n, it means that there is extra leftover memory in the span,
 								// usually due to rounding.
 								//
 								// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 								// initSpan initializes the heap bitmap for a span.
 								// It clears all checkmark bits.
 								// If this is a span of pointer-sized objects, it initializes all
 								// words to pointer/scan.
 								// Otherwise, it initializes all words to scalar/dead.
 								func (h heapBits) initSpan(s *mspan) {
 									size, n, total := s.layout()
 									// Init the markbit structures
 									s.freeindex = 0
 									s.allocCache = ^uint64(0) // all 1s indicating all free.
 									s.nelems = n
 									s.allocBits = nil
 									s.gcmarkBits = nil
 									s.gcmarkBits = newMarkBits(s.nelems)
 									s.allocBits = newAllocBits(s.nelems)
 									// Clear bits corresponding to objects.
 									if total%heapBitmapScale != 0 {
 										throw("initSpan: unaligned length")
 									}
 									nbyte := total / heapBitmapScale
 									if sys.PtrSize == 8 && size == sys.PtrSize {
 										end := h.bitp
 										bitp := subtractb(end, nbyte-1)
 										for {
 											*bitp = bitPointerAll | bitScanAll
 											if bitp == end {
 												break
 											}
 											bitp = add1(bitp)
 										}
 										return
 									}
 									memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
 								}
 								// initCheckmarkSpan initializes a span for being checkmarked.
 								// It clears the checkmark bits, which are set to 1 in normal operation.
 								func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
 									// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
 									if sys.PtrSize == 8 && size == sys.PtrSize {
 										// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 										// Only possible on 64-bit system, since minimum size is 8.
 										// Must clear type bit (checkmark bit) of every word.
 										// The type bit is the lower of every two-bit pair.
 										bitp := h.bitp
 										for i := uintptr(0); i < n; i += 4 {
 											*bitp &^= bitPointerAll
 											bitp = subtract1(bitp)
 										}
 										return
 									}
 									for i := uintptr(0); i < n; i++ {
 										*h.bitp &^= bitScan << (heapBitsShift + h.shift)
 										h = h.forward(size / sys.PtrSize)
 									}
 								}
 								// clearCheckmarkSpan undoes all the checkmarking in a span.
 								// The actual checkmark bits are ignored, so the only work to do
 								// is to fix the pointer bits. (Pointer bits are ignored by scanobject
 								// but consulted by typedmemmove.)
 								func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 									// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
 									if sys.PtrSize == 8 && size == sys.PtrSize {
 										// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 										// Only possible on 64-bit system, since minimum size is 8.
 										// Must clear type bit (checkmark bit) of every word.
 										// The type bit is the lower of every two-bit pair.
 										bitp := h.bitp
 										for i := uintptr(0); i < n; i += 4 {
 											*bitp |= bitPointerAll
 											bitp = subtract1(bitp)
 										}
 									}
 								}
 								// oneBitCount is indexed by byte and produces the
 								// number of 1 bits in that byte. For example 128 has 1 bit set
 								// and oneBitCount[128] will holds 1.
 								var oneBitCount = [256]uint8{
 , 1, 1, 2, 1, 2, 2, 3,
 , 2, 2, 3, 2, 3, 3, 4,
 , 2, 2, 3, 2, 3, 3, 4,
 , 3, 3, 4, 3, 4, 4, 5,
 , 2, 2, 3, 2, 3, 3, 4,
 , 3, 3, 4, 3, 4, 4, 5,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 2, 2, 3, 2, 3, 3, 4,
 , 3, 3, 4, 3, 4, 4, 5,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 4, 4, 5, 4, 5, 5, 6,
 , 5, 5, 6, 5, 6, 6, 7,
 , 2, 2, 3, 2, 3, 3, 4,
 , 3, 3, 4, 3, 4, 4, 5,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 4, 4, 5, 4, 5, 5, 6,
 , 5, 5, 6, 5, 6, 6, 7,
 , 3, 3, 4, 3, 4, 4, 5,
 , 4, 4, 5, 4, 5, 5, 6,
 , 4, 4, 5, 4, 5, 5, 6,
 , 5, 5, 6, 5, 6, 6, 7,
 , 4, 4, 5, 4, 5, 5, 6,
 , 5, 5, 6, 5, 6, 6, 7,
 , 5, 5, 6, 5, 6, 6, 7,
 , 6, 6, 7, 6, 7, 7, 8}
 								// countFree runs through the mark bits in a span and counts the number of free objects
 								// in the span.
 								// TODO:(rlh) Use popcount intrinsic.
 								func (s *mspan) countFree() int {
 									count := 0
 									maxIndex := s.nelems / 8
 									for i := uintptr(0); i < maxIndex; i++ {
 										mrkBits := *addb(s.gcmarkBits, i)
 										count += int(oneBitCount[mrkBits])
 									}
 									if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
 										mrkBits := *addb(s.gcmarkBits, maxIndex)
 										mask := uint8((1 << bitsInLastByte) - 1)
 										bits := mrkBits & mask
 										count += int(oneBitCount[bits])
 									}
 									return int(s.nelems) - count
 								}
 								// heapBitsSetType records that the new allocation [x, x+size)
 								// holds in [x, x+dataSize) one or more values of type typ.
 								// (The number of values is given by dataSize / typ.size.)
 								// If dataSize < size, the fragment [x+dataSize, x+size) is
 								// recorded as non-pointer data.
 								// It is known that the type has pointers somewhere;
 								// malloc does not call heapBitsSetType when there are no pointers,
 								// because all free objects are marked as noscan during
 								// heapBitsSweepSpan.
 								//
 								// There can only be one allocation from a given span active at a time,
 								// and the bitmap for a span always falls on byte boundaries,
 								// so there are no write-write races for access to the heap bitmap.
 								// Hence, heapBitsSetType can access the bitmap without atomics.
 								//
 								// There can be read-write races between heapBitsSetType and things
 								// that read the heap bitmap like scanobject. However, since
 								// heapBitsSetType is only used for objects that have not yet been
 								// made reachable, readers will ignore bits being modified by this
 								// function. This does mean this function cannot transiently modify
 								// bits that belong to neighboring objects. Also, on weakly-ordered
 								// machines, callers must execute a store/store (publication) barrier
 								// between calling this function and making the object reachable.
 								func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 									const doubleCheck = false // slow but helpful; enable to test modifications to this code
 									// dataSize is always size rounded up to the next malloc size class,
 									// except in the case of allocating a defer block, in which case
 									// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
 									// arbitrarily larger.
 									//
 									// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
 									// assume that dataSize == size without checking it explicitly.
 									if sys.PtrSize == 8 && size == sys.PtrSize {
 										// It's one word and it has pointers, it must be a pointer.
 										// Since all allocated one-word objects are pointers
 										// (non-pointers are aggregated into tinySize allocations),
 										// initSpan sets the pointer bits for us. Nothing to do here.
 										if doubleCheck {
 											h := heapBitsForAddr(x)
 											if !h.isPointer() {
 												throw("heapBitsSetType: pointer bit missing")
 											}
 											if !h.morePointers() {
 												throw("heapBitsSetType: scan bit missing")
 											}
 										}
 										return
 									}
 									h := heapBitsForAddr(x)
 									ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
 									// Heap bitmap bits for 2-word object are only 4 bits,
 									// so also shared with objects next to it.
 									// This is called out as a special case primarily for 32-bit systems,
 									// so that on 32-bit systems the code below can assume all objects
 									// are 4-word aligned (because they're all 16-byte aligned).
 									if size == 2*sys.PtrSize {
 										if typ.size == sys.PtrSize {
 											// We're allocating a block big enough to hold two pointers.
 											// On 64-bit, that means the actual object must be two pointers,
 											// or else we'd have used the one-pointer-sized block.
 											// On 32-bit, however, this is the 8-byte block, the smallest one.
 											// So it could be that we're allocating one pointer and this was
 											// just the smallest block available. Distinguish by checking dataSize.
 											// (In general the number of instances of typ being allocated is
 											// dataSize/typ.size.)
 											if sys.PtrSize == 4 && dataSize == sys.PtrSize {
 												// 1 pointer object. On 32-bit machines clear the bit for the
 												// unused second word.
 												*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 												*h.bitp |= (bitPointer | bitScan) << h.shift
 											} else {
 												// 2-element slice of pointer.
 												*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
 											}
 											return
 										}
 										// Otherwise typ.size must be 2*sys.PtrSize,
 										// and typ.kind&kindGCProg == 0.
 										if doubleCheck {
 											if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
 												print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
 												throw("heapBitsSetType")
 											}
 										}
 										b := uint32(*ptrmask)
 										hb := (b & 3) | bitScan
 										// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
 										// 110011 is shifted h.shift and complemented.
 										// This clears out the bits that are about to be
 										// ored into *h.hbitp in the next instructions.
 										*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
 										*h.bitp |= uint8(hb << h.shift)
 										return
 									}
 									// Copy from 1-bit ptrmask into 2-bit bitmap.
 									// The basic approach is to use a single uintptr as a bit buffer,
 									// alternating between reloading the buffer and writing bitmap bytes.
 									// In general, one load can supply two bitmap byte writes.
 									// This is a lot of lines of code, but it compiles into relatively few
 									// machine instructions.
 									var (
 										// Ptrmask input.
 										p     *byte   // last ptrmask byte read
 										b     uintptr // ptrmask bits already loaded
 										nb    uintptr // number of bits in b at next read
 										endp  *byte   // final ptrmask byte to read (then repeat)
 										endnb uintptr // number of valid bits in *endp
 										pbits uintptr // alternate source of bits
 										// Heap bitmap output.
 										w     uintptr // words processed
 										nw    uintptr // number of words to process
 										hbitp *byte   // next heap bitmap byte to write
 										hb    uintptr // bits being prepared for *hbitp
 									)
 									hbitp = h.bitp
 									// Handle GC program. Delayed until this part of the code
 									// so that we can use the same double-checking mechanism
 									// as the 1-bit case. Nothing above could have encountered
 									// GC programs: the cases were all too small.
 									if typ.kind&kindGCProg != 0 {
 										heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
 										if doubleCheck {
 											// Double-check the heap bits written by GC program
 											// by running the GC program to create a 1-bit pointer mask
 											// and then jumping to the double-check code below.
 											// This doesn't catch bugs shared between the 1-bit and 4-bit
 											// GC program execution, but it does catch mistakes specific
 											// to just one of those and bugs in heapBitsSetTypeGCProg's
 											// implementation of arrays.
 											lock(&debugPtrmask.lock)
 											if debugPtrmask.data == nil {
 												debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
 											}
 											ptrmask = debugPtrmask.data
 											runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
 											goto Phase4
 										}
 										return
 									}
 									// Note about sizes:
 									//
 									// typ.size is the number of words in the object,
 									// and typ.ptrdata is the number of words in the prefix
 									// of the object that contains pointers. That is, the final
 									// typ.size - typ.ptrdata words contain no pointers.
 									// This allows optimization of a common pattern where
 									// an object has a small header followed by a large scalar
 									// buffer. If we know the pointers are over, we don't have
 									// to scan the buffer's heap bitmap at all.
 									// The 1-bit ptrmasks are sized to contain only bits for
 									// the typ.ptrdata prefix, zero padded out to a full byte
 									// of bitmap. This code sets nw (below) so that heap bitmap
 									// bits are only written for the typ.ptrdata prefix; if there is
 									// more room in the allocated object, the next heap bitmap
 									// entry is a 00, indicating that there are no more pointers
 									// to scan. So only the ptrmask for the ptrdata bytes is needed.
 									//
 									// Replicated copies are not as nice: if there is an array of
 									// objects with scalar tails, all but the last tail does have to
 									// be initialized, because there is no way to say "skip forward".
 									// However, because of the possibility of a repeated type with
 									// size not a multiple of 4 pointers (one heap bitmap byte),
 									// the code already must handle the last ptrmask byte specially
 									// by treating it as containing only the bits for endnb pointers,
 									// where endnb <= 4. We represent large scalar tails that must
 									// be expanded in the replication by setting endnb larger than 4.
 									// This will have the effect of reading many bits out of b,
 									// but once the real bits are shifted out, b will supply as many
 									// zero bits as we try to read, which is exactly what we need.
 									p = ptrmask
 									if typ.size < dataSize {
 										// Filling in bits for an array of typ.
 										// Set up for repetition of ptrmask during main loop.
 										// Note that ptrmask describes only a prefix of
 										const maxBits = sys.PtrSize*8 - 7
 										if typ.ptrdata/sys.PtrSize <= maxBits {
 											// Entire ptrmask fits in uintptr with room for a byte fragment.
 											// Load into pbits and never read from ptrmask again.
 											// This is especially important when the ptrmask has
 											// fewer than 8 bits in it; otherwise the reload in the middle
 											// of the Phase 2 loop would itself need to loop to gather
 											// at least 8 bits.
 											// Accumulate ptrmask into b.
 											// ptrmask is sized to describe only typ.ptrdata, but we record
 											// it as describing typ.size bytes, since all the high bits are zero.
 											nb = typ.ptrdata / sys.PtrSize
 											for i := uintptr(0); i < nb; i += 8 {
 												b |= uintptr(*p) << i
 												p = add1(p)
 											}
 											nb = typ.size / sys.PtrSize
 											// Replicate ptrmask to fill entire pbits uintptr.
 											// Doubling and truncating is fewer steps than
 											// iterating by nb each time. (nb could be 1.)
 											// Since we loaded typ.ptrdata/sys.PtrSize bits
 											// but are pretending to have typ.size/sys.PtrSize,
 											// there might be no replication necessary/possible.
 											pbits = b
 											endnb = nb
 											if nb+nb <= maxBits {
 												for endnb <= sys.PtrSize*8 {
 													pbits |= pbits << endnb
 													endnb += endnb
 												}
 												// Truncate to a multiple of original ptrmask.
 												endnb = maxBits / nb * nb
 												pbits &= 1<<endnb - 1
 												b = pbits
 												nb = endnb
 											}
 											// Clear p and endp as sentinel for using pbits.
 											// Checked during Phase 2 loop.
 											p = nil
 											endp = nil
 										} else {
 											// Ptrmask is larger. Read it multiple times.
 											n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
 											endp = addb(ptrmask, n)
 											endnb = typ.size/sys.PtrSize - n*8
 										}
 									}
 									if p != nil {
 										b = uintptr(*p)
 										p = add1(p)
 										nb = 8
 									}
 									if typ.size == dataSize {
 										// Single entry: can stop once we reach the non-pointer data.
 										nw = typ.ptrdata / sys.PtrSize
 									} else {
 										// Repeated instances of typ in an array.
 										// Have to process first N-1 entries in full, but can stop
 										// once we reach the non-pointer data in the final entry.
 										nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
 									}
 									if nw == 0 {
 										// No pointers! Caller was supposed to check.
 										println("runtime: invalid type ", *typ.string)
 										throw("heapBitsSetType: called with non-pointer type")
 										return
 									}
 									if nw < 2 {
 										// Must write at least 2 words, because the "no scan"
 										// encoding doesn't take effect until the third word.
 										nw = 2
 									}
 									// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
 									// The leading byte is special because it contains the bits for word 1,
 									// which does not have the scan bit set.
 									// The leading half-byte is special because it's a half a byte,
 									// so we have to be careful with the bits already there.
 									switch {
 									default:
 										throw("heapBitsSetType: unexpected shift")
 									case h.shift == 0:
 										// Ptrmask and heap bitmap are aligned.
 										// Handle first byte of bitmap specially.
 										//
 										// The first byte we write out covers the first four
 										// words of the object. The scan/dead bit on the first
 										// word must be set to scan since there are pointers
 										// somewhere in the object. The scan/dead bit on the
 										// second word is the checkmark, so we don't set it.
 										// In all following words, we set the scan/dead
 										// appropriately to indicate that the object contains
 										// to the next 2-bit entry in the bitmap.
 										//
 										// TODO: It doesn't matter if we set the checkmark, so
 										// maybe this case isn't needed any more.
 										hb = b & bitPointerAll
 										hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
 										if w += 4; w >= nw {
 											goto Phase3
 										}
 										*hbitp = uint8(hb)
 										hbitp = subtract1(hbitp)
 										b >>= 4
 										nb -= 4
 									case sys.PtrSize == 8 && h.shift == 2:
 										// Ptrmask and heap bitmap are misaligned.
 										// The bits for the first two words are in a byte shared
 										// with another object, so we must be careful with the bits
 										// already there.
 										// We took care of 1-word and 2-word objects above,
 										// so this is at least a 6-word object.
 										hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
 										// This is not noscan, so set the scan bit in the
 										// first word.
 										hb |= bitScan << (2 * heapBitsShift)
 										b >>= 2
 										nb -= 2
 										// Note: no bitScan for second word because that's
 										// the checkmark.
 										*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
 										*hbitp |= uint8(hb)
 										hbitp = subtract1(hbitp)
 										if w += 2; w >= nw {
 											// We know that there is more data, because we handled 2-word objects above.
 											// This must be at least a 6-word object. If we're out of pointer words,
 											// mark no scan in next bitmap byte and finish.
 											hb = 0
 											w += 4
 											goto Phase3
 										}
 									}
 									// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
 									// The loop computes the bits for that last write but does not execute the write;
 									// it leaves the bits in hb for processing by phase 3.
 									// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
 									// use in the first half of the loop right now, and then we only adjust nb explicitly
 									// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
 									nb -= 4
 									for {
 										// Emit bitmap byte.
 										// b has at least nb+4 bits, with one exception:
 										// if w+4 >= nw, then b has only nw-w bits,
 										// but we'll stop at the break and then truncate
 										// appropriately in Phase 3.
 										hb = b & bitPointerAll
 										hb |= bitScanAll
 										if w += 4; w >= nw {
 											break
 										}
 										*hbitp = uint8(hb)
 										hbitp = subtract1(hbitp)
 										b >>= 4
 										// Load more bits. b has nb right now.
 										if p != endp {
 											// Fast path: keep reading from ptrmask.
 											// nb unmodified: we just loaded 8 bits,
 											// and the next iteration will consume 8 bits,
 											// leaving us with the same nb the next time we're here.
 											if nb < 8 {
 												b |= uintptr(*p) << nb
 												p = add1(p)
 											} else {
 												// Reduce the number of bits in b.
 												// This is important if we skipped
 												// over a scalar tail, since nb could
 												// be larger than the bit width of b.
 												nb -= 8
 											}
 										} else if p == nil {
 											// Almost as fast path: track bit count and refill from pbits.
 											// For short repetitions.
 											if nb < 8 {
 												b |= pbits << nb
 												nb += endnb
 											}
 											nb -= 8 // for next iteration
 										} else {
 											// Slow path: reached end of ptrmask.
 											// Process final partial byte and rewind to start.
 											b |= uintptr(*p) << nb
 											nb += endnb
 											if nb < 8 {
 												b |= uintptr(*ptrmask) << nb
 												p = add1(ptrmask)
 											} else {
 												nb -= 8
 												p = ptrmask
 											}
 										}
 										// Emit bitmap byte.
 										hb = b & bitPointerAll
 										hb |= bitScanAll
 										if w += 4; w >= nw {
 											break
 										}
 										*hbitp = uint8(hb)
 										hbitp = subtract1(hbitp)
 										b >>= 4
 									}
 								Phase3:
 									// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
 									if w > nw {
 										// Counting the 4 entries in hb not yet written to memory,
 										// there are more entries than possible pointer slots.
 										// Discard the excess entries (can't be more than 3).
 										mask := uintptr(1)<<(4-(w-nw)) - 1
 										hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
 									}
 									// Change nw from counting possibly-pointer words to total words in allocation.
 									nw = size / sys.PtrSize
 									// Write whole bitmap bytes.
 									// The first is hb, the rest are zero.
 									if w <= nw {
 										*hbitp = uint8(hb)
 										hbitp = subtract1(hbitp)
 										hb = 0 // for possible final half-byte below
 										for w += 4; w <= nw; w += 4 {
 											*hbitp = 0
 											hbitp = subtract1(hbitp)
 										}
 									}
 									// Write final partial bitmap byte if any.
 									// We know w > nw, or else we'd still be in the loop above.
 									// It can be bigger only due to the 4 entries in hb that it counts.
 									// If w == nw+4 then there's nothing left to do: we wrote all nw entries
 									// and can discard the 4 sitting in hb.
 									// But if w == nw+2, we need to write first two in hb.
 									// The byte is shared with the next object, so be careful with
 									// existing bits.
 									if w == nw+2 {
 										*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
 									}
 								Phase4:
 									// Phase 4: all done, but perhaps double check.
 									if doubleCheck {
 										end := heapBitsForAddr(x + size)
 										if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
 											println("ended at wrong bitmap byte for", *typ.string, "x", dataSize/typ.size)
 											print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 											print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 											h0 := heapBitsForAddr(x)
 											print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
 											print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
 											throw("bad heapBitsSetType")
 										}
 										// Double-check that bits to be written were written correctly.
 										// Does not check that other bits were not written, unfortunately.
 										h := heapBitsForAddr(x)
 										nptr := typ.ptrdata / sys.PtrSize
 										ndata := typ.size / sys.PtrSize
 										count := dataSize / typ.size
 										totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
 										for i := uintptr(0); i < size/sys.PtrSize; i++ {
 											j := i % ndata
 											var have, want uint8
 											have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
 											if i >= totalptr {
 												want = 0 // deadmarker
 												if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
 													want = bitScan
 												}
 											} else {
 												if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 													want |= bitPointer
 												}
 												if i != 1 {
 													want |= bitScan
 												} else {
 													have &^= bitScan
 												}
 											}
 											if have != want {
 												println("mismatch writing bits for", *typ.string, "x", dataSize/typ.size)
 												print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 												print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
 												print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 												h0 := heapBitsForAddr(x)
 												print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
 												print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
 												print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
 												println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
 												if typ.kind&kindGCProg != 0 {
 													println("GC program:")
 													dumpGCProg(addb(typ.gcdata, 4))
 												}
 												throw("bad heapBitsSetType")
 											}
 											h = h.next()
 										}
 										if ptrmask == debugPtrmask.data {
 											unlock(&debugPtrmask.lock)
 										}
 									}
 								}
 								// heapBitsSetTypeNoScan marks x as noscan by setting the first word
 								// of x in the heap bitmap to scalar/dead.
 								func heapBitsSetTypeNoScan(x uintptr) {
 									h := heapBitsForAddr(uintptr(x))
 									*h.bitp &^= (bitPointer | bitScan) << h.shift
 								}
 								var debugPtrmask struct {
 									lock mutex
 									data *byte
 								}
 								// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
 								// progSize is the size of the memory described by the program.
 								// elemSize is the size of the element that the GC program describes (a prefix of).
 								// dataSize is the total size of the intended data, a multiple of elemSize.
 								// allocSize is the total size of the allocated memory.
 								//
 								// GC programs are only used for large allocations.
 								// heapBitsSetType requires that allocSize is a multiple of 4 words,
 								// so that the relevant bitmap bytes are not shared with surrounding
 								// objects.
 								func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
 									if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
 										// Alignment will be wrong.
 										throw("heapBitsSetTypeGCProg: small allocation")
 									}
 									var totalBits uintptr
 									if elemSize == dataSize {
 										totalBits = runGCProg(prog, nil, h.bitp, 2)
 										if totalBits*sys.PtrSize != progSize {
 											println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
 											throw("heapBitsSetTypeGCProg: unexpected bit count")
 										}
 									} else {
 										count := dataSize / elemSize
 										// Piece together program trailer to run after prog that does:
 										//	literal(0)
 										//	repeat(1, elemSize-progSize-1) // zeros to fill element size
 										//	repeat(elemSize, count-1) // repeat that element for count
 										// This zero-pads the data remaining in the first element and then
 										// repeats that first element to fill the array.
 										var trailer [40]byte // 3 varints (max 10 each) + some bytes
 										i := 0
 										if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
 											// literal(0)
 											trailer[i] = 0x01
 											i++
 											trailer[i] = 0
 											i++
 											if n > 1 {
 												// repeat(1, n-1)
 												trailer[i] = 0x81
 												i++
 												n--
 												for ; n >= 0x80; n >>= 7 {
 													trailer[i] = byte(n | 0x80)
 													i++
 												}
 												trailer[i] = byte(n)
 												i++
 											}
 										}
 										// repeat(elemSize/ptrSize, count-1)
 										trailer[i] = 0x80
 										i++
 										n := elemSize / sys.PtrSize
 										for ; n >= 0x80; n >>= 7 {
 											trailer[i] = byte(n | 0x80)
 											i++
 										}
 										trailer[i] = byte(n)
 										i++
 										n = count - 1
 										for ; n >= 0x80; n >>= 7 {
 											trailer[i] = byte(n | 0x80)
 											i++
 										}
 										trailer[i] = byte(n)
 										i++
 										trailer[i] = 0
 										i++
 										runGCProg(prog, &trailer[0], h.bitp, 2)
 										// Even though we filled in the full array just now,
 										// record that we only filled in up to the ptrdata of the
 										// last element. This will cause the code below to
 										// memclr the dead section of the final array element,
 										// so that scanobject can stop early in the final element.
 										totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
 									}
 									endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
 									endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
 									memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
 								}
 								// progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
 								// size the size of the region described by prog, in bytes.
 								// The resulting bitvector will have no more than size/sys.PtrSize bits.
 								func progToPointerMask(prog *byte, size uintptr) bitvector {
 									n := (size/sys.PtrSize + 7) / 8
 									x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
 									x[len(x)-1] = 0xa1 // overflow check sentinel
 									n = runGCProg(prog, nil, &x[0], 1)
 									if x[len(x)-1] != 0xa1 {
 										throw("progToPointerMask: overflow")
 									}
 									return bitvector{int32(n), &x[0]}
 								}
 								// Packed GC pointer bitmaps, aka GC programs.
 								//
 								// For large types containing arrays, the type information has a
 								// natural repetition that can be encoded to save space in the
 								// binary and in the memory representation of the type information.
 								//
 								// The encoding is a simple Lempel-Ziv style bytecode machine
 								// with the following instructions:
 								//
 								//	00000000: stop
 								//	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
 								//	10000000 n c: repeat the previous n bits c times; n, c are varints
 								//	1nnnnnnn c: repeat the previous n bits c times; c is a varint
 								// runGCProg executes the GC program prog, and then trailer if non-nil,
 								// writing to dst with entries of the given size.
 								// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
 								// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
 								// starting at dst (because the heap bitmap does). In this case, the caller guarantees
 								// that only whole bytes in dst need to be written.
 								//
 								// runGCProg returns the number of 1- or 2-bit entries written to memory.
 								func runGCProg(prog, trailer, dst *byte, size int) uintptr {
 									dstStart := dst
 									// Bits waiting to be written to memory.
 									var bits uintptr
 									var nbits uintptr
 									p := prog
 								Run:
 									for {
 										// Flush accumulated full bytes.
 										// The rest of the loop assumes that nbits <= 7.
 										for ; nbits >= 8; nbits -= 8 {
 											if size == 1 {
 												*dst = uint8(bits)
 												dst = add1(dst)
 												bits >>= 8
 											} else {
 												v := bits&bitPointerAll | bitScanAll
 												*dst = uint8(v)
 												dst = subtract1(dst)
 												bits >>= 4
 												v = bits&bitPointerAll | bitScanAll
 												*dst = uint8(v)
 												dst = subtract1(dst)
 												bits >>= 4
 											}
 										}
 										// Process one instruction.
 										inst := uintptr(*p)
 										p = add1(p)
 										n := inst & 0x7F
 										if inst&0x80 == 0 {
 											// Literal bits; n == 0 means end of program.
 											if n == 0 {
 												// Program is over; continue in trailer if present.
 												if trailer != nil {
 													//println("trailer")
 													p = trailer
 													trailer = nil
 													continue
 												}
 												//println("done")
 												break Run
 											}
 											//println("lit", n, dst)
 											nbyte := n / 8
 											for i := uintptr(0); i < nbyte; i++ {
 												bits |= uintptr(*p) << nbits
 												p = add1(p)
 												if size == 1 {
 													*dst = uint8(bits)
 													dst = add1(dst)
 													bits >>= 8
 												} else {
 													v := bits&0xf | bitScanAll
 													*dst = uint8(v)
 													dst = subtract1(dst)
 													bits >>= 4
 													v = bits&0xf | bitScanAll
 													*dst = uint8(v)
 													dst = subtract1(dst)
 													bits >>= 4
 												}
 											}
 											if n %= 8; n > 0 {
 												bits |= uintptr(*p) << nbits
 												p = add1(p)
 												nbits += n
 											}
 											continue Run
 										}
 										// Repeat. If n == 0, it is encoded in a varint in the next bytes.
 										if n == 0 {
 											for off := uint(0); ; off += 7 {
 												x := uintptr(*p)
 												p = add1(p)
 												n |= (x & 0x7F) << off
 												if x&0x80 == 0 {
 													break
 												}
 											}
 										}
 										// Count is encoded in a varint in the next bytes.
 										c := uintptr(0)
 										for off := uint(0); ; off += 7 {
 											x := uintptr(*p)
 											p = add1(p)
 											c |= (x & 0x7F) << off
 											if x&0x80 == 0 {
 												break
 											}
 										}
 										c *= n // now total number of bits to copy
 										// If the number of bits being repeated is small, load them
 										// into a register and use that register for the entire loop
 										// instead of repeatedly reading from memory.
 										// Handling fewer than 8 bits here makes the general loop simpler.
 										// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
 										// the pattern to a bit buffer holding at most 7 bits (a partial byte)
 										// it will not overflow.
 										src := dst
 										const maxBits = sys.PtrSize*8 - 7
 										if n <= maxBits {
 											// Start with bits in output buffer.
 											pattern := bits
 											npattern := nbits
 											// If we need more bits, fetch them from memory.
 											if size == 1 {
 												src = subtract1(src)
 												for npattern < n {
 													pattern <<= 8
 													pattern |= uintptr(*src)
 													src = subtract1(src)
 													npattern += 8
 												}
 											} else {
 												src = add1(src)
 												for npattern < n {
 													pattern <<= 4
 													pattern |= uintptr(*src) & 0xf
 													src = add1(src)
 													npattern += 4
 												}
 											}
 											// We started with the whole bit output buffer,
 											// and then we loaded bits from whole bytes.
 											// Either way, we might now have too many instead of too few.
 											// Discard the extra.
 											if npattern > n {
 												pattern >>= npattern - n
 												npattern = n
 											}
 											// Replicate pattern to at most maxBits.
 											if npattern == 1 {
 												// One bit being repeated.
 												// If the bit is 1, make the pattern all 1s.
 												// If the bit is 0, the pattern is already all 0s,
 												// but we can claim that the number of bits
 												// in the word is equal to the number we need (c),
 												// because right shift of bits will zero fill.
 												if pattern == 1 {
 													pattern = 1<<maxBits - 1
 													npattern = maxBits
 												} else {
 													npattern = c
 												}
 											} else {
 												b := pattern
 												nb := npattern
 												if nb+nb <= maxBits {
 													// Double pattern until the whole uintptr is filled.
 													for nb <= sys.PtrSize*8 {
 														b |= b << nb
 														nb += nb
 													}
 													// Trim away incomplete copy of original pattern in high bits.
 													// TODO(rsc): Replace with table lookup or loop on systems without divide?
 													nb = maxBits / npattern * npattern
 													b &= 1<<nb - 1
 													pattern = b
 													npattern = nb
 												}
 											}
 											// Add pattern to bit buffer and flush bit buffer, c/npattern times.
 											// Since pattern contains >8 bits, there will be full bytes to flush
 											// on each iteration.
 											for ; c >= npattern; c -= npattern {
 												bits |= pattern << nbits
 												nbits += npattern
 												if size == 1 {
 													for nbits >= 8 {
 														*dst = uint8(bits)
 														dst = add1(dst)
 														bits >>= 8
 														nbits -= 8
 													}
 												} else {
 													for nbits >= 4 {
 														*dst = uint8(bits&0xf | bitScanAll)
 														dst = subtract1(dst)
 														bits >>= 4
 														nbits -= 4
 													}
 												}
 											}
 											// Add final fragment to bit buffer.
 											if c > 0 {
 												pattern &= 1<<c - 1
 												bits |= pattern << nbits
 												nbits += c
 											}
 											continue Run
 										}
 										// Repeat; n too large to fit in a register.
 										// Since nbits <= 7, we know the first few bytes of repeated data
 										// are already written to memory.
 										off := n - nbits // n > nbits because n > maxBits and nbits <= 7
 										if size == 1 {
 											// Leading src fragment.
 											src = subtractb(src, (off+7)/8)
 											if frag := off & 7; frag != 0 {
 												bits |= uintptr(*src) >> (8 - frag) << nbits
 												src = add1(src)
 												nbits += frag
 												c -= frag
 											}
 											// Main loop: load one byte, write another.
 											// The bits are rotating through the bit buffer.
 											for i := c / 8; i > 0; i-- {
 												bits |= uintptr(*src) << nbits
 												src = add1(src)
 												*dst = uint8(bits)
 												dst = add1(dst)
 												bits >>= 8
 											}
 											// Final src fragment.
 											if c %= 8; c > 0 {
 												bits |= (uintptr(*src) & (1<<c - 1)) << nbits
 												nbits += c
 											}
 										} else {
 											// Leading src fragment.
 											src = addb(src, (off+3)/4)
 											if frag := off & 3; frag != 0 {
 												bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
 												src = subtract1(src)
 												nbits += frag
 												c -= frag
 											}
 											// Main loop: load one byte, write another.
 											// The bits are rotating through the bit buffer.
 											for i := c / 4; i > 0; i-- {
 												bits |= (uintptr(*src) & 0xf) << nbits
 												src = subtract1(src)
 												*dst = uint8(bits&0xf | bitScanAll)
 												dst = subtract1(dst)
 												bits >>= 4
 											}
 											// Final src fragment.
 											if c %= 4; c > 0 {
 												bits |= (uintptr(*src) & (1<<c - 1)) << nbits
 												nbits += c
 											}
 										}
 									}
 									// Write any final bits out, using full-byte writes, even for the final byte.
 									var totalBits uintptr
 									if size == 1 {
 										totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
 										nbits += -nbits & 7
 										for ; nbits > 0; nbits -= 8 {
 											*dst = uint8(bits)
 											dst = add1(dst)
 											bits >>= 8
 										}
 									} else {
 										totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
 										nbits += -nbits & 3
 										for ; nbits > 0; nbits -= 4 {
 											v := bits&0xf | bitScanAll
 											*dst = uint8(v)
 											dst = subtract1(dst)
 											bits >>= 4
 										}
 									}
 									return totalBits
 								}
 								func dumpGCProg(p *byte) {
 									nptr := 0
 									for {
 										x := *p
 										p = add1(p)
 										if x == 0 {
 											print("\t", nptr, " end\n")
 											break
 										}
 										if x&0x80 == 0 {
 											print("\t", nptr, " lit ", x, ":")
 											n := int(x+7) / 8
 											for i := 0; i < n; i++ {
 												print(" ", hex(*p))
 												p = add1(p)
 											}
 											print("\n")
 											nptr += int(x)
 										} else {
 											nbit := int(x &^ 0x80)
 											if nbit == 0 {
 												for nb := uint(0); ; nb += 7 {
 													x := *p
 													p = add1(p)
 													nbit |= int(x&0x7f) << nb
 													if x&0x80 == 0 {
 														break
 													}
 												}
 											}
 											count := 0
 											for nb := uint(0); ; nb += 7 {
 												x := *p
 												p = add1(p)
 												count |= int(x&0x7f) << nb
 												if x&0x80 == 0 {
 													break
 												}
 											}
 											print("\t", nptr, " repeat ", nbit, " × ", count, "\n")
 											nptr += nbit * count
 										}
 									}
 								}
 								// Testing.
 								// gcbits returns the GC type info for x, for testing.
 								// The result is the bitmap entries (0 or 1), one entry per byte.
 								//go:linkname reflect_gcbits reflect.gcbits
 								func reflect_gcbits(x interface{}) []byte {
 									ret := getgcmask(x)
 									typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
 									nptr := typ.ptrdata / sys.PtrSize
 									for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
 										ret = ret[:len(ret)-1]
 									}
 									return ret
 								}
 								// Returns GC type info for object p for testing.
 								func getgcmask(ep interface{}) (mask []byte) {
 									e := *efaceOf(&ep)
 									p := e.data
 									t := e._type
 									// data or bss
 									roots := gcRoots
 									for roots != nil {
 										for i := 0; i < roots.count; i++ {
 											pr := roots.roots[i]
 											addr := uintptr(pr.decl)
 											if addr <= uintptr(p) && uintptr(p) < addr+pr.size {
 												n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 												mask = make([]byte, n/sys.PtrSize)
 												copy(mask, (*[1 << 29]uint8)(unsafe.Pointer(pr.gcdata))[:pr.ptrdata])
 											}
 											return
 										}
 										roots = roots.next
 									}
 									// heap
 									var n uintptr
 									var base uintptr
 									if mlookup(uintptr(p), &base, &n, nil) != 0 {
 										mask = make([]byte, n/sys.PtrSize)
 										for i := uintptr(0); i < n; i += sys.PtrSize {
 											hbits := heapBitsForAddr(base + i)
 											if hbits.isPointer() {
 												mask[i/sys.PtrSize] = 1
 											}
 											if i != 1*sys.PtrSize && !hbits.morePointers() {
 												mask = mask[:i/sys.PtrSize]
 												break
 											}
 										}
 										return
 									}
 									// otherwise, not something the GC knows about.
 									// possibly read-only data, like malloc(0).
 									// must not have pointers
 									// For gccgo, may live on the stack, which is collected conservatively.
 									return
 								}