runtime: simpler and faster GC

Implement the design described in: https://docs.google.com/document/d/1v4Oqa0WwHunqlb8C3ObL_uNQw3DfSY-ztoA-4wWbKcg/pub Summary of the changes: GC uses "2-bits per word" pointer type info embed directly into bitmap. Scanning of stacks/data/heap is unified. The old spans types go away. Compiler generates "sparse" 4-bits type info for GC (directly for GC bitmap). Linker generates "dense" 2-bits type info for data/bss (the same as stacks use). Summary of results: -1680 lines of code total (-1000+ in mgc0.c only) -25% memory consumption -3-7% binary size -15% GC pause reduction -7% run time reduction LGTM=khr R=golang-codereviews, rsc, christoph, khr CC=golang-codereviews, rlh https://golang.org/cl/106260045
2025-12-08 06:10:04 +00:00 · 2014-07-29 11:01:02 +04:00 · 2014-07-29 11:01:02 +04:00 · cd17a717f9
commit cd17a717f9
parent 0100afbdcc
27 changed files with 1632 additions and 2413 deletions
--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@ -381,7 +381,6 @@ enum
 	SymExported	= 1<<2,	// already written out by export
 	SymUniq		= 1<<3,
 	SymSiggen	= 1<<4,
-	SymGcgen	= 1<<5,
 };

 struct	Sym
@ -1515,6 +1514,7 @@ void	movelarge(NodeList*);
 int	isfat(Type*);
 void	linkarchinit(void);
 void	liveness(Node*, Prog*, Sym*, Sym*);
+void	twobitwalktype1(Type*, vlong*, Bvec*);
 void	markautoused(Prog*);
 Plist*	newplist(void);
 Node*	nodarg(Type*, int);
--- a/src/cmd/gc/plive.c
+++ b/src/cmd/gc/plive.c
@ -19,8 +19,7 @@
 #include "opt.h"
 #include "../ld/textflag.h"
 #include "../../pkg/runtime/funcdata.h"
-
-enum { BitsPerPointer = 2 };
+#include "../../pkg/runtime/mgc0.h"

 enum {
 	UNVISITED = 0,
@ -1040,7 +1039,7 @@ checkptxt(Node *fn, Prog *firstp)
 // and then simply copied into bv at the correct offset on future calls with
 // the same type t. On https://rsc.googlecode.com/hg/testdata/slow.go, twobitwalktype1
 // accounts for 40% of the 6g execution time.
-static void
+void
 twobitwalktype1(Type *t, vlong *xoffset, Bvec *bv)
 {
 	vlong fieldoffset;
--- a/src/cmd/gc/reflect.c
+++ b/src/cmd/gc/reflect.c
@ -7,6 +7,7 @@
 #include "go.h"
 #include "../ld/textflag.h"
 #include "../../pkg/runtime/mgc0.h"
+#include "../../pkg/runtime/typekind.h"

 /*
 * runtime interface and reflection data structures
@ -16,7 +17,9 @@ static	NodeList*	signatlist;
 static	Sym*	dtypesym(Type*);
 static	Sym*	weaktypesym(Type*);
 static	Sym*	dalgsym(Type*);
-static	Sym*	dgcsym(Type*);
+static	int	usegcprog(Type*);
+static	void	gengcprog(Type*, Sym**, Sym**);
+static	void	gengcmask(Type*, uint8[16]);

 static int
 sigcmp(Sig *a, Sig *b)
@ -612,37 +615,6 @@ dextratype(Sym *sym, int off, Type *t, int ptroff)
 	return ot;
 }

-enum {
-	KindBool = 1,
-	KindInt,
-	KindInt8,
-	KindInt16,
-	KindInt32,
-	KindInt64,
-	KindUint,
-	KindUint8,
-	KindUint16,
-	KindUint32,
-	KindUint64,
-	KindUintptr,
-	KindFloat32,
-	KindFloat64,
-	KindComplex64,
-	KindComplex128,
-	KindArray,
-	KindChan,
-	KindFunc,
-	KindInterface,
-	KindMap,
-	KindPtr,
-	KindSlice,
-	KindString,
-	KindStruct,
-	KindUnsafePointer,
-
-	KindNoPointers = 1<<7,
-};
-
 static int
 kinds[] =
 {
@ -746,8 +718,9 @@ haspointers(Type *t)
 static int
 dcommontype(Sym *s, int ot, Type *t)
 {
-	int i, alg, sizeofAlg;
-	Sym *sptr, *algsym, *zero;
+	int i, alg, sizeofAlg, gcprog;
+	Sym *sptr, *algsym, *zero, *gcprog0, *gcprog1;
+	uint8 gcmask[16];
 	static Sym *algarray;
 	char *p;
 	
@ -809,17 +782,32 @@ dcommontype(Sym *s, int ot, Type *t)
 	ot = duint8(s, ot, t->align);	// align
 	ot = duint8(s, ot, t->align);	// fieldAlign

+	gcprog = usegcprog(t);
 	i = kinds[t->etype];
 	if(t->etype == TARRAY && t->bound < 0)
 		i = KindSlice;
 	if(!haspointers(t))
 		i |= KindNoPointers;
+	if(gcprog)
+		i |= KindGCProg;
 	ot = duint8(s, ot, i);  // kind
 	if(alg >= 0)
 		ot = dsymptr(s, ot, algarray, alg*sizeofAlg);
 	else
 		ot = dsymptr(s, ot, algsym, 0);
-	ot = dsymptr(s, ot, dgcsym(t), 0);  // gc
+	// gc
+	if(gcprog) {
+		gengcprog(t, &gcprog0, &gcprog1);
+		if(gcprog0 != S)
+			ot = dsymptr(s, ot, gcprog0, 0);
+		else
+			ot = duintptr(s, ot, 0);
+		ot = dsymptr(s, ot, gcprog1, 0);
+	} else {
+		gengcmask(t, gcmask);
+		for(i = 0; i < 2*widthptr; i++)
+			ot = duint8(s, ot, gcmask[i]);
+	}
 	p = smprint("%-uT", t);
 	//print("dcommontype: %s\n", p);
 	ot = dgostringptr(s, ot, p);	// string
@ -1275,30 +1263,206 @@ dalgsym(Type *t)
 }

 static int
-gcinline(Type *t)
+usegcprog(Type *t)
 {
-	switch(t->etype) {
-	case TARRAY:
-		if(t->bound == 1)
-			return 1;
-		if(t->width <= 4*widthptr)
-			return 1;
-		break;
-	}
+	vlong size, nptr;
+
+	if(!haspointers(t))
 		return 0;
-}
-
-static int
-dgcsym1(Sym *s, int ot, Type *t, vlong *off, int stack_size)
-{
-	Type *t1;
-	vlong o, off2, fieldoffset, i;
-
-	if(t->align > 0 && (*off % t->align) != 0)
-		fatal("dgcsym1: invalid initial alignment, %T", t);
-
 	if(t->width == BADWIDTH)
 		dowidth(t);
+	// Calculate size of the unrolled GC mask.
+	nptr = (t->width+widthptr-1)/widthptr;
+	size = nptr;
+	if(size%2)
+		size *= 2;	// repeated
+	size = size*gcBits/8;	// 4 bits per word
+	// Decide whether to use unrolled GC mask or GC program.
+	// We could use a more elaborate condition, but this seems to work well in practice.
+	// For small objects GC program can't give significant reduction.
+	// While large objects usually contain arrays; and even if it don't
+	// the program uses 2-bits per word while mask uses 4-bits per word,
+	// so the program is still smaller.
+	return size > 2*widthptr;
+}
+
+// Generates sparse GC bitmask (4 bits per word).
+static void
+gengcmask(Type *t, uint8 gcmask[16])
+{
+	Bvec *vec;
+	vlong xoffset, nptr, i, j;
+	int  half, mw;
+	uint8 bits, *pos;
+
+	memset(gcmask, 0, 16);
+	if(!haspointers(t))
+		return;
+
+	// Generate compact mask as stacks use.
+	xoffset = 0;
+	vec = bvalloc(2*widthptr*8);
+	twobitwalktype1(t, &xoffset, vec);
+
+	// Unfold the mask for the GC bitmap format:
+	// 4 bits per word, 2 high bits encode pointer info.
+	pos = (uint8*)gcmask;
+	nptr = (t->width+widthptr-1)/widthptr;
+	half = 0;
+	mw = 0;
+	// If number of words is odd, repeat the mask.
+	// This makes simpler handling of arrays in runtime.
+	for(j=0; j<=(nptr%2); j++) {
+		for(i=0; i<nptr; i++) {
+			bits = bvget(vec, i*BitsPerPointer) | bvget(vec, i*BitsPerPointer+1)<<1;
+			// Some fake types (e.g. Hmap) has missing fileds.
+			// twobitwalktype1 generates BitsDead for that holes,
+			// replace BitsDead with BitsScalar.
+			if(!mw && bits == BitsDead)
+				bits = BitsScalar;
+			mw = !mw && bits == BitsMultiWord;
+			bits <<= 2;
+			if(half)
+				bits <<= 4;
+			*pos |= bits;
+			half = !half;
+			if(!half)
+				pos++;
+		}
+	}
+}
+
+// Helper object for generation of GC programs.
+typedef struct ProgGen ProgGen;
+struct ProgGen
+{
+	Sym*	s;
+	int32	datasize;
+	uint8	data[256/PointersPerByte];
+	vlong	ot;
+};
+
+static void
+proggeninit(ProgGen *g, Sym *s)
+{
+	g->s = s;
+	g->datasize = 0;
+	g->ot = 0;
+	memset(g->data, 0, sizeof(g->data));
+}
+
+static void
+proggenemit(ProgGen *g, uint8 v)
+{
+	g->ot = duint8(g->s, g->ot, v);
+}
+
+// Emits insData block from g->data.
+static void
+proggendataflush(ProgGen *g)
+{
+	int32 i, s;
+
+	if(g->datasize == 0)
+		return;
+	proggenemit(g, insData);
+	proggenemit(g, g->datasize);
+	s = (g->datasize + PointersPerByte - 1)/PointersPerByte;
+	for(i = 0; i < s; i++)
+		proggenemit(g, g->data[i]);
+	g->datasize = 0;
+	memset(g->data, 0, sizeof(g->data));
+}
+
+static void
+proggendata(ProgGen *g, uint8 d)
+{
+	g->data[g->datasize/PointersPerByte] |= d << ((g->datasize%PointersPerByte)*BitsPerPointer);
+	g->datasize++;
+	if(g->datasize == 255)
+		proggendataflush(g);
+}
+
+// Skip v bytes due to alignment, etc.
+static void
+proggenskip(ProgGen *g, vlong off, vlong v)
+{
+	vlong i;
+
+	for(i = off; i < off+v; i++) {
+		if((i%widthptr) == 0)
+			proggendata(g, BitsScalar);
+	}
+}
+
+// Emit insArray instruction.
+static void
+proggenarray(ProgGen *g, vlong len)
+{
+	int32 i;
+
+	proggendataflush(g);
+	proggenemit(g, insArray);
+	for(i = 0; i < widthptr; i++, len >>= 8)
+		proggenemit(g, len);
+}
+
+static void
+proggenarrayend(ProgGen *g)
+{
+	proggendataflush(g);
+	proggenemit(g, insArrayEnd);
+}
+
+static vlong
+proggenfini(ProgGen *g)
+{
+	proggendataflush(g);
+	proggenemit(g, insEnd);
+	return g->ot;
+}
+
+static void gengcprog1(ProgGen *g, Type *t, vlong *xoffset);
+
+// Generates GC program for large types.
+static void
+gengcprog(Type *t, Sym **pgc0, Sym **pgc1)
+{
+	Sym *gc0, *gc1;
+	vlong nptr, size, ot, xoffset;
+	ProgGen g;
+
+	nptr = (t->width+widthptr-1)/widthptr;
+	size = nptr;
+	if(size%2)
+		size *= 2;	// repeated twice
+	size = size*PointersPerByte/8;	// 4 bits per word
+	size++;	// unroll flag in the beginning, used by runtime (see runtime.markallocated)
+	// emity space in BSS for unrolled program
+	*pgc0 = S;
+	// Don't generate it if it's too large, runtime will unroll directly into GC bitmap.
+	if(size <= MaxGCMask) {
+		gc0 = typesymprefix(".gc", t);
+		ggloblsym(gc0, size, DUPOK|NOPTR);
+		*pgc0 = gc0;
+	}
+
+	// program in RODATA
+	gc1 = typesymprefix(".gcprog", t);
+	proggeninit(&g, gc1);
+	xoffset = 0;
+	gengcprog1(&g, t, &xoffset);
+	ot = proggenfini(&g);
+	ggloblsym(gc1, ot, DUPOK|RODATA);
+	*pgc1 = gc1;
+}
+
+// Recursively walks type t and writes GC program into g.
+static void
+gengcprog1(ProgGen *g, Type *t, vlong *xoffset)
+{
+	vlong fieldoffset, i, o, n;
+	Type *t1;

 	switch(t->etype) {
 	case TINT8:
@ -1317,187 +1481,71 @@ dgcsym1(Sym *s, int ot, Type *t, vlong *off, int stack_size)
 	case TFLOAT64:
 	case TCOMPLEX64:
 	case TCOMPLEX128:
-		*off += t->width;
+		proggenskip(g, *xoffset, t->width);
+		*xoffset += t->width;
 		break;
-
 	case TPTR32:
 	case TPTR64:
-		// NOTE: Any changes here need to be made to reflect.PtrTo as well.
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-
-		// NOTE(rsc): Emitting GC_APTR here for *nonptrtype
-		// (pointer to non-pointer-containing type) means that
-		// we do not record 'nonptrtype' and instead tell the 
-		// garbage collector to look up the type of the memory in
-		// type information stored in the heap. In effect we are telling
-		// the collector "we don't trust our information - use yours".
-		// It's not completely clear why we want to do this.
-		// It does have the effect that if you have a *SliceHeader and a *[]int
-		// pointing at the same actual slice header, *SliceHeader will not be
-		// used as an authoritative type for the memory, which is good:
-		// if the collector scanned the memory as type *SliceHeader, it would
-		// see no pointers inside but mark the block as scanned, preventing
-		// the seeing of pointers when we followed the *[]int pointer.
-		// Perhaps that kind of situation is the rationale.
-		if(!haspointers(t->type)) {
-			ot = duintptr(s, ot, GC_APTR);
-			ot = duintptr(s, ot, *off);
-		} else {
-			ot = duintptr(s, ot, GC_PTR);
-			ot = duintptr(s, ot, *off);
-			ot = dsymptr(s, ot, dgcsym(t->type), 0);
-		}
-		*off += t->width;
-		break;
-
 	case TUNSAFEPTR:
 	case TFUNC:
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-		ot = duintptr(s, ot, GC_APTR);
-		ot = duintptr(s, ot, *off);
-		*off += t->width;
-		break;
-
-	// struct Hchan*
 	case TCHAN:
-		// NOTE: Any changes here need to be made to reflect.ChanOf as well.
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-		ot = duintptr(s, ot, GC_CHAN_PTR);
-		ot = duintptr(s, ot, *off);
-		ot = dsymptr(s, ot, dtypesym(t), 0);
-		*off += t->width;
-		break;
-
-	// struct Hmap*
 	case TMAP:
-		// NOTE: Any changes here need to be made to reflect.MapOf as well.
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-		ot = duintptr(s, ot, GC_PTR);
-		ot = duintptr(s, ot, *off);
-		ot = dsymptr(s, ot, dgcsym(hmap(t)), 0);
-		*off += t->width;
+		proggendata(g, BitsPointer);
+		*xoffset += t->width;
 		break;
-
-	// struct { byte *str; int32 len; }
 	case TSTRING:
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-		ot = duintptr(s, ot, GC_STRING);
-		ot = duintptr(s, ot, *off);
-		*off += t->width;
+		proggendata(g, BitsMultiWord);
+		proggendata(g, BitsString);
+		*xoffset += t->width;
 		break;
-
-	// struct { Itab* tab;  void* data; }
-	// struct { Type* type; void* data; }	// When isnilinter(t)==true
 	case TINTER:
-		if(*off % widthptr != 0)
-			fatal("dgcsym1: invalid alignment, %T", t);
-		if(isnilinter(t)) {
-			ot = duintptr(s, ot, GC_EFACE);
-			ot = duintptr(s, ot, *off);
-		} else {
-			ot = duintptr(s, ot, GC_IFACE);
-			ot = duintptr(s, ot, *off);
-		}
-		*off += t->width;
+		proggendata(g, BitsMultiWord);
+		if(isnilinter(t))
+			proggendata(g, BitsEface);
+		else
+			proggendata(g, BitsIface);
+		*xoffset += t->width;
 		break;
-
 	case TARRAY:
-		if(t->bound < -1)
-			fatal("dgcsym1: invalid bound, %T", t);
-		if(t->type->width == BADWIDTH)
-			dowidth(t->type);
 		if(isslice(t)) {
-			// NOTE: Any changes here need to be made to reflect.SliceOf as well.
-			// struct { byte* array; uint32 len; uint32 cap; }
-			if(*off % widthptr != 0)
-				fatal("dgcsym1: invalid alignment, %T", t);
-			if(t->type->width != 0) {
-				ot = duintptr(s, ot, GC_SLICE);
-				ot = duintptr(s, ot, *off);
-				ot = dsymptr(s, ot, dgcsym(t->type), 0);
+			proggendata(g, BitsMultiWord);
+			proggendata(g, BitsSlice);
+			proggendata(g, BitsScalar);
 		} else {
-				ot = duintptr(s, ot, GC_APTR);
-				ot = duintptr(s, ot, *off);
-			}
-			*off += t->width;
-		} else {
-			// NOTE: Any changes here need to be made to reflect.ArrayOf as well,
-			// at least once ArrayOf's gc info is implemented and ArrayOf is exported.
-			// struct { byte* array; uint32 len; uint32 cap; }
-			if(t->bound < 1 || !haspointers(t->type)) {
-				*off += t->width;
-			} else if(gcinline(t)) {
+			t1 = t->type;
+			if(t1->width == 0) {
+				// ignore
+			} if(t->bound <= 1 || t->bound*t1->width < 32*widthptr) {
 				for(i = 0; i < t->bound; i++)
-					ot = dgcsym1(s, ot, t->type, off, stack_size);  // recursive call of dgcsym1
+					gengcprog1(g, t1, xoffset);
+			} else if(!haspointers(t1)) {
+				n = t->width;
+				n -= -*xoffset&(widthptr-1); // skip to next ptr boundary
+				proggenarray(g, (n+widthptr-1)/widthptr);
+				proggendata(g, BitsScalar);
+				proggenarrayend(g);
+				*xoffset -= (n+widthptr-1)/widthptr*widthptr - t->width;
 			} else {
-				if(stack_size < GC_STACK_CAPACITY) {
-					ot = duintptr(s, ot, GC_ARRAY_START);  // a stack push during GC
-					ot = duintptr(s, ot, *off);
-					ot = duintptr(s, ot, t->bound);
-					ot = duintptr(s, ot, t->type->width);
-					off2 = 0;
-					ot = dgcsym1(s, ot, t->type, &off2, stack_size+1);  // recursive call of dgcsym1
-					ot = duintptr(s, ot, GC_ARRAY_NEXT);  // a stack pop during GC
-				} else {
-					ot = duintptr(s, ot, GC_REGION);
-					ot = duintptr(s, ot, *off);
-					ot = duintptr(s, ot, t->width);
-					ot = dsymptr(s, ot, dgcsym(t), 0);
-				}
-				*off += t->width;
+				proggenarray(g, t->bound);
+				gengcprog1(g, t1, xoffset);
+				*xoffset += (t->bound-1)*t1->width;
+				proggenarrayend(g);
 			}
 		}
 		break;
-
 	case TSTRUCT:
 		o = 0;
 		for(t1 = t->type; t1 != T; t1 = t1->down) {
 			fieldoffset = t1->width;
-			*off += fieldoffset - o;
-			ot = dgcsym1(s, ot, t1->type, off, stack_size);  // recursive call of dgcsym1
+			proggenskip(g, *xoffset, fieldoffset - o);
+			*xoffset += fieldoffset - o;
+			gengcprog1(g, t1->type, xoffset);
 			o = fieldoffset + t1->type->width;
 		}
-		*off += t->width - o;
+		proggenskip(g, *xoffset, t->width - o);
+		*xoffset += t->width - o;
 		break;
-
 	default:
-		fatal("dgcsym1: unexpected type %T", t);
+		fatal("gengcprog1: unexpected type, %T", t);
 	}
-
-	return ot;
-}
-
-static Sym*
-dgcsym(Type *t)
-{
-	int ot;
-	vlong off;
-	Sym *s;
-
-	s = typesymprefix(".gc", t);
-	if(s->flags & SymGcgen)
-		return s;
-	s->flags |= SymGcgen;
-
-	if(t->width == BADWIDTH)
-		dowidth(t);
-
-	ot = 0;
-	off = 0;
-	ot = duintptr(s, ot, t->width);
-	ot = dgcsym1(s, ot, t, &off, 0);
-	ot = duintptr(s, ot, GC_END);
-	ggloblsym(s, ot, DUPOK|RODATA);
-
-	if(t->align > 0)
-		off = rnd(off, t->align);
-	if(off != t->width)
-		fatal("dgcsym: off=%lld, size=%lld, type %T", off, t->width, t);
-
-	return s;
 }
--- a/src/cmd/ld/data.c
+++ b/src/cmd/ld/data.c
@ -706,31 +706,165 @@ maxalign(LSym *s, int type)
 	return max;
 }

-static void
-gcaddsym(LSym *gc, LSym *s, vlong off)
+// Helper object for building GC type programs.
+typedef struct ProgGen ProgGen;
+struct ProgGen
 {
-	vlong a;
-	LSym *gotype;
+	LSym*	s;
+	int32	datasize;
+	uint8	data[256/PointersPerByte];
+	vlong	pos;
+};

-	if(s->size < PtrSize)
-		return;
-	if(strcmp(s->name, ".string") == 0)
-		return;
-
-	gotype = s->gotype;
-	if(gotype != nil) {
-		//print("gcaddsym:    %s    %d    %s\n", s->name, s->size, gotype->name);
-		adduintxx(ctxt, gc, GC_CALL, PtrSize);
-		adduintxx(ctxt, gc, off, PtrSize);
-		addpcrelplus(ctxt, gc, decodetype_gc(gotype), 3*PtrSize+4);
-		if(PtrSize == 8)
-			adduintxx(ctxt, gc, 0, 4);
-	} else {
-		//print("gcaddsym:    %s    %d    <unknown type>\n", s->name, s->size);
-		for(a = -off&(PtrSize-1); a+PtrSize<=s->size; a+=PtrSize) {
-			adduintxx(ctxt, gc, GC_APTR, PtrSize);
-			adduintxx(ctxt, gc, off+a, PtrSize);
+static void
+proggeninit(ProgGen *g, LSym *s)
+{
+	g->s = s;
+	g->datasize = 0;
+	g->pos = 0;
+	memset(g->data, 0, sizeof(g->data));
 }
+
+static void
+proggenemit(ProgGen *g, uint8 v)
+{
+	adduint8(ctxt, g->s, v);
+}
+
+// Writes insData block from g->data.
+static void
+proggendataflush(ProgGen *g)
+{
+	int32 i, s;
+
+	if(g->datasize == 0)
+		return;
+	proggenemit(g, insData);
+	proggenemit(g, g->datasize);
+	s = (g->datasize + PointersPerByte - 1)/PointersPerByte;
+	for(i = 0; i < s; i++)
+		proggenemit(g, g->data[i]);
+	g->datasize = 0;
+	memset(g->data, 0, sizeof(g->data));
+}
+
+static void
+proggendata(ProgGen *g, uint8 d)
+{
+	g->data[g->datasize/PointersPerByte] |= d << ((g->datasize%PointersPerByte)*BitsPerPointer);
+	g->datasize++;
+	if(g->datasize == 255)
+		proggendataflush(g);
+}
+
+// Skip v bytes due to alignment, etc.
+static void
+proggenskip(ProgGen *g, vlong off, vlong v)
+{
+	vlong i;
+
+	for(i = off; i < off+v; i++) {
+		if((i%PtrSize) == 0)
+			proggendata(g, BitsScalar);
+	}
+}
+
+// Emit insArray instruction.
+static void
+proggenarray(ProgGen *g, vlong len)
+{
+	int32 i;
+
+	proggendataflush(g);
+	proggenemit(g, insArray);
+	for(i = 0; i < PtrSize; i++, len >>= 8)
+		proggenemit(g, len);
+}
+
+static void
+proggenarrayend(ProgGen *g)
+{
+	proggendataflush(g);
+	proggenemit(g, insArrayEnd);
+}
+
+static void
+proggenfini(ProgGen *g, vlong size)
+{
+	proggenskip(g, g->pos, size - g->pos);
+	proggendataflush(g);
+	proggenemit(g, insEnd);
+}
+
+
+// This function generates GC pointer info for global variables.
+static void
+proggenaddsym(ProgGen *g, LSym *s)
+{
+	LSym *gcprog;
+	uint8 *mask;
+	vlong i, size;
+
+	if(s->size == 0)
+		return;
+
+	// Skip alignment hole from the previous symbol.
+	proggenskip(g, g->pos, s->value - g->pos);
+	g->pos += s->value - g->pos;
+
+	if(s->gotype == nil && s->size >= PtrSize) {
+		// conservative scan
+		if((s->size%PtrSize) || (g->pos%PtrSize))
+			diag("proggenaddsym: unaligned symbol");
+		size = (s->size+PtrSize-1)/PtrSize*PtrSize;
+		if(size < 32*PtrSize) {
+			// Emit small symbols as data.
+			for(i = 0; i < size/PtrSize; i++)
+				proggendata(g, BitsPointer);
+		} else {
+			// Emit large symbols as array.
+			proggenarray(g, size/PtrSize);
+			proggendata(g, BitsPointer);
+			proggenarrayend(g);
+		}
+		g->pos = s->value + size;
+	} else if(s->gotype == nil || decodetype_noptr(s->gotype) || s->size < PtrSize) {
+		// no scan
+		if(s->size < 32*PtrSize) {
+			// Emit small symbols as data.
+			// This case also handles unaligned and tiny symbols, so tread carefully.
+			for(i = s->value; i < s->value+s->size; i++) {
+				if((i%PtrSize) == 0)
+					proggendata(g, BitsScalar);
+			}
+		} else {
+			// Emit large symbols as array.
+			if((s->size%PtrSize) || (g->pos%PtrSize))
+				diag("proggenaddsym: unaligned symbol");
+			proggenarray(g, s->size/PtrSize);
+			proggendata(g, BitsScalar);
+			proggenarrayend(g);
+		}
+		g->pos = s->value + s->size;
+	} else if(decodetype_usegcprog(s->gotype)) {
+		// gc program, copy directly
+		proggendataflush(g);
+		gcprog = decodetype_gcprog(s->gotype);
+		size = decodetype_size(s->gotype);
+		if((size%PtrSize) || (g->pos%PtrSize))
+			diag("proggenaddsym: unaligned symbol");
+		for(i = 0; i < gcprog->np-1; i++)
+			proggenemit(g, gcprog->p[i]);
+		g->pos = s->value + size;
+	} else {
+		// gc mask, it's small so emit as data
+		mask = decodetype_gcmask(s->gotype);
+		size = decodetype_size(s->gotype);
+		if((size%PtrSize) || (g->pos%PtrSize))
+			diag("proggenaddsym: unaligned symbol");
+		for(i = 0; i < size; i += PtrSize)
+			proggendata(g, (mask[i/PtrSize/2]>>((i/PtrSize%2)*4+2))&BitsMask);
+		g->pos = s->value + size;
 	}
 }

@ -755,19 +889,13 @@ dodata(void)
 	Section *sect;
 	Segment *segro;
 	LSym *s, *last, **l;
-	LSym *gcdata1, *gcbss1;
+	LSym *gcdata, *gcbss;
+	ProgGen gen;

 	if(debug['v'])
 		Bprint(&bso, "%5.2f dodata\n", cputime());
 	Bflush(&bso);

-	gcdata1 = linklookup(ctxt, "gcdata", 0);
-	gcbss1 = linklookup(ctxt, "gcbss", 0);
-
-	// size of .data and .bss section. the zero value is later replaced by the actual size of the section.
-	adduintxx(ctxt, gcdata1, 0, PtrSize);
-	adduintxx(ctxt, gcbss1, 0, PtrSize);
-
 	last = nil;
 	datap = nil;

@ -884,6 +1012,8 @@ dodata(void)
 	sect->vaddr = datsize;
 	linklookup(ctxt, "data", 0)->sect = sect;
 	linklookup(ctxt, "edata", 0)->sect = sect;
+	gcdata = linklookup(ctxt, "gcdata", 0);
+	proggeninit(&gen, gcdata);
 	for(; s != nil && s->type < SBSS; s = s->next) {
 		if(s->type == SINITARR) {
 			ctxt->cursym = s;
@ -893,13 +1023,11 @@ dodata(void)
 		s->type = SDATA;
 		datsize = aligndatsize(datsize, s);
 		s->value = datsize - sect->vaddr;
-		gcaddsym(gcdata1, s, datsize - sect->vaddr);  // gc
+		proggenaddsym(&gen, s);  // gc
 		growdatsize(&datsize, s);
 	}
 	sect->len = datsize - sect->vaddr;
-
-	adduintxx(ctxt, gcdata1, GC_END, PtrSize);
-	setuintxx(ctxt, gcdata1, 0, sect->len, PtrSize);
+	proggenfini(&gen, sect->len);  // gc

 	/* bss */
 	sect = addsection(&segdata, ".bss", 06);
@ -908,17 +1036,17 @@ dodata(void)
 	sect->vaddr = datsize;
 	linklookup(ctxt, "bss", 0)->sect = sect;
 	linklookup(ctxt, "ebss", 0)->sect = sect;
+	gcbss = linklookup(ctxt, "gcbss", 0);
+	proggeninit(&gen, gcbss);
 	for(; s != nil && s->type < SNOPTRBSS; s = s->next) {
 		s->sect = sect;
 		datsize = aligndatsize(datsize, s);
 		s->value = datsize - sect->vaddr;
-		gcaddsym(gcbss1, s, datsize - sect->vaddr);  // gc
+		proggenaddsym(&gen, s);  // gc
 		growdatsize(&datsize, s);
 	}
 	sect->len = datsize - sect->vaddr;
-
-	adduintxx(ctxt, gcbss1, GC_END, PtrSize);
-	setuintxx(ctxt, gcbss1, 0, sect->len, PtrSize);
+	proggenfini(&gen, sect->len);  // gc

 	/* pointer-free bss */
 	sect = addsection(&segdata, ".noptrbss", 06);
--- a/src/cmd/ld/decodesym.c
+++ b/src/cmd/ld/decodesym.c
@ -70,14 +70,28 @@ decode_inuxi(uchar* p, int sz)
 static int
 commonsize(void)
 {
-	return 7*PtrSize + 8;
+	return 8*PtrSize + 8;
 }

 // Type.commonType.kind
 uint8
 decodetype_kind(LSym *s)
 {
-	return s->p[1*PtrSize + 7] & ~KindNoPointers;	//  0x13 / 0x1f
+	return s->p[1*PtrSize + 7] & KindMask;	//  0x13 / 0x1f
+}
+
+// Type.commonType.kind
+uint8
+decodetype_noptr(LSym *s)
+{
+	return s->p[1*PtrSize + 7] & KindNoPointers;	//  0x13 / 0x1f
+}
+
+// Type.commonType.kind
+uint8
+decodetype_usegcprog(LSym *s)
+{
+	return s->p[1*PtrSize + 7] & KindGCProg;	//  0x13 / 0x1f
 }

 // Type.commonType.size
@ -89,9 +103,15 @@ decodetype_size(LSym *s)

 // Type.commonType.gc
 LSym*
-decodetype_gc(LSym *s)
+decodetype_gcprog(LSym *s)
 {
-	return decode_reloc_sym(s, 1*PtrSize + 8 + 1*PtrSize);
+	return decode_reloc_sym(s, 1*PtrSize + 8 + 2*PtrSize);
+}
+
+uint8*
+decodetype_gcmask(LSym *s)
+{
+	return (uint8*)(s->p + 1*PtrSize + 8 + 1*PtrSize);
 }

 // Type.ArrayType.elem and Type.SliceType.Elem
--- a/src/cmd/ld/lib.h
+++ b/src/cmd/ld/lib.h
@ -196,9 +196,12 @@ int	decodetype_funcincount(LSym *s);
 LSym*	decodetype_funcintype(LSym *s, int i);
 int	decodetype_funcoutcount(LSym *s);
 LSym*	decodetype_funcouttype(LSym *s, int i);
-LSym*	decodetype_gc(LSym *s);
+LSym*	decodetype_gcprog(LSym *s);
+uint8*	decodetype_gcmask(LSym *s);
 vlong	decodetype_ifacemethodcount(LSym *s);
 uint8	decodetype_kind(LSym *s);
+uint8	decodetype_noptr(LSym *s);
+uint8	decodetype_usegcprog(LSym *s);
 LSym*	decodetype_mapkey(LSym *s);
 LSym*	decodetype_mapvalue(LSym *s);
 LSym*	decodetype_ptrelem(LSym *s);
--- a/src/pkg/reflect/type.go
+++ b/src/pkg/reflect/type.go
@ -249,7 +249,7 @@ type rtype struct {
 	fieldAlign    uint8             // alignment of struct field with this type
 	kind          uint8             // enumeration for C
 	alg           *uintptr          // algorithm table (../runtime/runtime.h:/Alg)
-	gc            unsafe.Pointer // garbage collection data
+	gc            [2]unsafe.Pointer // garbage collection data
 	string        *string           // string form; unnecessary but undeniably useful
 	*uncommonType                   // (relatively) uncommon fields
 	ptrToThis     *rtype            // type for pointer to this type, if used in binary or has methods
@ -357,24 +357,6 @@ type structType struct {
 	fields []structField // sorted by offset
 }

-// NOTE: These are copied from ../runtime/mgc0.h.
-// They must be kept in sync.
-const (
-	_GC_END = iota
-	_GC_PTR
-	_GC_APTR
-	_GC_ARRAY_START
-	_GC_ARRAY_NEXT
-	_GC_CALL
-	_GC_CHAN_PTR
-	_GC_STRING
-	_GC_EFACE
-	_GC_IFACE
-	_GC_SLICE
-	_GC_REGION
-	_GC_NUM_INSTR
-)
-
 /*
 * The compiler knows the exact layout of all the data structures above.
 * The compiler does not know about the data structures and methods below.
@ -399,7 +381,8 @@ type Method struct {
 // High bit says whether type has
 // embedded pointers,to help garbage collector.
 const (
-	kindMask       = 0x7f
+	kindMask       = 0x3f
+	kindGCProg     = 0x40
 	kindNoPointers = 0x80
 )

@ -1013,32 +996,6 @@ var ptrMap struct {
 	m map[*rtype]*ptrType
 }

-// garbage collection bytecode program for pointer to memory without pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrDataGC struct {
-	width uintptr // sizeof(ptr)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var ptrDataGCProg = ptrDataGC{
-	width: unsafe.Sizeof((*byte)(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
-// garbage collection bytecode program for pointer to memory with pointers.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type ptrGC struct {
-	width  uintptr        // sizeof(ptr)
-	op     uintptr        // _GC_PTR
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc type
-	end    uintptr        // _GC_END
-}
-
 // PtrTo returns the pointer type with element t.
 // For example, if t represents type Foo, PtrTo(t) represents *Foo.
 func PtrTo(t Type) Type {
@ -1096,20 +1053,6 @@ func (t *rtype) ptrTo() *rtype {
 	p.zero = unsafe.Pointer(&make([]byte, p.size)[0])
 	p.elem = t

-	if t.kind&kindNoPointers != 0 {
-		p.gc = unsafe.Pointer(&ptrDataGCProg)
-	} else {
-		p.gc = unsafe.Pointer(&ptrGC{
-			width:  p.size,
-			op:     _GC_PTR,
-			off:    0,
-			elemgc: t.gc,
-			end:    _GC_END,
-		})
-	}
-	// INCORRECT. Uncomment to check that TestPtrToGC fails when p.gc is wrong.
-	//p.gc = unsafe.Pointer(&badGC{width: p.size, end: _GC_END})
-
 	ptrMap.m[t] = p
 	ptrMap.Unlock()
 	return &p.rtype
@ -1414,21 +1357,6 @@ func cachePut(k cacheKey, t *rtype) Type {
 	return t
 }

-// garbage collection bytecode program for chan.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type chanGC struct {
-	width uintptr // sizeof(map)
-	op    uintptr // _GC_CHAN_PTR
-	off   uintptr // 0
-	typ   *rtype  // map type
-	end   uintptr // _GC_END
-}
-
-type badGC struct {
-	width uintptr
-	end   uintptr
-}
-
 // ChanOf returns the channel type with the given direction and element type.
 // For example, if t represents int, ChanOf(RecvDir, t) represents <-chan int.
 //
@ -1482,17 +1410,6 @@ func ChanOf(dir ChanDir, t Type) Type {
 	ch.ptrToThis = nil
 	ch.zero = unsafe.Pointer(&make([]byte, ch.size)[0])

-	ch.gc = unsafe.Pointer(&chanGC{
-		width: ch.size,
-		op:    _GC_CHAN_PTR,
-		off:   0,
-		typ:   &ch.rtype,
-		end:   _GC_END,
-	})
-
-	// INCORRECT. Uncomment to check that TestChanOfGC fails when ch.gc is wrong.
-	//ch.gc = unsafe.Pointer(&badGC{width: ch.size, end: _GC_END})
-
 	return cachePut(ckey, &ch.rtype)
 }

@ -1537,166 +1454,141 @@ func MapOf(key, elem Type) Type {
 	mt.key = ktyp
 	mt.elem = etyp
 	mt.bucket = bucketOf(ktyp, etyp)
-	mt.hmap = hMapOf(mt.bucket)
 	mt.uncommonType = nil
 	mt.ptrToThis = nil
 	mt.zero = unsafe.Pointer(&make([]byte, mt.size)[0])
-	mt.gc = unsafe.Pointer(&ptrGC{
-		width:  unsafe.Sizeof(uintptr(0)),
-		op:     _GC_PTR,
-		off:    0,
-		elemgc: mt.hmap.gc,
-		end:    _GC_END,
-	})
-
-	// INCORRECT. Uncomment to check that TestMapOfGC and TestMapOfGCValues
-	// fail when mt.gc is wrong.
-	//mt.gc = unsafe.Pointer(&badGC{width: mt.size, end: _GC_END})

 	return cachePut(ckey, &mt.rtype)
 }

+// gcProg is a helper type for generatation of GC pointer info.
+type gcProg struct {
+	gc   []byte
+	size uintptr // size of type in bytes
+}
+
+func (gc *gcProg) append(v byte) {
+	gc.align(unsafe.Sizeof(uintptr(0)))
+	gc.appendWord(v)
+}
+
+// Appends t's type info to the current program.
+func (gc *gcProg) appendProg(t *rtype) {
+	gc.align(uintptr(t.align))
+	if !t.pointers() {
+		gc.size += t.size
+		return
+	}
+	nptr := t.size / unsafe.Sizeof(uintptr(0))
+	var prog []byte
+	if t.kind&kindGCProg != 0 {
+		// Ensure that the runtime has unrolled GC program.
+		unsafe_New(t)
+		// The program is stored in t.gc[0], skip unroll flag.
+		prog = (*[1 << 30]byte)(unsafe.Pointer(t.gc[0]))[1:]
+	} else {
+		// The mask is embed directly in t.gc.
+		prog = (*[1 << 30]byte)(unsafe.Pointer(&t.gc[0]))[:]
+	}
+	for i := uintptr(0); i < nptr; i++ {
+		gc.appendWord(extractGCWord(prog, i))
+	}
+}
+
+func (gc *gcProg) appendWord(v byte) {
+	ptrsize := unsafe.Sizeof(uintptr(0))
+	if gc.size%ptrsize != 0 {
+		panic("reflect: unaligned GC program")
+	}
+	nptr := gc.size / ptrsize
+	for uintptr(len(gc.gc)) < nptr/2+1 {
+		gc.gc = append(gc.gc, 0x44) // BitsScalar
+	}
+	gc.gc[nptr/2] &= ^(3 << ((nptr%2)*4 + 2))
+	gc.gc[nptr/2] |= v << ((nptr%2)*4 + 2)
+	gc.size += ptrsize
+}
+
+func (gc *gcProg) finalize() unsafe.Pointer {
+	if gc.size == 0 {
+		return nil
+	}
+	ptrsize := unsafe.Sizeof(uintptr(0))
+	gc.align(ptrsize)
+	nptr := gc.size / ptrsize
+	for uintptr(len(gc.gc)) < nptr/2+1 {
+		gc.gc = append(gc.gc, 0x44) // BitsScalar
+	}
+	// If number of words is odd, repeat the mask twice.
+	// Compiler does the same.
+	if nptr%2 != 0 {
+		for i := uintptr(0); i < nptr; i++ {
+			gc.appendWord(extractGCWord(gc.gc, i))
+		}
+	}
+	gc.gc = append([]byte{1}, gc.gc...) // prepend unroll flag
+	return unsafe.Pointer(&gc.gc[0])
+}
+
+func extractGCWord(gc []byte, i uintptr) byte {
+	return (gc[i/2] >> ((i%2)*4 + 2)) & 3
+}
+
+func (gc *gcProg) align(a uintptr) {
+	gc.size = align(gc.size, a)
+}
+
+const (
+	bitsScalar  = 1
+	bitsPointer = 2
+)
+
 // Make sure these routines stay in sync with ../../pkg/runtime/hashmap.c!
 // These types exist only for GC, so we only fill out GC relevant info.
 // Currently, that's just size and the GC program.  We also fill in string
 // for possible debugging use.
 const (
-	_BUCKETSIZE = 8
-	_MAXKEYSIZE = 128
-	_MAXVALSIZE = 128
+	bucketSize = 8
+	maxKeySize = 128
+	maxValSize = 128
 )

 func bucketOf(ktyp, etyp *rtype) *rtype {
-	if ktyp.size > _MAXKEYSIZE {
+	if ktyp.size > maxKeySize {
 		ktyp = PtrTo(ktyp).(*rtype)
 	}
-	if etyp.size > _MAXVALSIZE {
+	if etyp.size > maxValSize {
 		etyp = PtrTo(etyp).(*rtype)
 	}
 	ptrsize := unsafe.Sizeof(uintptr(0))

-	gc := make([]uintptr, 1)                                       // first entry is size, filled in at the end
-	offset := _BUCKETSIZE * unsafe.Sizeof(uint8(0))                // topbits
-	gc = append(gc, _GC_PTR, offset, 0 /*self pointer set below*/) // overflow
-	offset += ptrsize
-
+	var gc gcProg
+	// topbits
+	for i := 0; i < int(bucketSize*unsafe.Sizeof(uint8(0))/ptrsize); i++ {
+		gc.append(bitsScalar)
+	}
+	gc.append(bitsPointer) // overflow
 	if runtime.GOARCH == "amd64p32" {
-		offset += 4
+		gc.append(bitsScalar)
 	}
-
 	// keys
-	if ktyp.kind&kindNoPointers == 0 {
-		gc = append(gc, _GC_ARRAY_START, offset, _BUCKETSIZE, ktyp.size)
-		gc = appendGCProgram(gc, ktyp)
-		gc = append(gc, _GC_ARRAY_NEXT)
+	for i := 0; i < bucketSize; i++ {
+		gc.appendProg(ktyp)
 	}
-	offset += _BUCKETSIZE * ktyp.size
-
 	// values
-	if etyp.kind&kindNoPointers == 0 {
-		gc = append(gc, _GC_ARRAY_START, offset, _BUCKETSIZE, etyp.size)
-		gc = appendGCProgram(gc, etyp)
-		gc = append(gc, _GC_ARRAY_NEXT)
+	for i := 0; i < bucketSize; i++ {
+		gc.appendProg(etyp)
 	}
-	offset += _BUCKETSIZE * etyp.size
-
-	gc = append(gc, _GC_END)
-	gc[0] = offset
-	gc[3] = uintptr(unsafe.Pointer(&gc[0])) // set self pointer

 	b := new(rtype)
-	b.size = offset
-	b.gc = unsafe.Pointer(&gc[0])
+	b.size = gc.size
+	b.gc[0] = gc.finalize()
+	b.kind |= kindGCProg
 	s := "bucket(" + *ktyp.string + "," + *etyp.string + ")"
 	b.string = &s
 	return b
 }

-// Take the GC program for "t" and append it to the GC program "gc".
-func appendGCProgram(gc []uintptr, t *rtype) []uintptr {
-	p := t.gc
-	p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0))) // skip size
-loop:
-	for {
-		var argcnt int
-		switch *(*uintptr)(p) {
-		case _GC_END:
-			// Note: _GC_END not included in append
-			break loop
-		case _GC_ARRAY_NEXT:
-			argcnt = 0
-		case _GC_APTR, _GC_STRING, _GC_EFACE, _GC_IFACE:
-			argcnt = 1
-		case _GC_PTR, _GC_CALL, _GC_CHAN_PTR, _GC_SLICE:
-			argcnt = 2
-		case _GC_ARRAY_START, _GC_REGION:
-			argcnt = 3
-		default:
-			panic("unknown GC program op for " + *t.string + ": " + strconv.FormatUint(*(*uint64)(p), 10))
-		}
-		for i := 0; i < argcnt+1; i++ {
-			gc = append(gc, *(*uintptr)(p))
-			p = unsafe.Pointer(uintptr(p) + unsafe.Sizeof(uintptr(0)))
-		}
-	}
-	return gc
-}
-func hMapOf(bucket *rtype) *rtype {
-	ptrsize := unsafe.Sizeof(uintptr(0))
-
-	// make gc program & compute hmap size
-	gc := make([]uintptr, 1)           // first entry is size, filled in at the end
-	offset := unsafe.Sizeof(uint(0))   // count
-	offset += unsafe.Sizeof(uint32(0)) // flags
-	offset += unsafe.Sizeof(uint32(0)) // hash0
-	offset += unsafe.Sizeof(uint8(0))  // B
-	offset += unsafe.Sizeof(uint8(0))  // keysize
-	offset += unsafe.Sizeof(uint8(0))  // valuesize
-	offset = (offset + 1) / 2 * 2
-	offset += unsafe.Sizeof(uint16(0)) // bucketsize
-	offset = (offset + ptrsize - 1) / ptrsize * ptrsize
-	gc = append(gc, _GC_PTR, offset, uintptr(bucket.gc)) // buckets
-	offset += ptrsize
-	gc = append(gc, _GC_PTR, offset, uintptr(bucket.gc)) // oldbuckets
-	offset += ptrsize
-	offset += ptrsize // nevacuate
-	gc = append(gc, _GC_END)
-	gc[0] = offset
-
-	h := new(rtype)
-	h.size = offset
-	h.gc = unsafe.Pointer(&gc[0])
-	s := "hmap(" + *bucket.string + ")"
-	h.string = &s
-	return h
-}
-
-// garbage collection bytecode program for slice of non-zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceGC struct {
-	width  uintptr        // sizeof(slice)
-	op     uintptr        // _GC_SLICE
-	off    uintptr        // 0
-	elemgc unsafe.Pointer // element gc program
-	end    uintptr        // _GC_END
-}
-
-// garbage collection bytecode program for slice of zero-length values.
-// See ../../cmd/gc/reflect.c:/^dgcsym1 and :/^dgcsym.
-type sliceEmptyGC struct {
-	width uintptr // sizeof(slice)
-	op    uintptr // _GC_APTR
-	off   uintptr // 0
-	end   uintptr // _GC_END
-}
-
-var sliceEmptyGCProg = sliceEmptyGC{
-	width: unsafe.Sizeof([]byte(nil)),
-	op:    _GC_APTR,
-	off:   0,
-	end:   _GC_END,
-}
-
 // SliceOf returns the slice type with element type t.
 // For example, if t represents int, SliceOf(t) represents []int.
 func SliceOf(t Type) Type {
@ -1729,21 +1621,6 @@ func SliceOf(t Type) Type {
 	slice.ptrToThis = nil
 	slice.zero = unsafe.Pointer(&make([]byte, slice.size)[0])

-	if typ.size == 0 {
-		slice.gc = unsafe.Pointer(&sliceEmptyGCProg)
-	} else {
-		slice.gc = unsafe.Pointer(&sliceGC{
-			width:  slice.size,
-			op:     _GC_SLICE,
-			off:    0,
-			elemgc: typ.gc,
-			end:    _GC_END,
-		})
-	}
-
-	// INCORRECT. Uncomment to check that TestSliceOfOfGC fails when slice.gc is wrong.
-	//slice.gc = unsafe.Pointer(&badGC{width: slice.size, end: _GC_END})
-
 	return cachePut(ckey, &slice.rtype)
 }

@ -1861,49 +1738,41 @@ func funcLayout(t *rtype, rcvr *rtype) (frametype *rtype, argSize, retOffset uin
 	tt := (*funcType)(unsafe.Pointer(t))

 	// compute gc program for arguments
-	gc := make([]uintptr, 1) // first entry is size, filled in at the end
-	offset := uintptr(0)
+	var gc gcProg
 	if rcvr != nil {
 		// Reflect uses the "interface" calling convention for
 		// methods, where receivers take one word of argument
 		// space no matter how big they actually are.
 		if rcvr.size > ptrSize {
 			// we pass a pointer to the receiver.
-			gc = append(gc, _GC_PTR, offset, uintptr(rcvr.gc))
+			gc.append(bitsPointer)
 		} else if rcvr.pointers() {
 			// rcvr is a one-word pointer object.  Its gc program
 			// is just what we need here.
-			gc = appendGCProgram(gc, rcvr)
+			gc.append(bitsPointer)
+		} else {
+			gc.append(bitsScalar)
 		}
-		offset += ptrSize
 	}
 	for _, arg := range tt.in {
-		offset = align(offset, uintptr(arg.align))
-		if arg.pointers() {
-			gc = append(gc, _GC_REGION, offset, arg.size, uintptr(arg.gc))
+		gc.appendProg(arg)
 	}
-		offset += arg.size
-	}
-	argSize = offset
+	argSize = gc.size
 	if runtime.GOARCH == "amd64p32" {
-		offset = align(offset, 8)
+		gc.align(8)
 	}
-	offset = align(offset, ptrSize)
-	retOffset = offset
+	gc.align(ptrSize)
+	retOffset = gc.size
 	for _, res := range tt.out {
-		offset = align(offset, uintptr(res.align))
-		if res.pointers() {
-			gc = append(gc, _GC_REGION, offset, res.size, uintptr(res.gc))
+		gc.appendProg(res)
 	}
-		offset += res.size
-	}
-	gc = append(gc, _GC_END)
-	gc[0] = offset
+	gc.align(ptrSize)

 	// build dummy rtype holding gc program
 	x := new(rtype)
-	x.size = offset
-	x.gc = unsafe.Pointer(&gc[0])
+	x.size = gc.size
+	x.gc[0] = gc.finalize()
+	x.kind |= kindGCProg
 	var s string
 	if rcvr != nil {
 		s = "methodargs(" + *rcvr.string + ")(" + *t.string + ")"
--- a/src/pkg/runtime/chan.goc
+++ b/src/pkg/runtime/chan.goc
@ -37,7 +37,7 @@ makechan(ChanType *t, int64 hint)
 		runtime·panicstring("makechan: size out of range");

 	// allocate memory in one call
-	c = (Hchan*)runtime·mallocgc(sizeof(*c) + hint*elem->size, (uintptr)t | TypeInfo_Chan, 0);
+	c = (Hchan*)runtime·mallocgc(sizeof(*c) + hint*elem->size, nil, 0);
 	c->elemsize = elem->size;
 	c->elemtype = elem;
 	c->dataqsiz = hint;
--- a/src/pkg/runtime/export_test.go
+++ b/src/pkg/runtime/export_test.go
@ -62,6 +62,9 @@ func ParForIters(desc *ParFor, tid uint32) (uint32, uint32) {
 	return uint32(begin), uint32(end)
 }

+//go:noescape
+func GCMask(x interface{}) []byte
+
 func testSchedLocalQueue()
 func testSchedLocalQueueSteal()

--- a/src/pkg/runtime/gc_test.go
+++ b/src/pkg/runtime/gc_test.go
@ -10,6 +10,7 @@ import (
 	"runtime/debug"
 	"testing"
 	"time"
+	"unsafe"
 )

 func TestGcSys(t *testing.T) {
@ -165,6 +166,29 @@ func TestGcLastTime(t *testing.T) {
 	}
 }

+var hugeSink interface{}
+
+func TestHugeGCInfo(t *testing.T) {
+	// The test ensures that compiler can chew these huge types even on weakest machines.
+	// The types are not allocated at runtime.
+	if hugeSink != nil {
+		// 400MB on 32 bots, 4TB on 64-bits.
+		const n = (400 << 20) + (unsafe.Sizeof(uintptr(0))-4)<<40
+		hugeSink = new([n]*byte)
+		hugeSink = new([n]uintptr)
+		hugeSink = new(struct {
+			x float64
+			y [n]*byte
+			z []string
+		})
+		hugeSink = new(struct {
+			x float64
+			y [n]uintptr
+			z []string
+		})
+	}
+}
+
 func BenchmarkSetTypeNoPtr1(b *testing.B) {
 	type NoPtr1 struct {
 		p uintptr
--- a/src/pkg/runtime/gcinfo_test.go
+++ b/src/pkg/runtime/gcinfo_test.go
@ -0,0 +1,147 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"runtime"
+	"testing"
+)
+
+// TestGCInfo tests that various objects in heap, data and bss receive correct GC pointer type info.
+func TestGCInfo(t *testing.T) {
+	verifyGCInfo(t, "bss ScalarPtr", &bssScalarPtr, infoScalarPtr)
+	verifyGCInfo(t, "bss PtrScalar", &bssPtrScalar, infoPtrScalar)
+	verifyGCInfo(t, "bss Complex", &bssComplex, infoComplex())
+	verifyGCInfo(t, "bss string", &bssString, infoString)
+	verifyGCInfo(t, "bss eface", &bssEface, infoEface)
+
+	verifyGCInfo(t, "data ScalarPtr", &dataScalarPtr, infoScalarPtr)
+	verifyGCInfo(t, "data PtrScalar", &dataPtrScalar, infoPtrScalar)
+	verifyGCInfo(t, "data Complex", &dataComplex, infoComplex())
+	verifyGCInfo(t, "data string", &dataString, infoString)
+	verifyGCInfo(t, "data eface", &dataEface, infoEface)
+
+	for i := 0; i < 3; i++ {
+		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
+		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
+		verifyGCInfo(t, "heap Complex", escape(new(Complex)), infoComplex())
+		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
+		verifyGCInfo(t, "heap eface", escape(new(interface{})), infoEface)
+	}
+
+}
+
+func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
+	mask := runtime.GCMask(p)
+	if len(mask) > len(mask0) {
+		mask0 = append(mask0, BitsDead)
+		mask = mask[:len(mask0)]
+	}
+	if bytes.Compare(mask, mask0) != 0 {
+		t.Errorf("bad GC program for %v:\nwant %+v\ngot  %+v", name, mask0, mask)
+		return
+	}
+}
+
+var gcinfoSink interface{}
+
+func escape(p interface{}) interface{} {
+	gcinfoSink = p
+	return p
+}
+
+const (
+	BitsDead = iota
+	BitsScalar
+	BitsPointer
+	BitsMultiWord
+)
+
+const (
+	BitsString = iota
+	BitsSlice
+	BitsIface
+	BitsEface
+)
+
+type ScalarPtr struct {
+	q int
+	w *int
+	e int
+	r *int
+	t int
+	y *int
+}
+
+var infoScalarPtr = []byte{BitsScalar, BitsPointer, BitsScalar, BitsPointer, BitsScalar, BitsPointer}
+
+type PtrScalar struct {
+	q *int
+	w int
+	e *int
+	r int
+	t *int
+	y int
+}
+
+var infoPtrScalar = []byte{BitsPointer, BitsScalar, BitsPointer, BitsScalar, BitsPointer, BitsScalar}
+
+type Complex struct {
+	q *int
+	w byte
+	e [17]byte
+	r []byte
+	t int
+	y uint16
+	u uint64
+	i string
+}
+
+func infoComplex() []byte {
+	switch runtime.GOARCH {
+	case "386", "arm":
+		return []byte{
+			BitsPointer, BitsScalar, BitsScalar, BitsScalar,
+			BitsScalar, BitsScalar, BitsMultiWord, BitsSlice,
+			BitsScalar, BitsScalar, BitsScalar, BitsScalar,
+			BitsScalar, BitsMultiWord, BitsString,
+		}
+	case "amd64":
+		return []byte{
+			BitsPointer, BitsScalar, BitsScalar, BitsScalar,
+			BitsMultiWord, BitsSlice, BitsScalar, BitsScalar,
+			BitsScalar, BitsScalar, BitsMultiWord, BitsString,
+		}
+	case "amd64p32":
+		return []byte{
+			BitsPointer, BitsScalar, BitsScalar, BitsScalar,
+			BitsScalar, BitsScalar, BitsMultiWord, BitsSlice,
+			BitsScalar, BitsScalar, BitsScalar, BitsScalar,
+			BitsScalar, BitsScalar, BitsMultiWord, BitsString,
+		}
+	default:
+		panic("unknown arch")
+	}
+}
+
+var (
+	// BSS
+	bssScalarPtr ScalarPtr
+	bssPtrScalar PtrScalar
+	bssComplex   Complex
+	bssString    string
+	bssEface     interface{}
+
+	// DATA
+	dataScalarPtr             = ScalarPtr{q: 1}
+	dataPtrScalar             = PtrScalar{w: 1}
+	dataComplex               = Complex{w: 1}
+	dataString                = "foo"
+	dataEface     interface{} = 42
+
+	infoString = []byte{BitsMultiWord, BitsString}
+	infoEface  = []byte{BitsMultiWord, BitsEface}
+)
--- a/src/pkg/runtime/heapdump.c
+++ b/src/pkg/runtime/heapdump.c
@ -52,17 +52,17 @@ enum {
 	TagPanic = 15,
 	TagMemProf = 16,
 	TagAllocSample = 17,
-
-	TypeInfo_Conservative = 127,
 };

 static uintptr* playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg);
-static void dumpfields(uintptr *prog);
-static void dumpefacetypes(void *obj, uintptr size, Type *type, uintptr kind);
+static void dumpfields(BitVector bv);
 static void dumpbvtypes(BitVector *bv, byte *base);
+static BitVector makeheapobjbv(byte *p, uintptr size);

 // fd to write the dump to.
 static uintptr	dumpfd;
+static byte	*tmpbuf;
+static uintptr	tmpbufsize;

 // buffer of pending write data
 enum {
@ -199,34 +199,18 @@ dumptype(Type *t)
 		write(t->x->name->str, t->x->name->len);
 	}
 	dumpbool(t->size > PtrSize || (t->kind & KindNoPointers) == 0);
-	dumpfields((uintptr*)t->gc + 1);
-}
-
-// returns true if object is scannable
-static bool
-scannable(byte *obj)
-{
-	uintptr *b, off, shift;
-
-	off = (uintptr*)obj - (uintptr*)runtime·mheap.arena_start;  // word offset
-	b = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-	shift = off % wordsPerBitmapWord;
-	return ((*b >> shift) & bitScan) != 0;
+	dumpfields((BitVector){0, nil});
 }

 // dump an object
 static void
-dumpobj(byte *obj, uintptr size, Type *type, uintptr kind)
+dumpobj(byte *obj, uintptr size, BitVector bv)
 {
-	if(type != nil) {
-		dumptype(type);
-		dumpefacetypes(obj, size, type, kind);
-	}
-
+	dumpbvtypes(&bv, obj);
 	dumpint(TagObject);
 	dumpint((uintptr)obj);
-	dumpint((uintptr)type);
-	dumpint(kind);
+	dumpint(0); // Type*
+	dumpint(0); // kind
 	dumpmemrange(obj, size);
 }

@ -513,33 +497,19 @@ dumproots(void)
 	dumpint(TagData);
 	dumpint((uintptr)data);
 	dumpmemrange(data, edata - data);
-	dumpfields((uintptr*)gcdata + 1);
+	dumpfields((BitVector){(edata - data)*8, (uint32*)gcdata});

 	// bss segment
 	dumpint(TagBss);
 	dumpint((uintptr)bss);
 	dumpmemrange(bss, ebss - bss);
-	dumpfields((uintptr*)gcbss + 1);
+	dumpfields((BitVector){(ebss - bss)*8, (uint32*)gcbss});

 	// MSpan.types
 	allspans = runtime·mheap.allspans;
 	for(spanidx=0; spanidx<runtime·mheap.nspan; spanidx++) {
 		s = allspans[spanidx];
 		if(s->state == MSpanInUse) {
-			// The garbage collector ignores type pointers stored in MSpan.types:
-			//  - Compiler-generated types are stored outside of heap.
-			//  - The reflect package has runtime-generated types cached in its data structures.
-			//    The garbage collector relies on finding the references via that cache.
-			switch(s->types.compression) {
-			case MTypes_Empty:
-			case MTypes_Single:
-				break;
-			case MTypes_Words:
-			case MTypes_Bytes:
-				dumpotherroot("runtime type info", (byte*)s->types.data);
-				break;
-			}
-
 			// Finalizers
 			for(sp = s->specials; sp != nil; sp = sp->next) {
 				if(sp->kind != KindSpecialFinalizer)
@ -555,18 +525,12 @@ dumproots(void)
 	runtime·iterate_finq(finq_callback);
 }

-// Bit vector of free marks.
-// Needs to be as big as the largest number of objects per span.
-static byte free[PageSize/8];
-
 static void
 dumpobjs(void)
 {
-	uintptr i, j, size, n, off, shift, *bitp, bits, ti, kind;
+	uintptr i, j, size, n, off, shift, *bitp, bits;
 	MSpan *s;
-	MLink *l;
 	byte *p;
-	Type *t;

 	for(i = 0; i < runtime·mheap.nspan; i++) {
 		s = runtime·mheap.allspans[i];
@ -575,36 +539,16 @@ dumpobjs(void)
 		p = (byte*)(s->start << PageShift);
 		size = s->elemsize;
 		n = (s->npages << PageShift) / size;
-		if(n > PageSize/8)
-			runtime·throw("free array doesn't have enough entries");
-		for(l = s->freelist; l != nil; l = l->next) {
-			free[((byte*)l - p) / size] = true;
-		}
 		for(j = 0; j < n; j++, p += size) {
-			if(free[j]) {
-				free[j] = false;
-				continue;
-			}
 			off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start;
 			bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
-			shift = off % wordsPerBitmapWord;
-			bits = *bitp >> shift;
+			shift = (off % wordsPerBitmapWord) * gcBits;
+			bits = (*bitp >> shift) & bitMask;

 			// Skip FlagNoGC allocations (stacks)
-			if((bits & bitAllocated) == 0)
+			if(bits != bitAllocated)
 				continue;
-
-			// extract type and kind
-			ti = runtime·gettype(p);
-			t = (Type*)(ti & ~(uintptr)(PtrSize-1));
-			kind = ti & (PtrSize-1);
-			
-			// dump it
-			if(kind == TypeInfo_Chan)
-				t = ((ChanType*)t)->elem; // use element type for chan encoding
-			if(t == nil && scannable(p))
-				kind = TypeInfo_Conservative; // special kind for conservatively scanned objects
-			dumpobj(p, size, t, kind);
+			dumpobj(p, size, makeheapobjbv(p, size));
 		}
 	}
 }
@ -621,7 +565,6 @@ dumpparams(void)
 	else
 		dumpbool(true); // big-endian ptrs
 	dumpint(PtrSize);
-	dumpint(runtime·Hchansize);
 	dumpint((uintptr)runtime·mheap.arena_start);
 	dumpint((uintptr)runtime·mheap.arena_used);
 	dumpint(thechar);
@ -819,6 +762,11 @@ runtime∕debug·WriteHeapDump(uintptr fd)

 	// Reset dump file.
 	dumpfd = 0;
+	if(tmpbuf != nil) {
+		runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
+		tmpbuf = nil;
+		tmpbufsize = 0;
+	}

 	// Start up the world again.
 	g->m->gcing = 0;
@ -827,132 +775,17 @@ runtime∕debug·WriteHeapDump(uintptr fd)
 	g->m->locks--;
 }

-// Runs the specified gc program.  Calls the callback for every
-// pointer-like field specified by the program and passes to the
-// callback the kind and offset of that field within the object.
-// offset is the offset in the object of the start of the program.
-// Returns a pointer to the opcode that ended the gc program (either
-// GC_END or GC_ARRAY_NEXT).
-static uintptr*
-playgcprog(uintptr offset, uintptr *prog, void (*callback)(void*,uintptr,uintptr), void *arg)
-{
-	uintptr len, elemsize, i, *end;
-
-	for(;;) {
-		switch(prog[0]) {
-		case GC_END:
-			return prog;
-		case GC_PTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_APTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_ARRAY_START:
-			len = prog[2];
-			elemsize = prog[3];
-			end = nil;
-			for(i = 0; i < len; i++) {
-				end = playgcprog(offset + prog[1] + i * elemsize, prog + 4, callback, arg);
-				if(end[0] != GC_ARRAY_NEXT)
-					runtime·throw("GC_ARRAY_START did not have matching GC_ARRAY_NEXT");
-			}
-			prog = end + 1;
-			break;
-		case GC_ARRAY_NEXT:
-			return prog;
-		case GC_CALL:
-			playgcprog(offset + prog[1], (uintptr*)((byte*)prog + *(int32*)&prog[2]), callback, arg);
-			prog += 3;
-			break;
-		case GC_CHAN_PTR:
-			callback(arg, FieldKindPtr, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_STRING:
-			callback(arg, FieldKindString, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_EFACE:
-			callback(arg, FieldKindEface, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_IFACE:
-			callback(arg, FieldKindIface, offset + prog[1]);
-			prog += 2;
-			break;
-		case GC_SLICE:
-			callback(arg, FieldKindSlice, offset + prog[1]);
-			prog += 3;
-			break;
-		case GC_REGION:
-			playgcprog(offset + prog[1], (uintptr*)prog[3] + 1, callback, arg);
-			prog += 4;
-			break;
-		default:
-			runtime·printf("%D\n", (uint64)prog[0]);
-			runtime·throw("bad gc op");
-		}
-	}
-}
-
-static void
-dump_callback(void *p, uintptr kind, uintptr offset)
-{
-	USED(&p);
-	dumpint(kind);
-	dumpint(offset);
-}
-
 // dumpint() the kind & offset of each field in an object.
 static void
-dumpfields(uintptr *prog)
+dumpfields(BitVector bv)
 {
-	playgcprog(0, prog, dump_callback, nil);
+	dumpbv(&bv, 0);
 	dumpint(FieldKindEol);
 }

-static void
-dumpeface_callback(void *p, uintptr kind, uintptr offset)
-{
-	Eface *e;
-
-	if(kind != FieldKindEface)
-		return;
-	e = (Eface*)((byte*)p + offset);
-	dumptype(e->type);
-}
-
 // The heap dump reader needs to be able to disambiguate
 // Eface entries.  So it needs to know every type that might
-// appear in such an entry.  The following two routines accomplish
-// that.
-
-// Dump all the types that appear in the type field of
-// any Eface contained in obj.
-static void
-dumpefacetypes(void *obj, uintptr size, Type *type, uintptr kind)
-{
-	uintptr i;
-
-	switch(kind) {
-	case TypeInfo_SingleObject:
-		playgcprog(0, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		break;
-	case TypeInfo_Array:
-		for(i = 0; i <= size - type->size; i += type->size)
-			playgcprog(i, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		break;
-	case TypeInfo_Chan:
-		if(type->size == 0) // channels may have zero-sized objects in them
-			break;
-		for(i = runtime·Hchansize; i <= size - type->size; i += type->size)
-			playgcprog(i, (uintptr*)type->gc + 1, dumpeface_callback, obj);
-		break;
-	}
-}
+// appear in such an entry.  The following routine accomplishes that.

 // Dump all the types that appear in the type field of
 // any Eface described by this bit vector.
@ -979,3 +812,36 @@ dumpbvtypes(BitVector *bv, byte *base)
 		}
 	}
 }
+
+static BitVector
+makeheapobjbv(byte *p, uintptr size)
+{
+	uintptr off, shift, *bitp, bits, nptr, i;
+	bool mw;
+
+	// Extend the temp buffer if necessary.
+	nptr = size/PtrSize;
+	if(tmpbufsize < nptr*BitsPerPointer/8+1) {
+		if(tmpbuf != nil)
+			runtime·SysFree(tmpbuf, tmpbufsize, &mstats.other_sys);
+		tmpbufsize = nptr*BitsPerPointer/8+1;
+		tmpbuf = runtime·SysAlloc(tmpbufsize, &mstats.other_sys);
+		if(tmpbuf == nil)
+			runtime·throw("heapdump: out of memory");
+	}
+
+	// Copy and compact the bitmap.
+	mw = false;
+	for(i = 0; i < nptr; i++) {
+		off = (uintptr*)(p + i*PtrSize) - (uintptr*)runtime·mheap.arena_start;
+		bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1;
+		shift = (off % wordsPerBitmapWord) * gcBits;
+		bits = (*bitp >> (shift + 2)) & 3;
+		if(!mw && bits == BitsDead)
+			break;  // end of heap object
+		mw = !mw && bits == BitsMultiWord;
+		tmpbuf[i*BitsPerPointer/8] &= ~(3<<((i*BitsPerPointer)%8));
+		tmpbuf[i*BitsPerPointer/8] |= bits<<((i*BitsPerPointer)%8);
+	}
+	return (BitVector){i*BitsPerPointer, (uint32*)tmpbuf};
+}
--- a/src/pkg/runtime/malloc.goc
+++ b/src/pkg/runtime/malloc.goc
@ -22,8 +22,6 @@ MHeap runtime·mheap;
 #pragma dataflag NOPTR
 MStats mstats;

-int32	runtime·checking;
-
 extern MStats mstats;	// defined in zruntime_def_$GOOS_$GOARCH.go

 extern volatile intgo runtime·MemProfileRate;
@ -37,10 +35,10 @@ static void settype(MSpan *s, void *v, uintptr typ);
 // Large objects (> 32 kB) are allocated straight from the heap.
 // If the block will be freed with runtime·free(), typ must be 0.
 void*
-runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
+runtime·mallocgc(uintptr size, Type *typ, uint32 flag)
 {
 	int32 sizeclass;
-	uintptr tinysize, size1;
+	uintptr tinysize, size0, size1;
 	intgo rate;
 	MCache *c;
 	MSpan *s;
@ -60,9 +58,7 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 	g->m->locks++;
 	g->m->mallocing = 1;

-	if(DebugTypeAtBlockEnd)
-		size += sizeof(uintptr);
-
+	size0 = size;
 	c = g->m->mcache;
 	if(!runtime·debug.efence && size <= MaxSmallSize) {
 		if((flag&(FlagNoScan|FlagNoGC)) == FlagNoScan && size < TinySize) {
@ -170,19 +166,10 @@ runtime·mallocgc(uintptr size, uintptr typ, uint32 flag)
 		v = (void*)(s->start << PageShift);
 	}

-	if(flag & FlagNoGC)
-		runtime·marknogc(v);
-	else if(!(flag & FlagNoScan))
-		runtime·markscan(v);
-
-	if(DebugTypeAtBlockEnd)
-		*(uintptr*)((uintptr)v+size-sizeof(uintptr)) = typ;
+	if(!(flag & FlagNoGC))
+		runtime·markallocated(v, size, size0, typ, !(flag&FlagNoScan));

 	g->m->mallocing = 0;
-	// TODO: save type even if FlagNoScan?  Potentially expensive but might help
-	// heap profiling/tracing.
-	if(UseSpanType && !(flag & FlagNoScan) && typ != 0)
-		settype(s, v, typ);

 	if(raceenabled)
 		runtime·racemalloc(v, size);
@ -261,7 +248,7 @@ profilealloc(void *v, uintptr size)
 void*
 runtime·malloc(uintptr size)
 {
-	return runtime·mallocgc(size, 0, FlagNoInvokeGC);
+	return runtime·mallocgc(size, nil, FlagNoInvokeGC);
 }

 // Free the object whose base pointer is v.
@ -311,7 +298,7 @@ runtime·free(void *v)
 		// Must mark v freed before calling unmarkspan and MHeap_Free:
 		// they might coalesce v into other spans and change the bitmap further.
 		runtime·markfreed(v);
-		runtime·unmarkspan(v, 1<<PageShift);
+		runtime·unmarkspan(v, s->npages<<PageShift);
 		// NOTE(rsc,dvyukov): The original implementation of efence
 		// in CL 22060046 used SysFree instead of SysFault, so that
 		// the operating system would eventually give the memory
@ -326,9 +313,10 @@ runtime·free(void *v)
 		// have mysterious crashes due to confused memory reuse.
 		// It should be possible to switch back to SysFree if we also 
 		// implement and then call some kind of MHeap_DeleteSpan.
-		if(runtime·debug.efence)
+		if(runtime·debug.efence) {
+			s->limit = nil;	// prevent mlookup from finding this span
 			runtime·SysFault((void*)(s->start<<PageShift), size);
-		else
+		} else
 			runtime·MHeap_Free(&runtime·mheap, s, 1);
 		c->local_nlargefree++;
 		c->local_largefree += size;
@ -376,7 +364,6 @@ runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **sp)
 	if(sp)
 		*sp = s;
 	if(s == nil) {
-		runtime·checkfreed(v, 1);
 		if(base)
 			*base = nil;
 		if(size)
@ -713,140 +700,38 @@ runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat)
 	return p;
 }

-static void
-settype(MSpan *s, void *v, uintptr typ)
-{
-	uintptr size, ofs, j, t;
-	uintptr ntypes, nbytes2, nbytes3;
-	uintptr *data2;
-	byte *data3;
-
-	if(s->sizeclass == 0) {
-		s->types.compression = MTypes_Single;
-		s->types.data = typ;
-		return;
-	}
-	size = s->elemsize;
-	ofs = ((uintptr)v - (s->start<<PageShift)) / size;
-
-	switch(s->types.compression) {
-	case MTypes_Empty:
-		ntypes = (s->npages << PageShift) / size;
-		nbytes3 = 8*sizeof(uintptr) + 1*ntypes;
-		data3 = runtime·mallocgc(nbytes3, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-		s->types.compression = MTypes_Bytes;
-		s->types.data = (uintptr)data3;
-		((uintptr*)data3)[1] = typ;
-		data3[8*sizeof(uintptr) + ofs] = 1;
-		break;
-		
-	case MTypes_Words:
-		((uintptr*)s->types.data)[ofs] = typ;
-		break;
-		
-	case MTypes_Bytes:
-		data3 = (byte*)s->types.data;
-		for(j=1; j<8; j++) {
-			if(((uintptr*)data3)[j] == typ) {
-				break;
-			}
-			if(((uintptr*)data3)[j] == 0) {
-				((uintptr*)data3)[j] = typ;
-				break;
-			}
-		}
-		if(j < 8) {
-			data3[8*sizeof(uintptr) + ofs] = j;
-		} else {
-			ntypes = (s->npages << PageShift) / size;
-			nbytes2 = ntypes * sizeof(uintptr);
-			data2 = runtime·mallocgc(nbytes2, 0, FlagNoProfiling|FlagNoScan|FlagNoInvokeGC);
-			s->types.compression = MTypes_Words;
-			s->types.data = (uintptr)data2;
-			
-			// Move the contents of data3 to data2. Then deallocate data3.
-			for(j=0; j<ntypes; j++) {
-				t = data3[8*sizeof(uintptr) + j];
-				t = ((uintptr*)data3)[t];
-				data2[j] = t;
-			}
-			data2[ofs] = typ;
-		}
-		break;
-	}
-}
-
-uintptr
-runtime·gettype(void *v)
-{
-	MSpan *s;
-	uintptr t, ofs;
-	byte *data;
-
-	s = runtime·MHeap_LookupMaybe(&runtime·mheap, v);
-	if(s != nil) {
-		t = 0;
-		switch(s->types.compression) {
-		case MTypes_Empty:
-			break;
-		case MTypes_Single:
-			t = s->types.data;
-			break;
-		case MTypes_Words:
-			ofs = (uintptr)v - (s->start<<PageShift);
-			t = ((uintptr*)s->types.data)[ofs/s->elemsize];
-			break;
-		case MTypes_Bytes:
-			ofs = (uintptr)v - (s->start<<PageShift);
-			data = (byte*)s->types.data;
-			t = data[8*sizeof(uintptr) + ofs/s->elemsize];
-			t = ((uintptr*)data)[t];
-			break;
-		default:
-			runtime·throw("runtime·gettype: invalid compression kind");
-		}
-		if(0) {
-			runtime·printf("%p -> %d,%X\n", v, (int32)s->types.compression, (int64)t);
-		}
-		return t;
-	}
-	return 0;
-}
-
 // Runtime stubs.

 void*
 runtime·mal(uintptr n)
 {
-	return runtime·mallocgc(n, 0, 0);
+	return runtime·mallocgc(n, nil, 0);
 }

 #pragma textflag NOSPLIT
 func new(typ *Type) (ret *uint8) {
-	ret = runtime·mallocgc(typ->size, (uintptr)typ | TypeInfo_SingleObject, typ->kind&KindNoPointers ? FlagNoScan : 0);
+	ret = runtime·mallocgc(typ->size, typ, typ->kind&KindNoPointers ? FlagNoScan : 0);
 }

 static void*
-cnew(Type *typ, intgo n, int32 objtyp)
+cnew(Type *typ, intgo n)
 {
-	if((objtyp&(PtrSize-1)) != objtyp)
-		runtime·throw("runtime: invalid objtyp");
 	if(n < 0 || (typ->size > 0 && n > MaxMem/typ->size))
 		runtime·panicstring("runtime: allocation size out of range");
-	return runtime·mallocgc(typ->size*n, (uintptr)typ | objtyp, typ->kind&KindNoPointers ? FlagNoScan : 0);
+	return runtime·mallocgc(typ->size*n, typ, typ->kind&KindNoPointers ? FlagNoScan : 0);
 }

 // same as runtime·new, but callable from C
 void*
 runtime·cnew(Type *typ)
 {
-	return cnew(typ, 1, TypeInfo_SingleObject);
+	return cnew(typ, 1);
 }

 void*
 runtime·cnewarray(Type *typ, intgo n)
 {
-	return cnew(typ, n, TypeInfo_Array);
+	return cnew(typ, n);
 }

 func GC() {
@ -868,7 +753,7 @@ func SetFinalizer(obj Eface, finalizer Eface) {
 		runtime·printf("runtime.SetFinalizer: first argument is nil interface\n");
 		goto throw;
 	}
-	if(obj.type->kind != KindPtr) {
+	if((obj.type->kind&KindMask) != KindPtr) {
 		runtime·printf("runtime.SetFinalizer: first argument is %S, not pointer\n", *obj.type->string);
 		goto throw;
 	}
@ -937,3 +822,9 @@ badfunc:
 throw:
 	runtime·throw("runtime.SetFinalizer");
 }
+
+// For testing.
+func GCMask(x Eface) (mask Slice) {
+	runtime·getgcmask(x.data, x.type, &mask.array, &mask.len);
+	mask.cap = mask.len;
+}
--- a/src/pkg/runtime/malloc.h
+++ b/src/pkg/runtime/malloc.h
@ -85,7 +85,6 @@ typedef struct MHeap	MHeap;
 typedef struct MSpan	MSpan;
 typedef struct MStats	MStats;
 typedef struct MLink	MLink;
-typedef struct MTypes	MTypes;
 typedef struct GCStats	GCStats;

 enum
@ -348,43 +347,6 @@ void	runtime·MCache_Free(MCache *c, MLink *p, int32 sizeclass, uintptr size);
 void	runtime·MCache_ReleaseAll(MCache *c);
 void	runtime·stackcache_clear(MCache *c);

-// MTypes describes the types of blocks allocated within a span.
-// The compression field describes the layout of the data.
-//
-// MTypes_Empty:
-//     All blocks are free, or no type information is available for
-//     allocated blocks.
-//     The data field has no meaning.
-// MTypes_Single:
-//     The span contains just one block.
-//     The data field holds the type information.
-//     The sysalloc field has no meaning.
-// MTypes_Words:
-//     The span contains multiple blocks.
-//     The data field points to an array of type [NumBlocks]uintptr,
-//     and each element of the array holds the type of the corresponding
-//     block.
-// MTypes_Bytes:
-//     The span contains at most seven different types of blocks.
-//     The data field points to the following structure:
-//         struct {
-//             type  [8]uintptr       // type[0] is always 0
-//             index [NumBlocks]byte
-//         }
-//     The type of the i-th block is: data.type[data.index[i]]
-enum
-{
-	MTypes_Empty = 0,
-	MTypes_Single = 1,
-	MTypes_Words = 2,
-	MTypes_Bytes = 3,
-};
-struct MTypes
-{
-	byte	compression;	// one of MTypes_*
-	uintptr	data;
-};
-
 enum
 {
 	KindSpecialFinalizer = 1,
@ -454,7 +416,6 @@ struct MSpan
 	int64   unusedsince;	// First time spotted by GC in MSpanFree state
 	uintptr npreleased;	// number of pages released to the OS
 	byte	*limit;		// end of data in span
-	MTypes	types;		// types of allocated objects in this span
 	Lock	specialLock;	// guards specials list
 	Special	*specials;	// linked list of special records sorted by offset.
 	MLink	*freebuf;	// objects freed explicitly, not incorporated into freelist yet
@ -554,28 +515,22 @@ void	runtime·MHeap_MapBits(MHeap *h);
 void	runtime·MHeap_MapSpans(MHeap *h);
 void	runtime·MHeap_Scavenger(void);

-void*	runtime·mallocgc(uintptr size, uintptr typ, uint32 flag);
+void*	runtime·mallocgc(uintptr size, Type* typ, uint32 flag);
 void*	runtime·persistentalloc(uintptr size, uintptr align, uint64 *stat);
 int32	runtime·mlookup(void *v, byte **base, uintptr *size, MSpan **s);
 void	runtime·gc(int32 force);
 uintptr	runtime·sweepone(void);
-void	runtime·markscan(void *v);
-void	runtime·marknogc(void *v);
-void	runtime·checkallocated(void *v, uintptr n);
+void	runtime·markallocated(void *v, uintptr size, uintptr size0, Type* typ, bool scan);
 void	runtime·markfreed(void *v);
-void	runtime·checkfreed(void *v, uintptr n);
-extern	int32	runtime·checking;
 void	runtime·markspan(void *v, uintptr size, uintptr n, bool leftover);
 void	runtime·unmarkspan(void *v, uintptr size);
 void	runtime·purgecachedstats(MCache*);
 void*	runtime·cnew(Type*);
 void*	runtime·cnewarray(Type*, intgo);
-void	runtime·tracealloc(void*, uintptr, uintptr);
+void	runtime·tracealloc(void*, uintptr, Type*);
 void	runtime·tracefree(void*, uintptr);
 void	runtime·tracegc(void);

-uintptr	runtime·gettype(void*);
-
 enum
 {
 	// flags to malloc
@ -595,6 +550,7 @@ void	runtime·helpgc(int32 nproc);
 void	runtime·gchelper(void);
 void	runtime·createfing(void);
 G*	runtime·wakefing(void);
+void	runtime·getgcmask(byte*, Type*, byte**, uintptr*);
 extern bool	runtime·fingwait;
 extern bool	runtime·fingwake;

@ -607,16 +563,6 @@ void	runtime·queuefinalizer(byte *p, FuncVal *fn, uintptr nret, Type *fint, Ptr
 void	runtime·freeallspecials(MSpan *span, void *p, uintptr size);
 bool	runtime·freespecial(Special *s, void *p, uintptr size, bool freed);

-enum
-{
-	TypeInfo_SingleObject = 0,
-	TypeInfo_Array = 1,
-	TypeInfo_Chan = 2,
-
-	// Enables type information at the end of blocks allocated from heap	
-	DebugTypeAtBlockEnd = 0,
-};
-
 // Information from the compiler about the layout of stack frames.
 typedef struct BitVector BitVector;
 struct BitVector
@ -631,20 +577,6 @@ struct StackMap
 	int32 nbit; // number of bits in each bitmap
 	uint32 data[];
 };
-enum {
-	// Pointer map
-	BitsPerPointer = 2,
-	BitsDead = 0,
-	BitsScalar = 1,
-	BitsPointer = 2,
-	BitsMultiWord = 3,
-	// BitsMultiWord will be set for the first word of a multi-word item.
-	// When it is set, one of the following will be set for the second word.
-	BitsString = 0,
-	BitsSlice = 1,
-	BitsIface = 2,
-	BitsEface = 3,
-};
 // Returns pointer map data for the given stackmap index
 // (the index is encoded in PCDATA_StackMapIndex).
 BitVector	runtime·stackmapdata(StackMap *stackmap, int32 n);
@ -654,7 +586,6 @@ void	runtime·gc_m_ptr(Eface*);
 void	runtime·gc_g_ptr(Eface*);
 void	runtime·gc_itab_ptr(Eface*);

-void	runtime·memorydump(void);
 int32	runtime·setgcpercent(int32);

 // Value we use to mark dead pointers when GODEBUG=gcdead=1.
--- a/src/pkg/runtime/malloc_test.go
+++ b/src/pkg/runtime/malloc_test.go
@ -68,6 +68,19 @@ func BenchmarkMallocTypeInfo16(b *testing.B) {
 	mallocSink = x
 }

+type LargeStruct struct {
+	x [16][]byte
+}
+
+func BenchmarkMallocLargeStruct(b *testing.B) {
+	var x uintptr
+	for i := 0; i < b.N; i++ {
+		p := make([]LargeStruct, 2)
+		x ^= uintptr(unsafe.Pointer(&p[0]))
+	}
+	mallocSink = x
+}
+
 var n = flag.Int("n", 1000, "number of goroutines")

 func BenchmarkGoroutineSelect(b *testing.B) {
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
--- a/src/pkg/runtime/mgc0.h
+++ b/src/pkg/runtime/mgc0.h
@ -4,84 +4,76 @@

 // Garbage collector (GC)

-// GC instruction opcodes.
-//
-// The opcode of an instruction is followed by zero or more
-// arguments to the instruction.
-//
-// Meaning of arguments:
-//   off      Offset (in bytes) from the start of the current object
-//   objgc    Pointer to GC info of an object
-//   objgcrel Offset to GC info of an object
-//   len      Length of an array
-//   elemsize Size (in bytes) of an element
-//   size     Size (in bytes)
-//
-// NOTE: There is a copy of these in ../reflect/type.go.
-// They must be kept in sync.
-enum {
-	GC_END,         // End of object, loop or subroutine. Args: none
-	GC_PTR,         // A typed pointer. Args: (off, objgc)
-	GC_APTR,        // Pointer to an arbitrary object. Args: (off)
-	GC_ARRAY_START, // Start an array with a fixed length. Args: (off, len, elemsize)
-	GC_ARRAY_NEXT,  // The next element of an array. Args: none
-	GC_CALL,        // Call a subroutine. Args: (off, objgcrel)
-	GC_CHAN_PTR,    // Go channel. Args: (off, ChanType*)
-	GC_STRING,      // Go string. Args: (off)
-	GC_EFACE,       // interface{}. Args: (off)
-	GC_IFACE,       // interface{...}. Args: (off)
-	GC_SLICE,       // Go slice. Args: (off, objgc)
-	GC_REGION,      // A region/part of the current object. Args: (off, size, objgc)
-
-	GC_NUM_INSTR,   // Number of instruction opcodes
-};
-
-enum {
-	// Size of GC's fixed stack.
-	//
-	// The current GC implementation permits:
-	//  - at most 1 stack allocation because of GC_CALL
-	//  - at most GC_STACK_CAPACITY allocations because of GC_ARRAY_START
-	GC_STACK_CAPACITY = 8,	
-};
-
 enum {
 	ScanStackByFrames = 1,
-	IgnorePreciseGC = 0,

 	// Four bits per word (see #defines below).
 	wordsPerBitmapWord = sizeof(void*)*8/4,
-	bitShift = sizeof(void*)*8/4,
+	gcBits = 4,
+
+	// GC type info programs.
+	// The programs allow to store type info required for GC in a compact form.
+	// Most importantly arrays take O(1) space instead of O(n).
+	// The program grammar is:
+	//
+	// Program = {Block} "insEnd"
+	// Block = Data | Array
+	// Data = "insData" DataSize DataBlock
+	// DataSize = int // size of the DataBlock in bit pairs, 1 byte
+	// DataBlock = binary // dense GC mask (2 bits per word) of size ]DataSize/4[ bytes
+	// Array = "insArray" ArrayLen Block "insArrayEnd"
+	// ArrayLen = int // length of the array, 8 bytes (4 bytes for 32-bit arch)
+	//
+	// Each instruction (insData, insArray, etc) is 1 byte.
+	// For example, for type struct { x []byte; y [20]struct{ z int; w *byte }; }
+	// the program looks as:
+	//
+	// insData 3 (BitsMultiWord BitsSlice BitsScalar)
+	//	insArray 20 insData 2 (BitsScalar BitsPointer) insArrayEnd insEnd
+	//
+	// Total size of the program is 17 bytes (13 bytes on 32-bits).
+	// The corresponding GC mask would take 43 bytes (it would be repeated
+	// because the type has odd number of words).
+	insData = 1,
+	insArray,
+	insArrayEnd,
+	insEnd,
+
+	// Pointer map
+	BitsPerPointer	= 2,
+	BitsMask	= (1<<BitsPerPointer)-1,
+	PointersPerByte	= 8/BitsPerPointer,
+
+	BitsDead	= 0,
+	BitsScalar	= 1,
+	BitsPointer	= 2,
+	BitsMultiWord	= 3,
+	// BitsMultiWord will be set for the first word of a multi-word item.
+	// When it is set, one of the following will be set for the second word.
+	BitsString	= 0,
+	BitsSlice	= 1,
+	BitsIface	= 2,
+	BitsEface	= 3,
+
+	MaxGCMask	= 0,	// disabled because wastes several bytes of memory
 };

 // Bits in per-word bitmap.
-// #defines because enum might not be able to hold the values.
+// #defines because we shift the values beyond 32 bits.
 //
 // Each word in the bitmap describes wordsPerBitmapWord words
 // of heap memory.  There are 4 bitmap bits dedicated to each heap word,
 // so on a 64-bit system there is one bitmap word per 16 heap words.
-// The bits in the word are packed together by type first, then by
-// heap location, so each 64-bit bitmap word consists of, from top to bottom,
-// the 16 bitMarked bits for the corresponding heap words,
-// then the 16 bitScan/bitBlockBoundary bits, then the 16 bitAllocated bits.
-// This layout makes it easier to iterate over the bits of a given type.
 //
 // The bitmap starts at mheap.arena_start and extends *backward* from
 // there.  On a 64-bit system the off'th word in the arena is tracked by
 // the off/16+1'th word before mheap.arena_start.  (On a 32-bit system,
 // the only difference is that the divisor is 8.)
-//
-// To pull out the bits corresponding to a given pointer p, we use:
-//
-//	off = p - (uintptr*)mheap.arena_start;  // word offset
-//	b = (uintptr*)mheap.arena_start - off/wordsPerBitmapWord - 1;
-//	shift = off % wordsPerBitmapWord
-//	bits = *b >> shift;
-//	/* then test bits & bitAllocated, bits & bitMarked, etc. */
-//
-#define bitAllocated		((uintptr)1<<(bitShift*0))	/* block start; eligible for garbage collection */
-#define bitScan			((uintptr)1<<(bitShift*1))	/* when bitAllocated is set */
-#define bitMarked		((uintptr)1<<(bitShift*2))	/* when bitAllocated is set */
-#define bitBlockBoundary	((uintptr)1<<(bitShift*1))	/* when bitAllocated is NOT set - mark for FlagNoGC objects */

-#define bitMask (bitAllocated | bitScan | bitMarked)
+#define bitMiddle	((uintptr)0) // middle of an object
+#define bitBoundary	((uintptr)1) // boundary on a non-allocated object
+#define bitAllocated	((uintptr)2) // boundary on an allocated object
+#define bitMarked	((uintptr)3) // boundary on an allocated and marked object
+
+#define bitMask		((uintptr)bitMiddle|bitBoundary|bitAllocated|bitMarked)
+#define bitPtrMask	((uintptr)BitsMask<<2)
--- a/src/pkg/runtime/mheap.c
+++ b/src/pkg/runtime/mheap.c
@ -195,7 +195,6 @@ mheap_alloc(MHeap *h, uintptr npage, int32 sizeclass, bool large)
 		s->ref = 0;
 		s->sizeclass = sizeclass;
 		s->elemsize = (sizeclass==0 ? s->npages<<PageShift : runtime·class_to_size[sizeclass]);
-		s->types.compression = MTypes_Empty;

 		// update stats, sweep lists
 		if(large) {
@ -468,7 +467,6 @@ mheap_free(MHeap *h, MSpan *s, int32 acct)
 		mstats.heap_alloc -= s->npages<<PageShift;
 		mstats.heap_objects--;
 	}
-	s->types.compression = MTypes_Empty;
 	MHeap_FreeSpanLocked(h, s);
 	runtime·unlock(h);
 }
@ -713,7 +711,6 @@ runtime·MSpan_Init(MSpan *span, PageID start, uintptr npages)
 	span->state = MSpanDead;
 	span->unusedsince = 0;
 	span->npreleased = 0;
-	span->types.compression = MTypes_Empty;
 	span->specialLock.key = 0;
 	span->specials = nil;
 	span->needzero = 0;
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@ -409,33 +409,15 @@ func GoroutineProfile(b Slice) (n int, ok bool) {

 static Lock tracelock;

-static int8*
-typeinfoname(int32 typeinfo)
-{
-	if(typeinfo == TypeInfo_SingleObject)
-		return "single object";
-	else if(typeinfo == TypeInfo_Array)
-		return "array";
-	else if(typeinfo == TypeInfo_Chan)
-		return "channel";
-	runtime·throw("typinfoname: unknown type info");
-	return nil;
-}
-
 void
-runtime·tracealloc(void *p, uintptr size, uintptr typ)
+runtime·tracealloc(void *p, uintptr size, Type *type)
 {
-	int8 *name;
-	Type *type;
-
 	runtime·lock(&tracelock);
 	g->m->traceback = 2;
-	type = (Type*)(typ & ~3);
-	name = typeinfoname(typ & 3);
 	if(type == nil)
-		runtime·printf("tracealloc(%p, %p, %s)\n", p, size, name);
+		runtime·printf("tracealloc(%p, %p)\n", p, size);
 	else	
-		runtime·printf("tracealloc(%p, %p, %s of %S)\n", p, size, name, *type->string);
+		runtime·printf("tracealloc(%p, %p, %S)\n", p, size, *type->string);
 	if(g->m->curg == nil || g == g->m->curg) {
 		runtime·goroutineheader(g);
 		runtime·traceback((uintptr)runtime·getcallerpc(&p), (uintptr)runtime·getcallersp(&p), 0, g);
--- a/src/pkg/runtime/proc.c
+++ b/src/pkg/runtime/proc.c
@ -9,6 +9,7 @@
 #include "stack.h"
 #include "race.h"
 #include "type.h"
+#include "mgc0.h"
 #include "../../cmd/ld/textflag.h"

 // Goroutine scheduler
--- a/src/pkg/runtime/race.c
+++ b/src/pkg/runtime/race.c
@ -152,7 +152,7 @@ runtime·racewriteobjectpc(void *addr, Type *t, void *callpc, void *pc)
 {
 	uint8 kind;

-	kind = t->kind & ~KindNoPointers;
+	kind = t->kind & KindMask;
 	if(kind == KindArray || kind == KindStruct)
 		runtime·racewriterangepc(addr, t->size, callpc, pc);
 	else
@ -164,7 +164,7 @@ runtime·racereadobjectpc(void *addr, Type *t, void *callpc, void *pc)
 {
 	uint8 kind;

-	kind = t->kind & ~KindNoPointers;
+	kind = t->kind & KindMask;
 	if(kind == KindArray || kind == KindStruct)
 		runtime·racereadrangepc(addr, t->size, callpc, pc);
 	else
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@ -756,7 +756,6 @@ extern	int32	runtime·ncpu;
 extern	bool	runtime·iscgo;
 extern 	void	(*runtime·sysargs)(int32, uint8**);
 extern	uintptr	runtime·maxstring;
-extern	uint32	runtime·Hchansize;
 extern	uint32	runtime·cpuid_ecx;
 extern	uint32	runtime·cpuid_edx;
 extern	DebugVars	runtime·debug;
--- a/src/pkg/runtime/slice.goc
+++ b/src/pkg/runtime/slice.goc
@ -126,7 +126,7 @@ growslice1(SliceType *t, Slice x, intgo newcap, Slice *ret)
 	// Can't use FlagNoZero w/o FlagNoScan, because otherwise GC can scan unitialized memory.
 	if(typ->kind&KindNoPointers)
 		flag = FlagNoScan|FlagNoZero;
-	ret->array = runtime·mallocgc(capmem, (uintptr)typ|TypeInfo_Array, flag);
+	ret->array = runtime·mallocgc(capmem, typ, flag);
 	ret->len = x.len;
 	ret->cap = newcap1;
 	lenmem = x.len*typ->size;
--- a/src/pkg/runtime/stack.c
+++ b/src/pkg/runtime/stack.c
@ -10,6 +10,7 @@
 #include "typekind.h"
 #include "type.h"
 #include "race.h"
+#include "mgc0.h"
 #include "../../cmd/ld/textflag.h"

 enum
--- a/src/pkg/runtime/type.go
+++ b/src/pkg/runtime/type.go
@ -22,7 +22,7 @@ type rtype struct {
 	fieldAlign uint8
 	kind       uint8
 	alg        unsafe.Pointer
-	gc         unsafe.Pointer
+	gc         [2]unsafe.Pointer
 	string     *string
 	*uncommonType
 	ptrToThis *rtype
--- a/src/pkg/runtime/type.h
+++ b/src/pkg/runtime/type.h
@ -16,7 +16,8 @@ typedef struct IMethod IMethod;
 typedef struct SliceType SliceType;
 typedef struct FuncType FuncType;

-// Needs to be in sync with ../../cmd/ld/decodesym.c:/^commonsize
+// Needs to be in sync with ../../cmd/ld/decodesym.c:/^commonsize,
+// pkg/reflect/type.go:/type anf type.go:/rtype
 struct Type
 {
 	uintptr size;
@ -26,7 +27,17 @@ struct Type
 	uint8 fieldAlign;
 	uint8 kind;
 	Alg *alg;
-	void *gc;
+	// gc stores type info required for garbage collector.
+	// If (kind&KindGCProg)==0, then gc directly contains sparse GC bitmap
+	// (no indirection), 4 bits per word.
+	// If (kind&KindGCProg)!=0, then gc[1] points to a compiler-generated
+	// read-only GC program; and gc[0] points to BSS space for sparse GC bitmap.
+	// For huge types (>MaxGCMask), runtime unrolls the program directly into
+	// GC bitmap and gc[0] is not used. For moderately-sized types, runtime
+	// unrolls the program into gc[0] space on first use. The first byte of gc[0]
+	// (gc[0][0]) contains 'unroll' flag saying whether the program is already
+	// unrolled into gc[0] or not.
+	uintptr gc[2];
 	String *string;
 	UncommonType *x;
 	Type *ptrto;
--- a/src/pkg/runtime/typekind.h
+++ b/src/pkg/runtime/typekind.h
@ -33,6 +33,8 @@ enum {
 	KindStruct,
 	KindUnsafePointer,

+	KindGCProg = 1<<6,	// Type.gc points to GC program
 	KindNoPointers = 1<<7,
+	KindMask = (1<<6)-1,
 };