pprof: add goroutine blocking profiling

The profiler collects goroutine blocking information similar to Google Perf Tools. You may see an example of the profile (converted to svg) attached to http://code.google.com/p/go/issues/detail?id=3946 The public API changes are: +pkg runtime, func BlockProfile([]BlockProfileRecord) (int, bool) +pkg runtime, func SetBlockProfileRate(int) +pkg runtime, method (*BlockProfileRecord) Stack() []uintptr +pkg runtime, type BlockProfileRecord struct +pkg runtime, type BlockProfileRecord struct, Count int64 +pkg runtime, type BlockProfileRecord struct, Cycles int64 +pkg runtime, type BlockProfileRecord struct, embedded StackRecord R=rsc, dave, minux.ma, r CC=gobot, golang-dev, r, remyoudompheng https://golang.org/cl/6443115
2025-12-08 06:10:04 +00:00 · 2012-10-06 12:56:04 +04:00 · 2012-10-06 12:56:04 +04:00 · 4cc7bf326a
commit 4cc7bf326a
parent ebb0e5db75
12 changed files with 350 additions and 52 deletions
--- a/src/cmd/go/test.go
+++ b/src/cmd/go/test.go
@ -112,6 +112,18 @@ directory containing the package sources, has its own flags:
 	    garbage collector, provided the test can run in the available
 	    memory without garbage collection.
 	-test.blockprofile block.out
 	    Write a goroutine blocking profile to the specified file
 	    when all tests are complete.
 	-test.blockprofilerate n
 	    Control the detail provided in goroutine blocking profiles by setting
 	    runtime.BlockProfileRate to n.  See 'godoc runtime BlockProfileRate'.
 	    The profiler aims to sample, on average, one blocking event every
 	    n nanoseconds the program spends blocked.  By default,
 	    if -test.blockprofile is set without this flag, all blocking events
 	    are recorded, equivalent to -test.blockprofilerate=1.
 	-test.parallel n
 	    Allow parallel execution of test functions that call t.Parallel.
 	    The value of this flag is the maximum number of tests to run
--- a/src/cmd/go/testflag.go
+++ b/src/cmd/go/testflag.go
@ -31,6 +31,8 @@ var usageMessage = `Usage of go test:
  -cpuprofile="": passes -test.cpuprofile to test
  -memprofile="": passes -test.memprofile to test
  -memprofilerate=0: passes -test.memprofilerate to test
  -blockprofile="": pases -test.blockprofile to test
  -blockprofilerate=0: passes -test.blockprofilerate to test
  -parallel=0: passes -test.parallel to test
  -run="": passes -test.run to test
  -short=false: passes -test.short to test
@ -82,6 +84,8 @@ var testFlagDefn = []*testFlagSpec{
 	{name: "cpuprofile", passToTest: true},
 	{name: "memprofile", passToTest: true},
 	{name: "memprofilerate", passToTest: true},
 	{name: "blockprofile", passToTest: true},
 	{name: "blockprofilerate", passToTest: true},
 	{name: "parallel", passToTest: true},
 	{name: "run", passToTest: true},
 	{name: "short", boolVar: new(bool), passToTest: true},
--- a/src/pkg/net/http/pprof/pprof.go
+++ b/src/pkg/net/http/pprof/pprof.go
@ -30,6 +30,10 @@
 //
 //	go tool pprof http://localhost:6060/debug/pprof/profile
 //
 // Or to look at the goroutine blocking profile:
 //
 //	go tool pprof http://localhost:6060/debug/pprof/block
 //
 // Or to view all available profiles:
 //
 //	go tool pprof http://localhost:6060/debug/pprof/
--- a/src/pkg/runtime/chan.c
+++ b/src/pkg/runtime/chan.c
@ -22,6 +22,7 @@ struct	SudoG
 	G*	g;		// g and selgen constitute
 	uint32	selgen;		// a weak pointer to g
 	SudoG*	link;
 	int64	releasetime;
 	byte*	elem;		// data element
 };
@ -154,6 +155,7 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 	SudoG *sg;
 	SudoG mysg;
 	G* gp;
 	int64 t0;
 	if(c == nil) {
 		USED(t);
@ -174,6 +176,13 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 		runtime·prints("\n");
 	}
 	t0 = 0;
 	mysg.releasetime = 0;
 	if(runtime·blockprofilerate > 0) {
 		t0 = runtime·cputicks();
 		mysg.releasetime = -1;
 	}
 	runtime·lock(c);
 	if(c->closed)
 		goto closed;
@ -189,6 +198,8 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 		gp->param = sg;
 		if(sg->elem != nil)
 			c->elemalg->copy(c->elemsize, sg->elem, ep);
 		if(sg->releasetime)
 			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 		if(pres != nil)
@ -216,6 +227,9 @@ runtime·chansend(ChanType *t, Hchan *c, byte *ep, bool *pres)
 		goto closed;
 	}
 	if(mysg.releasetime > 0)
 		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 asynch:
@ -246,11 +260,15 @@ asynch:
 	if(sg != nil) {
 		gp = sg->g;
 		runtime·unlock(c);
 		if(sg->releasetime)
 			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 	} else
 		runtime·unlock(c);
 	if(pres != nil)
 		*pres = true;
 	if(mysg.releasetime > 0)
 		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 closed:
@ -265,6 +283,7 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 	SudoG *sg;
 	SudoG mysg;
 	G *gp;
 	int64 t0;
 	if(runtime·gcwaiting)
 		runtime·gosched();
@ -282,6 +301,13 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 		return;  // not reached
 	}
 	t0 = 0;
 	mysg.releasetime = 0;
 	if(runtime·blockprofilerate > 0) {
 		t0 = runtime·cputicks();
 		mysg.releasetime = -1;
 	}
 	runtime·lock(c);
 	if(c->dataqsiz > 0)
 		goto asynch;
@ -297,6 +323,8 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 			c->elemalg->copy(c->elemsize, ep, sg->elem);
 		gp = sg->g;
 		gp->param = sg;
 		if(sg->releasetime)
 			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 		if(selected != nil)
@ -328,6 +356,8 @@ runtime·chanrecv(ChanType *t, Hchan* c, byte *ep, bool *selected, bool *receive
 	if(received != nil)
 		*received = true;
 	if(mysg.releasetime > 0)
 		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 asynch:
@ -362,6 +392,8 @@ asynch:
 	if(sg != nil) {
 		gp = sg->g;
 		runtime·unlock(c);
 		if(sg->releasetime)
 			sg->releasetime = runtime·cputicks();
 		runtime·ready(gp);
 	} else
 		runtime·unlock(c);
@ -370,6 +402,8 @@ asynch:
 		*selected = true;
 	if(received != nil)
 		*received = true;
 	if(mysg.releasetime > 0)
 		runtime·blockevent(mysg.releasetime - t0, 2);
 	return;
 closed:
@ -380,6 +414,8 @@ closed:
 	if(received != nil)
 		*received = false;
 	runtime·unlock(c);
 	if(mysg.releasetime > 0)
 		runtime·blockevent(mysg.releasetime - t0, 2);
 }
 // chansend1(hchan *chan any, elem any);
--- a/src/pkg/runtime/debug.go
+++ b/src/pkg/runtime/debug.go
@ -138,6 +138,31 @@ func CPUProfile() []byte
 // SetCPUProfileRate directly.
 func SetCPUProfileRate(hz int)
 // SetBlockProfileRate controls the fraction of goroutine blocking events
 // that are reported in the blocking profile.  The profiler aims to sample
 // an average of one blocking event per rate nanoseconds spent blocked.
 //
 // To include every blocking event in the profile, pass rate = 1.
 // To turn off profiling entirely, pass rate <= 0.
 func SetBlockProfileRate(rate int)
 // BlockProfileRecord describes blocking events originated
 // at a particular call sequence (stack trace).
 type BlockProfileRecord struct {
 	Count  int64
 	Cycles int64
 	StackRecord
 }
 // BlockProfile returns n, the number of records in the current blocking profile.
 // If len(p) >= n, BlockProfile copies the profile into p and returns n, true.
 // If len(p) < n, BlockProfile does not change p and returns n, false.
 //
 // Most clients should use the runtime/pprof package or
 // the testing package's -test.blockprofile flag instead
 // of calling BlockProfile directly.
 func BlockProfile(p []BlockProfileRecord) (n int, ok bool)
 // Stack formats a stack trace of the calling goroutine into buf
 // and returns the number of bytes written to buf.
 // If all is true, Stack formats stack traces of all other goroutines
--- a/src/pkg/runtime/mprof.goc
+++ b/src/pkg/runtime/mprof.goc
@ -15,21 +15,35 @@ package runtime
 // NOTE(rsc): Everything here could use cas if contention became an issue.
 static Lock proflock;
-// Per-call-stack allocation information.
+enum { MProf, BProf };  // profile types
 // Per-call-stack profiling information.
 // Lookup by hashing call stack into a linked-list hash table.
 typedef struct Bucket Bucket;
 struct Bucket
 {
 	Bucket	*next;	// next in hash list
-	Bucket	*allnext;	// next in list of all buckets
+	Bucket	*allnext;	// next in list of all mbuckets/bbuckets
-	uintptr	allocs;
+	int32	typ;
-	uintptr	frees;
+	union
-	uintptr	alloc_bytes;
+	{
-	uintptr	free_bytes;
+		struct  // typ == MProf
-	uintptr	recent_allocs;  // since last gc
+		{
-	uintptr	recent_frees;
+			uintptr	allocs;
-	uintptr	recent_alloc_bytes;
+			uintptr	frees;
-	uintptr	recent_free_bytes;
+			uintptr	alloc_bytes;
 			uintptr	free_bytes;
 			uintptr	recent_allocs;  // since last gc
 			uintptr	recent_frees;
 			uintptr	recent_alloc_bytes;
 			uintptr	recent_free_bytes;
 		};
 		struct  // typ == BProf
 		{
 			int64	count;
 			int64	cycles;
 		};
 	};
 	uintptr	hash;
 	uintptr	nstk;
 	uintptr	stk[1];
@ -38,12 +52,13 @@ enum {
 	BuckHashSize = 179999,
 };
 static Bucket **buckhash;
-static Bucket *buckets;
+static Bucket *mbuckets;  // memory profile buckets
 static Bucket *bbuckets;  // blocking profile buckets
 static uintptr bucketmem;
 // Return the bucket for stk[0:nstk], allocating new bucket if needed.
 static Bucket*
-stkbucket(uintptr *stk, int32 nstk, bool alloc)
+stkbucket(int32 typ, uintptr *stk, int32 nstk, bool alloc)
 {
 	int32 i;
 	uintptr h;
@ -66,7 +81,7 @@ stkbucket(uintptr *stk, int32 nstk, bool alloc)
 	i = h%BuckHashSize;
 	for(b = buckhash[i]; b; b=b->next)
-		if(b->hash == h && b->nstk == nstk &&
+		if(b->typ == typ && b->hash == h && b->nstk == nstk &&
 		   runtime·mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0)
 			return b;
@ -76,12 +91,18 @@ stkbucket(uintptr *stk, int32 nstk, bool alloc)
 	b = runtime·mallocgc(sizeof *b + nstk*sizeof stk[0], FlagNoProfiling, 0, 1);
 	bucketmem += sizeof *b + nstk*sizeof stk[0];
 	runtime·memmove(b->stk, stk, nstk*sizeof stk[0]);
 	b->typ = typ;
 	b->hash = h;
 	b->nstk = nstk;
 	b->next = buckhash[i];
 	buckhash[i] = b;
-	b->allnext = buckets;
+	if(typ == MProf) {
-	buckets = b;
+		b->allnext = mbuckets;
 		mbuckets = b;
 	} else {
 		b->allnext = bbuckets;
 		bbuckets = b;
 	}
 	return b;
 }
@ -92,7 +113,7 @@ runtime·MProf_GC(void)
 	Bucket *b;
 	runtime·lock(&proflock);
-	for(b=buckets; b; b=b->allnext) {
+	for(b=mbuckets; b; b=b->allnext) {
 		b->allocs += b->recent_allocs;
 		b->frees += b->recent_frees;
 		b->alloc_bytes += b->recent_alloc_bytes;
@ -228,7 +249,7 @@ runtime·MProf_Malloc(void *p, uintptr size)
 	m->nomemprof++;
 	nstk = runtime·callers(1, stk, 32);
 	runtime·lock(&proflock);
-	b = stkbucket(stk, nstk, true);
+	b = stkbucket(MProf, stk, nstk, true);
 	b->recent_allocs++;
 	b->recent_alloc_bytes += size;
 	setaddrbucket((uintptr)p, b);
@ -256,6 +277,35 @@ runtime·MProf_Free(void *p, uintptr size)
 	m->nomemprof--;
 }
 int64 runtime·blockprofilerate;  // in CPU ticks
 void
 runtime·SetBlockProfileRate(intgo rate)
 {
 	runtime·atomicstore64((uint64*)&runtime·blockprofilerate, rate * runtime·tickspersecond() / (1000*1000*1000));
 }
 void
 runtime·blockevent(int64 cycles, int32 skip)
 {
 	int32 nstk;
 	int64 rate;
 	uintptr stk[32];
 	Bucket *b;
 	if(cycles <= 0)
 		return;
 	rate = runtime·atomicload64((uint64*)&runtime·blockprofilerate);
 	if(rate <= 0 || (rate > cycles && runtime·fastrand1()%rate > cycles))
 		return;
 	nstk = runtime·callers(skip, stk, 32);
 	runtime·lock(&proflock);
 	b = stkbucket(BProf, stk, nstk, true);
 	b->count++;
 	b->cycles += cycles;
 	runtime·unlock(&proflock);
 }
 // Go interface to profile data.  (Declared in extern.go)
 // Assumes Go sizeof(int) == sizeof(int32)
@ -290,20 +340,53 @@ func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
 	runtime·lock(&proflock);
 	n = 0;
-	for(b=buckets; b; b=b->allnext)
+	for(b=mbuckets; b; b=b->allnext)
 		if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 			n++;
 	ok = false;
 	if(n <= p.len) {
 		ok = true;
 		r = (Record*)p.array;
-		for(b=buckets; b; b=b->allnext)
+		for(b=mbuckets; b; b=b->allnext)
 			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
 				record(r++, b);
 	}
 	runtime·unlock(&proflock);
 }
 // Must match BlockProfileRecord in debug.go.
 typedef struct BRecord BRecord;
 struct BRecord {
 	int64 count;
 	int64 cycles;
 	uintptr stk[32];
 };
 func BlockProfile(p Slice) (n int, ok bool) {
 	Bucket *b;
 	BRecord *r;
 	int32 i;
 	runtime·lock(&proflock);
 	n = 0;
 	for(b=bbuckets; b; b=b->allnext)
 		n++;
 	ok = false;
 	if(n <= p.len) {
 		ok = true;
 		r = (BRecord*)p.array;
 		for(b=bbuckets; b; b=b->allnext, r++) {
 			r->count = b->count;
 			r->cycles = b->cycles;
 			for(i=0; i<b->nstk && i<nelem(r->stk); i++)
 				r->stk[i] = b->stk[i];
 			for(; i<nelem(r->stk); i++)
 				r->stk[i] = 0;			
 		}
 	}
 	runtime·unlock(&proflock);
 }
 // Must match StackRecord in debug.go.
 typedef struct TRecord TRecord;
 struct TRecord {
--- a/src/pkg/runtime/pprof/pprof.go
+++ b/src/pkg/runtime/pprof/pprof.go
@ -36,6 +36,7 @@ import (
 //	goroutine    - stack traces of all current goroutines
 //	heap         - a sampling of all heap allocations
 //	threadcreate - stack traces that led to the creation of new OS threads
 //	block        - stack traces that led to blocking on synchronization primitives
 //
 // These predefine profiles maintain themselves and panic on an explicit
 // Add or Remove method call.
@ -76,6 +77,12 @@ var heapProfile = &Profile{
 	write: writeHeap,
 }
 var blockProfile = &Profile{
 	name:  "block",
 	count: countBlock,
 	write: writeBlock,
 }
 func lockProfiles() {
 	profiles.mu.Lock()
 	if profiles.m == nil {
@ -84,6 +91,7 @@ func lockProfiles() {
 			"goroutine":    goroutineProfile,
 			"threadcreate": threadcreateProfile,
 			"heap":         heapProfile,
 			"block":        blockProfile,
 		}
 	}
 }
@ -600,3 +608,60 @@ func StopCPUProfile() {
 	runtime.SetCPUProfileRate(0)
 	<-cpu.done
 }
 type byCycles []runtime.BlockProfileRecord
 func (x byCycles) Len() int           { return len(x) }
 func (x byCycles) Swap(i, j int)      { x[i], x[j] = x[j], x[i] }
 func (x byCycles) Less(i, j int) bool { return x[i].Cycles > x[j].Cycles }
 // countBlock returns the number of records in the blocking profile.
 func countBlock() int {
 	n, _ := runtime.BlockProfile(nil)
 	return n
 }
 // writeBlock writes the current blocking profile to w.
 func writeBlock(w io.Writer, debug int) error {
 	var p []runtime.BlockProfileRecord
 	n, ok := runtime.BlockProfile(nil)
 	for {
 		p = make([]runtime.BlockProfileRecord, n+50)
 		n, ok = runtime.BlockProfile(p)
 		if ok {
 			p = p[:n]
 			break
 		}
 	}
 	sort.Sort(byCycles(p))
 	b := bufio.NewWriter(w)
 	var tw *tabwriter.Writer
 	w = b
 	if debug > 0 {
 		tw = tabwriter.NewWriter(w, 1, 8, 1, '\t', 0)
 		w = tw
 	}
 	fmt.Fprintf(w, "--- contention:\n")
 	fmt.Fprintf(w, "cycles/second=%v\n", runtime_cyclesPerSecond())
 	for i := range p {
 		r := &p[i]
 		fmt.Fprintf(w, "%v %v @", r.Cycles, r.Count)
 		for _, pc := range r.Stack() {
 			fmt.Fprintf(w, " %#x", pc)
 		}
 		fmt.Fprint(w, "\n")
 		if debug > 0 {
 			printStackRecord(w, r.Stack(), false)
 		}
 	}
 	if tw != nil {
 		tw.Flush()
 	}
 	return b.Flush()
 }
 func runtime_cyclesPerSecond() int64
--- a/src/pkg/runtime/runtime.c
+++ b/src/pkg/runtime/runtime.c
@ -358,3 +358,40 @@ runtime·fastrand1(void)
 	m->fastrand = x;
 	return x;
 }
 static Lock ticksLock;
 static int64 ticks;
 int64
 runtime·tickspersecond(void)
 {
 	int64 res, t0, t1, c0, c1;
 	res = (int64)runtime·atomicload64((uint64*)&ticks);
 	if(res != 0)
 		return ticks;
 	runtime·lock(&ticksLock);
 	res = ticks;
 	if(res == 0) {
 		t0 = runtime·nanotime();
 		c0 = runtime·cputicks();
 		runtime·usleep(100*1000);
 		t1 = runtime·nanotime();
 		c1 = runtime·cputicks();
 		if(t1 == t0)
 			t1++;
 		res = (c1-c0)*1000*1000*1000/(t1-t0);
 		if(res == 0)
 			res++;
 		runtime·atomicstore64((uint64*)&ticks, res);
 	}
 	runtime·unlock(&ticksLock);
 	return res;
 }
 void
 runtime∕pprof·runtime_cyclesPerSecond(int64 res)
 {
 	res = runtime·tickspersecond();
 	FLUSH(&res);
 }
--- a/src/pkg/runtime/runtime.h
+++ b/src/pkg/runtime/runtime.h
@ -642,6 +642,9 @@ void	runtime·resetcpuprofiler(int32);
 void	runtime·setcpuprofilerate(void(*)(uintptr*, int32), int32);
 void	runtime·usleep(uint32);
 int64	runtime·cputicks(void);
 int64	runtime·tickspersecond(void);
 void	runtime·blockevent(int64, int32);
 extern int64 runtime·blockprofilerate;
 #pragma	varargck	argpos	runtime·printf	1
 #pragma	varargck	type	"d"	int32
--- a/src/pkg/runtime/sema.goc
+++ b/src/pkg/runtime/sema.goc
@ -24,20 +24,21 @@ package sync
 typedef struct Sema Sema;
 struct Sema
 {
-	uint32 volatile *addr;
+	uint32 volatile*	addr;
-	G *g;
+	G*	g;
-	Sema *prev;
+	int64	releasetime;
-	Sema *next;
+	Sema*	prev;
 	Sema*	next;
 };
 typedef struct SemaRoot SemaRoot;
 struct SemaRoot
 {
-        Lock;
+	Lock;
-	Sema *head;
+	Sema*	head;
-	Sema *tail;
+	Sema*	tail;
 	// Number of waiters. Read w/o the lock.
-	uint32 volatile nwait;
+	uint32 volatile	nwait;
 };
 // Prime to not correlate with any user patterns.
@ -97,12 +98,13 @@ cansemacquire(uint32 *addr)
 	return 0;
 }
-void
+static void
-runtime·semacquire(uint32 volatile *addr)
+semacquireimpl(uint32 volatile *addr, int32 profile)
 {
 	Sema s;	// Needs to be allocated on stack, otherwise garbage collector could deallocate it
 	SemaRoot *root;
-
+	int64 t0;
 	// Easy case.
 	if(cansemacquire(addr))
 		return;
@ -114,6 +116,12 @@ runtime·semacquire(uint32 volatile *addr)
 	//	sleep
 	//	(waiter descriptor is dequeued by signaler)
 	root = semroot(addr);
 	t0 = 0;
 	s.releasetime = 0;
 	if(profile && runtime·blockprofilerate > 0) {
 		t0 = runtime·cputicks();
 		s.releasetime = -1;
 	}
 	for(;;) {
 		runtime·lock(root);
 		// Add ourselves to nwait to disable "easy case" in semrelease.
@ -128,11 +136,20 @@ runtime·semacquire(uint32 volatile *addr)
 		// (we set nwait above), so go to sleep.
 		semqueue(root, addr, &s);
 		runtime·park(runtime·unlock, root, "semacquire");
-		if(cansemacquire(addr))
+		if(cansemacquire(addr)) {
 			if(t0)
 				runtime·blockevent(s.releasetime - t0, 3);
 			return;
 		}
 	}
 }
 void
 runtime·semacquire(uint32 volatile *addr)
 {
 	semacquireimpl(addr, 0);
 }
 void
 runtime·semrelease(uint32 volatile *addr)
 {
@ -164,12 +181,15 @@ runtime·semrelease(uint32 volatile *addr)
 		}
 	}
 	runtime·unlock(root);
-	if(s)
+	if(s) {
 		if(s->releasetime)
 			s->releasetime = runtime·cputicks();
 		runtime·ready(s->g);
 	}
 }
 func runtime_Semacquire(addr *uint32) {
-	runtime·semacquire(addr);
+	semacquireimpl(addr, 1);
 }
 func runtime_Semrelease(addr *uint32) {
--- a/src/pkg/runtime/signal_linux_arm.c
+++ b/src/pkg/runtime/signal_linux_arm.c
@ -206,14 +206,8 @@ runtime·setup_auxv(int32 argc, void *argv_list)
 #pragma textflag 7
 int64
 runtime·cputicks() {
-	// copied from runtime.c:/^fastrand1
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
-	uint32 x;
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-
+	// runtime·randomNumber provides better seeding of fastrand1.
-	x = runtime·randomNumber;
+	return runtime·nanotime() + runtime·randomNumber;
 	x += x;
 	if(x & 0x80000000L)
 		x ^= 0x88888eefUL;
 	runtime·randomNumber = x;
 	return ((int64)x) << 32 | x;
 }
--- a/src/pkg/testing/testing.go
+++ b/src/pkg/testing/testing.go
@ -102,14 +102,16 @@ var (
 	short = flag.Bool("test.short", false, "run smaller test suite to save time")
 	// Report as tests are run; default is silent for success.
-	chatty         = flag.Bool("test.v", false, "verbose: print additional output")
+	chatty           = flag.Bool("test.v", false, "verbose: print additional output")
-	match          = flag.String("test.run", "", "regular expression to select tests and examples to run")
+	match            = flag.String("test.run", "", "regular expression to select tests and examples to run")
-	memProfile     = flag.String("test.memprofile", "", "write a memory profile to the named file after execution")
+	memProfile       = flag.String("test.memprofile", "", "write a memory profile to the named file after execution")
-	memProfileRate = flag.Int("test.memprofilerate", 0, "if >=0, sets runtime.MemProfileRate")
+	memProfileRate   = flag.Int("test.memprofilerate", 0, "if >=0, sets runtime.MemProfileRate")
-	cpuProfile     = flag.String("test.cpuprofile", "", "write a cpu profile to the named file during execution")
+	cpuProfile       = flag.String("test.cpuprofile", "", "write a cpu profile to the named file during execution")
-	timeout        = flag.Duration("test.timeout", 0, "if positive, sets an aggregate time limit for all tests")
+	blockProfile     = flag.String("test.blockprofile", "", "write a goroutine blocking profile to the named file after execution")
-	cpuListStr     = flag.String("test.cpu", "", "comma-separated list of number of CPUs to use for each test")
+	blockProfileRate = flag.Int("test.blockprofilerate", 1, "if >= 0, calls runtime.SetBlockProfileRate()")
-	parallel       = flag.Int("test.parallel", runtime.GOMAXPROCS(0), "maximum test parallelism")
+	timeout          = flag.Duration("test.timeout", 0, "if positive, sets an aggregate time limit for all tests")
 	cpuListStr       = flag.String("test.cpu", "", "comma-separated list of number of CPUs to use for each test")
 	parallel         = flag.Int("test.parallel", runtime.GOMAXPROCS(0), "maximum test parallelism")
 	haveExamples bool // are there examples?
@ -420,7 +422,9 @@ func before() {
 		}
 		// Could save f so after can call f.Close; not worth the effort.
 	}
-
+	if *blockProfile != "" && *blockProfileRate >= 0 {
 		runtime.SetBlockProfileRate(*blockProfileRate)
 	}
 }
 // after runs after all testing.
@ -439,6 +443,17 @@ func after() {
 		}
 		f.Close()
 	}
 	if *blockProfile != "" && *blockProfileRate >= 0 {
 		f, err := os.Create(*blockProfile)
 		if err != nil {
 			fmt.Fprintf(os.Stderr, "testing: %s", err)
 			return
 		}
 		if err = pprof.Lookup("block").WriteTo(f, 0); err != nil {
 			fmt.Fprintf(os.Stderr, "testing: can't write %s: %s", *blockProfile, err)
 		}
 		f.Close()
 	}
 }
 var timer *time.Timer