pprof: improve sampling for heap profiling

The current heap sampling introduces some bias that interferes with unsampling, producing unexpected heap profiles. The solution is to use a Poisson process to generate the sampling points, using the formulas described at https://en.wikipedia.org/wiki/Poisson_process This fixes #12620 Change-Id: If2400809ed3c41de504dd6cff06be14e476ff96c Reviewed-on: https://go-review.googlesource.com/14590 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Minux Ma <minux@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2025-12-08 06:10:04 +00:00 · 2015-09-14 14:03:45 -07:00 · 2015-09-14 14:03:45 -07:00 · 27ee719fb3
commit 27ee719fb3
parent 0357c38adf
9 changed files with 361 additions and 28 deletions
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@ -26,6 +26,8 @@ var Xadduintptr = xadduintptr

 var FuncPC = funcPC

+var Fastlog2 = fastlog2
+
 type LFNode struct {
 	Next    uint64
 	Pushcnt uintptr
--- a/src/runtime/fastlog2.go
+++ b/src/runtime/fastlog2.go
@ -0,0 +1,33 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// fastlog2 implements a fast approximation to the base 2 log of a
+// float64. This is used to compute a geometric distribution for heap
+// sampling, without introducing dependences into package math. This
+// uses a very rough approximation using the float64 exponent and the
+// first 25 bits of the mantissa. The top 5 bits of the mantissa are
+// used to load limits from a table of constants and the rest are used
+// to scale linearly between them.
+func fastlog2(x float64) float64 {
+	const fastlogScaleBits = 20
+	const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits)
+
+	xBits := float64bits(x)
+	// Extract the exponent from the IEEE float64, and index a constant
+	// table with the first 10 bits from the mantissa.
+	xExp := int64((xBits>>52)&0x7FF) - 1023
+	xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits)
+	xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits)
+
+	low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
+	return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
+}
+
+// float64bits returns the IEEE 754 binary representation of f.
+// Taken from math.Float64bits to avoid dependences into package math.
+func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }
--- a/src/runtime/fastlog2_test.go
+++ b/src/runtime/fastlog2_test.go
@ -0,0 +1,28 @@
+// Copyright 2015 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math"
+	"runtime"
+	"testing"
+)
+
+func TestFastLog2(t *testing.T) {
+	// Compute the euclidean distance between math.Log2 and the FastLog2
+	// implementation over the range of interest for heap sampling.
+	const randomBitCount = 26
+	var e float64
+	for i := 1; i < 1<<randomBitCount; i++ {
+		l, fl := math.Log2(float64(i)), runtime.Fastlog2(float64(i))
+		d := l - fl
+		e += d * d
+	}
+	e = math.Sqrt(e)
+
+	if e > 1.0 {
+		t.Fatalf("imprecision on fastlog2 implementation, want <=1.0, got %f", e)
+	}
+}
--- a/src/runtime/fastlog2table.go
+++ b/src/runtime/fastlog2table.go
@ -0,0 +1,43 @@
+// AUTO-GENERATED by mkfastlog2table.go
+// Run go generate from src/runtime to update.
+// See mkfastlog2table.go for comments.
+
+package runtime
+
+const fastlogNumBits = 5
+
+var fastlog2Table = [1<<fastlogNumBits + 1]float64{
+	0,
+	0.0443941193584535,
+	0.08746284125033943,
+	0.12928301694496647,
+	0.16992500144231248,
+	0.2094533656289499,
+	0.24792751344358555,
+	0.28540221886224837,
+	0.3219280948873623,
+	0.3575520046180837,
+	0.39231742277876036,
+	0.4262647547020979,
+	0.4594316186372973,
+	0.4918530963296748,
+	0.5235619560570128,
+	0.5545888516776374,
+	0.5849625007211563,
+	0.6147098441152082,
+	0.6438561897747247,
+	0.6724253419714956,
+	0.7004397181410922,
+	0.7279204545631992,
+	0.7548875021634686,
+	0.7813597135246596,
+	0.8073549220576042,
+	0.8328900141647417,
+	0.8579809951275721,
+	0.8826430493618412,
+	0.9068905956085185,
+	0.9307373375628862,
+	0.9541963103868752,
+	0.9772799234999164,
+	1,
+}
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@ -792,26 +792,43 @@ func rawmem(size uintptr) unsafe.Pointer {
 }

 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	c := mp.mcache
-	rate := MemProfileRate
-	if size < uintptr(rate) {
-		// pick next profile time
-		// If you change this, also change allocmcache.
-		if rate > 0x3fffffff { // make 2*rate not overflow
-			rate = 0x3fffffff
-		}
-		next := int32(fastrand1()) % (2 * int32(rate))
-		// Subtract the "remainder" of the current allocation.
-		// Otherwise objects that are close in size to sampling rate
-		// will be under-sampled, because we consistently discard this remainder.
-		next -= (int32(size) - c.next_sample)
-		if next < 0 {
-			next = 0
-		}
-		c.next_sample = next
+	mp.mcache.next_sample = nextSample()
+	mProf_Malloc(x, size)
 }

-	mProf_Malloc(x, size)
+// nextSample returns the next sampling point for heap profiling.
+// It produces a random variable with a geometric distribution and
+// mean MemProfileRate. This is done by generating a uniformly
+// distributed random number and applying the cumulative distribution
+// function for an exponential.
+func nextSample() int32 {
+	period := MemProfileRate
+
+	// make nextSample not overflow. Maximum possible step is
+	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+	switch {
+	case period > 0x7000000:
+		period = 0x7000000
+	case period == 0:
+		return 0
+	}
+
+	// Let m be the sample rate,
+	// the probability distribution function is m*exp(-mx), so the CDF is
+	// p = 1 - exp(-mx), so
+	// q = 1 - p == exp(-mx)
+	// log_e(q) = -mx
+	// -log_e(q)/m = x
+	// x = -log_e(q) * period
+	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	const randomBitCount = 26
+	q := uint32(fastrand1())%(1<<randomBitCount) + 1
+	qlog := fastlog2(float64(q)) - randomBitCount
+	if qlog > 0 {
+		qlog = 0
+	}
+	const minusLog2 = -0.6931471805599453 // -ln(2)
+	return int32(qlog*(minusLog2*float64(period))) + 1
 }

 type persistentAlloc struct {
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@ -69,16 +69,7 @@ func allocmcache() *mcache {
 	for i := 0; i < _NumSizeClasses; i++ {
 		c.alloc[i] = &emptymspan
 	}
-
-	// Set first allocation sample size.
-	rate := MemProfileRate
-	if rate > 0x3fffffff { // make 2*rate not overflow
-		rate = 0x3fffffff
-	}
-	if rate != 0 {
-		c.next_sample = int32(int(fastrand1()) % (2 * rate))
-	}
-
+	c.next_sample = nextSample()
 	return c
 }

--- a/src/runtime/mkfastlog2table.go
+++ b/src/runtime/mkfastlog2table.go
@ -0,0 +1,52 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// fastlog2Table contains log2 approximations for 5 binary digits.
+// This is used to implement fastlog2, which is used for heap sampling.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math"
+)
+
+func main() {
+	var buf bytes.Buffer
+
+	fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go")
+	fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
+	fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.")
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "package runtime")
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "const fastlogNumBits =", fastlogNumBits)
+	fmt.Fprintln(&buf)
+
+	fmt.Fprintln(&buf, "var fastlog2Table = [1<<fastlogNumBits + 1]float64{")
+	table := computeTable()
+	for _, t := range table {
+		fmt.Fprintf(&buf, "\t%v,\n", t)
+	}
+	fmt.Fprintln(&buf, "}")
+
+	if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
+		log.Fatalln(err)
+	}
+}
+
+const fastlogNumBits = 5
+
+func computeTable() []float64 {
+	fastlog2Table := make([]float64, 1<<fastlogNumBits+1)
+	for i := 0; i <= (1 << fastlogNumBits); i++ {
+		fastlog2Table[i] = math.Log2(1.0 + float64(i)/(1<<fastlogNumBits))
+	}
+	return fastlog2Table
+}
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@ -8,6 +8,7 @@ import _ "unsafe" // for go:linkname

 //go:generate go run wincallback.go
 //go:generate go run mkduff.go
+//go:generate go run mkfastlog2table.go

 var ticks struct {
 	lock mutex
--- a/test/heapsampling.go
+++ b/test/heapsampling.go
@ -0,0 +1,166 @@
+// run
+
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test heap sampling logic.
+
+package main
+
+import (
+	"fmt"
+	"math"
+	"runtime"
+)
+
+var a16 *[16]byte
+var a512 *[512]byte
+var a256 *[256]byte
+var a1k *[1024]byte
+var a64k *[64 * 1024]byte
+
+// This test checks that heap sampling produces reasonable
+// results. Note that heap sampling uses randomization, so the results
+// vary for run to run. This test only checks that the resulting
+// values appear reasonable.
+func main() {
+	const countInterleaved = 10000
+	allocInterleaved(countInterleaved)
+	checkAllocations(getMemProfileRecords(), "main.allocInterleaved", countInterleaved, []int64{256 * 1024, 1024, 256 * 1024, 512, 256 * 1024, 256})
+
+	const count = 100000
+	alloc(count)
+	checkAllocations(getMemProfileRecords(), "main.alloc", count, []int64{1024, 512, 256})
+}
+
+// allocInterleaved stress-tests the heap sampling logic by
+// interleaving large and small allocations.
+func allocInterleaved(n int) {
+	for i := 0; i < n; i++ {
+		// Test verification depends on these lines being contiguous.
+		a64k = new([64 * 1024]byte)
+		a1k = new([1024]byte)
+		a64k = new([64 * 1024]byte)
+		a512 = new([512]byte)
+		a64k = new([64 * 1024]byte)
+		a256 = new([256]byte)
+	}
+}
+
+// alloc performs only small allocations for sanity testing.
+func alloc(n int) {
+	for i := 0; i < n; i++ {
+		// Test verification depends on these lines being contiguous.
+		a1k = new([1024]byte)
+		a512 = new([512]byte)
+		a256 = new([256]byte)
+	}
+}
+
+// checkAllocations validates that the profile records collected for
+// the named function are consistent with count contiguous allocations
+// of the specified sizes.
+func checkAllocations(records []runtime.MemProfileRecord, fname string, count int64, size []int64) {
+	a := allocObjects(records, fname)
+	firstLine := 0
+	for ln := range a {
+		if firstLine == 0 || firstLine > ln {
+			firstLine = ln
+		}
+	}
+	var totalcount int64
+	for i, w := range size {
+		ln := firstLine + i
+		s := a[ln]
+		checkValue(fname, ln, "objects", count, s.objects)
+		checkValue(fname, ln, "bytes", count*w, s.bytes)
+		totalcount += s.objects
+	}
+	// Check the total number of allocations, to ensure some sampling occurred.
+	if totalwant := count * int64(len(size)); totalcount <= 0 || totalcount > totalwant*1024 {
+		panic(fmt.Sprintf("%s want total count > 0 && <= %d, got %d", fname, totalwant*1024, totalcount))
+	}
+}
+
+// checkValue checks an unsampled value against a range.
+func checkValue(fname string, ln int, name string, want, got int64) {
+	if got < 0 || got > 1024*want {
+		panic(fmt.Sprintf("%s:%d want %s >= 0 && <= %d, got %d", fname, ln, name, 1024*want, got))
+	}
+}
+
+func getMemProfileRecords() []runtime.MemProfileRecord {
+	// Find out how many records there are (MemProfile(nil, true)),
+	// allocate that many records, and get the data.
+	// There's a race—more records might be added between
+	// the two calls—so allocate a few extra records for safety
+	// and also try again if we're very unlucky.
+	// The loop should only execute one iteration in the common case.
+	var p []runtime.MemProfileRecord
+	n, ok := runtime.MemProfile(nil, true)
+	for {
+		// Allocate room for a slightly bigger profile,
+		// in case a few more entries have been added
+		// since the call to MemProfile.
+		p = make([]runtime.MemProfileRecord, n+50)
+		n, ok = runtime.MemProfile(p, true)
+		if ok {
+			p = p[0:n]
+			break
+		}
+		// Profile grew; try again.
+	}
+	return p
+}
+
+type allocStat struct {
+	bytes, objects int64
+}
+
+// allocObjects examines the profile records for the named function
+// and returns the allocation stats aggregated by source line number.
+func allocObjects(records []runtime.MemProfileRecord, function string) map[int]allocStat {
+	a := make(map[int]allocStat)
+	for _, r := range records {
+		for _, s := range r.Stack0 {
+			if s == 0 {
+				break
+			}
+			if f := runtime.FuncForPC(s); f != nil {
+				name := f.Name()
+				_, line := f.FileLine(s)
+				if name == function {
+					allocStat := a[line]
+					allocStat.bytes += r.AllocBytes
+					allocStat.objects += r.AllocObjects
+					a[line] = allocStat
+				}
+			}
+		}
+	}
+	for line, stats := range a {
+		objects, bytes := scaleHeapSample(stats.objects, stats.bytes, int64(runtime.MemProfileRate))
+		a[line] = allocStat{bytes, objects}
+	}
+	return a
+}
+
+// scaleHeapSample unsamples heap allocations.
+// Taken from src/cmd/pprof/internal/profile/legacy_profile.go
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+	if count == 0 || size == 0 {
+		return 0, 0
+	}
+
+	if rate <= 1 {
+		// if rate==1 all samples were collected so no adjustment is needed.
+		// if rate<1 treat as unknown and skip scaling.
+		return count, size
+	}
+
+	avgSize := float64(size) / float64(count)
+	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+	return int64(float64(count) * scale), int64(float64(size) * scale)
+}