mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
pprof: improve sampling for heap profiling
The current heap sampling introduces some bias that interferes with unsampling, producing unexpected heap profiles. The solution is to use a Poisson process to generate the sampling points, using the formulas described at https://en.wikipedia.org/wiki/Poisson_process This fixes #12620 Change-Id: If2400809ed3c41de504dd6cff06be14e476ff96c Reviewed-on: https://go-review.googlesource.com/14590 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Minux Ma <minux@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
This commit is contained in:
parent
0357c38adf
commit
27ee719fb3
9 changed files with 361 additions and 28 deletions
|
|
@ -26,6 +26,8 @@ var Xadduintptr = xadduintptr
|
|||
|
||||
var FuncPC = funcPC
|
||||
|
||||
var Fastlog2 = fastlog2
|
||||
|
||||
type LFNode struct {
|
||||
Next uint64
|
||||
Pushcnt uintptr
|
||||
|
|
|
|||
33
src/runtime/fastlog2.go
Normal file
33
src/runtime/fastlog2.go
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runtime
|
||||
|
||||
import "unsafe"
|
||||
|
||||
// fastlog2 implements a fast approximation to the base 2 log of a
|
||||
// float64. This is used to compute a geometric distribution for heap
|
||||
// sampling, without introducing dependences into package math. This
|
||||
// uses a very rough approximation using the float64 exponent and the
|
||||
// first 25 bits of the mantissa. The top 5 bits of the mantissa are
|
||||
// used to load limits from a table of constants and the rest are used
|
||||
// to scale linearly between them.
|
||||
func fastlog2(x float64) float64 {
|
||||
const fastlogScaleBits = 20
|
||||
const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits)
|
||||
|
||||
xBits := float64bits(x)
|
||||
// Extract the exponent from the IEEE float64, and index a constant
|
||||
// table with the first 10 bits from the mantissa.
|
||||
xExp := int64((xBits>>52)&0x7FF) - 1023
|
||||
xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits)
|
||||
xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits)
|
||||
|
||||
low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
|
||||
return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
|
||||
}
|
||||
|
||||
// float64bits returns the IEEE 754 binary representation of f.
|
||||
// Taken from math.Float64bits to avoid dependences into package math.
|
||||
func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }
|
||||
28
src/runtime/fastlog2_test.go
Normal file
28
src/runtime/fastlog2_test.go
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package runtime_test
|
||||
|
||||
import (
|
||||
"math"
|
||||
"runtime"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFastLog2(t *testing.T) {
|
||||
// Compute the euclidean distance between math.Log2 and the FastLog2
|
||||
// implementation over the range of interest for heap sampling.
|
||||
const randomBitCount = 26
|
||||
var e float64
|
||||
for i := 1; i < 1<<randomBitCount; i++ {
|
||||
l, fl := math.Log2(float64(i)), runtime.Fastlog2(float64(i))
|
||||
d := l - fl
|
||||
e += d * d
|
||||
}
|
||||
e = math.Sqrt(e)
|
||||
|
||||
if e > 1.0 {
|
||||
t.Fatalf("imprecision on fastlog2 implementation, want <=1.0, got %f", e)
|
||||
}
|
||||
}
|
||||
43
src/runtime/fastlog2table.go
Normal file
43
src/runtime/fastlog2table.go
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
// AUTO-GENERATED by mkfastlog2table.go
|
||||
// Run go generate from src/runtime to update.
|
||||
// See mkfastlog2table.go for comments.
|
||||
|
||||
package runtime
|
||||
|
||||
const fastlogNumBits = 5
|
||||
|
||||
var fastlog2Table = [1<<fastlogNumBits + 1]float64{
|
||||
0,
|
||||
0.0443941193584535,
|
||||
0.08746284125033943,
|
||||
0.12928301694496647,
|
||||
0.16992500144231248,
|
||||
0.2094533656289499,
|
||||
0.24792751344358555,
|
||||
0.28540221886224837,
|
||||
0.3219280948873623,
|
||||
0.3575520046180837,
|
||||
0.39231742277876036,
|
||||
0.4262647547020979,
|
||||
0.4594316186372973,
|
||||
0.4918530963296748,
|
||||
0.5235619560570128,
|
||||
0.5545888516776374,
|
||||
0.5849625007211563,
|
||||
0.6147098441152082,
|
||||
0.6438561897747247,
|
||||
0.6724253419714956,
|
||||
0.7004397181410922,
|
||||
0.7279204545631992,
|
||||
0.7548875021634686,
|
||||
0.7813597135246596,
|
||||
0.8073549220576042,
|
||||
0.8328900141647417,
|
||||
0.8579809951275721,
|
||||
0.8826430493618412,
|
||||
0.9068905956085185,
|
||||
0.9307373375628862,
|
||||
0.9541963103868752,
|
||||
0.9772799234999164,
|
||||
1,
|
||||
}
|
||||
|
|
@ -792,26 +792,43 @@ func rawmem(size uintptr) unsafe.Pointer {
|
|||
}
|
||||
|
||||
func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
|
||||
c := mp.mcache
|
||||
rate := MemProfileRate
|
||||
if size < uintptr(rate) {
|
||||
// pick next profile time
|
||||
// If you change this, also change allocmcache.
|
||||
if rate > 0x3fffffff { // make 2*rate not overflow
|
||||
rate = 0x3fffffff
|
||||
}
|
||||
next := int32(fastrand1()) % (2 * int32(rate))
|
||||
// Subtract the "remainder" of the current allocation.
|
||||
// Otherwise objects that are close in size to sampling rate
|
||||
// will be under-sampled, because we consistently discard this remainder.
|
||||
next -= (int32(size) - c.next_sample)
|
||||
if next < 0 {
|
||||
next = 0
|
||||
}
|
||||
c.next_sample = next
|
||||
mp.mcache.next_sample = nextSample()
|
||||
mProf_Malloc(x, size)
|
||||
}
|
||||
|
||||
mProf_Malloc(x, size)
|
||||
// nextSample returns the next sampling point for heap profiling.
|
||||
// It produces a random variable with a geometric distribution and
|
||||
// mean MemProfileRate. This is done by generating a uniformly
|
||||
// distributed random number and applying the cumulative distribution
|
||||
// function for an exponential.
|
||||
func nextSample() int32 {
|
||||
period := MemProfileRate
|
||||
|
||||
// make nextSample not overflow. Maximum possible step is
|
||||
// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
|
||||
switch {
|
||||
case period > 0x7000000:
|
||||
period = 0x7000000
|
||||
case period == 0:
|
||||
return 0
|
||||
}
|
||||
|
||||
// Let m be the sample rate,
|
||||
// the probability distribution function is m*exp(-mx), so the CDF is
|
||||
// p = 1 - exp(-mx), so
|
||||
// q = 1 - p == exp(-mx)
|
||||
// log_e(q) = -mx
|
||||
// -log_e(q)/m = x
|
||||
// x = -log_e(q) * period
|
||||
// x = log_2(q) * (-log_e(2)) * period ; Using log_2 for efficiency
|
||||
const randomBitCount = 26
|
||||
q := uint32(fastrand1())%(1<<randomBitCount) + 1
|
||||
qlog := fastlog2(float64(q)) - randomBitCount
|
||||
if qlog > 0 {
|
||||
qlog = 0
|
||||
}
|
||||
const minusLog2 = -0.6931471805599453 // -ln(2)
|
||||
return int32(qlog*(minusLog2*float64(period))) + 1
|
||||
}
|
||||
|
||||
type persistentAlloc struct {
|
||||
|
|
|
|||
|
|
@ -69,16 +69,7 @@ func allocmcache() *mcache {
|
|||
for i := 0; i < _NumSizeClasses; i++ {
|
||||
c.alloc[i] = &emptymspan
|
||||
}
|
||||
|
||||
// Set first allocation sample size.
|
||||
rate := MemProfileRate
|
||||
if rate > 0x3fffffff { // make 2*rate not overflow
|
||||
rate = 0x3fffffff
|
||||
}
|
||||
if rate != 0 {
|
||||
c.next_sample = int32(int(fastrand1()) % (2 * rate))
|
||||
}
|
||||
|
||||
c.next_sample = nextSample()
|
||||
return c
|
||||
}
|
||||
|
||||
|
|
|
|||
52
src/runtime/mkfastlog2table.go
Normal file
52
src/runtime/mkfastlog2table.go
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build ignore
|
||||
|
||||
// fastlog2Table contains log2 approximations for 5 binary digits.
|
||||
// This is used to implement fastlog2, which is used for heap sampling.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"math"
|
||||
)
|
||||
|
||||
func main() {
|
||||
var buf bytes.Buffer
|
||||
|
||||
fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go")
|
||||
fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
|
||||
fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.")
|
||||
fmt.Fprintln(&buf)
|
||||
fmt.Fprintln(&buf, "package runtime")
|
||||
fmt.Fprintln(&buf)
|
||||
fmt.Fprintln(&buf, "const fastlogNumBits =", fastlogNumBits)
|
||||
fmt.Fprintln(&buf)
|
||||
|
||||
fmt.Fprintln(&buf, "var fastlog2Table = [1<<fastlogNumBits + 1]float64{")
|
||||
table := computeTable()
|
||||
for _, t := range table {
|
||||
fmt.Fprintf(&buf, "\t%v,\n", t)
|
||||
}
|
||||
fmt.Fprintln(&buf, "}")
|
||||
|
||||
if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
|
||||
log.Fatalln(err)
|
||||
}
|
||||
}
|
||||
|
||||
const fastlogNumBits = 5
|
||||
|
||||
func computeTable() []float64 {
|
||||
fastlog2Table := make([]float64, 1<<fastlogNumBits+1)
|
||||
for i := 0; i <= (1 << fastlogNumBits); i++ {
|
||||
fastlog2Table[i] = math.Log2(1.0 + float64(i)/(1<<fastlogNumBits))
|
||||
}
|
||||
return fastlog2Table
|
||||
}
|
||||
|
|
@ -8,6 +8,7 @@ import _ "unsafe" // for go:linkname
|
|||
|
||||
//go:generate go run wincallback.go
|
||||
//go:generate go run mkduff.go
|
||||
//go:generate go run mkfastlog2table.go
|
||||
|
||||
var ticks struct {
|
||||
lock mutex
|
||||
|
|
|
|||
166
test/heapsampling.go
Normal file
166
test/heapsampling.go
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
// run
|
||||
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test heap sampling logic.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
var a16 *[16]byte
|
||||
var a512 *[512]byte
|
||||
var a256 *[256]byte
|
||||
var a1k *[1024]byte
|
||||
var a64k *[64 * 1024]byte
|
||||
|
||||
// This test checks that heap sampling produces reasonable
|
||||
// results. Note that heap sampling uses randomization, so the results
|
||||
// vary for run to run. This test only checks that the resulting
|
||||
// values appear reasonable.
|
||||
func main() {
|
||||
const countInterleaved = 10000
|
||||
allocInterleaved(countInterleaved)
|
||||
checkAllocations(getMemProfileRecords(), "main.allocInterleaved", countInterleaved, []int64{256 * 1024, 1024, 256 * 1024, 512, 256 * 1024, 256})
|
||||
|
||||
const count = 100000
|
||||
alloc(count)
|
||||
checkAllocations(getMemProfileRecords(), "main.alloc", count, []int64{1024, 512, 256})
|
||||
}
|
||||
|
||||
// allocInterleaved stress-tests the heap sampling logic by
|
||||
// interleaving large and small allocations.
|
||||
func allocInterleaved(n int) {
|
||||
for i := 0; i < n; i++ {
|
||||
// Test verification depends on these lines being contiguous.
|
||||
a64k = new([64 * 1024]byte)
|
||||
a1k = new([1024]byte)
|
||||
a64k = new([64 * 1024]byte)
|
||||
a512 = new([512]byte)
|
||||
a64k = new([64 * 1024]byte)
|
||||
a256 = new([256]byte)
|
||||
}
|
||||
}
|
||||
|
||||
// alloc performs only small allocations for sanity testing.
|
||||
func alloc(n int) {
|
||||
for i := 0; i < n; i++ {
|
||||
// Test verification depends on these lines being contiguous.
|
||||
a1k = new([1024]byte)
|
||||
a512 = new([512]byte)
|
||||
a256 = new([256]byte)
|
||||
}
|
||||
}
|
||||
|
||||
// checkAllocations validates that the profile records collected for
|
||||
// the named function are consistent with count contiguous allocations
|
||||
// of the specified sizes.
|
||||
func checkAllocations(records []runtime.MemProfileRecord, fname string, count int64, size []int64) {
|
||||
a := allocObjects(records, fname)
|
||||
firstLine := 0
|
||||
for ln := range a {
|
||||
if firstLine == 0 || firstLine > ln {
|
||||
firstLine = ln
|
||||
}
|
||||
}
|
||||
var totalcount int64
|
||||
for i, w := range size {
|
||||
ln := firstLine + i
|
||||
s := a[ln]
|
||||
checkValue(fname, ln, "objects", count, s.objects)
|
||||
checkValue(fname, ln, "bytes", count*w, s.bytes)
|
||||
totalcount += s.objects
|
||||
}
|
||||
// Check the total number of allocations, to ensure some sampling occurred.
|
||||
if totalwant := count * int64(len(size)); totalcount <= 0 || totalcount > totalwant*1024 {
|
||||
panic(fmt.Sprintf("%s want total count > 0 && <= %d, got %d", fname, totalwant*1024, totalcount))
|
||||
}
|
||||
}
|
||||
|
||||
// checkValue checks an unsampled value against a range.
|
||||
func checkValue(fname string, ln int, name string, want, got int64) {
|
||||
if got < 0 || got > 1024*want {
|
||||
panic(fmt.Sprintf("%s:%d want %s >= 0 && <= %d, got %d", fname, ln, name, 1024*want, got))
|
||||
}
|
||||
}
|
||||
|
||||
func getMemProfileRecords() []runtime.MemProfileRecord {
|
||||
// Find out how many records there are (MemProfile(nil, true)),
|
||||
// allocate that many records, and get the data.
|
||||
// There's a race—more records might be added between
|
||||
// the two calls—so allocate a few extra records for safety
|
||||
// and also try again if we're very unlucky.
|
||||
// The loop should only execute one iteration in the common case.
|
||||
var p []runtime.MemProfileRecord
|
||||
n, ok := runtime.MemProfile(nil, true)
|
||||
for {
|
||||
// Allocate room for a slightly bigger profile,
|
||||
// in case a few more entries have been added
|
||||
// since the call to MemProfile.
|
||||
p = make([]runtime.MemProfileRecord, n+50)
|
||||
n, ok = runtime.MemProfile(p, true)
|
||||
if ok {
|
||||
p = p[0:n]
|
||||
break
|
||||
}
|
||||
// Profile grew; try again.
|
||||
}
|
||||
return p
|
||||
}
|
||||
|
||||
type allocStat struct {
|
||||
bytes, objects int64
|
||||
}
|
||||
|
||||
// allocObjects examines the profile records for the named function
|
||||
// and returns the allocation stats aggregated by source line number.
|
||||
func allocObjects(records []runtime.MemProfileRecord, function string) map[int]allocStat {
|
||||
a := make(map[int]allocStat)
|
||||
for _, r := range records {
|
||||
for _, s := range r.Stack0 {
|
||||
if s == 0 {
|
||||
break
|
||||
}
|
||||
if f := runtime.FuncForPC(s); f != nil {
|
||||
name := f.Name()
|
||||
_, line := f.FileLine(s)
|
||||
if name == function {
|
||||
allocStat := a[line]
|
||||
allocStat.bytes += r.AllocBytes
|
||||
allocStat.objects += r.AllocObjects
|
||||
a[line] = allocStat
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for line, stats := range a {
|
||||
objects, bytes := scaleHeapSample(stats.objects, stats.bytes, int64(runtime.MemProfileRate))
|
||||
a[line] = allocStat{bytes, objects}
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
// scaleHeapSample unsamples heap allocations.
|
||||
// Taken from src/cmd/pprof/internal/profile/legacy_profile.go
|
||||
func scaleHeapSample(count, size, rate int64) (int64, int64) {
|
||||
if count == 0 || size == 0 {
|
||||
return 0, 0
|
||||
}
|
||||
|
||||
if rate <= 1 {
|
||||
// if rate==1 all samples were collected so no adjustment is needed.
|
||||
// if rate<1 treat as unknown and skip scaling.
|
||||
return count, size
|
||||
}
|
||||
|
||||
avgSize := float64(size) / float64(count)
|
||||
scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
|
||||
|
||||
return int64(float64(count) * scale), int64(float64(size) * scale)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue