mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: use bit-parallel operations to compute heap bit summaries
The new implementation is much faster in all cases. name old time/op new time/op delta PallocBitsSummarize/Unpacked00-16 142ns ± 1% 7ns ± 2% -94.75% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedFFFFFFFFFFFFFFFF-16 172ns ± 0% 24ns ± 0% -86.02% (p=0.000 n=9+9) PallocBitsSummarize/UnpackedAA-16 145ns ± 0% 32ns ± 0% -78.16% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedAAAAAAAAAAAAAAAA-16 172ns ± 0% 33ns ± 0% -80.95% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000AAAAAAAA-16 162ns ± 1% 60ns ± 0% -62.69% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedAAAAAAAA00000001-16 163ns ± 0% 68ns ± 1% -58.47% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBBBBBBBBBB-16 172ns ± 0% 35ns ± 0% -79.70% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000BBBBBBBB-16 161ns ± 0% 63ns ± 0% -60.61% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBB00000001-16 163ns ± 0% 60ns ± 0% -63.14% (p=0.000 n=9+10) PallocBitsSummarize/UnpackedCCCCCCCCCCCCCCCC-16 172ns ± 0% 39ns ± 0% -77.41% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4444444444444444-16 172ns ± 0% 39ns ± 0% -77.42% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4040404040404040-16 173ns ± 2% 51ns ± 1% -70.55% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked4000400040004000-16 160ns ± 1% 53ns ± 0% -66.78% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked1000404044CCAAFF-16 169ns ± 1% 59ns ± 1% -65.28% (p=0.000 n=10+10) Change-Id: I94daa645b76a9cf9c93edeb2058d7132216fcb72 Reviewed-on: https://go-review.googlesource.com/c/go/+/240900 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
This commit is contained in:
parent
88c094c96a
commit
4e5ed83e8d
2 changed files with 107 additions and 74 deletions
|
|
@ -120,78 +120,99 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) {
|
||||||
// sake of documentation, 0s are free pages and 1s are allocated pages.
|
// sake of documentation, 0s are free pages and 1s are allocated pages.
|
||||||
type pallocBits pageBits
|
type pallocBits pageBits
|
||||||
|
|
||||||
// consec8tab is a table containing the number of consecutive
|
|
||||||
// zero bits for any uint8 value.
|
|
||||||
//
|
|
||||||
// The table is generated by calling consec8(i) for each
|
|
||||||
// possible uint8 value, which is defined as:
|
|
||||||
//
|
|
||||||
// // consec8 counts the maximum number of consecutive 0 bits
|
|
||||||
// // in a uint8.
|
|
||||||
// func consec8(n uint8) int {
|
|
||||||
// n = ^n
|
|
||||||
// i := 0
|
|
||||||
// for n != 0 {
|
|
||||||
// n &= (n << 1)
|
|
||||||
// i++
|
|
||||||
// }
|
|
||||||
// return i
|
|
||||||
// }
|
|
||||||
var consec8tab = [256]uint{
|
|
||||||
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
|
|
||||||
4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
||||||
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
|
|
||||||
4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
|
|
||||||
6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
|
|
||||||
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
||||||
4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
|
|
||||||
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
|
|
||||||
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
|
|
||||||
4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
// summarize returns a packed summary of the bitmap in pallocBits.
|
// summarize returns a packed summary of the bitmap in pallocBits.
|
||||||
func (b *pallocBits) summarize() pallocSum {
|
func (b *pallocBits) summarize() pallocSum {
|
||||||
// TODO(mknyszek): There may be something more clever to be done
|
var start, max, cur uint
|
||||||
// here to make the summarize operation more efficient. For example,
|
const notSetYet = ^uint(0) // sentinel for start value
|
||||||
// we can compute start and end with 64-bit wide operations easily,
|
start = notSetYet
|
||||||
// but max is a bit more complex. Perhaps there exists some way to
|
|
||||||
// leverage the 64-bit start and end to our advantage?
|
|
||||||
var start, max, end uint
|
|
||||||
for i := 0; i < len(b); i++ {
|
for i := 0; i < len(b); i++ {
|
||||||
a := b[i]
|
x := b[i]
|
||||||
for j := 0; j < 64; j += 8 {
|
if x == 0 {
|
||||||
k := uint8(a >> j)
|
cur += 64
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
t := uint(sys.TrailingZeros64(x))
|
||||||
|
l := uint(sys.LeadingZeros64(x))
|
||||||
|
|
||||||
// Compute start.
|
// Finish any region spanning the uint64s
|
||||||
si := uint(sys.TrailingZeros8(k))
|
cur += t
|
||||||
if start == uint(i*64+j) {
|
if start == notSetYet {
|
||||||
start += si
|
start = cur
|
||||||
|
}
|
||||||
|
if cur > max {
|
||||||
|
max = cur
|
||||||
|
}
|
||||||
|
// Final region that might span to next uint64
|
||||||
|
cur = l
|
||||||
|
}
|
||||||
|
if start == notSetYet {
|
||||||
|
// Made it all the way through without finding a single 1 bit.
|
||||||
|
const n = uint(64 * len(b))
|
||||||
|
return packPallocSum(n, n, n)
|
||||||
|
}
|
||||||
|
if cur > max {
|
||||||
|
max = cur
|
||||||
|
}
|
||||||
|
if max >= 64-2 {
|
||||||
|
// There is no way an internal run of zeros could beat max.
|
||||||
|
return packPallocSum(start, max, cur)
|
||||||
|
}
|
||||||
|
// Now look inside each uint64 for runs of zeros.
|
||||||
|
// All uint64s must be nonzero, or we would have aborted above.
|
||||||
|
outer:
|
||||||
|
for i := 0; i < len(b); i++ {
|
||||||
|
x := b[i]
|
||||||
|
|
||||||
|
// Look inside this uint64. We have a pattern like
|
||||||
|
// 000000 1xxxxx1 000000
|
||||||
|
// We need to look inside the 1xxxxx1 for any contiguous
|
||||||
|
// region of zeros.
|
||||||
|
|
||||||
|
// We already know the trailing zeros are no larger than max. Remove them.
|
||||||
|
x >>= sys.TrailingZeros64(x) & 63
|
||||||
|
if x&(x+1) == 0 { // no more zeros (except at the top).
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute max.
|
// Strategy: shrink all runs of zeros by max. If any runs of zero
|
||||||
if end+si > max {
|
// remain, then we've identified a larger maxiumum zero run.
|
||||||
max = end + si
|
p := max // number of zeros we still need to shrink by.
|
||||||
|
k := uint(1) // current minimum length of runs of ones in x.
|
||||||
|
for {
|
||||||
|
// Shrink all runs of zeros by p places (except the top zeros).
|
||||||
|
for p > 0 {
|
||||||
|
if p <= k {
|
||||||
|
// Shift p ones down into the top of each run of zeros.
|
||||||
|
x |= x >> (p & 63)
|
||||||
|
if x&(x+1) == 0 { // no more zeros (except at the top).
|
||||||
|
continue outer
|
||||||
}
|
}
|
||||||
if mi := consec8tab[k]; mi > max {
|
break
|
||||||
max = mi
|
}
|
||||||
|
// Shift k ones down into the top of each run of zeros.
|
||||||
|
x |= x >> (k & 63)
|
||||||
|
if x&(x+1) == 0 { // no more zeros (except at the top).
|
||||||
|
continue outer
|
||||||
|
}
|
||||||
|
p -= k
|
||||||
|
// We've just doubled the minimum length of 1-runs.
|
||||||
|
// This allows us to shift farther in the next iteration.
|
||||||
|
k *= 2
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute end.
|
// The length of the lowest-order zero run is an increment to our maximum.
|
||||||
if k == 0 {
|
j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones
|
||||||
end += 8
|
x >>= j & 63 // remove trailing ones
|
||||||
} else {
|
j = uint(sys.TrailingZeros64(x)) // count contiguous trailing zeros
|
||||||
end = uint(sys.LeadingZeros8(k))
|
x >>= j & 63 // remove zeros
|
||||||
|
max += j // we have a new maximum!
|
||||||
|
if x&(x+1) == 0 { // no more zeros (except at the top).
|
||||||
|
continue outer
|
||||||
|
}
|
||||||
|
p = j // remove j more zeros from each zero run.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return packPallocSum(start, max, cur)
|
||||||
return packPallocSum(start, max, end)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// find searches for npages contiguous free pages in pallocBits and returns
|
// find searches for npages contiguous free pages in pallocBits and returns
|
||||||
|
|
|
||||||
|
|
@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) {
|
||||||
|
|
||||||
// Ensures two packed summaries are identical, and reports a detailed description
|
// Ensures two packed summaries are identical, and reports a detailed description
|
||||||
// of the difference if they're not.
|
// of the difference if they're not.
|
||||||
func checkPallocSum(t *testing.T, got, want PallocSum) {
|
func checkPallocSum(t testing.TB, got, want PallocSum) {
|
||||||
if got.Start() != want.Start() {
|
if got.Start() != want.Start() {
|
||||||
t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
|
t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
|
||||||
}
|
}
|
||||||
|
|
@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) {
|
||||||
|
|
||||||
// Benchmarks how quickly we can summarize a PallocBits.
|
// Benchmarks how quickly we can summarize a PallocBits.
|
||||||
func BenchmarkPallocBitsSummarize(b *testing.B) {
|
func BenchmarkPallocBitsSummarize(b *testing.B) {
|
||||||
buf0 := new(PallocBits)
|
patterns := []uint64{
|
||||||
buf1 := new(PallocBits)
|
0,
|
||||||
for i := 0; i < len(buf1); i++ {
|
^uint64(0),
|
||||||
buf1[i] = ^uint64(0)
|
0xaa,
|
||||||
|
0xaaaaaaaaaaaaaaaa,
|
||||||
|
0x80000000aaaaaaaa,
|
||||||
|
0xaaaaaaaa00000001,
|
||||||
|
0xbbbbbbbbbbbbbbbb,
|
||||||
|
0x80000000bbbbbbbb,
|
||||||
|
0xbbbbbbbb00000001,
|
||||||
|
0xcccccccccccccccc,
|
||||||
|
0x4444444444444444,
|
||||||
|
0x4040404040404040,
|
||||||
|
0x4000400040004000,
|
||||||
|
0x1000404044ccaaff,
|
||||||
}
|
}
|
||||||
bufa := new(PallocBits)
|
for _, p := range patterns {
|
||||||
for i := 0; i < len(bufa); i++ {
|
buf := new(PallocBits)
|
||||||
bufa[i] = 0xaa
|
for i := 0; i < len(buf); i++ {
|
||||||
|
buf[i] = p
|
||||||
}
|
}
|
||||||
for _, buf := range []*PallocBits{buf0, buf1, bufa} {
|
b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) {
|
||||||
b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
|
checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf))
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
buf.Summarize()
|
buf.Summarize()
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue