mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
internal/runtime/gc/scan: add AVX512 impl of filterNil.
Benchmark results: ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=60-48 6.356Gi ± ∞ ¹ 7.332Gi ± ∞ ¹ ~ (p=1.000 n=1) ² ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=70-48 6.756Gi ± ∞ ¹ 8.302Gi ± ∞ ¹ ~ (p=1.000 n=1) ² ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=80-48 7.018Gi ± ∞ ¹ 8.658Gi ± ∞ ¹ ~ (p=1.000 n=1) ² ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=90-48 7.313Gi ± ∞ ¹ 9.055Gi ± ∞ ¹ ~ (p=1.000 n=1) ² ScanSpanPacked/cache=ram/pages=10368/sizeclass=25/pct=100-48 7.583Gi ± ∞ ¹ 9.557Gi ± ∞ ¹ ~ (p=1.000 n=1) ² geomean 10.61Gi 14.83Gi +39.81% Almost a +40% on throughput. Change-Id: I6f31a0f0202ec7f3c9d2bbffca5d6e377306fc25 Reviewed-on: https://go-review.googlesource.com/c/go/+/722040 Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
parent
ccd389036a
commit
a9704f89ea
6 changed files with 113 additions and 19 deletions
|
|
@ -9,8 +9,6 @@ import "unsafe"
|
|||
// FilterNil packs non-nil (non-zero) values in bufp together
|
||||
// at the beginning of bufp, returning the length of the
|
||||
// packed buffer. It treats bufp as an array of size n.
|
||||
//
|
||||
// TODO(mknyszek): Add a faster SIMD-based implementation.
|
||||
func FilterNil(bufp *uintptr, n int32) int32 {
|
||||
buf := unsafe.Slice(bufp, int(n))
|
||||
lo := 0
|
||||
|
|
|
|||
9
src/internal/runtime/gc/scan/filter_amd64.go
Normal file
9
src/internal/runtime/gc/scan/filter_amd64.go
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package scan
|
||||
|
||||
// FilterNilAVX512 is the simd version of FilterNil,
|
||||
// it is implemented in assembly.
|
||||
func FilterNilAVX512(bufp *uintptr, n int32) int32
|
||||
64
src/internal/runtime/gc/scan/filter_amd64.s
Normal file
64
src/internal/runtime/gc/scan/filter_amd64.s
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "go_asm.h"
|
||||
#include "textflag.h"
|
||||
|
||||
TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20
|
||||
// Load arguments
|
||||
MOVQ bufp+0(FP), R8 // R8 = bufp (start of the uint64 array)
|
||||
MOVL n+8(FP), R9 // R9 = n (total length)
|
||||
XORL R10, R10 // R10 = 0 (scanned = 0)
|
||||
XORL R11, R11 // R11 = 0 (cnt = 0)
|
||||
|
||||
MOVL R9, R12 // R12 = n
|
||||
SUBL R10, R12 // R12 = n - scanned
|
||||
CMPL R12, $8 // Compare (n - scanned) with 8
|
||||
JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup
|
||||
|
||||
vector_loop:
|
||||
LEAQ (R8)(R10*8), R13 // R13 = buf[scanned:] address
|
||||
VMOVDQU64 (R13), Z1 // Z1 = v (Load 8 uint64s)
|
||||
VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1.
|
||||
|
||||
LEAQ (R8)(R11*8), R14 // R14 = buf[cnt:] address
|
||||
VPCOMPRESSQ Z1, K1, Z1 // compress v
|
||||
VMOVDQU64 Z1, (R14) // store v to buf[cnt:]
|
||||
|
||||
KMOVW K1, R15
|
||||
POPCNTL R15, R15 // R15 = popcount(K1)
|
||||
|
||||
ADDL R15, R11 // cnt += popcount(K1)
|
||||
ADDL $8, R10 // scanned += 8
|
||||
|
||||
MOVL R9, R12 // R12 = n
|
||||
SUBL R10, R12 // R12 = n - scanned
|
||||
CMPL R12, $8 // Compare (n - scanned) with 8
|
||||
JGE vector_loop // If (n - scanned) >= 8, continue loop
|
||||
|
||||
scalar_loop:
|
||||
CMPL R10, R9 // Compare scanned with n
|
||||
JGE end // If scanned >= n, loop is done
|
||||
|
||||
scalar_next_i:
|
||||
LEAQ (R8)(R10*8), R13 // R13 = &buf[scanned]
|
||||
MOVQ (R13), R14 // R14 = buf[scanned]
|
||||
|
||||
CMPQ R14, $0
|
||||
JE scalar_increment_i // If buf[i] == 0, skip to increment i
|
||||
|
||||
LEAQ (R8)(R11*8), R15 // R15 = &buf[cnt]
|
||||
MOVQ R14, (R15) // buf[cnt] = buf[scanned]
|
||||
|
||||
ADDL $1, R11 // cnt++
|
||||
|
||||
scalar_increment_i:
|
||||
ADDL $1, R10 // scanned++
|
||||
|
||||
CMPL R10, R9
|
||||
JL scalar_next_i // if scanned < n, continue
|
||||
|
||||
end:
|
||||
MOVL R11, ret+16(FP)
|
||||
RET
|
||||
19
src/internal/runtime/gc/scan/filter_amd64_test.go
Normal file
19
src/internal/runtime/gc/scan/filter_amd64_test.go
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
// Copyright 2025 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64
|
||||
|
||||
package scan_test
|
||||
|
||||
import (
|
||||
"internal/runtime/gc/scan"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestFilterNilAVX512(t *testing.T) {
|
||||
if !scan.CanAVX512() {
|
||||
t.Skip("AVX512 is required for TestFilterNilAVX512")
|
||||
}
|
||||
runTestFilterNil(t, scan.FilterNilAVX512)
|
||||
}
|
||||
|
|
@ -10,56 +10,60 @@ import (
|
|||
)
|
||||
|
||||
func TestFilterNil(t *testing.T) {
|
||||
runTestFilterNil(t, scan.FilterNil)
|
||||
}
|
||||
|
||||
func runTestFilterNil(t *testing.T, filterNil func(*uintptr, int32) int32) {
|
||||
t.Run("empty", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{}, []uintptr{})
|
||||
testFilterNil(t, []uintptr{}, []uintptr{}, filterNil)
|
||||
})
|
||||
t.Run("one", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{4}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("elimOne", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0}, []uintptr{})
|
||||
testFilterNil(t, []uintptr{0}, []uintptr{}, filterNil)
|
||||
})
|
||||
t.Run("oneElimBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 4}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{0, 4}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("oneElimEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4, 0}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{4, 0}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("oneElimMultiBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("oneElimMultiEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("oneElimMulti", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4})
|
||||
testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}, filterNil)
|
||||
})
|
||||
t.Run("two", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12})
|
||||
testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}, filterNil)
|
||||
})
|
||||
t.Run("twoElimBegin", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12})
|
||||
testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}, filterNil)
|
||||
})
|
||||
t.Run("twoElimMid", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12})
|
||||
testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}, filterNil)
|
||||
})
|
||||
t.Run("twoElimEnd", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12})
|
||||
testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}, filterNil)
|
||||
})
|
||||
t.Run("twoElimMulti", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12})
|
||||
testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}, filterNil)
|
||||
})
|
||||
t.Run("Multi", func(t *testing.T) {
|
||||
testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5})
|
||||
testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}, filterNil)
|
||||
})
|
||||
}
|
||||
|
||||
func testFilterNil(t *testing.T, buf, want []uintptr) {
|
||||
func testFilterNil(t *testing.T, buf, want []uintptr, filterNil func(*uintptr, int32) int32) {
|
||||
var bufp *uintptr
|
||||
if len(buf) != 0 {
|
||||
bufp = &buf[0]
|
||||
}
|
||||
n := scan.FilterNil(bufp, int32(len(buf)))
|
||||
n := filterNil(bufp, int32(len(buf)))
|
||||
if n > int32(len(buf)) {
|
||||
t.Errorf("bogus new length returned: %d > %d", n, len(buf))
|
||||
return
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ func CanAVX512() bool {
|
|||
}
|
||||
|
||||
func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) {
|
||||
return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
|
||||
return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask))
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue