diff --git a/src/internal/runtime/gc/scan/filter.go b/src/internal/runtime/gc/scan/filter.go index 63cee9abf06..08b5fd5b545 100644 --- a/src/internal/runtime/gc/scan/filter.go +++ b/src/internal/runtime/gc/scan/filter.go @@ -9,8 +9,6 @@ import "unsafe" // FilterNil packs non-nil (non-zero) values in bufp together // at the beginning of bufp, returning the length of the // packed buffer. It treats bufp as an array of size n. -// -// TODO(mknyszek): Add a faster SIMD-based implementation. func FilterNil(bufp *uintptr, n int32) int32 { buf := unsafe.Slice(bufp, int(n)) lo := 0 diff --git a/src/internal/runtime/gc/scan/filter_amd64.go b/src/internal/runtime/gc/scan/filter_amd64.go new file mode 100644 index 00000000000..c750741f7f4 --- /dev/null +++ b/src/internal/runtime/gc/scan/filter_amd64.go @@ -0,0 +1,9 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scan + +// FilterNilAVX512 is the simd version of FilterNil, +// it is implemented in assembly. +func FilterNilAVX512(bufp *uintptr, n int32) int32 diff --git a/src/internal/runtime/gc/scan/filter_amd64.s b/src/internal/runtime/gc/scan/filter_amd64.s new file mode 100644 index 00000000000..47330b62eea --- /dev/null +++ b/src/internal/runtime/gc/scan/filter_amd64.s @@ -0,0 +1,64 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "go_asm.h" +#include "textflag.h" + +TEXT ·FilterNilAVX512(SB), NOSPLIT, $0-20 + // Load arguments + MOVQ bufp+0(FP), R8 // R8 = bufp (start of the uint64 array) + MOVL n+8(FP), R9 // R9 = n (total length) + XORL R10, R10 // R10 = 0 (scanned = 0) + XORL R11, R11 // R11 = 0 (cnt = 0) + + MOVL R9, R12 // R12 = n + SUBL R10, R12 // R12 = n - scanned + CMPL R12, $8 // Compare (n - scanned) with 8 + JLT scalar_loop // If (n - scanned) < 8, jump to the scalar cleanup + +vector_loop: + LEAQ (R8)(R10*8), R13 // R13 = buf[scanned:] address + VMOVDQU64 (R13), Z1 // Z1 = v (Load 8 uint64s) + VPCMPUQ $4, Z1, Z15, K1 // Z15 is always 0, compare Z1 with 0, results in K1. + + LEAQ (R8)(R11*8), R14 // R14 = buf[cnt:] address + VPCOMPRESSQ Z1, K1, Z1 // compress v + VMOVDQU64 Z1, (R14) // store v to buf[cnt:] + + KMOVW K1, R15 + POPCNTL R15, R15 // R15 = popcount(K1) + + ADDL R15, R11 // cnt += popcount(K1) + ADDL $8, R10 // scanned += 8 + + MOVL R9, R12 // R12 = n + SUBL R10, R12 // R12 = n - scanned + CMPL R12, $8 // Compare (n - scanned) with 8 + JGE vector_loop // If (n - scanned) >= 8, continue loop + +scalar_loop: + CMPL R10, R9 // Compare scanned with n + JGE end // If scanned >= n, loop is done + +scalar_next_i: + LEAQ (R8)(R10*8), R13 // R13 = &buf[scanned] + MOVQ (R13), R14 // R14 = buf[scanned] + + CMPQ R14, $0 + JE scalar_increment_i // If buf[i] == 0, skip to increment i + + LEAQ (R8)(R11*8), R15 // R15 = &buf[cnt] + MOVQ R14, (R15) // buf[cnt] = buf[scanned] + + ADDL $1, R11 // cnt++ + +scalar_increment_i: + ADDL $1, R10 // scanned++ + + CMPL R10, R9 + JL scalar_next_i // if scanned < n, continue + +end: + MOVL R11, ret+16(FP) + RET diff --git a/src/internal/runtime/gc/scan/filter_amd64_test.go b/src/internal/runtime/gc/scan/filter_amd64_test.go new file mode 100644 index 00000000000..f5426385f63 --- /dev/null +++ b/src/internal/runtime/gc/scan/filter_amd64_test.go @@ -0,0 +1,19 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build amd64 + +package scan_test + +import ( + "internal/runtime/gc/scan" + "testing" +) + +func TestFilterNilAVX512(t *testing.T) { + if !scan.CanAVX512() { + t.Skip("AVX512 is required for TestFilterNilAVX512") + } + runTestFilterNil(t, scan.FilterNilAVX512) +} diff --git a/src/internal/runtime/gc/scan/filter_test.go b/src/internal/runtime/gc/scan/filter_test.go index 115fbfb8bcf..7b5dd32db3b 100644 --- a/src/internal/runtime/gc/scan/filter_test.go +++ b/src/internal/runtime/gc/scan/filter_test.go @@ -10,56 +10,60 @@ import ( ) func TestFilterNil(t *testing.T) { + runTestFilterNil(t, scan.FilterNil) +} + +func runTestFilterNil(t *testing.T, filterNil func(*uintptr, int32) int32) { t.Run("empty", func(t *testing.T) { - testFilterNil(t, []uintptr{}, []uintptr{}) + testFilterNil(t, []uintptr{}, []uintptr{}, filterNil) }) t.Run("one", func(t *testing.T) { - testFilterNil(t, []uintptr{4}, []uintptr{4}) + testFilterNil(t, []uintptr{4}, []uintptr{4}, filterNil) }) t.Run("elimOne", func(t *testing.T) { - testFilterNil(t, []uintptr{0}, []uintptr{}) + testFilterNil(t, []uintptr{0}, []uintptr{}, filterNil) }) t.Run("oneElimBegin", func(t *testing.T) { - testFilterNil(t, []uintptr{0, 4}, []uintptr{4}) + testFilterNil(t, []uintptr{0, 4}, []uintptr{4}, filterNil) }) t.Run("oneElimEnd", func(t *testing.T) { - testFilterNil(t, []uintptr{4, 0}, []uintptr{4}) + testFilterNil(t, []uintptr{4, 0}, []uintptr{4}, filterNil) }) t.Run("oneElimMultiBegin", func(t *testing.T) { - testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}) + testFilterNil(t, []uintptr{0, 0, 0, 4}, []uintptr{4}, filterNil) }) t.Run("oneElimMultiEnd", func(t *testing.T) { - testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}) + testFilterNil(t, []uintptr{4, 0, 0, 0}, []uintptr{4}, filterNil) }) t.Run("oneElimMulti", func(t *testing.T) { - testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}) + testFilterNil(t, []uintptr{0, 0, 0, 4, 0}, []uintptr{4}, filterNil) }) t.Run("two", func(t *testing.T) { - testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}) + testFilterNil(t, []uintptr{5, 12}, []uintptr{5, 12}, filterNil) }) t.Run("twoElimBegin", func(t *testing.T) { - testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}) + testFilterNil(t, []uintptr{0, 5, 12}, []uintptr{5, 12}, filterNil) }) t.Run("twoElimMid", func(t *testing.T) { - testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}) + testFilterNil(t, []uintptr{5, 0, 12}, []uintptr{5, 12}, filterNil) }) t.Run("twoElimEnd", func(t *testing.T) { - testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}) + testFilterNil(t, []uintptr{5, 12, 0}, []uintptr{5, 12}, filterNil) }) t.Run("twoElimMulti", func(t *testing.T) { - testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}) + testFilterNil(t, []uintptr{0, 5, 0, 12, 0}, []uintptr{5, 12}, filterNil) }) t.Run("Multi", func(t *testing.T) { - testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}) + testFilterNil(t, []uintptr{1, 5, 5, 0, 0, 0, 12, 0, 121, 5, 0}, []uintptr{1, 5, 5, 12, 121, 5}, filterNil) }) } -func testFilterNil(t *testing.T, buf, want []uintptr) { +func testFilterNil(t *testing.T, buf, want []uintptr, filterNil func(*uintptr, int32) int32) { var bufp *uintptr if len(buf) != 0 { bufp = &buf[0] } - n := scan.FilterNil(bufp, int32(len(buf))) + n := filterNil(bufp, int32(len(buf))) if n > int32(len(buf)) { t.Errorf("bogus new length returned: %d > %d", n, len(buf)) return diff --git a/src/internal/runtime/gc/scan/scan_amd64.go b/src/internal/runtime/gc/scan/scan_amd64.go index 2ac181f97e5..0151804fa53 100644 --- a/src/internal/runtime/gc/scan/scan_amd64.go +++ b/src/internal/runtime/gc/scan/scan_amd64.go @@ -28,7 +28,7 @@ func CanAVX512() bool { } func ScanSpanPackedAVX512(mem unsafe.Pointer, bufp *uintptr, objMarks *gc.ObjMask, sizeClass uintptr, ptrMask *gc.PtrMask) (count int32) { - return FilterNil(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)) + return FilterNilAVX512(bufp, scanSpanPackedAVX512(mem, bufp, objMarks, sizeClass, ptrMask)) } //go:noescape