From d2270bccbda381a542b77157c9960e4ae90df8ad Mon Sep 17 00:00:00 2001 From: David Chase Date: Fri, 5 Sep 2025 19:05:18 -0400 Subject: [PATCH] [dev.simd] cmd/compile: track which CPU features are in scope analysis for - is this block only reached through feature checks? - does the function signature imply AVX-something? - is there an instruction in this block which implies AVX-something? and keep track of which features those are. Features = AVX, AVX2, AVX512, etc. Has a test. Change-Id: I0b6f2e87d01ec587818db11cf71fac1e4d500650 Reviewed-on: https://go-review.googlesource.com/c/go/+/706337 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao --- src/cmd/compile/internal/ssa/block.go | 53 ++++ src/cmd/compile/internal/ssa/compile.go | 3 + src/cmd/compile/internal/ssa/cpufeatures.go | 261 ++++++++++++++++++++ src/cmd/compile/internal/ssa/sizeof_test.go | 2 +- src/cmd/compile/internal/types/type.go | 2 + test/simd.go | 97 ++++++++ 6 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 src/cmd/compile/internal/ssa/cpufeatures.go create mode 100644 test/simd.go diff --git a/src/cmd/compile/internal/ssa/block.go b/src/cmd/compile/internal/ssa/block.go index 1240bfd6556..f457e66f16e 100644 --- a/src/cmd/compile/internal/ssa/block.go +++ b/src/cmd/compile/internal/ssa/block.go @@ -18,6 +18,9 @@ type Block struct { // Source position for block's control operation Pos src.XPos + // What cpu features (AVXnnn, SVEyyy) are implied to reach/execute this block? + CPUfeatures CPUfeatures + // The kind of block this is. Kind BlockKind @@ -449,3 +452,53 @@ const ( HotPgoInitial = HotPgo | HotInitial // special case; single block loop, initial block is header block has a flow-in entry, but PGO says it is hot HotPgoInitialNotFLowIn = HotPgo | HotInitial | HotNotFlowIn // PGO says it is hot, and the loop is rotated so flow enters loop with a branch ) + +type CPUfeatures uint32 + +const ( + CPUNone CPUfeatures = 0 + CPUAll CPUfeatures = ^CPUfeatures(0) + CPUavx CPUfeatures = 1 << iota + CPUavx2 + CPUavxvnni + CPUavx512 + CPUbitalg + CPUgfni + CPUvbmi + CPUvbmi2 + CPUvpopcntdq + CPUavx512vnni + + CPUneon + CPUsve2 +) + +func (f CPUfeatures) String() string { + if f == CPUNone { + return "none" + } + if f == CPUAll { + return "all" + } + s := "" + foo := func(what string, feat CPUfeatures) { + if feat&f != 0 { + if s != "" { + s += "+" + } + s += what + } + } + foo("avx", CPUavx) + foo("avx2", CPUavx2) + foo("avx512", CPUavx512) + foo("avxvnni", CPUavxvnni) + foo("bitalg", CPUbitalg) + foo("gfni", CPUgfni) + foo("vbmi", CPUvbmi) + foo("vbmi2", CPUvbmi2) + foo("popcntdq", CPUvpopcntdq) + foo("avx512vnni", CPUavx512vnni) + + return s +} diff --git a/src/cmd/compile/internal/ssa/compile.go b/src/cmd/compile/internal/ssa/compile.go index 1f473625833..be1a6f158e6 100644 --- a/src/cmd/compile/internal/ssa/compile.go +++ b/src/cmd/compile/internal/ssa/compile.go @@ -485,6 +485,7 @@ var passes = [...]pass{ {name: "writebarrier", fn: writebarrier, required: true}, // expand write barrier ops {name: "insert resched checks", fn: insertLoopReschedChecks, disabled: !buildcfg.Experiment.PreemptibleLoops}, // insert resched checks in loops. + {name: "cpufeatures", fn: cpufeatures, required: buildcfg.Experiment.SIMD, disabled: !buildcfg.Experiment.SIMD}, {name: "lower", fn: lower, required: true}, {name: "addressing modes", fn: addressingModes, required: false}, {name: "late lower", fn: lateLower, required: true}, @@ -587,6 +588,8 @@ var passOrder = [...]constraint{ {"branchelim", "late opt"}, // ranchelim is an arch-independent pass. {"branchelim", "lower"}, + // lower needs cpu feature information (for SIMD) + {"cpufeatures", "lower"}, } func init() { diff --git a/src/cmd/compile/internal/ssa/cpufeatures.go b/src/cmd/compile/internal/ssa/cpufeatures.go new file mode 100644 index 00000000000..77b1db552d2 --- /dev/null +++ b/src/cmd/compile/internal/ssa/cpufeatures.go @@ -0,0 +1,261 @@ +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package ssa + +import ( + "cmd/compile/internal/types" + "cmd/internal/obj" + "fmt" + "internal/goarch" +) + +type localEffect struct { + start CPUfeatures // features present at beginning of block + internal CPUfeatures // features implied by execution of block + end [2]CPUfeatures // for BlockIf, features present on outgoing edges + visited bool // On the first iteration this will be false for backedges. +} + +func (e localEffect) String() string { + return fmt.Sprintf("visited=%v, start=%v, internal=%v, end[0]=%v, end[1]=%v", e.visited, e.start, e.internal, e.end[0], e.end[1]) +} + +// ifEffect pattern matches for a BlockIf conditional on a load +// of a field from internal/cpu.X86 and returns the corresponding +// effect. +func ifEffect(b *Block) (features CPUfeatures, taken int) { + // TODO generalize for other architectures. + if b.Kind != BlockIf { + return + } + c := b.Controls[0] + + if c.Op == OpNot { + taken = 1 + c = c.Args[0] + } + if c.Op != OpLoad { + return + } + offPtr := c.Args[0] + if offPtr.Op != OpOffPtr { + return + } + addr := offPtr.Args[0] + if addr.Op != OpAddr || addr.Args[0].Op != OpSB { + return + } + sym := addr.Aux.(*obj.LSym) + if sym.Name != "internal/cpu.X86" { + return + } + o := offPtr.AuxInt + t := addr.Type + if !t.IsPtr() { + b.Func.Fatalf("The symbol %s is not a pointer, found %v instead", sym.Name, t) + } + t = t.Elem() + if !t.IsStruct() { + b.Func.Fatalf("The referent of symbol %s is not a struct, found %v instead", sym.Name, t) + } + match := "" + for _, f := range t.Fields() { + if o == f.Offset && f.Sym != nil { + match = f.Sym.Name + break + } + } + + switch match { + + case "HasAVX": + features = CPUavx + case "HasAVXVNNI": + features = CPUavx | CPUavxvnni + case "HasAVX2": + features = CPUavx2 | CPUavx + + // Compiler currently treats these all alike. + case "HasAVX512", "HasAVX512F", "HasAVX512CD", "HasAVX512BW", + "HasAVX512DQ", "HasAVX512VL", "HasAVX512VPCLMULQDQ": + features = CPUavx512 | CPUavx2 | CPUavx + + case "HasAVX512GFNI": + features = CPUavx512 | CPUgfni | CPUavx2 | CPUavx + case "HasAVX512VNNI": + features = CPUavx512 | CPUavx512vnni | CPUavx2 | CPUavx + case "HasAVX512VBMI": + features = CPUavx512 | CPUvbmi | CPUavx2 | CPUavx + case "HasAVX512VBMI2": + features = CPUavx512 | CPUvbmi2 | CPUavx2 | CPUavx + case "HasAVX512BITALG": + features = CPUavx512 | CPUbitalg | CPUavx2 | CPUavx + case "HasAVX512VPOPCNTDQ": + features = CPUavx512 | CPUvpopcntdq | CPUavx2 | CPUavx + + case "HasBMI1": + features = CPUvbmi + case "HasBMI2": + features = CPUvbmi2 + + // Features that are not currently interesting to the compiler. + case "HasAES", "HasADX", "HasERMS", "HasFSRM", "HasFMA", "HasGFNI", "HasOSXSAVE", + "HasPCLMULQDQ", "HasPOPCNT", "HasRDTSCP", "HasSHA", + "HasSSE3", "HasSSSE3", "HasSSE41", "HasSSE42": + + } + if b.Func.pass.debug > 2 { + b.Func.Warnl(b.Pos, "%s, block b%v has features offset %d, match is %s, features is %v", b.Func.Name, b.ID, o, match, features) + } + return +} + +func cpufeatures(f *Func) { + arch := f.Config.Ctxt().Arch.Family + // TODO there are other SIMD architectures + if arch != goarch.AMD64 { + return + } + + po := f.Postorder() + + effects := make([]localEffect, 1+f.NumBlocks(), 1+f.NumBlocks()) + + features := func(t *types.Type) CPUfeatures { + if t.IsSIMD() { + switch t.Size() { + case 16, 32: + return CPUavx + case 64: + return CPUavx512 | CPUavx2 | CPUavx + } + } + return CPUNone + } + + // visit blocks in reverse post order + // when b is visited, all of its predecessors (except for loop back edges) + // will have been visited + for i := len(po) - 1; i >= 0; i-- { + b := po[i] + + var feat CPUfeatures + + if b == f.Entry { + // Check the types of inputs and outputs, as well as annotations. + // Start with none and union all that is implied by all the types seen. + if f.Type != nil { // a problem for SSA tests + for _, field := range f.Type.RecvParamsResults() { + feat |= features(field.Type) + } + } + + } else { + // Start with all and intersect over predecessors + feat = CPUAll + for _, p := range b.Preds { + pb := p.Block() + if !effects[pb.ID].visited { + + continue + } + pi := p.Index() + if pb.Kind != BlockIf { + pi = 0 + } + + feat &= effects[pb.ID].end[pi] + } + } + + e := localEffect{start: feat, visited: true} + + // Separately capture the internal effects of this block + var internal CPUfeatures + for _, v := range b.Values { + // the rule applied here is, if the block contains any + // instruction that would fault if the feature (avx, avx512) + // were not present, then assume that the feature is present + // for all the instructions in the block, a fault is a fault. + t := v.Type + if t.IsResults() { + for i := 0; i < t.NumFields(); i++ { + feat |= features(t.FieldType(i)) + } + } else { + internal |= features(v.Type) + } + } + e.internal = internal + feat |= internal + + branchEffect, taken := ifEffect(b) + e.end = [2]CPUfeatures{feat, feat} + e.end[taken] |= branchEffect + + effects[b.ID] = e + if f.pass.debug > 1 && feat != CPUNone { + f.Warnl(b.Pos, "%s, block b%v has features %v", b.Func.Name, b.ID, feat) + } + + b.CPUfeatures = feat + } + + // If the flow graph is irreducible, things can still change on backedges. + change := true + for change { + change = false + for i := len(po) - 1; i >= 0; i-- { + b := po[i] + + if b == f.Entry { + continue // cannot change + } + feat := CPUAll + for _, p := range b.Preds { + pb := p.Block() + pi := p.Index() + if pb.Kind != BlockIf { + pi = 0 + } + feat &= effects[pb.ID].end[pi] + } + e := effects[b.ID] + if feat == e.start { + continue + } + e.start = feat + effects[b.ID] = e + // uh-oh, something changed + if f.pass.debug > 1 { + f.Warnl(b.Pos, "%s, block b%v saw predecessor feature change", b.Func.Name, b.ID) + } + + feat |= e.internal + if feat == e.end[0]&e.end[1] { + continue + } + + branchEffect, taken := ifEffect(b) + e.end = [2]CPUfeatures{feat, feat} + e.end[taken] |= branchEffect + + effects[b.ID] = e + b.CPUfeatures = feat + if f.pass.debug > 1 { + f.Warnl(b.Pos, "%s, block b%v has new features %v", b.Func.Name, b.ID, feat) + } + change = true + } + } + if f.pass.debug > 0 { + for _, b := range f.Blocks { + if b.CPUfeatures != CPUNone { + f.Warnl(b.Pos, "%s, block b%v has features %v", b.Func.Name, b.ID, b.CPUfeatures) + } + + } + } +} diff --git a/src/cmd/compile/internal/ssa/sizeof_test.go b/src/cmd/compile/internal/ssa/sizeof_test.go index a27002ee3ac..9a58197925c 100644 --- a/src/cmd/compile/internal/ssa/sizeof_test.go +++ b/src/cmd/compile/internal/ssa/sizeof_test.go @@ -21,7 +21,7 @@ func TestSizeof(t *testing.T) { _64bit uintptr // size on 64bit platforms }{ {Value{}, 72, 112}, - {Block{}, 164, 304}, + {Block{}, 168, 312}, {LocalSlot{}, 28, 40}, {valState{}, 28, 40}, } diff --git a/src/cmd/compile/internal/types/type.go b/src/cmd/compile/internal/types/type.go index 652d4362ce7..fc2c0435bdf 100644 --- a/src/cmd/compile/internal/types/type.go +++ b/src/cmd/compile/internal/types/type.go @@ -989,6 +989,7 @@ func (t *Type) ArgWidth() int64 { return t.extra.(*Func).Argwid } +// Size returns the width of t in bytes. func (t *Type) Size() int64 { if t.kind == TSSA { return t.width @@ -997,6 +998,7 @@ func (t *Type) Size() int64 { return t.width } +// Alignment returns the alignment of t in bytes. func (t *Type) Alignment() int64 { CalcSize(t) return int64(t.align) diff --git a/test/simd.go b/test/simd.go new file mode 100644 index 00000000000..b1695fa514d --- /dev/null +++ b/test/simd.go @@ -0,0 +1,97 @@ +// errorcheck -0 -d=ssa/cpufeatures/debug=1 + +//go:build goexperiment.simd && amd64 + +// Copyright 2025 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package foo + +import "simd" + +func f1(x simd.Int8x16) { + return // ERROR "has features avx" +} + +func g1() simd.Int8x16 { + var x simd.Int8x16 + return x // ERROR "has features avx$" +} + +type T1 simd.Int8x16 + +func (x T1) h() { + return // ERROR "has features avx$" +} + +func f2(x simd.Int8x64) { + return // ERROR "has features avx[+]avx2[+]avx512$" +} + +func g2() simd.Int8x64 { + var x simd.Int8x64 + return x // ERROR "has features avx[+]avx2[+]avx512$" +} + +type T2 simd.Int8x64 + +func (x T2) h() { + return // ERROR "has features avx[+]avx2[+]avx512$" +} + +var a int + +func f() { + if a == 0 { + if !simd.HasAVX512() { + return + } + println("has avx512") // ERROR "has features avx[+]avx2[+]avx512$" + } else { + if !simd.HasAVX2() { + return + } + println("has avx2") // ERROR "has features avx[+]avx2$" + } + println("has something") +} // ERROR "has features avx[+]avx2$" + +func g() { + if simd.HasAVX2() { // ERROR "has features avx[+]avx2$" + for range 5 { // ERROR "has features avx[+]avx2$" + if a < 0 { // ERROR "has features avx[+]avx2$" + a++ // ERROR "has features avx[+]avx2$" + } + } + } + println("ahoy!") // ERROR "has features avx[+]avx2$" // this is an artifact of flaky block numbering and why isn't it fused? + if a > 0 { + a-- + } +} + +//go:noinline +func p() bool { + return true +} + +func hasIrreducibleLoop() { + if simd.HasAVX2() { + goto a // ERROR "has features avx[+]avx2$" + } else { + goto b + } +a: + println("a") + if p() { + goto c + } +b: + println("b") + if p() { + goto a + } +c: + println("c") +}