[dev.simd] cmd/compile: adapters for simd

This combines several CLs into a single patch of "glue"
for the generated SIMD extensions.

This glue includes GOEXPERIMENT checks that disable
the creation of user-visible "simd" types and
that disable the registration of "simd" intrinsics.

The simd type checks were changed to work for either
package "simd" or "internal/simd" so that moving that
package won't be quite so fragile.

cmd/compile, internal/simd: glue for adding SIMD extensions to Go
cmd/compile: theft of Cherry's sample SIMD compilation

Change-Id: Id44e2f4bafe74032c26de576a8691b6f7d977e01
Reviewed-on: https://go-review.googlesource.com/c/go/+/675598
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
David Chase 2025-03-31 10:45:23 +11:00
parent 2ef7106881
commit 04b1030ae4
26 changed files with 2196 additions and 675 deletions

View file

@ -150,12 +150,12 @@ func appendParamTypes(rts []*types.Type, t *types.Type) []*types.Type {
if w == 0 {
return rts
}
if t.IsScalar() || t.IsPtrShaped() {
if t.IsScalar() || t.IsPtrShaped() || t.IsSIMD() {
if t.IsComplex() {
c := types.FloatForComplex(t)
return append(rts, c, c)
} else {
if int(t.Size()) <= types.RegSize {
if int(t.Size()) <= types.RegSize || t.IsSIMD() {
return append(rts, t)
}
// assume 64bit int on 32-bit machine
@ -199,6 +199,9 @@ func appendParamOffsets(offsets []int64, at int64, t *types.Type) ([]int64, int6
if w == 0 {
return offsets, at
}
if t.IsSIMD() {
return append(offsets, at), at + w
}
if t.IsScalar() || t.IsPtrShaped() {
if t.IsComplex() || int(t.Size()) > types.RegSize { // complex and *int64 on 32-bit
s := w / 2
@ -521,11 +524,11 @@ func (state *assignState) allocateRegs(regs []RegIndex, t *types.Type) []RegInde
}
ri := state.rUsed.intRegs
rf := state.rUsed.floatRegs
if t.IsScalar() || t.IsPtrShaped() {
if t.IsScalar() || t.IsPtrShaped() || t.IsSIMD() {
if t.IsComplex() {
regs = append(regs, RegIndex(rf+state.rTotal.intRegs), RegIndex(rf+1+state.rTotal.intRegs))
rf += 2
} else if t.IsFloat() {
} else if t.IsFloat() || t.IsSIMD() {
regs = append(regs, RegIndex(rf+state.rTotal.intRegs))
rf += 1
} else {

View file

@ -0,0 +1,19 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Placeholder for generated glue to come later
package amd64
import (
"cmd/compile/internal/ssa"
"cmd/compile/internal/ssagen"
)
func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
switch v.Op {
default:
return false
}
return true
}

View file

@ -67,6 +67,8 @@ func storeByType(t *types.Type) obj.As {
case 8:
return x86.AMOVSD
}
} else if t.IsSIMD() {
return simdMov(width)
} else {
switch width {
case 1:
@ -92,6 +94,8 @@ func moveByType(t *types.Type) obj.As {
// There is no xmm->xmm move with 1 byte opcode,
// so use movups, which has 2 byte opcode.
return x86.AMOVUPS
} else if t.IsSIMD() {
return simdMov(t.Size())
} else {
switch t.Size() {
case 1:
@ -1038,6 +1042,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
}
x := v.Args[0].Reg()
y := v.Reg()
if v.Type.IsSIMD() {
x = simdReg(v.Args[0])
y = simdReg(v)
}
if x != y {
opregreg(s, moveByType(v.Type), y, x)
}
@ -1049,16 +1057,24 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p := s.Prog(loadByType(v.Type))
ssagen.AddrAuto(&p.From, v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
r := v.Reg()
if v.Type.IsSIMD() {
r = simdReg(v)
}
p.To.Reg = r
case ssa.OpStoreReg:
if v.Type.IsFlags() {
v.Fatalf("store flags not implemented: %v", v.LongString())
return
}
r := v.Args[0].Reg()
if v.Type.IsSIMD() {
r = simdReg(v.Args[0])
}
p := s.Prog(storeByType(v.Type))
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.From.Reg = r
ssagen.AddrAuto(&p.To, v)
case ssa.OpAMD64LoweredHasCPUFeature:
p := s.Prog(x86.AMOVBLZX)
@ -1426,10 +1442,124 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.From.Offset = int64(x)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
// XXX SIMD
// XXX may change depending on how we handle aliased registers
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v)
p.AddRestSourceReg(simdReg(v))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VPADDD4:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64VPMOVMToVec8x16,
ssa.OpAMD64VPMOVMToVec8x32,
ssa.OpAMD64VPMOVMToVec8x64,
ssa.OpAMD64VPMOVMToVec16x8,
ssa.OpAMD64VPMOVMToVec16x16,
ssa.OpAMD64VPMOVMToVec16x32,
ssa.OpAMD64VPMOVMToVec32x4,
ssa.OpAMD64VPMOVMToVec32x8,
ssa.OpAMD64VPMOVMToVec32x16,
ssa.OpAMD64VPMOVMToVec64x2,
ssa.OpAMD64VPMOVMToVec64x4,
ssa.OpAMD64VPMOVMToVec64x8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VPMOVVec8x16ToM,
ssa.OpAMD64VPMOVVec8x32ToM,
ssa.OpAMD64VPMOVVec8x64ToM,
ssa.OpAMD64VPMOVVec16x8ToM,
ssa.OpAMD64VPMOVVec16x16ToM,
ssa.OpAMD64VPMOVVec16x32ToM,
ssa.OpAMD64VPMOVVec32x4ToM,
ssa.OpAMD64VPMOVVec32x8ToM,
ssa.OpAMD64VPMOVVec32x16ToM,
ssa.OpAMD64VPMOVVec64x2ToM,
ssa.OpAMD64VPMOVVec64x4ToM,
ssa.OpAMD64VPMOVVec64x8ToM:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
default:
if !ssaGenSIMDValue(s, v) {
v.Fatalf("genValue not implemented: %s", v.LongString())
}
}
}
func simdGenUnary(s *ssagen.State, v *ssa.Value) {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
}
func simdGenBinary(s *ssagen.State, v *ssa.Value) {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
}
func simdGenUnaryImmUint8(s *ssagen.State, v *ssa.Value) {
p := s.Prog(v.Op.Asm())
imm := v.AuxInt
if imm < 0 || imm > 255 {
v.Fatalf("Invalid source selection immediate")
}
p.From.Offset = imm
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
}
func simdGenBinaryImmUint8(s *ssagen.State, v *ssa.Value) {
p := s.Prog(v.Op.Asm())
imm := v.AuxInt
if imm < 0 || imm > 255 {
v.Fatalf("Invalid source selection immediate")
}
p.From.Offset = imm
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
}
var blockJump = [...]struct {
asm, invasm obj.As
@ -1532,3 +1662,30 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
p.Pos = p.Pos.WithNotStmt()
return p
}
// XXX maybe make this part of v.Reg?
// On the other hand, it is architecture-specific.
func simdReg(v *ssa.Value) int16 {
t := v.Type
if !t.IsSIMD() {
panic("simdReg: not a simd type")
}
switch t.Size() {
case 16:
return v.Reg()
case 32:
return v.Reg() + (x86.REG_Y0 - x86.REG_X0)
case 64:
return v.Reg() + (x86.REG_Z0 - x86.REG_X0)
}
panic("unreachable")
}
func simdMov(width int64) obj.As {
if width >= 64 {
return x86.AVMOVDQU64
} else if width >= 16 {
return x86.AVMOVDQU
}
return x86.AKMOVQ
}

View file

@ -1680,3 +1680,36 @@
// If we don't use the flags any more, just use the standard op.
(Select0 a:(ADD(Q|L)constflags [c] x)) && a.Uses == 1 => (ADD(Q|L)const [c] x)
// XXX SIMD
(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
(Load <t> ptr mem) && t.Size() == 32 => (VMOVDQUload256 ptr mem)
(Store {t} ptr val mem) && t.Size() == 32 => (VMOVDQUstore256 ptr val mem)
(Load <t> ptr mem) && t.Size() == 64 => (VMOVDQUload512 ptr mem)
(Store {t} ptr val mem) && t.Size() == 64 => (VMOVDQUstore512 ptr val mem)
(ZeroSIMD <t>) && t.Size() == 16 => (Zero128 <t>)
(ZeroSIMD <t>) && t.Size() == 32 => (Zero256 <t>)
(ZeroSIMD <t>) && t.Size() == 64 => (Zero512 <t>)
(VPMOVVec8x16ToM (VPMOVMToVec8x16 x)) => x
(VPMOVVec8x32ToM (VPMOVMToVec8x32 x)) => x
(VPMOVVec8x64ToM (VPMOVMToVec8x64 x)) => x
(VPMOVVec16x8ToM (VPMOVMToVec16x8 x)) => x
(VPMOVVec16x16ToM (VPMOVMToVec16x16 x)) => x
(VPMOVVec16x32ToM (VPMOVMToVec16x32 x)) => x
(VPMOVVec32x4ToM (VPMOVMToVec32x4 x)) => x
(VPMOVVec32x8ToM (VPMOVMToVec32x8 x)) => x
(VPMOVVec32x16ToM (VPMOVMToVec32x16 x)) => x
(VPMOVVec64x2ToM (VPMOVMToVec64x2 x)) => x
(VPMOVVec64x4ToM (VPMOVMToVec64x4 x)) => x
(VPMOVVec64x8ToM (VPMOVMToVec64x8 x)) => x

View file

@ -63,6 +63,16 @@ var regNamesAMD64 = []string{
"X14",
"X15", // constant 0 in ABIInternal
// TODO: update asyncPreempt for K registers.
// asyncPreempt also needs to store Z0-Z15 properly.
"K0",
"K1",
"K2",
"K3",
"K4",
"K5",
"K6",
"K7",
// If you add registers, update asyncPreempt in runtime
// pseudo-registers
@ -100,6 +110,7 @@ func init() {
g = buildReg("g")
fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14")
x15 = buildReg("X15")
mask = buildReg("K1 K2 K3 K4 K5 K6 K7")
gpsp = gp | buildReg("SP")
gpspsb = gpsp | buildReg("SB")
gpspsbg = gpspsb | g
@ -109,6 +120,7 @@ func init() {
var (
gponly = []regMask{gp}
fponly = []regMask{fp}
maskonly = []regMask{mask}
)
// Common regInfo
@ -170,6 +182,12 @@ func init() {
fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}}
fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
fp1m1 = regInfo{inputs: fponly, outputs: maskonly}
m1fp1 = regInfo{inputs: maskonly, outputs: fponly}
fp2m1 = regInfo{inputs: []regMask{fp, fp}, outputs: maskonly}
fp2m1fp1 = regInfo{inputs: []regMask{fp, fp, mask}, outputs: fponly}
fp2m1m1 = regInfo{inputs: []regMask{fp, fp, mask}, outputs: maskonly}
prefreg = regInfo{inputs: []regMask{gpspsbg}}
)
@ -1199,6 +1217,54 @@ func init() {
//
// output[i] = (input[i] >> 7) & 1
{name: "PMOVMSKB", argLength: 1, reg: fpgp, asm: "PMOVMSKB"},
// XXX SIMD
{name: "VPADDD4", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true}, // arg0 + arg1
{name: "VMOVDQUload128", argLength: 2, reg: fpload, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1 = mem
{name: "VMOVDQUstore128", argLength: 3, reg: fpstore, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
{name: "VMOVDQUload256", argLength: 2, reg: fpload, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1 = mem
{name: "VMOVDQUstore256", argLength: 3, reg: fpstore, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
{name: "VMOVDQUload512", argLength: 2, reg: fpload, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0+auxint+aux, arg1 = mem
{name: "VMOVDQUstore512", argLength: 3, reg: fpstore, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
{name: "VPMOVMToVec8x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x32", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
{name: "VPMOVMToVec8x64", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
{name: "VPMOVMToVec16x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
{name: "VPMOVMToVec16x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
{name: "VPMOVMToVec16x32", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
{name: "VPMOVMToVec32x4", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
{name: "VPMOVMToVec32x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
{name: "VPMOVMToVec32x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
{name: "VPMOVMToVec64x2", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
{name: "VPMOVMToVec64x4", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
{name: "VPMOVMToVec64x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
{name: "VPMOVVec8x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
{name: "VPMOVVec8x32ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
{name: "VPMOVVec8x64ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
{name: "VPMOVVec16x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
{name: "VPMOVVec16x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
{name: "VPMOVVec16x32ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
{name: "VPMOVVec32x4ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
{name: "VPMOVVec32x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
{name: "VPMOVVec32x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
{name: "VPMOVVec64x2ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
{name: "VPMOVVec64x4ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
{name: "VPMOVVec64x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
{name: "Zero128", argLength: 0, reg: fp01, asm: "VPXOR"},
{name: "Zero256", argLength: 0, reg: fp01, asm: "VPXOR"},
{name: "Zero512", argLength: 0, reg: fp01, asm: "VPXORQ"},
}
var AMD64blocks = []blockData{
@ -1230,14 +1296,15 @@ func init() {
name: "AMD64",
pkg: "cmd/internal/obj/x86",
genfile: "../../amd64/ssa.go",
ops: AMD64ops,
genSIMDfile: "../../amd64/simdssa.go",
ops: append(AMD64ops, simdAMD64Ops(fp11, fp21, fp2m1, fp2m1fp1, fp2m1m1)...), // AMD64ops,
blocks: AMD64blocks,
regnames: regNamesAMD64,
ParamIntRegNames: "AX BX CX DI SI R8 R9 R10 R11",
ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14",
gpregmask: gp,
fpregmask: fp,
specialregmask: x15,
specialregmask: x15 | mask,
framepointerreg: int8(num["BP"]),
linkreg: -1, // not used
})

View file

@ -910,7 +910,7 @@
// struct operations
(StructSelect [i] x:(StructMake ___)) => x.Args[i]
(Load <t> _ _) && t.IsStruct() && CanSSA(t) => rewriteStructLoad(v)
(Load <t> _ _) && t.IsStruct() && CanSSA(t) && !t.IsSIMD() => rewriteStructLoad(v)
(Store _ (StructMake ___) _) => rewriteStructStore(v)
(StructSelect [i] x:(Load <t> ptr mem)) && !CanSSA(t) =>

View file

@ -662,6 +662,10 @@ var genericOps = []opData{
// Prefetch instruction
{name: "PrefetchCache", argLength: 2, hasSideEffects: true}, // Do prefetch arg0 to cache. arg0=addr, arg1=memory.
{name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory.
// XXX SIMD
{name: "Add32x4", argLength: 2}, // arg0 + arg1
{name: "ZeroSIMD", argLength: 0},
}
// kind controls successors implicit exit
@ -689,6 +693,7 @@ var genericBlocks = []blockData{
}
func init() {
genericOps = append(genericOps, simdGenericOps()...)
archs = append(archs, arch{
name: "generic",
ops: genericOps,

View file

@ -32,6 +32,7 @@ type arch struct {
name string
pkg string // obj package to import for this arch.
genfile string // source file containing opcode code generation.
genSIMDfile string // source file containing opcode code generation for SIMD.
ops []opData
blocks []blockData
regnames []string
@ -525,6 +526,15 @@ func genOp() {
if err != nil {
log.Fatalf("can't read %s: %v", a.genfile, err)
}
// Append the file of simd operations, too
if a.genSIMDfile != "" {
simdSrc, err := os.ReadFile(a.genSIMDfile)
if err != nil {
log.Fatalf("can't read %s: %v", a.genSIMDfile, err)
}
src = append(src, simdSrc...)
}
seen := make(map[string]bool, len(a.ops))
for _, m := range rxOp.FindAllSubmatch(src, -1) {
seen[string(m[1])] = true

View file

@ -95,6 +95,7 @@ func genLateLowerRules(arch arch) { genRulesSuffix(arch, "latelower") }
func genRulesSuffix(arch arch, suff string) {
// Open input file.
var text io.Reader
text, err := os.Open(arch.name + suff + ".rules")
if err != nil {
if suff == "" {
@ -105,6 +106,14 @@ func genRulesSuffix(arch arch, suff string) {
return
}
// Check for file of SIMD rules to add
if suff == "" {
simdtext, err := os.Open("simd" + arch.name + ".rules")
if err == nil {
text = io.MultiReader(text, simdtext)
}
}
// oprules contains a list of rules for each block and opcode
blockrules := map[string][]Rule{}
oprules := map[string][]Rule{}

View file

@ -0,0 +1,4 @@
// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
// (AddInt8x16 ...) => (VPADDB ...)
// etc

View file

@ -0,0 +1,10 @@
// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
package main
func simdAMD64Ops(fp11, fp21, fp2m1, fp2m1fp1, fp2m1m1 regInfo) []opData {
return []opData{
// {name: "VPADDB", argLength: 2, reg: fp21, asm: "VPADDB", commutative: true},
// etc, generated
}
}

View file

@ -0,0 +1,10 @@
// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
package main
func simdGenericOps() []opData {
return []opData{
// {name: "AddInt8x16", argLength: 2, commutative: true},
// etc
}
}

View file

@ -89,6 +89,10 @@ type Types struct {
Float32Ptr *types.Type
Float64Ptr *types.Type
BytePtrPtr *types.Type
Vec128 *types.Type
Vec256 *types.Type
Vec512 *types.Type
Mask *types.Type
}
// NewTypes creates and populates a Types.
@ -123,6 +127,10 @@ func (t *Types) SetTypPtrs() {
t.Float32Ptr = types.NewPtr(types.Types[types.TFLOAT32])
t.Float64Ptr = types.NewPtr(types.Types[types.TFLOAT64])
t.BytePtrPtr = types.NewPtr(types.NewPtr(types.Types[types.TUINT8]))
t.Vec128 = types.TypeVec128
t.Vec256 = types.TypeVec256
t.Vec512 = types.TypeVec512
t.Mask = types.TypeMask
}
type Logger interface {

View file

@ -100,7 +100,7 @@ func decomposeBuiltIn(f *Func) {
}
case t.IsFloat():
// floats are never decomposed, even ones bigger than RegSize
case t.Size() > f.Config.RegSize:
case t.Size() > f.Config.RegSize && !t.IsSIMD():
f.Fatalf("undecomposed named type %s %v", name, t)
}
}
@ -135,7 +135,7 @@ func decomposeBuiltInPhi(v *Value) {
decomposeInterfacePhi(v)
case v.Type.IsFloat():
// floats are never decomposed, even ones bigger than RegSize
case v.Type.Size() > v.Block.Func.Config.RegSize:
case v.Type.Size() > v.Block.Func.Config.RegSize && !v.Type.IsSIMD():
v.Fatalf("%v undecomposed type %v", v, v.Type)
}
}
@ -248,7 +248,7 @@ func decomposeUser(f *Func) {
for _, name := range f.Names {
t := name.Type
switch {
case t.IsStruct():
case isStructNotSIMD(t):
newNames = decomposeUserStructInto(f, name, newNames)
case t.IsArray():
newNames = decomposeUserArrayInto(f, name, newNames)
@ -293,7 +293,7 @@ func decomposeUserArrayInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Loc
if t.Elem().IsArray() {
return decomposeUserArrayInto(f, elemName, slots)
} else if t.Elem().IsStruct() {
} else if isStructNotSIMD(t.Elem()) {
return decomposeUserStructInto(f, elemName, slots)
}
@ -313,7 +313,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
fnames = append(fnames, fs)
// arrays and structs will be decomposed further, so
// there's no need to record a name
if !fs.Type.IsArray() && !fs.Type.IsStruct() {
if !fs.Type.IsArray() && !isStructNotSIMD(fs.Type) {
slots = maybeAppend(f, slots, fs)
}
}
@ -339,7 +339,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
// now that this f.NamedValues contains values for the struct
// fields, recurse into nested structs
for i := 0; i < n; i++ {
if name.Type.FieldType(i).IsStruct() {
if isStructNotSIMD(name.Type.FieldType(i)) {
slots = decomposeUserStructInto(f, fnames[i], slots)
delete(f.NamedValues, *fnames[i])
} else if name.Type.FieldType(i).IsArray() {
@ -351,7 +351,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
}
func decomposeUserPhi(v *Value) {
switch {
case v.Type.IsStruct():
case isStructNotSIMD(v.Type):
decomposeStructPhi(v)
case v.Type.IsArray():
decomposeArrayPhi(v)
@ -458,3 +458,7 @@ func deleteNamedVals(f *Func, toDelete []namedVal) {
}
f.Names = f.Names[:end]
}
func isStructNotSIMD(t *types.Type) bool {
return t.IsStruct() && !t.IsSIMD()
}

View file

@ -399,6 +399,9 @@ func (x *expandState) decomposeAsNecessary(pos src.XPos, b *Block, a, m0 *Value,
return mem
case types.TSTRUCT:
if at.IsSIMD() {
break // XXX
}
for i := 0; i < at.NumFields(); i++ {
et := at.Field(i).Type // might need to read offsets from the fields
e := b.NewValue1I(pos, OpStructSelect, et, int64(i), a)
@ -547,6 +550,9 @@ func (x *expandState) rewriteSelectOrArg(pos src.XPos, b *Block, container, a, m
case types.TSTRUCT:
// Assume ssagen/ssa.go (in buildssa) spills large aggregates so they won't appear here.
if at.IsSIMD() {
break // XXX
}
for i := 0; i < at.NumFields(); i++ {
et := at.Field(i).Type
e := x.rewriteSelectOrArg(pos, b, container, nil, m0, et, rc.next(et))
@ -713,6 +719,9 @@ func (x *expandState) rewriteWideSelectToStores(pos src.XPos, b *Block, containe
case types.TSTRUCT:
// Assume ssagen/ssa.go (in buildssa) spills large aggregates so they won't appear here.
if at.IsSIMD() {
break // XXX
}
for i := 0; i < at.NumFields(); i++ {
et := at.Field(i).Type
m0 = x.rewriteWideSelectToStores(pos, b, container, m0, et, rc.next(et))
@ -859,7 +868,7 @@ func (c *registerCursor) at(t *types.Type, i int) registerCursor {
rc.nextSlice += Abi1RO(i * w)
return rc
}
if t.IsStruct() {
if isStructNotSIMD(t) {
for j := 0; j < i; j++ {
rc.next(t.FieldType(j))
}
@ -973,7 +982,7 @@ func (x *expandState) regOffset(t *types.Type, i int) Abi1RO {
if t.IsArray() {
return Abi1RO(i) * x.regWidth(t.Elem())
}
if t.IsStruct() {
if isStructNotSIMD(t) {
k := Abi1RO(0)
for j := 0; j < i; j++ {
k += x.regWidth(t.FieldType(j))

File diff suppressed because it is too large Load diff

View file

@ -501,6 +501,30 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpAMD64TESTW(v)
case OpAMD64TESTWconst:
return rewriteValueAMD64_OpAMD64TESTWconst(v)
case OpAMD64VPMOVVec16x16ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v)
case OpAMD64VPMOVVec16x32ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec16x32ToM(v)
case OpAMD64VPMOVVec16x8ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec16x8ToM(v)
case OpAMD64VPMOVVec32x16ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec32x16ToM(v)
case OpAMD64VPMOVVec32x4ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec32x4ToM(v)
case OpAMD64VPMOVVec32x8ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec32x8ToM(v)
case OpAMD64VPMOVVec64x2ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec64x2ToM(v)
case OpAMD64VPMOVVec64x4ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec64x4ToM(v)
case OpAMD64VPMOVVec64x8ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec64x8ToM(v)
case OpAMD64VPMOVVec8x16ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec8x16ToM(v)
case OpAMD64VPMOVVec8x32ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v)
case OpAMD64VPMOVVec8x64ToM:
return rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v)
case OpAMD64XADDLlock:
return rewriteValueAMD64_OpAMD64XADDLlock(v)
case OpAMD64XADDQlock:
@ -1198,6 +1222,8 @@ func rewriteValueAMD64(v *Value) bool {
case OpZeroExt8to64:
v.Op = OpAMD64MOVBQZX
return true
case OpZeroSIMD:
return rewriteValueAMD64_OpZeroSIMD(v)
}
return false
}
@ -22812,6 +22838,174 @@ func rewriteValueAMD64_OpAMD64TESTWconst(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec16x16ToM (VPMOVMToVec16x16 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec16x16 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec16x32ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec16x32ToM (VPMOVMToVec16x32 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec16x32 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec16x8ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec16x8ToM (VPMOVMToVec16x8 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec16x8 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec32x16ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec32x16ToM (VPMOVMToVec32x16 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec32x16 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec32x4ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec32x4ToM (VPMOVMToVec32x4 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec32x4 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec32x8ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec32x8ToM (VPMOVMToVec32x8 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec32x8 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec64x2ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec64x2ToM (VPMOVMToVec64x2 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec64x2 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec64x4ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec64x4ToM (VPMOVMToVec64x4 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec64x4 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec64x8ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec64x8ToM (VPMOVMToVec64x8 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec64x8 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec8x16ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec8x16ToM (VPMOVMToVec8x16 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec8x16 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec8x32ToM (VPMOVMToVec8x32 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec8x32 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v *Value) bool {
v_0 := v.Args[0]
// match: (VPMOVVec8x64ToM (VPMOVMToVec8x64 x))
// result: x
for {
if v_0.Op != OpAMD64VPMOVMToVec8x64 {
break
}
x := v_0.Args[0]
v.copyOf(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64XADDLlock(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
@ -26215,6 +26409,48 @@ func rewriteValueAMD64_OpLoad(v *Value) bool {
v.AddArg2(ptr, mem)
return true
}
// match: (Load <t> ptr mem)
// cond: t.Size() == 16
// result: (VMOVDQUload128 ptr mem)
for {
t := v.Type
ptr := v_0
mem := v_1
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VMOVDQUload128)
v.AddArg2(ptr, mem)
return true
}
// match: (Load <t> ptr mem)
// cond: t.Size() == 32
// result: (VMOVDQUload256 ptr mem)
for {
t := v.Type
ptr := v_0
mem := v_1
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VMOVDQUload256)
v.AddArg2(ptr, mem)
return true
}
// match: (Load <t> ptr mem)
// cond: t.Size() == 64
// result: (VMOVDQUload512 ptr mem)
for {
t := v.Type
ptr := v_0
mem := v_1
if !(t.Size() == 64) {
break
}
v.reset(OpAMD64VMOVDQUload512)
v.AddArg2(ptr, mem)
return true
}
return false
}
func rewriteValueAMD64_OpLocalAddr(v *Value) bool {
@ -29764,6 +30000,51 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
v.AddArg3(ptr, val, mem)
return true
}
// match: (Store {t} ptr val mem)
// cond: t.Size() == 16
// result: (VMOVDQUstore128 ptr val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64VMOVDQUstore128)
v.AddArg3(ptr, val, mem)
return true
}
// match: (Store {t} ptr val mem)
// cond: t.Size() == 32
// result: (VMOVDQUstore256 ptr val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64VMOVDQUstore256)
v.AddArg3(ptr, val, mem)
return true
}
// match: (Store {t} ptr val mem)
// cond: t.Size() == 64
// result: (VMOVDQUstore512 ptr val mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
if !(t.Size() == 64) {
break
}
v.reset(OpAMD64VMOVDQUstore512)
v.AddArg3(ptr, val, mem)
return true
}
return false
}
func rewriteValueAMD64_OpTrunc(v *Value) bool {
@ -30117,6 +30398,45 @@ func rewriteValueAMD64_OpZero(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpZeroSIMD(v *Value) bool {
// match: (ZeroSIMD <t>)
// cond: t.Size() == 16
// result: (Zero128 <t>)
for {
t := v.Type
if !(t.Size() == 16) {
break
}
v.reset(OpAMD64Zero128)
v.Type = t
return true
}
// match: (ZeroSIMD <t>)
// cond: t.Size() == 32
// result: (Zero256 <t>)
for {
t := v.Type
if !(t.Size() == 32) {
break
}
v.reset(OpAMD64Zero256)
v.Type = t
return true
}
// match: (ZeroSIMD <t>)
// cond: t.Size() == 64
// result: (Zero512 <t>)
for {
t := v.Type
if !(t.Size() == 64) {
break
}
v.reset(OpAMD64Zero512)
v.Type = t
return true
}
return false
}
func rewriteBlockAMD64(b *Block) bool {
typ := &b.Func.Config.Types
switch b.Kind {

View file

@ -14149,11 +14149,11 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
return true
}
// match: (Load <t> _ _)
// cond: t.IsStruct() && CanSSA(t)
// cond: t.IsStruct() && CanSSA(t) && !t.IsSIMD()
// result: rewriteStructLoad(v)
for {
t := v.Type
if !(t.IsStruct() && CanSSA(t)) {
if !(t.IsStruct() && CanSSA(t) && !t.IsSIMD()) {
break
}
v.copyOf(rewriteStructLoad(v))

View file

@ -596,6 +596,9 @@ func AutoVar(v *Value) (*ir.Name, int64) {
// CanSSA reports whether values of type t can be represented as a Value.
func CanSSA(t *types.Type) bool {
types.CalcSize(t)
if t.IsSIMD() {
return true
}
if t.Size() > int64(4*types.PtrSize) {
// 4*Widthptr is an arbitrary constant. We want it
// to be at least 3*Widthptr so slices can be registerized.

View file

@ -1602,6 +1602,104 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
if buildcfg.Experiment.SIMD {
// Only enable intrinsics, if SIMD experiment.
simdIntrinsics(addF)
}
}
// simdLoadSliceMethod does intrinsic for method form of Load-from-slice
func simdLoadSliceMethod(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// args[0] is unused except for its type.
t := args[0].Type
slice := args[1]
arrlen := s.constInt(types.Types[types.TINT], nElts)
cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
return s.newValue2(ssa.OpLoad, t, ptr, s.mem())
}
}
// simdLoadSlice does intrinsic for function form of Load-from-slice
func simdLoadSlice(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// args[0] is unused except for its type.
t := n.Type()
slice := args[0]
arrlen := s.constInt(types.Types[types.TINT], nElts)
cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
return s.newValue2(ssa.OpLoad, t, ptr, s.mem())
}
}
func simdStoreSlice(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := args[0]
t := x.Type
slice := args[1]
arrlen := s.constInt(types.Types[types.TINT], nElts)
cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
s.store(t, ptr, x)
return nil
}
}
func simdLoadSliceMethodPart(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// args[0] is unused except for its type.
t := args[0].Type
slice := args[1]
arrLen := s.constInt(types.Types[types.TINT], nElts)
cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
/*
if off := vec.Len() - len(slice) ; off <= 0 {
plain load
} else {
load mask[off] into a scratch vector
masked load/store
}
*/
// TODO SIMD support on a 32-bit processor
off := s.newValue2(ssa.OpSub64, types.Types[types.TINT], arrLen, cap)
cond := s.newValue2(ssa.OpLeq64, types.Types[types.TBOOL], off, s.zeroVal(types.Types[types.TINT]))
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(cond)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
simdRes := ssaMarker("simdload")
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice)
s.vars[simdRes] = s.newValue2(ssa.OpLoad, t, ptr, s.mem())
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
// NOT IMPLEMENTED, NEED TO ADD GENERIC PARTIAL LOAD/STORE
// MASK REGISTER DEPENDS ON ARCH AND ITS SIMD VERSION.
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(simdRes, t)
}
}
// findIntrinsic returns a function which builds the SSA equivalent of the
@ -1627,7 +1725,8 @@ func findIntrinsic(sym *types.Sym) intrinsicBuilder {
fn := sym.Name
if ssa.IntrinsicsDisable {
if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") {
if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
pkg == "internal/simd" || pkg == "simd" { // TODO after simd has been moved to package simd, remove internal/simd
// These runtime functions don't have definitions, must be intrinsics.
} else {
return nil

View file

@ -0,0 +1,15 @@
// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
package ssagen
import (
// "cmd/compile/internal/ir"
// "cmd/compile/internal/ssa"
// "cmd/compile/internal/types"
"cmd/internal/sys"
)
func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) {
// addF("internal/simd", "Int32x4.Uint32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
// etc
}

View file

@ -623,6 +623,9 @@ func buildssa(fn *ir.Func, worker int, isPgoHot bool) *ssa.Func {
// TODO figure out exactly what's unused, don't spill it. Make liveness fine-grained, also.
for _, p := range params.InParams() {
typs, offs := p.RegisterTypesAndOffsets()
if len(offs) < len(typs) {
s.Fatalf("len(offs)=%d < len(typs)=%d, params=\n%s", len(offs), len(typs), params)
}
for i, t := range typs {
o := offs[i] // offset within parameter
fo := p.FrameOffset(params) // offset of parameter in frame
@ -1399,7 +1402,7 @@ func (s *state) instrument(t *types.Type, addr *ssa.Value, kind instrumentKind)
// If it is instrumenting for MSAN or ASAN and t is a struct type, it instruments
// operation for each field, instead of for the whole struct.
func (s *state) instrumentFields(t *types.Type, addr *ssa.Value, kind instrumentKind) {
if !(base.Flag.MSan || base.Flag.ASan) || !t.IsStruct() {
if !(base.Flag.MSan || base.Flag.ASan) || !isStructNotSIMD(t) {
s.instrument(t, addr, kind)
return
}
@ -4335,7 +4338,7 @@ func (s *state) zeroVal(t *types.Type) *ssa.Value {
return s.constInterface(t)
case t.IsSlice():
return s.constSlice(t)
case t.IsStruct():
case isStructNotSIMD(t):
n := t.NumFields()
v := s.entryNewValue0(ssa.OpStructMake, t)
for i := 0; i < n; i++ {
@ -4349,6 +4352,8 @@ func (s *state) zeroVal(t *types.Type) *ssa.Value {
case 1:
return s.entryNewValue1(ssa.OpArrayMake1, t, s.zeroVal(t.Elem()))
}
case t.IsSIMD():
return s.newValue0(ssa.OpZeroSIMD, t)
}
s.Fatalf("zero for type %v not implemented", t)
return nil
@ -5328,7 +5333,7 @@ func (s *state) storeType(t *types.Type, left, right *ssa.Value, skip skipMask,
// do *left = right for all scalar (non-pointer) parts of t.
func (s *state) storeTypeScalars(t *types.Type, left, right *ssa.Value, skip skipMask) {
switch {
case t.IsBoolean() || t.IsInteger() || t.IsFloat() || t.IsComplex():
case t.IsBoolean() || t.IsInteger() || t.IsFloat() || t.IsComplex() || t.IsSIMD():
s.store(t, left, right)
case t.IsPtrShaped():
if t.IsPtr() && t.Elem().NotInHeap() {
@ -5357,7 +5362,7 @@ func (s *state) storeTypeScalars(t *types.Type, left, right *ssa.Value, skip ski
// itab field doesn't need a write barrier (even though it is a pointer).
itab := s.newValue1(ssa.OpITab, s.f.Config.Types.BytePtr, right)
s.store(types.Types[types.TUINTPTR], left, itab)
case t.IsStruct():
case isStructNotSIMD(t):
n := t.NumFields()
for i := 0; i < n; i++ {
ft := t.FieldType(i)
@ -5394,7 +5399,7 @@ func (s *state) storeTypePtrs(t *types.Type, left, right *ssa.Value) {
idata := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, right)
idataAddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.BytePtrPtr, s.config.PtrSize, left)
s.store(s.f.Config.Types.BytePtr, idataAddr, idata)
case t.IsStruct():
case isStructNotSIMD(t):
n := t.NumFields()
for i := 0; i < n; i++ {
ft := t.FieldType(i)
@ -6477,7 +6482,7 @@ func EmitArgInfo(f *ir.Func, abiInfo *abi.ABIParamResultInfo) *obj.LSym {
uintptrTyp := types.Types[types.TUINTPTR]
isAggregate := func(t *types.Type) bool {
return t.IsStruct() || t.IsArray() || t.IsComplex() || t.IsInterface() || t.IsString() || t.IsSlice()
return isStructNotSIMD(t) || t.IsArray() || t.IsComplex() || t.IsInterface() || t.IsString() || t.IsSlice()
}
wOff := 0
@ -6537,7 +6542,7 @@ func EmitArgInfo(f *ir.Func, abiInfo *abi.ABIParamResultInfo) *obj.LSym {
}
baseOffset += t.Elem().Size()
}
case t.IsStruct():
case isStructNotSIMD(t):
if t.NumFields() == 0 {
n++ // {} counts as a component
break
@ -7554,7 +7559,7 @@ func (s *State) UseArgs(n int64) {
// fieldIdx finds the index of the field referred to by the ODOT node n.
func fieldIdx(n *ir.SelectorExpr) int {
t := n.X.Type()
if !t.IsStruct() {
if !isStructNotSIMD(t) {
panic("ODOT's LHS is not a struct")
}
@ -7762,6 +7767,10 @@ func SpillSlotAddr(spill ssa.Spill, baseReg int16, extraOffset int64) obj.Addr {
}
}
func isStructNotSIMD(t *types.Type) bool {
return t.IsStruct() && !t.IsSIMD()
}
var (
BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
ExtendCheckFunc [ssa.BoundsKindCount]*obj.LSym

View file

@ -10,6 +10,7 @@ import (
"cmd/compile/internal/base"
"cmd/internal/src"
"internal/buildcfg"
"internal/types/errors"
)
@ -410,6 +411,10 @@ func CalcSize(t *Type) {
}
CalcStructSize(t)
w = t.width
if t.IsSIMD() { // XXX
t.intRegs = 0
t.floatRegs = 1
}
// make fake type to check later to
// trigger function argument computation.
@ -452,6 +457,31 @@ func CalcSize(t *Type) {
ResumeCheckSize()
}
// simdify marks as type as "SIMD", either as a tag field,
// or having the SIMD attribute. The tag field is a marker
// type used to identify a struct that is not really a struct.
// A SIMD type is allocated to a vector register (on amd64,
// xmm, ymm, or zmm). The fields of a SIMD type are ignored
// by the compiler except for the space that they reserve.
func simdify(st *Type, isTag bool) {
st.align = 8
st.alg = AMEM
st.intRegs = 0
st.isSIMD = true
if isTag {
st.width = 0
st.isSIMDTag = true
st.floatRegs = 0
} else {
st.floatRegs = 1
}
// if st.Sym() != nil {
// base.Warn("Simdify %s, %v, %d", st.Sym().Name, isTag, st.width)
// } else {
// base.Warn("Simdify %v, %v, %d", st, isTag, st.width)
// }
}
// CalcStructSize calculates the size of t,
// filling in t.width, t.align, t.intRegs, and t.floatRegs,
// even if size calculation is otherwise disabled.
@ -464,10 +494,27 @@ func CalcStructSize(t *Type) {
switch {
case sym.Name == "align64" && isAtomicStdPkg(sym.Pkg):
maxAlign = 8
case buildcfg.Experiment.SIMD && (sym.Pkg.Path == "internal/simd" || sym.Pkg.Path == "simd") && len(t.Fields()) >= 1:
// This gates the experiment -- without it, no user-visible types can be "simd".
// The SSA-visible SIMD types remain.
// TODO after simd has been moved to package simd, remove internal/simd.
switch sym.Name {
case "v128":
simdify(t, true)
return
case "v256":
simdify(t, true)
return
case "v512":
simdify(t, true)
return
}
}
}
fields := t.Fields()
size := calcStructOffset(t, fields, 0)
// For non-zero-sized structs which end in a zero-sized field, we
@ -540,6 +587,11 @@ func CalcStructSize(t *Type) {
break
}
}
if len(t.Fields()) >= 1 && t.Fields()[0].Type.isSIMDTag {
// this catches `type Foo simd.Whatever` -- Foo is also SIMD.
simdify(t, false)
}
}
// CalcArraySize calculates the size of t,

View file

@ -203,6 +203,7 @@ type Type struct {
flags bitset8
alg AlgKind // valid if Align > 0
isSIMDTag, isSIMD bool // tag is the marker type, isSIMD means has marker type
// size of prefix of object that contains all pointers. valid if Align > 0.
// Note that for pointers, this is always PtrSize even if the element type
@ -605,6 +606,12 @@ func newSSA(name string) *Type {
return t
}
func newSIMD(name string) *Type {
t := newSSA(name)
t.isSIMD = true
return t
}
// NewMap returns a new map Type with key type k and element (aka value) type v.
func NewMap(k, v *Type) *Type {
t := newType(TMAP)
@ -995,10 +1002,7 @@ func (t *Type) ArgWidth() int64 {
func (t *Type) Size() int64 {
if t.kind == TSSA {
if t == TypeInt128 {
return 16
}
return 0
return t.width
}
CalcSize(t)
return t.width
@ -1626,12 +1630,26 @@ var (
TypeFlags = newSSA("flags")
TypeVoid = newSSA("void")
TypeInt128 = newSSA("int128")
TypeVec128 = newSIMD("vec128")
TypeVec256 = newSIMD("vec256")
TypeVec512 = newSIMD("vec512")
TypeMask = newSSA("mask") // not a vector, not 100% sure what this should be.
TypeResultMem = newResults([]*Type{TypeMem})
)
func init() {
TypeInt128.width = 16
TypeInt128.align = 8
TypeVec128.width = 16
TypeVec128.align = 8
TypeVec256.width = 32
TypeVec256.align = 8
TypeVec512.width = 64
TypeVec512.align = 8
TypeMask.width = 8 // This will depend on the architecture; spilling will be "interesting".
TypeMask.align = 8
}
// NewNamed returns a new named type for the given type name. obj should be an
@ -2017,3 +2035,7 @@ var SimType [NTYPE]Kind
// Fake package for shape types (see typecheck.Shapify()).
var ShapePkg = NewPkg("go.shape", "go.shape")
func (t *Type) IsSIMD() bool {
return t.isSIMD
}

View file

@ -0,0 +1,7 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
// Empty file to allow bodyless functions.

145
src/internal/simd/testdata/sample.go vendored Normal file
View file

@ -0,0 +1,145 @@
package sample
import (
"internal/simd"
"os"
"unsafe"
)
type S1 = simd.Float64x4
type S2 simd.Float64x4
func (s S2) Len() int {
return simd.Float64x4(s).Len()
}
func (s S2) Load(a []float64) S2 {
return S2(simd.LoadFloat64x4FromSlice(a))
}
func (s S2) Store(a []float64) {
simd.Float64x4(s).Store(a)
}
func (s S2) Add(a S2) S2 {
return S2(simd.Float64x4(s).Add(simd.Float64x4(a)))
}
func (s S2) Mul(a S2) S2 {
return S2(simd.Float64x4(s).Mul(simd.Float64x4(a)))
}
type S3 struct {
simd.Float64x4
}
func ip64_0(a, b []float64) float64 {
s := 0.0
for i := range a {
s += a[i] * b[i]
}
return s
}
func ip64_1(a, b []float64) float64 {
var z S1
sum := z
var i int
stride := z.Len()
for ; i <= len(a)-stride; i += stride {
va := simd.LoadFloat64x4FromSlice(a[i:])
vb := simd.LoadFloat64x4FromSlice(b[i:])
sum = sum.Add(va.Mul(vb))
}
var tmp [4]float64
sum.Store(tmp[:])
return tmp[0] + tmp[1] + tmp[2] + tmp[3]
}
func ip64_1a(a, b []float64) float64 {
var z S1
sum := z
var i int
stride := z.Len()
for ; i <= len(a)-stride; i += stride {
va := simd.LoadFloat64x4FromSlice(a[i:])
vb := simd.LoadFloat64x4FromSlice(b[i:])
sum = FMA(sum, va, vb)
}
var tmp [4]float64
sum.Store(tmp[:])
return tmp[0] + tmp[1] + tmp[2] + tmp[3]
}
//go:noinline
func FMA(a, b, c simd.Float64x4) simd.Float64x4 {
return a.Add(b.Mul(c))
}
func ip64_2(a, b []float64) float64 {
var z S2
sum := z
var i int
stride := z.Len()
for ; i <= len(a)-stride; i += stride {
va := z.Load(a[i:])
vb := z.Load(b[i:])
sum = sum.Add(va.Mul(vb))
}
var tmp [4]float64
sum.Store(tmp[:])
return tmp[0] + tmp[1] + tmp[2] + tmp[3]
}
func ip64_3(a, b []float64) float64 {
var z S3
sum := z
var i int
stride := z.Len()
for ; i <= len(a)-stride; i += stride {
va := simd.LoadFloat64x4FromSlice(a[i:])
vb := simd.LoadFloat64x4FromSlice(b[i:])
sum = S3{sum.Add(va.Mul(vb))}
}
var tmp [4]float64
sum.Store(tmp[:])
return tmp[0] + tmp[1] + tmp[2] + tmp[3]
}
func main() {
a := []float64{1, 2, 3, 4, 5, 6, 7, 8}
ip0 := ip64_0(a, a)
ip1 := ip64_1(a, a)
ip1a := ip64_1a(a, a)
ip2 := ip64_2(a, a)
ip3 := ip64_3(a, a)
fmt.Printf("Test IP = %f\n", ip0)
fmt.Printf("SIMD IP 1 = %f\n", ip1)
fmt.Printf("SIMD IP 1a = %f\n", ip1a)
fmt.Printf("SIMD IP 2 = %f\n", ip2)
fmt.Printf("SIMD IP 3 = %f\n", ip3)
var z1 S1
var z2 S2
var z3 S2
s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3)
fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3)
fail := false
if s1 != 32 || s2 != 32 || s3 != 32 {
fmt.Println("Failed a sizeof check, should all be 32")
fail = true
}
if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 {
fmt.Println("Failed an inner product check, should all be", ip0)
fail = true
}
if fail {
os.Exit(1)
}
}