[dev.simd] cmd/compile: adapters for simd

This combines several CLs into a single patch of "glue" for the generated SIMD extensions. This glue includes GOEXPERIMENT checks that disable the creation of user-visible "simd" types and that disable the registration of "simd" intrinsics. The simd type checks were changed to work for either package "simd" or "internal/simd" so that moving that package won't be quite so fragile. cmd/compile, internal/simd: glue for adding SIMD extensions to Go cmd/compile: theft of Cherry's sample SIMD compilation Change-Id: Id44e2f4bafe74032c26de576a8691b6f7d977e01 Reviewed-on: https://go-review.googlesource.com/c/go/+/675598 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2025-12-08 06:10:04 +00:00 · 2025-03-31 10:45:23 +11:00 · 2025-03-31 10:45:23 +11:00 · 04b1030ae4
commit 04b1030ae4
parent 2ef7106881
26 changed files with 2196 additions and 675 deletions
--- a/src/cmd/compile/internal/abi/abiutils.go
+++ b/src/cmd/compile/internal/abi/abiutils.go
@ -150,12 +150,12 @@ func appendParamTypes(rts []*types.Type, t *types.Type) []*types.Type {
 	if w == 0 {
 		return rts
 	}
-	if t.IsScalar() || t.IsPtrShaped() {
+	if t.IsScalar() || t.IsPtrShaped() || t.IsSIMD() {
 		if t.IsComplex() {
 			c := types.FloatForComplex(t)
 			return append(rts, c, c)
 		} else {
-			if int(t.Size()) <= types.RegSize {
+			if int(t.Size()) <= types.RegSize || t.IsSIMD() {
 				return append(rts, t)
 			}
 			// assume 64bit int on 32-bit machine
@ -199,6 +199,9 @@ func appendParamOffsets(offsets []int64, at int64, t *types.Type) ([]int64, int6
 	if w == 0 {
 		return offsets, at
 	}
+	if t.IsSIMD() {
+		return append(offsets, at), at + w
+	}
 	if t.IsScalar() || t.IsPtrShaped() {
 		if t.IsComplex() || int(t.Size()) > types.RegSize { // complex and *int64 on 32-bit
 			s := w / 2
@ -521,11 +524,11 @@ func (state *assignState) allocateRegs(regs []RegIndex, t *types.Type) []RegInde
 	}
 	ri := state.rUsed.intRegs
 	rf := state.rUsed.floatRegs
-	if t.IsScalar() || t.IsPtrShaped() {
+	if t.IsScalar() || t.IsPtrShaped() || t.IsSIMD() {
 		if t.IsComplex() {
 			regs = append(regs, RegIndex(rf+state.rTotal.intRegs), RegIndex(rf+1+state.rTotal.intRegs))
 			rf += 2
-		} else if t.IsFloat() {
+		} else if t.IsFloat() || t.IsSIMD() {
 			regs = append(regs, RegIndex(rf+state.rTotal.intRegs))
 			rf += 1
 		} else {
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -0,0 +1,19 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Placeholder for generated glue to come later
+package amd64
+
+import (
+	"cmd/compile/internal/ssa"
+	"cmd/compile/internal/ssagen"
+)
+
+func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
+	switch v.Op {
+	default:
+		return false
+	}
+	return true
+}
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -67,6 +67,8 @@ func storeByType(t *types.Type) obj.As {
 		case 8:
 			return x86.AMOVSD
 		}
+	} else if t.IsSIMD() {
+		return simdMov(width)
 	} else {
 		switch width {
 		case 1:
@ -92,6 +94,8 @@ func moveByType(t *types.Type) obj.As {
 		// There is no xmm->xmm move with 1 byte opcode,
 		// so use movups, which has 2 byte opcode.
 		return x86.AMOVUPS
+	} else if t.IsSIMD() {
+		return simdMov(t.Size())
 	} else {
 		switch t.Size() {
 		case 1:
@ -1038,6 +1042,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		}
 		x := v.Args[0].Reg()
 		y := v.Reg()
+		if v.Type.IsSIMD() {
+			x = simdReg(v.Args[0])
+			y = simdReg(v)
+		}
 		if x != y {
 			opregreg(s, moveByType(v.Type), y, x)
 		}
@ -1049,16 +1057,24 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p := s.Prog(loadByType(v.Type))
 		ssagen.AddrAuto(&p.From, v.Args[0])
 		p.To.Type = obj.TYPE_REG
-		p.To.Reg = v.Reg()
+		r := v.Reg()
+		if v.Type.IsSIMD() {
+			r = simdReg(v)
+		}
+		p.To.Reg = r

 	case ssa.OpStoreReg:
 		if v.Type.IsFlags() {
 			v.Fatalf("store flags not implemented: %v", v.LongString())
 			return
 		}
+		r := v.Args[0].Reg()
+		if v.Type.IsSIMD() {
+			r = simdReg(v.Args[0])
+		}
 		p := s.Prog(storeByType(v.Type))
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = v.Args[0].Reg()
+		p.From.Reg = r
 		ssagen.AddrAuto(&p.To, v)
 	case ssa.OpAMD64LoweredHasCPUFeature:
 		p := s.Prog(x86.AMOVBLZX)
@ -1426,10 +1442,124 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.From.Offset = int64(x)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = v.Reg()
+
+	// XXX SIMD
+	// XXX may change depending on how we handle aliased registers
+	case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = simdReg(v)
+		p.AddRestSourceReg(simdReg(v))
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = simdReg(v)
+	case ssa.OpAMD64VPADDD4:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = simdReg(v.Args[0])
+		p.AddRestSourceReg(simdReg(v.Args[1]))
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = simdReg(v)
+	case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_MEM
+		p.From.Reg = v.Args[0].Reg()
+		ssagen.AddAux(&p.From, v)
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = simdReg(v)
+	case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = simdReg(v.Args[1])
+		p.To.Type = obj.TYPE_MEM
+		p.To.Reg = v.Args[0].Reg()
+		ssagen.AddAux(&p.To, v)
+
+	case ssa.OpAMD64VPMOVMToVec8x16,
+		ssa.OpAMD64VPMOVMToVec8x32,
+		ssa.OpAMD64VPMOVMToVec8x64,
+		ssa.OpAMD64VPMOVMToVec16x8,
+		ssa.OpAMD64VPMOVMToVec16x16,
+		ssa.OpAMD64VPMOVMToVec16x32,
+		ssa.OpAMD64VPMOVMToVec32x4,
+		ssa.OpAMD64VPMOVMToVec32x8,
+		ssa.OpAMD64VPMOVMToVec32x16,
+		ssa.OpAMD64VPMOVMToVec64x2,
+		ssa.OpAMD64VPMOVMToVec64x4,
+		ssa.OpAMD64VPMOVMToVec64x8:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = v.Args[0].Reg()
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = simdReg(v)
+
+	case ssa.OpAMD64VPMOVVec8x16ToM,
+		ssa.OpAMD64VPMOVVec8x32ToM,
+		ssa.OpAMD64VPMOVVec8x64ToM,
+		ssa.OpAMD64VPMOVVec16x8ToM,
+		ssa.OpAMD64VPMOVVec16x16ToM,
+		ssa.OpAMD64VPMOVVec16x32ToM,
+		ssa.OpAMD64VPMOVVec32x4ToM,
+		ssa.OpAMD64VPMOVVec32x8ToM,
+		ssa.OpAMD64VPMOVVec32x16ToM,
+		ssa.OpAMD64VPMOVVec64x2ToM,
+		ssa.OpAMD64VPMOVVec64x4ToM,
+		ssa.OpAMD64VPMOVVec64x8ToM:
+		p := s.Prog(v.Op.Asm())
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = simdReg(v.Args[0])
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = v.Reg()
+
 	default:
+		if !ssaGenSIMDValue(s, v) {
 			v.Fatalf("genValue not implemented: %s", v.LongString())
 		}
 	}
+}
+
+func simdGenUnary(s *ssagen.State, v *ssa.Value) {
+	p := s.Prog(v.Op.Asm())
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = simdReg(v.Args[0])
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+}
+
+func simdGenBinary(s *ssagen.State, v *ssa.Value) {
+	p := s.Prog(v.Op.Asm())
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = simdReg(v.Args[0])
+	p.AddRestSourceReg(simdReg(v.Args[1]))
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+}
+
+func simdGenUnaryImmUint8(s *ssagen.State, v *ssa.Value) {
+	p := s.Prog(v.Op.Asm())
+	imm := v.AuxInt
+	if imm < 0 || imm > 255 {
+		v.Fatalf("Invalid source selection immediate")
+	}
+	p.From.Offset = imm
+	p.From.Type = obj.TYPE_CONST
+	p.AddRestSourceReg(simdReg(v.Args[0]))
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+}
+
+func simdGenBinaryImmUint8(s *ssagen.State, v *ssa.Value) {
+	p := s.Prog(v.Op.Asm())
+	imm := v.AuxInt
+	if imm < 0 || imm > 255 {
+		v.Fatalf("Invalid source selection immediate")
+	}
+	p.From.Offset = imm
+	p.From.Type = obj.TYPE_CONST
+	p.AddRestSourceReg(simdReg(v.Args[0]))
+	p.AddRestSourceReg(simdReg(v.Args[1]))
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+}

 var blockJump = [...]struct {
 	asm, invasm obj.As
@ -1532,3 +1662,30 @@ func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg in
 	p.Pos = p.Pos.WithNotStmt()
 	return p
 }
+
+// XXX maybe make this part of v.Reg?
+// On the other hand, it is architecture-specific.
+func simdReg(v *ssa.Value) int16 {
+	t := v.Type
+	if !t.IsSIMD() {
+		panic("simdReg: not a simd type")
+	}
+	switch t.Size() {
+	case 16:
+		return v.Reg()
+	case 32:
+		return v.Reg() + (x86.REG_Y0 - x86.REG_X0)
+	case 64:
+		return v.Reg() + (x86.REG_Z0 - x86.REG_X0)
+	}
+	panic("unreachable")
+}
+
+func simdMov(width int64) obj.As {
+	if width >= 64 {
+		return x86.AVMOVDQU64
+	} else if width >= 16 {
+		return x86.AVMOVDQU
+	}
+	return x86.AKMOVQ
+}
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@ -1680,3 +1680,36 @@

 // If we don't use the flags any more, just use the standard op.
 (Select0 a:(ADD(Q|L)constflags [c] x)) && a.Uses == 1 => (ADD(Q|L)const [c] x)
+
+// XXX SIMD
+(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
+
+(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
+
+(Load <t> ptr mem) && t.Size() == 32 => (VMOVDQUload256 ptr mem)
+
+(Store {t} ptr val mem) && t.Size() == 32 => (VMOVDQUstore256 ptr val mem)
+
+(Load <t> ptr mem) && t.Size() == 64 => (VMOVDQUload512 ptr mem)
+
+(Store {t} ptr val mem) && t.Size() == 64 => (VMOVDQUstore512 ptr val mem)
+
+(ZeroSIMD <t>) && t.Size() == 16 => (Zero128 <t>)
+(ZeroSIMD <t>) && t.Size() == 32 => (Zero256 <t>)
+(ZeroSIMD <t>) && t.Size() == 64 => (Zero512 <t>)
+
+(VPMOVVec8x16ToM (VPMOVMToVec8x16 x)) => x
+(VPMOVVec8x32ToM (VPMOVMToVec8x32 x)) => x
+(VPMOVVec8x64ToM (VPMOVMToVec8x64 x)) => x
+
+(VPMOVVec16x8ToM (VPMOVMToVec16x8 x)) => x
+(VPMOVVec16x16ToM (VPMOVMToVec16x16 x)) => x
+(VPMOVVec16x32ToM (VPMOVMToVec16x32 x)) => x
+
+(VPMOVVec32x4ToM (VPMOVMToVec32x4 x)) => x
+(VPMOVVec32x8ToM (VPMOVMToVec32x8 x)) => x
+(VPMOVVec32x16ToM (VPMOVMToVec32x16 x)) => x
+
+(VPMOVVec64x2ToM (VPMOVMToVec64x2 x)) => x
+(VPMOVVec64x4ToM (VPMOVMToVec64x4 x)) => x
+(VPMOVVec64x8ToM (VPMOVMToVec64x8 x)) => x
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@ -63,6 +63,16 @@ var regNamesAMD64 = []string{
 	"X14",
 	"X15", // constant 0 in ABIInternal

+	// TODO: update asyncPreempt for K registers.
+	// asyncPreempt also needs to store Z0-Z15 properly.
+	"K0",
+	"K1",
+	"K2",
+	"K3",
+	"K4",
+	"K5",
+	"K6",
+	"K7",
 	// If you add registers, update asyncPreempt in runtime

 	// pseudo-registers
@ -100,6 +110,7 @@ func init() {
 		g          = buildReg("g")
 		fp         = buildReg("X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14")
 		x15        = buildReg("X15")
+		mask       = buildReg("K1 K2 K3 K4 K5 K6 K7")
 		gpsp       = gp | buildReg("SP")
 		gpspsb     = gpsp | buildReg("SB")
 		gpspsbg    = gpspsb | g
@ -109,6 +120,7 @@ func init() {
 	var (
 		gponly   = []regMask{gp}
 		fponly   = []regMask{fp}
+		maskonly = []regMask{mask}
 	)

 	// Common regInfo
@ -170,6 +182,12 @@ func init() {
 		fpstore    = regInfo{inputs: []regMask{gpspsb, fp, 0}}
 		fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}

+		fp1m1    = regInfo{inputs: fponly, outputs: maskonly}
+		m1fp1    = regInfo{inputs: maskonly, outputs: fponly}
+		fp2m1    = regInfo{inputs: []regMask{fp, fp}, outputs: maskonly}
+		fp2m1fp1 = regInfo{inputs: []regMask{fp, fp, mask}, outputs: fponly}
+		fp2m1m1  = regInfo{inputs: []regMask{fp, fp, mask}, outputs: maskonly}
+
 		prefreg = regInfo{inputs: []regMask{gpspsbg}}
 	)

@ -1199,6 +1217,54 @@ func init() {
 		//
 		// output[i] = (input[i] >> 7) & 1
 		{name: "PMOVMSKB", argLength: 1, reg: fpgp, asm: "PMOVMSKB"},
+
+		// XXX SIMD
+		{name: "VPADDD4", argLength: 2, reg: fp21, asm: "VPADDD", commutative: true}, // arg0 + arg1
+
+		{name: "VMOVDQUload128", argLength: 2, reg: fpload, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0+auxint+aux, arg1 = mem
+		{name: "VMOVDQUstore128", argLength: 3, reg: fpstore, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
+
+		{name: "VMOVDQUload256", argLength: 2, reg: fpload, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0+auxint+aux, arg1 = mem
+		{name: "VMOVDQUstore256", argLength: 3, reg: fpstore, asm: "VMOVDQU", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
+
+		{name: "VMOVDQUload512", argLength: 2, reg: fpload, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},    // load from arg0+auxint+aux, arg1 = mem
+		{name: "VMOVDQUstore512", argLength: 3, reg: fpstore, asm: "VMOVDQU64", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // store, *(arg0+auxint+aux) = arg1, arg2 = mem
+
+		{name: "VPMOVMToVec8x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
+		{name: "VPMOVMToVec8x32", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
+		{name: "VPMOVMToVec8x64", argLength: 1, reg: m1fp1, asm: "VPMOVM2B"},
+
+		{name: "VPMOVMToVec16x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
+		{name: "VPMOVMToVec16x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
+		{name: "VPMOVMToVec16x32", argLength: 1, reg: m1fp1, asm: "VPMOVM2W"},
+
+		{name: "VPMOVMToVec32x4", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
+		{name: "VPMOVMToVec32x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
+		{name: "VPMOVMToVec32x16", argLength: 1, reg: m1fp1, asm: "VPMOVM2D"},
+
+		{name: "VPMOVMToVec64x2", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
+		{name: "VPMOVMToVec64x4", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
+		{name: "VPMOVMToVec64x8", argLength: 1, reg: m1fp1, asm: "VPMOVM2Q"},
+
+		{name: "VPMOVVec8x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
+		{name: "VPMOVVec8x32ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
+		{name: "VPMOVVec8x64ToM", argLength: 1, reg: fp1m1, asm: "VPMOVB2M"},
+
+		{name: "VPMOVVec16x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
+		{name: "VPMOVVec16x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
+		{name: "VPMOVVec16x32ToM", argLength: 1, reg: fp1m1, asm: "VPMOVW2M"},
+
+		{name: "VPMOVVec32x4ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
+		{name: "VPMOVVec32x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
+		{name: "VPMOVVec32x16ToM", argLength: 1, reg: fp1m1, asm: "VPMOVD2M"},
+
+		{name: "VPMOVVec64x2ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
+		{name: "VPMOVVec64x4ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
+		{name: "VPMOVVec64x8ToM", argLength: 1, reg: fp1m1, asm: "VPMOVQ2M"},
+
+		{name: "Zero128", argLength: 0, reg: fp01, asm: "VPXOR"},
+		{name: "Zero256", argLength: 0, reg: fp01, asm: "VPXOR"},
+		{name: "Zero512", argLength: 0, reg: fp01, asm: "VPXORQ"},
 	}

 	var AMD64blocks = []blockData{
@ -1230,14 +1296,15 @@ func init() {
 		name:               "AMD64",
 		pkg:                "cmd/internal/obj/x86",
 		genfile:            "../../amd64/ssa.go",
-		ops:                AMD64ops,
+		genSIMDfile:        "../../amd64/simdssa.go",
+		ops:                append(AMD64ops, simdAMD64Ops(fp11, fp21, fp2m1, fp2m1fp1, fp2m1m1)...), // AMD64ops,
 		blocks:             AMD64blocks,
 		regnames:           regNamesAMD64,
 		ParamIntRegNames:   "AX BX CX DI SI R8 R9 R10 R11",
 		ParamFloatRegNames: "X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14",
 		gpregmask:          gp,
 		fpregmask:          fp,
-		specialregmask:     x15,
+		specialregmask:     x15 | mask,
 		framepointerreg:    int8(num["BP"]),
 		linkreg:            -1, // not used
 	})
--- a/src/cmd/compile/internal/ssa/_gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/_gen/generic.rules
@ -910,7 +910,7 @@

 // struct operations
 (StructSelect [i] x:(StructMake ___)) => x.Args[i]
-(Load <t> _ _) && t.IsStruct() && CanSSA(t) => rewriteStructLoad(v)
+(Load <t> _ _) && t.IsStruct() && CanSSA(t) && !t.IsSIMD() => rewriteStructLoad(v)
 (Store _ (StructMake ___) _) => rewriteStructStore(v)

 (StructSelect [i] x:(Load <t> ptr mem)) && !CanSSA(t) =>
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@ -662,6 +662,10 @@ var genericOps = []opData{
 	// Prefetch instruction
 	{name: "PrefetchCache", argLength: 2, hasSideEffects: true},         // Do prefetch arg0 to cache. arg0=addr, arg1=memory.
 	{name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory.
+
+	// XXX SIMD
+	{name: "Add32x4", argLength: 2}, // arg0 + arg1
+	{name: "ZeroSIMD", argLength: 0},
 }

 //     kind          controls          successors   implicit exit
@ -689,6 +693,7 @@ var genericBlocks = []blockData{
 }

 func init() {
+	genericOps = append(genericOps, simdGenericOps()...)
 	archs = append(archs, arch{
 		name:    "generic",
 		ops:     genericOps,
--- a/src/cmd/compile/internal/ssa/_gen/main.go
+++ b/src/cmd/compile/internal/ssa/_gen/main.go
@ -32,6 +32,7 @@ type arch struct {
 	name               string
 	pkg                string // obj package to import for this arch.
 	genfile            string // source file containing opcode code generation.
+	genSIMDfile        string // source file containing opcode code generation for SIMD.
 	ops                []opData
 	blocks             []blockData
 	regnames           []string
@ -525,6 +526,15 @@ func genOp() {
 		if err != nil {
 			log.Fatalf("can't read %s: %v", a.genfile, err)
 		}
+		// Append the file of simd operations, too
+		if a.genSIMDfile != "" {
+			simdSrc, err := os.ReadFile(a.genSIMDfile)
+			if err != nil {
+				log.Fatalf("can't read %s: %v", a.genSIMDfile, err)
+			}
+			src = append(src, simdSrc...)
+		}
+
 		seen := make(map[string]bool, len(a.ops))
 		for _, m := range rxOp.FindAllSubmatch(src, -1) {
 			seen[string(m[1])] = true
--- a/src/cmd/compile/internal/ssa/_gen/rulegen.go
+++ b/src/cmd/compile/internal/ssa/_gen/rulegen.go
@ -95,6 +95,7 @@ func genLateLowerRules(arch arch) { genRulesSuffix(arch, "latelower") }

 func genRulesSuffix(arch arch, suff string) {
 	// Open input file.
+	var text io.Reader
 	text, err := os.Open(arch.name + suff + ".rules")
 	if err != nil {
 		if suff == "" {
@ -105,6 +106,14 @@ func genRulesSuffix(arch arch, suff string) {
 		return
 	}

+	// Check for file of SIMD rules to add
+	if suff == "" {
+		simdtext, err := os.Open("simd" + arch.name + ".rules")
+		if err == nil {
+			text = io.MultiReader(text, simdtext)
+		}
+	}
+
 	// oprules contains a list of rules for each block and opcode
 	blockrules := map[string][]Rule{}
 	oprules := map[string][]Rule{}
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -0,0 +1,4 @@
+// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
+
+// (AddInt8x16 ...) => (VPADDB ...)
+// etc
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -0,0 +1,10 @@
+// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
+
+package main
+
+func simdAMD64Ops(fp11, fp21, fp2m1, fp2m1fp1, fp2m1m1 regInfo) []opData {
+	return []opData{
+		//		{name: "VPADDB", argLength: 2, reg: fp21, asm: "VPADDB", commutative: true},
+		//      etc, generated
+	}
+}
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -0,0 +1,10 @@
+// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
+
+package main
+
+func simdGenericOps() []opData {
+	return []opData{
+		//	{name: "AddInt8x16", argLength: 2, commutative: true},
+		// etc
+	}
+}
--- a/src/cmd/compile/internal/ssa/config.go
+++ b/src/cmd/compile/internal/ssa/config.go
@ -89,6 +89,10 @@ type Types struct {
 	Float32Ptr *types.Type
 	Float64Ptr *types.Type
 	BytePtrPtr *types.Type
+	Vec128     *types.Type
+	Vec256     *types.Type
+	Vec512     *types.Type
+	Mask       *types.Type
 }

 // NewTypes creates and populates a Types.
@ -123,6 +127,10 @@ func (t *Types) SetTypPtrs() {
 	t.Float32Ptr = types.NewPtr(types.Types[types.TFLOAT32])
 	t.Float64Ptr = types.NewPtr(types.Types[types.TFLOAT64])
 	t.BytePtrPtr = types.NewPtr(types.NewPtr(types.Types[types.TUINT8]))
+	t.Vec128 = types.TypeVec128
+	t.Vec256 = types.TypeVec256
+	t.Vec512 = types.TypeVec512
+	t.Mask = types.TypeMask
 }

 type Logger interface {
--- a/src/cmd/compile/internal/ssa/decompose.go
+++ b/src/cmd/compile/internal/ssa/decompose.go
@ -100,7 +100,7 @@ func decomposeBuiltIn(f *Func) {
 			}
 		case t.IsFloat():
 			// floats are never decomposed, even ones bigger than RegSize
-		case t.Size() > f.Config.RegSize:
+		case t.Size() > f.Config.RegSize && !t.IsSIMD():
 			f.Fatalf("undecomposed named type %s %v", name, t)
 		}
 	}
@ -135,7 +135,7 @@ func decomposeBuiltInPhi(v *Value) {
 		decomposeInterfacePhi(v)
 	case v.Type.IsFloat():
 		// floats are never decomposed, even ones bigger than RegSize
-	case v.Type.Size() > v.Block.Func.Config.RegSize:
+	case v.Type.Size() > v.Block.Func.Config.RegSize && !v.Type.IsSIMD():
 		v.Fatalf("%v undecomposed type %v", v, v.Type)
 	}
 }
@ -248,7 +248,7 @@ func decomposeUser(f *Func) {
 	for _, name := range f.Names {
 		t := name.Type
 		switch {
-		case t.IsStruct():
+		case isStructNotSIMD(t):
 			newNames = decomposeUserStructInto(f, name, newNames)
 		case t.IsArray():
 			newNames = decomposeUserArrayInto(f, name, newNames)
@ -293,7 +293,7 @@ func decomposeUserArrayInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Loc

 	if t.Elem().IsArray() {
 		return decomposeUserArrayInto(f, elemName, slots)
-	} else if t.Elem().IsStruct() {
+	} else if isStructNotSIMD(t.Elem()) {
 		return decomposeUserStructInto(f, elemName, slots)
 	}

@ -313,7 +313,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
 		fnames = append(fnames, fs)
 		// arrays and structs will be decomposed further, so
 		// there's no need to record a name
-		if !fs.Type.IsArray() && !fs.Type.IsStruct() {
+		if !fs.Type.IsArray() && !isStructNotSIMD(fs.Type) {
 			slots = maybeAppend(f, slots, fs)
 		}
 	}
@ -339,7 +339,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
 	// now that this f.NamedValues contains values for the struct
 	// fields, recurse into nested structs
 	for i := 0; i < n; i++ {
-		if name.Type.FieldType(i).IsStruct() {
+		if isStructNotSIMD(name.Type.FieldType(i)) {
 			slots = decomposeUserStructInto(f, fnames[i], slots)
 			delete(f.NamedValues, *fnames[i])
 		} else if name.Type.FieldType(i).IsArray() {
@ -351,7 +351,7 @@ func decomposeUserStructInto(f *Func, name *LocalSlot, slots []*LocalSlot) []*Lo
 }
 func decomposeUserPhi(v *Value) {
 	switch {
-	case v.Type.IsStruct():
+	case isStructNotSIMD(v.Type):
 		decomposeStructPhi(v)
 	case v.Type.IsArray():
 		decomposeArrayPhi(v)
@ -458,3 +458,7 @@ func deleteNamedVals(f *Func, toDelete []namedVal) {
 	}
 	f.Names = f.Names[:end]
 }
+
+func isStructNotSIMD(t *types.Type) bool {
+	return t.IsStruct() && !t.IsSIMD()
+}
--- a/src/cmd/compile/internal/ssa/expand_calls.go
+++ b/src/cmd/compile/internal/ssa/expand_calls.go
@ -399,6 +399,9 @@ func (x *expandState) decomposeAsNecessary(pos src.XPos, b *Block, a, m0 *Value,
 		return mem

 	case types.TSTRUCT:
+		if at.IsSIMD() {
+			break // XXX
+		}
 		for i := 0; i < at.NumFields(); i++ {
 			et := at.Field(i).Type // might need to read offsets from the fields
 			e := b.NewValue1I(pos, OpStructSelect, et, int64(i), a)
@ -547,6 +550,9 @@ func (x *expandState) rewriteSelectOrArg(pos src.XPos, b *Block, container, a, m

 	case types.TSTRUCT:
 		// Assume ssagen/ssa.go (in buildssa) spills large aggregates so they won't appear here.
+		if at.IsSIMD() {
+			break // XXX
+		}
 		for i := 0; i < at.NumFields(); i++ {
 			et := at.Field(i).Type
 			e := x.rewriteSelectOrArg(pos, b, container, nil, m0, et, rc.next(et))
@ -713,6 +719,9 @@ func (x *expandState) rewriteWideSelectToStores(pos src.XPos, b *Block, containe

 	case types.TSTRUCT:
 		// Assume ssagen/ssa.go (in buildssa) spills large aggregates so they won't appear here.
+		if at.IsSIMD() {
+			break // XXX
+		}
 		for i := 0; i < at.NumFields(); i++ {
 			et := at.Field(i).Type
 			m0 = x.rewriteWideSelectToStores(pos, b, container, m0, et, rc.next(et))
@ -859,7 +868,7 @@ func (c *registerCursor) at(t *types.Type, i int) registerCursor {
 		rc.nextSlice += Abi1RO(i * w)
 		return rc
 	}
-	if t.IsStruct() {
+	if isStructNotSIMD(t) {
 		for j := 0; j < i; j++ {
 			rc.next(t.FieldType(j))
 		}
@ -973,7 +982,7 @@ func (x *expandState) regOffset(t *types.Type, i int) Abi1RO {
 	if t.IsArray() {
 		return Abi1RO(i) * x.regWidth(t.Elem())
 	}
-	if t.IsStruct() {
+	if isStructNotSIMD(t) {
 		k := Abi1RO(0)
 		for j := 0; j < i; j++ {
 			k += x.regWidth(t.FieldType(j))
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -501,6 +501,30 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAMD64TESTW(v)
 	case OpAMD64TESTWconst:
 		return rewriteValueAMD64_OpAMD64TESTWconst(v)
+	case OpAMD64VPMOVVec16x16ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v)
+	case OpAMD64VPMOVVec16x32ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec16x32ToM(v)
+	case OpAMD64VPMOVVec16x8ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec16x8ToM(v)
+	case OpAMD64VPMOVVec32x16ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec32x16ToM(v)
+	case OpAMD64VPMOVVec32x4ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec32x4ToM(v)
+	case OpAMD64VPMOVVec32x8ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec32x8ToM(v)
+	case OpAMD64VPMOVVec64x2ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec64x2ToM(v)
+	case OpAMD64VPMOVVec64x4ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec64x4ToM(v)
+	case OpAMD64VPMOVVec64x8ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec64x8ToM(v)
+	case OpAMD64VPMOVVec8x16ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec8x16ToM(v)
+	case OpAMD64VPMOVVec8x32ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v)
+	case OpAMD64VPMOVVec8x64ToM:
+		return rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v)
 	case OpAMD64XADDLlock:
 		return rewriteValueAMD64_OpAMD64XADDLlock(v)
 	case OpAMD64XADDQlock:
@ -1198,6 +1222,8 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpZeroExt8to64:
 		v.Op = OpAMD64MOVBQZX
 		return true
+	case OpZeroSIMD:
+		return rewriteValueAMD64_OpZeroSIMD(v)
 	}
 	return false
 }
@ -22812,6 +22838,174 @@ func rewriteValueAMD64_OpAMD64TESTWconst(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpAMD64VPMOVVec16x16ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec16x16ToM (VPMOVMToVec16x16 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec16x16 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec16x32ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec16x32ToM (VPMOVMToVec16x32 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec16x32 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec16x8ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec16x8ToM (VPMOVMToVec16x8 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec16x8 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec32x16ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec32x16ToM (VPMOVMToVec32x16 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec32x16 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec32x4ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec32x4ToM (VPMOVMToVec32x4 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec32x4 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec32x8ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec32x8ToM (VPMOVMToVec32x8 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec32x8 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec64x2ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec64x2ToM (VPMOVMToVec64x2 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec64x2 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec64x4ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec64x4ToM (VPMOVMToVec64x4 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec64x4 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec64x8ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec64x8ToM (VPMOVMToVec64x8 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec64x8 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec8x16ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec8x16ToM (VPMOVMToVec8x16 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec8x16 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec8x32ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec8x32ToM (VPMOVMToVec8x32 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec8x32 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
+func rewriteValueAMD64_OpAMD64VPMOVVec8x64ToM(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (VPMOVVec8x64ToM (VPMOVMToVec8x64 x))
+	// result: x
+	for {
+		if v_0.Op != OpAMD64VPMOVMToVec8x64 {
+			break
+		}
+		x := v_0.Args[0]
+		v.copyOf(x)
+		return true
+	}
+	return false
+}
 func rewriteValueAMD64_OpAMD64XADDLlock(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
@ -26215,6 +26409,48 @@ func rewriteValueAMD64_OpLoad(v *Value) bool {
 		v.AddArg2(ptr, mem)
 		return true
 	}
+	// match: (Load <t> ptr mem)
+	// cond: t.Size() == 16
+	// result: (VMOVDQUload128 ptr mem)
+	for {
+		t := v.Type
+		ptr := v_0
+		mem := v_1
+		if !(t.Size() == 16) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUload128)
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	// match: (Load <t> ptr mem)
+	// cond: t.Size() == 32
+	// result: (VMOVDQUload256 ptr mem)
+	for {
+		t := v.Type
+		ptr := v_0
+		mem := v_1
+		if !(t.Size() == 32) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUload256)
+		v.AddArg2(ptr, mem)
+		return true
+	}
+	// match: (Load <t> ptr mem)
+	// cond: t.Size() == 64
+	// result: (VMOVDQUload512 ptr mem)
+	for {
+		t := v.Type
+		ptr := v_0
+		mem := v_1
+		if !(t.Size() == 64) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUload512)
+		v.AddArg2(ptr, mem)
+		return true
+	}
 	return false
 }
 func rewriteValueAMD64_OpLocalAddr(v *Value) bool {
@ -29764,6 +30000,51 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
 		v.AddArg3(ptr, val, mem)
 		return true
 	}
+	// match: (Store {t} ptr val mem)
+	// cond: t.Size() == 16
+	// result: (VMOVDQUstore128 ptr val mem)
+	for {
+		t := auxToType(v.Aux)
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		if !(t.Size() == 16) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUstore128)
+		v.AddArg3(ptr, val, mem)
+		return true
+	}
+	// match: (Store {t} ptr val mem)
+	// cond: t.Size() == 32
+	// result: (VMOVDQUstore256 ptr val mem)
+	for {
+		t := auxToType(v.Aux)
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		if !(t.Size() == 32) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUstore256)
+		v.AddArg3(ptr, val, mem)
+		return true
+	}
+	// match: (Store {t} ptr val mem)
+	// cond: t.Size() == 64
+	// result: (VMOVDQUstore512 ptr val mem)
+	for {
+		t := auxToType(v.Aux)
+		ptr := v_0
+		val := v_1
+		mem := v_2
+		if !(t.Size() == 64) {
+			break
+		}
+		v.reset(OpAMD64VMOVDQUstore512)
+		v.AddArg3(ptr, val, mem)
+		return true
+	}
 	return false
 }
 func rewriteValueAMD64_OpTrunc(v *Value) bool {
@ -30117,6 +30398,45 @@ func rewriteValueAMD64_OpZero(v *Value) bool {
 	}
 	return false
 }
+func rewriteValueAMD64_OpZeroSIMD(v *Value) bool {
+	// match: (ZeroSIMD <t>)
+	// cond: t.Size() == 16
+	// result: (Zero128 <t>)
+	for {
+		t := v.Type
+		if !(t.Size() == 16) {
+			break
+		}
+		v.reset(OpAMD64Zero128)
+		v.Type = t
+		return true
+	}
+	// match: (ZeroSIMD <t>)
+	// cond: t.Size() == 32
+	// result: (Zero256 <t>)
+	for {
+		t := v.Type
+		if !(t.Size() == 32) {
+			break
+		}
+		v.reset(OpAMD64Zero256)
+		v.Type = t
+		return true
+	}
+	// match: (ZeroSIMD <t>)
+	// cond: t.Size() == 64
+	// result: (Zero512 <t>)
+	for {
+		t := v.Type
+		if !(t.Size() == 64) {
+			break
+		}
+		v.reset(OpAMD64Zero512)
+		v.Type = t
+		return true
+	}
+	return false
+}
 func rewriteBlockAMD64(b *Block) bool {
 	typ := &b.Func.Config.Types
 	switch b.Kind {
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
@ -14149,11 +14149,11 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
 		return true
 	}
 	// match: (Load <t> _ _)
-	// cond: t.IsStruct() && CanSSA(t)
+	// cond: t.IsStruct() && CanSSA(t) && !t.IsSIMD()
 	// result: rewriteStructLoad(v)
 	for {
 		t := v.Type
-		if !(t.IsStruct() && CanSSA(t)) {
+		if !(t.IsStruct() && CanSSA(t) && !t.IsSIMD()) {
 			break
 		}
 		v.copyOf(rewriteStructLoad(v))
--- a/src/cmd/compile/internal/ssa/value.go
+++ b/src/cmd/compile/internal/ssa/value.go
@ -596,6 +596,9 @@ func AutoVar(v *Value) (*ir.Name, int64) {
 // CanSSA reports whether values of type t can be represented as a Value.
 func CanSSA(t *types.Type) bool {
 	types.CalcSize(t)
+	if t.IsSIMD() {
+		return true
+	}
 	if t.Size() > int64(4*types.PtrSize) {
 		// 4*Widthptr is an arbitrary constant. We want it
 		// to be at least 3*Widthptr so slices can be registerized.
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -1602,6 +1602,104 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
 			return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
 		},
 		sys.AMD64)
+
+	if buildcfg.Experiment.SIMD {
+		// Only enable intrinsics, if SIMD experiment.
+		simdIntrinsics(addF)
+	}
+}
+
+// simdLoadSliceMethod does intrinsic for method form of Load-from-slice
+func simdLoadSliceMethod(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		// args[0] is unused except for its type.
+		t := args[0].Type
+		slice := args[1]
+		arrlen := s.constInt(types.Types[types.TINT], nElts)
+		cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
+		s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
+		ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
+		return s.newValue2(ssa.OpLoad, t, ptr, s.mem())
+	}
+}
+
+// simdLoadSlice does intrinsic for function form of Load-from-slice
+func simdLoadSlice(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		// args[0] is unused except for its type.
+		t := n.Type()
+		slice := args[0]
+		arrlen := s.constInt(types.Types[types.TINT], nElts)
+		cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
+		s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
+		ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
+		return s.newValue2(ssa.OpLoad, t, ptr, s.mem())
+	}
+}
+
+func simdStoreSlice(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		x := args[0]
+		t := x.Type
+		slice := args[1]
+		arrlen := s.constInt(types.Types[types.TINT], nElts)
+		cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
+		s.boundsCheck(arrlen, cap, ssa.BoundsConvert, false)
+		ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice) // is this the right type? Does it need a convert?
+		s.store(t, ptr, x)
+		return nil
+	}
+}
+
+func simdLoadSliceMethodPart(nElts int64) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		// args[0] is unused except for its type.
+		t := args[0].Type
+		slice := args[1]
+		arrLen := s.constInt(types.Types[types.TINT], nElts)
+		cap := s.newValue1(ssa.OpSliceLen, types.Types[types.TINT], slice)
+
+		/*
+			if off := vec.Len() - len(slice) ; off <= 0 {
+			    plain load
+			} else {
+			    load mask[off] into a scratch vector
+			    masked load/store
+			}
+		*/
+
+		// TODO SIMD support on a 32-bit processor
+
+		off := s.newValue2(ssa.OpSub64, types.Types[types.TINT], arrLen, cap)
+		cond := s.newValue2(ssa.OpLeq64, types.Types[types.TBOOL], off, s.zeroVal(types.Types[types.TINT]))
+		b := s.endBlock()
+		b.Kind = ssa.BlockIf
+		b.SetControl(cond)
+		bTrue := s.f.NewBlock(ssa.BlockPlain)
+		bFalse := s.f.NewBlock(ssa.BlockPlain)
+		bEnd := s.f.NewBlock(ssa.BlockPlain)
+		b.AddEdgeTo(bTrue)
+		b.AddEdgeTo(bFalse)
+
+		simdRes := ssaMarker("simdload")
+
+		// We have atomic instructions - use it directly.
+		s.startBlock(bTrue)
+		ptr := s.newValue1(ssa.OpSlicePtr, t.PtrTo(), slice)
+		s.vars[simdRes] = s.newValue2(ssa.OpLoad, t, ptr, s.mem())
+		s.endBlock().AddEdgeTo(bEnd)
+
+		// Use original instruction sequence.
+		s.startBlock(bFalse)
+		// NOT IMPLEMENTED, NEED TO ADD GENERIC PARTIAL LOAD/STORE
+		// MASK REGISTER DEPENDS ON ARCH AND ITS SIMD VERSION.
+		s.endBlock().AddEdgeTo(bEnd)
+
+		// Merge results.
+		s.startBlock(bEnd)
+		return s.variable(simdRes, t)
+
+	}
 }

 // findIntrinsic returns a function which builds the SSA equivalent of the
@ -1627,7 +1725,8 @@ func findIntrinsic(sym *types.Sym) intrinsicBuilder {

 	fn := sym.Name
 	if ssa.IntrinsicsDisable {
-		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") {
+		if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
+			pkg == "internal/simd" || pkg == "simd" { // TODO after simd has been moved to package simd, remove internal/simd
 			// These runtime functions don't have definitions, must be intrinsics.
 		} else {
 			return nil
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -0,0 +1,15 @@
+// Code generated by internal/simd/_gen using 'go run .'; DO NOT EDIT.
+
+package ssagen
+
+import (
+	// "cmd/compile/internal/ir"
+	// "cmd/compile/internal/ssa"
+	// "cmd/compile/internal/types"
+	"cmd/internal/sys"
+)
+
+func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily)) {
+	// addF("internal/simd", "Int32x4.Uint32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
+	// etc
+}
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@ -623,6 +623,9 @@ func buildssa(fn *ir.Func, worker int, isPgoHot bool) *ssa.Func {
 	// TODO figure out exactly what's unused, don't spill it. Make liveness fine-grained, also.
 	for _, p := range params.InParams() {
 		typs, offs := p.RegisterTypesAndOffsets()
+		if len(offs) < len(typs) {
+			s.Fatalf("len(offs)=%d < len(typs)=%d, params=\n%s", len(offs), len(typs), params)
+		}
 		for i, t := range typs {
 			o := offs[i]                // offset within parameter
 			fo := p.FrameOffset(params) // offset of parameter in frame
@ -1399,7 +1402,7 @@ func (s *state) instrument(t *types.Type, addr *ssa.Value, kind instrumentKind)
 // If it is instrumenting for MSAN or ASAN and t is a struct type, it instruments
 // operation for each field, instead of for the whole struct.
 func (s *state) instrumentFields(t *types.Type, addr *ssa.Value, kind instrumentKind) {
-	if !(base.Flag.MSan || base.Flag.ASan) || !t.IsStruct() {
+	if !(base.Flag.MSan || base.Flag.ASan) || !isStructNotSIMD(t) {
 		s.instrument(t, addr, kind)
 		return
 	}
@ -4335,7 +4338,7 @@ func (s *state) zeroVal(t *types.Type) *ssa.Value {
 		return s.constInterface(t)
 	case t.IsSlice():
 		return s.constSlice(t)
-	case t.IsStruct():
+	case isStructNotSIMD(t):
 		n := t.NumFields()
 		v := s.entryNewValue0(ssa.OpStructMake, t)
 		for i := 0; i < n; i++ {
@ -4349,6 +4352,8 @@ func (s *state) zeroVal(t *types.Type) *ssa.Value {
 		case 1:
 			return s.entryNewValue1(ssa.OpArrayMake1, t, s.zeroVal(t.Elem()))
 		}
+	case t.IsSIMD():
+		return s.newValue0(ssa.OpZeroSIMD, t)
 	}
 	s.Fatalf("zero for type %v not implemented", t)
 	return nil
@ -5328,7 +5333,7 @@ func (s *state) storeType(t *types.Type, left, right *ssa.Value, skip skipMask,
 // do *left = right for all scalar (non-pointer) parts of t.
 func (s *state) storeTypeScalars(t *types.Type, left, right *ssa.Value, skip skipMask) {
 	switch {
-	case t.IsBoolean() || t.IsInteger() || t.IsFloat() || t.IsComplex():
+	case t.IsBoolean() || t.IsInteger() || t.IsFloat() || t.IsComplex() || t.IsSIMD():
 		s.store(t, left, right)
 	case t.IsPtrShaped():
 		if t.IsPtr() && t.Elem().NotInHeap() {
@ -5357,7 +5362,7 @@ func (s *state) storeTypeScalars(t *types.Type, left, right *ssa.Value, skip ski
 		// itab field doesn't need a write barrier (even though it is a pointer).
 		itab := s.newValue1(ssa.OpITab, s.f.Config.Types.BytePtr, right)
 		s.store(types.Types[types.TUINTPTR], left, itab)
-	case t.IsStruct():
+	case isStructNotSIMD(t):
 		n := t.NumFields()
 		for i := 0; i < n; i++ {
 			ft := t.FieldType(i)
@ -5394,7 +5399,7 @@ func (s *state) storeTypePtrs(t *types.Type, left, right *ssa.Value) {
 		idata := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, right)
 		idataAddr := s.newValue1I(ssa.OpOffPtr, s.f.Config.Types.BytePtrPtr, s.config.PtrSize, left)
 		s.store(s.f.Config.Types.BytePtr, idataAddr, idata)
-	case t.IsStruct():
+	case isStructNotSIMD(t):
 		n := t.NumFields()
 		for i := 0; i < n; i++ {
 			ft := t.FieldType(i)
@ -6477,7 +6482,7 @@ func EmitArgInfo(f *ir.Func, abiInfo *abi.ABIParamResultInfo) *obj.LSym {
 	uintptrTyp := types.Types[types.TUINTPTR]

 	isAggregate := func(t *types.Type) bool {
-		return t.IsStruct() || t.IsArray() || t.IsComplex() || t.IsInterface() || t.IsString() || t.IsSlice()
+		return isStructNotSIMD(t) || t.IsArray() || t.IsComplex() || t.IsInterface() || t.IsString() || t.IsSlice()
 	}

 	wOff := 0
@ -6537,7 +6542,7 @@ func EmitArgInfo(f *ir.Func, abiInfo *abi.ABIParamResultInfo) *obj.LSym {
 				}
 				baseOffset += t.Elem().Size()
 			}
-		case t.IsStruct():
+		case isStructNotSIMD(t):
 			if t.NumFields() == 0 {
 				n++ // {} counts as a component
 				break
@ -7554,7 +7559,7 @@ func (s *State) UseArgs(n int64) {
 // fieldIdx finds the index of the field referred to by the ODOT node n.
 func fieldIdx(n *ir.SelectorExpr) int {
 	t := n.X.Type()
-	if !t.IsStruct() {
+	if !isStructNotSIMD(t) {
 		panic("ODOT's LHS is not a struct")
 	}

@ -7762,6 +7767,10 @@ func SpillSlotAddr(spill ssa.Spill, baseReg int16, extraOffset int64) obj.Addr {
 	}
 }

+func isStructNotSIMD(t *types.Type) bool {
+	return t.IsStruct() && !t.IsSIMD()
+}
+
 var (
 	BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
 	ExtendCheckFunc [ssa.BoundsKindCount]*obj.LSym
--- a/src/cmd/compile/internal/types/size.go
+++ b/src/cmd/compile/internal/types/size.go
@ -10,6 +10,7 @@ import (

 	"cmd/compile/internal/base"
 	"cmd/internal/src"
+	"internal/buildcfg"
 	"internal/types/errors"
 )

@ -410,6 +411,10 @@ func CalcSize(t *Type) {
 		}
 		CalcStructSize(t)
 		w = t.width
+		if t.IsSIMD() { // XXX
+			t.intRegs = 0
+			t.floatRegs = 1
+		}

 	// make fake type to check later to
 	// trigger function argument computation.
@ -452,6 +457,31 @@ func CalcSize(t *Type) {
 	ResumeCheckSize()
 }

+// simdify marks as type as "SIMD", either as a tag field,
+// or having the SIMD attribute.  The tag field is a marker
+// type used to identify a struct that is not really a struct.
+// A SIMD type is allocated to a vector register (on amd64,
+// xmm, ymm, or zmm).  The fields of a SIMD type are ignored
+// by the compiler except for the space that they reserve.
+func simdify(st *Type, isTag bool) {
+	st.align = 8
+	st.alg = AMEM
+	st.intRegs = 0
+	st.isSIMD = true
+	if isTag {
+		st.width = 0
+		st.isSIMDTag = true
+		st.floatRegs = 0
+	} else {
+		st.floatRegs = 1
+	}
+	// if st.Sym() != nil {
+	// 	base.Warn("Simdify %s, %v, %d", st.Sym().Name, isTag, st.width)
+	// } else {
+	// 	base.Warn("Simdify %v, %v, %d", st, isTag, st.width)
+	// }
+}
+
 // CalcStructSize calculates the size of t,
 // filling in t.width, t.align, t.intRegs, and t.floatRegs,
 // even if size calculation is otherwise disabled.
@ -464,10 +494,27 @@ func CalcStructSize(t *Type) {
 		switch {
 		case sym.Name == "align64" && isAtomicStdPkg(sym.Pkg):
 			maxAlign = 8
+
+		case buildcfg.Experiment.SIMD && (sym.Pkg.Path == "internal/simd" || sym.Pkg.Path == "simd") && len(t.Fields()) >= 1:
+			// This gates the experiment -- without it, no user-visible types can be "simd".
+			// The SSA-visible SIMD types remain.
+			// TODO after simd has been moved to package simd, remove internal/simd.
+			switch sym.Name {
+			case "v128":
+				simdify(t, true)
+				return
+			case "v256":
+				simdify(t, true)
+				return
+			case "v512":
+				simdify(t, true)
+				return
+			}
 		}
 	}

 	fields := t.Fields()
+
 	size := calcStructOffset(t, fields, 0)

 	// For non-zero-sized structs which end in a zero-sized field, we
@ -540,6 +587,11 @@ func CalcStructSize(t *Type) {
 			break
 		}
 	}
+
+	if len(t.Fields()) >= 1 && t.Fields()[0].Type.isSIMDTag {
+		// this catches `type Foo simd.Whatever` -- Foo is also SIMD.
+		simdify(t, false)
+	}
 }

 // CalcArraySize calculates the size of t,
--- a/src/cmd/compile/internal/types/type.go
+++ b/src/cmd/compile/internal/types/type.go
@ -203,6 +203,7 @@ type Type struct {

 	flags             bitset8
 	alg               AlgKind // valid if Align > 0
+	isSIMDTag, isSIMD bool    // tag is the marker type, isSIMD means has marker type

 	// size of prefix of object that contains all pointers. valid if Align > 0.
 	// Note that for pointers, this is always PtrSize even if the element type
@ -605,6 +606,12 @@ func newSSA(name string) *Type {
 	return t
 }

+func newSIMD(name string) *Type {
+	t := newSSA(name)
+	t.isSIMD = true
+	return t
+}
+
 // NewMap returns a new map Type with key type k and element (aka value) type v.
 func NewMap(k, v *Type) *Type {
 	t := newType(TMAP)
@ -995,10 +1002,7 @@ func (t *Type) ArgWidth() int64 {

 func (t *Type) Size() int64 {
 	if t.kind == TSSA {
-		if t == TypeInt128 {
-			return 16
-		}
-		return 0
+		return t.width
 	}
 	CalcSize(t)
 	return t.width
@ -1626,12 +1630,26 @@ var (
 	TypeFlags     = newSSA("flags")
 	TypeVoid      = newSSA("void")
 	TypeInt128    = newSSA("int128")
+	TypeVec128    = newSIMD("vec128")
+	TypeVec256    = newSIMD("vec256")
+	TypeVec512    = newSIMD("vec512")
+	TypeMask      = newSSA("mask") // not a vector, not 100% sure what this should be.
 	TypeResultMem = newResults([]*Type{TypeMem})
 )

 func init() {
 	TypeInt128.width = 16
 	TypeInt128.align = 8
+
+	TypeVec128.width = 16
+	TypeVec128.align = 8
+	TypeVec256.width = 32
+	TypeVec256.align = 8
+	TypeVec512.width = 64
+	TypeVec512.align = 8
+
+	TypeMask.width = 8 // This will depend on the architecture; spilling will be "interesting".
+	TypeMask.align = 8
 }

 // NewNamed returns a new named type for the given type name. obj should be an
@ -2017,3 +2035,7 @@ var SimType [NTYPE]Kind

 // Fake package for shape types (see typecheck.Shapify()).
 var ShapePkg = NewPkg("go.shape", "go.shape")
+
+func (t *Type) IsSIMD() bool {
+	return t.isSIMD
+}
--- a/src/internal/simd/dummy.s
+++ b/src/internal/simd/dummy.s
@ -0,0 +1,7 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build amd64
+
+// Empty file to allow bodyless functions.
--- a/src/internal/simd/testdata/sample.go
+++ b/src/internal/simd/testdata/sample.go
@ -0,0 +1,145 @@
+package sample
+
+import (
+	"internal/simd"
+	"os"
+	"unsafe"
+)
+
+type S1 = simd.Float64x4
+
+type S2 simd.Float64x4
+
+func (s S2) Len() int {
+	return simd.Float64x4(s).Len()
+}
+
+func (s S2) Load(a []float64) S2 {
+	return S2(simd.LoadFloat64x4FromSlice(a))
+}
+
+func (s S2) Store(a []float64) {
+	simd.Float64x4(s).Store(a)
+}
+
+func (s S2) Add(a S2) S2 {
+	return S2(simd.Float64x4(s).Add(simd.Float64x4(a)))
+}
+
+func (s S2) Mul(a S2) S2 {
+	return S2(simd.Float64x4(s).Mul(simd.Float64x4(a)))
+}
+
+type S3 struct {
+	simd.Float64x4
+}
+
+func ip64_0(a, b []float64) float64 {
+	s := 0.0
+	for i := range a {
+		s += a[i] * b[i]
+	}
+	return s
+}
+
+func ip64_1(a, b []float64) float64 {
+	var z S1
+	sum := z
+	var i int
+	stride := z.Len()
+	for ; i <= len(a)-stride; i += stride {
+		va := simd.LoadFloat64x4FromSlice(a[i:])
+		vb := simd.LoadFloat64x4FromSlice(b[i:])
+		sum = sum.Add(va.Mul(vb))
+	}
+	var tmp [4]float64
+	sum.Store(tmp[:])
+	return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func ip64_1a(a, b []float64) float64 {
+	var z S1
+	sum := z
+	var i int
+	stride := z.Len()
+	for ; i <= len(a)-stride; i += stride {
+		va := simd.LoadFloat64x4FromSlice(a[i:])
+		vb := simd.LoadFloat64x4FromSlice(b[i:])
+		sum = FMA(sum, va, vb)
+	}
+	var tmp [4]float64
+	sum.Store(tmp[:])
+	return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+//go:noinline
+func FMA(a, b, c simd.Float64x4) simd.Float64x4 {
+	return a.Add(b.Mul(c))
+}
+
+func ip64_2(a, b []float64) float64 {
+	var z S2
+	sum := z
+	var i int
+	stride := z.Len()
+	for ; i <= len(a)-stride; i += stride {
+		va := z.Load(a[i:])
+		vb := z.Load(b[i:])
+		sum = sum.Add(va.Mul(vb))
+	}
+	var tmp [4]float64
+	sum.Store(tmp[:])
+	return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func ip64_3(a, b []float64) float64 {
+	var z S3
+	sum := z
+	var i int
+	stride := z.Len()
+	for ; i <= len(a)-stride; i += stride {
+		va := simd.LoadFloat64x4FromSlice(a[i:])
+		vb := simd.LoadFloat64x4FromSlice(b[i:])
+		sum = S3{sum.Add(va.Mul(vb))}
+	}
+	var tmp [4]float64
+	sum.Store(tmp[:])
+	return tmp[0] + tmp[1] + tmp[2] + tmp[3]
+}
+
+func main() {
+	a := []float64{1, 2, 3, 4, 5, 6, 7, 8}
+	ip0 := ip64_0(a, a)
+	ip1 := ip64_1(a, a)
+	ip1a := ip64_1a(a, a)
+	ip2 := ip64_2(a, a)
+	ip3 := ip64_3(a, a)
+	fmt.Printf("Test IP    = %f\n", ip0)
+	fmt.Printf("SIMD IP 1  = %f\n", ip1)
+	fmt.Printf("SIMD IP 1a = %f\n", ip1a)
+	fmt.Printf("SIMD IP 2  = %f\n", ip2)
+	fmt.Printf("SIMD IP 3 = %f\n", ip3)
+	var z1 S1
+	var z2 S2
+	var z3 S2
+
+	s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3)
+
+	fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3)
+
+	fail := false
+
+	if s1 != 32 || s2 != 32 || s3 != 32 {
+		fmt.Println("Failed a sizeof check, should all be 32")
+		fail = true
+	}
+
+	if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 {
+		fmt.Println("Failed an inner product check, should all be", ip0)
+		fail = true
+	}
+
+	if fail {
+		os.Exit(1)
+	}
+}