[dev.simd] cmd/compile, simd: add SHA features

This CL also fixed some bugs left in CL 712181. Change-Id: I9cb6cd9fbaef307f352809bf21b8fec3eb62721a Reviewed-on: https://go-review.googlesource.com/c/go/+/712361 Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-12-08 06:10:04 +00:00 · 2025-10-16 16:07:32 +00:00 · 2025-10-16 16:07:32 +00:00 · cf7c1a4cbb
commit cf7c1a4cbb
parent 2b8eded4f4
22 changed files with 843 additions and 235 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -1955,6 +1955,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPTERNLOGQ512load:
 		p = simdV31loadResultInArg0Imm8(s, v)

+	case ssa.OpAMD64SHA1MSG1128,
+		ssa.OpAMD64SHA1MSG2128,
+		ssa.OpAMD64SHA1NEXTE128,
+		ssa.OpAMD64SHA256MSG1128:
+		p = simdV21ResultInArg0(s, v)
+
+	case ssa.OpAMD64SHA1RNDS4128:
+		p = simdV21ResultInArg0Imm8(s, v)
+
+	case ssa.OpAMD64SHA256RNDS2128:
+		p = simdV31x0AtIn2ResultInArg0(s, v)
+
 	default:
 		// Unknown reg shape
 		return false
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -2349,6 +2349,32 @@ func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
 	return p
 }

+// Example instruction: SHA1NEXTE X2, X2
+func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
+	p := s.Prog(v.Op.Asm())
+	p.From.Type = obj.TYPE_REG
+	p.From.Reg = simdReg(v.Args[1])
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+	return p
+}
+
+// Example instruction: SHA1RNDS4 $1, X2, X2
+func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
+	p := s.Prog(v.Op.Asm())
+	p.From.Offset = int64(v.AuxUInt8())
+	p.From.Type = obj.TYPE_CONST
+	p.AddRestSourceReg(simdReg(v.Args[1]))
+	p.To.Type = obj.TYPE_REG
+	p.To.Reg = simdReg(v)
+	return p
+}
+
+// Example instruction: SHA256RNDS2 X0, X11, X2
+func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
+	return simdV31ResultInArg0(s, v)
+}
+
 var blockJump = [...]struct {
 	asm, invasm obj.As
 }{
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@ -135,6 +135,7 @@ func init() {

 		vz = v | x15
 		wz = w | x15
+		x0 = buildReg("X0")
 	)
 	// Common slices of register masks
 	var (
@ -213,7 +214,7 @@ func init() {
 		vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}

 		v11     = regInfo{inputs: vzonly, outputs: vonly}
-		v21     = regInfo{inputs: []regMask{vz, vz}, outputs: vonly}
+		v21     = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
 		vk      = regInfo{inputs: vzonly, outputs: maskonly}
 		kv      = regInfo{inputs: maskonly, outputs: vonly}
 		v2k     = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly}
@ -247,17 +248,18 @@ func init() {

 		// These register masks are used by SIMD only, they follow the pattern:
 		// Mem last, k mask second to last (if any), address right before mem and k mask.
-		wkwload  = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: wonly}
-		v21load  = regInfo{inputs: []regMask{vz, gpspsb, 0}, outputs: vonly}
-		v31load  = regInfo{inputs: []regMask{v, vz, gpspsb, 0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
-		v11load  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: vonly}
-		w21load  = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: wonly}
-		w31load  = regInfo{inputs: []regMask{w, wz, gpspsb, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
-		w2kload  = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: maskonly}
-		w2kwload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: wonly}
-		w11load  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: wonly}
-		w3kwload = regInfo{inputs: []regMask{w, wz, gpspsb, mask, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
-		w2kkload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: maskonly}
+		wkwload    = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: wonly}
+		v21load    = regInfo{inputs: []regMask{v, gpspsb, 0}, outputs: vonly}     // used in resultInArg0 ops, arg0 must not be x15
+		v31load    = regInfo{inputs: []regMask{v, vz, gpspsb, 0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
+		v11load    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: vonly}
+		w21load    = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: wonly}
+		w31load    = regInfo{inputs: []regMask{w, wz, gpspsb, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
+		w2kload    = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: maskonly}
+		w2kwload   = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: wonly}
+		w11load    = regInfo{inputs: []regMask{gpspsb, 0}, outputs: wonly}
+		w3kwload   = regInfo{inputs: []regMask{w, wz, gpspsb, mask, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
+		w2kkload   = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: maskonly}
+		v31x0AtIn2 = regInfo{inputs: []regMask{v, vz, x0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15

 		kload  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
 		kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
@ -1477,7 +1479,7 @@ func init() {
 		genSIMDfile: "../../amd64/simdssa.go",
 		ops: append(AMD64ops, simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv,
 			w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw, wkwload, v21load, v31load, v11load,
-			w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload)...), // AMD64ops,
+			w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2)...), // AMD64ops,
 		blocks:             AMD64blocks,
 		regnames:           regNamesAMD64,
 		ParamIntRegNames:   "AX BX CX DI SI R8 R9 R10 R11",
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -939,6 +939,20 @@
 (RoundToEvenScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
 (RoundToEvenScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
 (RoundToEvenScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
+(SHA1Msg1Int32x4 ...) => (SHA1MSG1128 ...)
+(SHA1Msg1Uint32x4 ...) => (SHA1MSG1128 ...)
+(SHA1Msg2Int32x4 ...) => (SHA1MSG2128 ...)
+(SHA1Msg2Uint32x4 ...) => (SHA1MSG2128 ...)
+(SHA1NextEInt32x4 ...) => (SHA1NEXTE128 ...)
+(SHA1NextEUint32x4 ...) => (SHA1NEXTE128 ...)
+(SHA1Round4Int32x4 ...) => (SHA1RNDS4128 ...)
+(SHA1Round4Uint32x4 ...) => (SHA1RNDS4128 ...)
+(SHA256Msg1Int32x4 ...) => (SHA256MSG1128 ...)
+(SHA256Msg1Uint32x4 ...) => (SHA256MSG1128 ...)
+(SHA256Msg2Int32x4 ...) => (SHA256MSG1128 ...)
+(SHA256Msg2Uint32x4 ...) => (SHA256MSG1128 ...)
+(SHA256Rounds2Int32x4 ...) => (SHA256RNDS2128 ...)
+(SHA256Rounds2Uint32x4 ...) => (SHA256RNDS2128 ...)
 (ScaleFloat32x4 ...) => (VSCALEFPS128 ...)
 (ScaleFloat32x8 ...) => (VSCALEFPS256 ...)
 (ScaleFloat32x16 ...) => (VSCALEFPS512 ...)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -3,8 +3,13 @@
 package main

 func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
-	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
+	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
 	return []opData{
+		{name: "SHA1MSG1128", argLength: 2, reg: v21, asm: "SHA1MSG1", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "SHA1MSG2128", argLength: 2, reg: v21, asm: "SHA1MSG2", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "SHA1NEXTE128", argLength: 2, reg: v21, asm: "SHA1NEXTE", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "SHA256MSG1128", argLength: 2, reg: v21, asm: "SHA256MSG1", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "SHA256RNDS2128", argLength: 3, reg: v31x0AtIn2, asm: "SHA256RNDS2", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VADDPD128", argLength: 2, reg: v21, asm: "VADDPD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPD256", argLength: 2, reg: v21, asm: "VADDPD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDPD512", argLength: 2, reg: w21, asm: "VADDPD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -1216,6 +1221,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
+		{name: "SHA1RNDS4128", argLength: 2, reg: v21, asm: "SHA1RNDS4", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -844,6 +844,18 @@ func simdGenericOps() []opData {
 		{name: "RoundToEvenFloat32x8", argLength: 1, commutative: false},
 		{name: "RoundToEvenFloat64x2", argLength: 1, commutative: false},
 		{name: "RoundToEvenFloat64x4", argLength: 1, commutative: false},
+		{name: "SHA1Msg1Int32x4", argLength: 2, commutative: false},
+		{name: "SHA1Msg1Uint32x4", argLength: 2, commutative: false},
+		{name: "SHA1Msg2Int32x4", argLength: 2, commutative: false},
+		{name: "SHA1Msg2Uint32x4", argLength: 2, commutative: false},
+		{name: "SHA1NextEInt32x4", argLength: 2, commutative: false},
+		{name: "SHA1NextEUint32x4", argLength: 2, commutative: false},
+		{name: "SHA256Msg1Int32x4", argLength: 2, commutative: false},
+		{name: "SHA256Msg1Uint32x4", argLength: 2, commutative: false},
+		{name: "SHA256Msg2Int32x4", argLength: 2, commutative: false},
+		{name: "SHA256Msg2Uint32x4", argLength: 2, commutative: false},
+		{name: "SHA256Rounds2Int32x4", argLength: 3, commutative: false},
+		{name: "SHA256Rounds2Uint32x4", argLength: 3, commutative: false},
 		{name: "ScaleFloat32x4", argLength: 2, commutative: false},
 		{name: "ScaleFloat32x8", argLength: 2, commutative: false},
 		{name: "ScaleFloat32x16", argLength: 2, commutative: false},
@ -1206,6 +1218,8 @@ func simdGenericOps() []opData {
 		{name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
 		{name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
+		{name: "SHA1Round4Int32x4", argLength: 2, commutative: false, aux: "UInt8"},
+		{name: "SHA1Round4Uint32x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
 		{name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -4978,6 +4978,48 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpRsh8x64(v)
 	case OpRsh8x8:
 		return rewriteValueAMD64_OpRsh8x8(v)
+	case OpSHA1Msg1Int32x4:
+		v.Op = OpAMD64SHA1MSG1128
+		return true
+	case OpSHA1Msg1Uint32x4:
+		v.Op = OpAMD64SHA1MSG1128
+		return true
+	case OpSHA1Msg2Int32x4:
+		v.Op = OpAMD64SHA1MSG2128
+		return true
+	case OpSHA1Msg2Uint32x4:
+		v.Op = OpAMD64SHA1MSG2128
+		return true
+	case OpSHA1NextEInt32x4:
+		v.Op = OpAMD64SHA1NEXTE128
+		return true
+	case OpSHA1NextEUint32x4:
+		v.Op = OpAMD64SHA1NEXTE128
+		return true
+	case OpSHA1Round4Int32x4:
+		v.Op = OpAMD64SHA1RNDS4128
+		return true
+	case OpSHA1Round4Uint32x4:
+		v.Op = OpAMD64SHA1RNDS4128
+		return true
+	case OpSHA256Msg1Int32x4:
+		v.Op = OpAMD64SHA256MSG1128
+		return true
+	case OpSHA256Msg1Uint32x4:
+		v.Op = OpAMD64SHA256MSG1128
+		return true
+	case OpSHA256Msg2Int32x4:
+		v.Op = OpAMD64SHA256MSG1128
+		return true
+	case OpSHA256Msg2Uint32x4:
+		v.Op = OpAMD64SHA256MSG1128
+		return true
+	case OpSHA256Rounds2Int32x4:
+		v.Op = OpAMD64SHA256RNDS2128
+		return true
+	case OpSHA256Rounds2Uint32x4:
+		v.Op = OpAMD64SHA256RNDS2128
+		return true
 	case OpScaleFloat32x16:
 		v.Op = OpAMD64VSCALEFPS512
 		return true
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -1987,6 +1987,19 @@ func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExp
 	}
 }

+// The assembler requires the imm value of a SHA1RNDS4 instruction to be one of 0,1,2,3...
+func opLen2Imm8_SHA1RNDS4(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
+		if args[1].Op == ssa.OpConst8 {
+			return s.newValue2I(op, t, (args[1].AuxInt<<int64(offset))&0b11, args[0], args[2])
+		}
+		return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
+			// Encode as int8 due to requirement of AuxInt, check its comment for details.
+			s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset))&0b11, args[0], args[2])
+		})
+	}
+}
+
 func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		if args[2].Op == ssa.OpConst8 {
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -951,6 +951,20 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
 	addF(simdPackage, "Float64x4.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
 	addF(simdPackage, "Float64x8.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA1Msg1", opLen2(ssa.OpSHA1Msg1Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA1Msg1", opLen2(ssa.OpSHA1Msg1Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA1Msg2", opLen2(ssa.OpSHA1Msg2Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA1Msg2", opLen2(ssa.OpSHA1Msg2Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA1NextE", opLen2(ssa.OpSHA1NextEInt32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA1NextE", opLen2(ssa.OpSHA1NextEUint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA1Round4", opLen2Imm8_SHA1RNDS4(ssa.OpSHA1Round4Int32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA1Round4", opLen2Imm8_SHA1RNDS4(ssa.OpSHA1Round4Uint32x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA256Msg1", opLen2(ssa.OpSHA256Msg1Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA256Msg1", opLen2(ssa.OpSHA256Msg1Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA256Msg2", opLen2(ssa.OpSHA256Msg2Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA256Msg2", opLen2(ssa.OpSHA256Msg2Uint32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Int32x4.SHA256Rounds2", opLen3(ssa.OpSHA256Rounds2Int32x4, types.TypeVec128), sys.AMD64)
+	addF(simdPackage, "Uint32x4.SHA256Rounds2", opLen3(ssa.OpSHA256Rounds2Uint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64)
--- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go
+++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go
@ -58,6 +58,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 {{end}}
 {{define "op2Imm8_II"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
+{{define "op2Imm8_SHA1RNDS4"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
+{{end}}
 {{define "op3Imm8"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
 {{end}}
 {{define "op3Imm8_2I"}}	addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
--- a/src/simd/_gen/simdgen/gen_simdMachineOps.go
+++ b/src/simd/_gen/simdgen/gen_simdMachineOps.go
@ -16,7 +16,7 @@ const simdMachineOpsTmpl = `
 package main

 func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
-	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
+	wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
 	return []opData{
 {{- range .OpsData }}
 		{name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
@ -61,7 +61,7 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
 		"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
 		"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
 		"wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
-		"w3kwload": true, "w2kkload": true}
+		"w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
 	opsData := make([]opData, 0)
 	opsDataImm := make([]opData, 0)
 	opsDataLoad := make([]opData, 0)
--- a/src/simd/_gen/simdgen/gen_simdTypes.go
+++ b/src/simd/_gen/simdgen/gen_simdTypes.go
@ -352,6 +352,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin
 func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
 {{end}}

+{{define "op2Imm8_SHA1RNDS4"}}
+{{if .Documentation}}{{.Documentation}}
+//{{end}}
+// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
+func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
+{{end}}
+
 {{define "op3Imm8"}}
 {{if .Documentation}}{{.Documentation}}
 //{{end}}
--- a/src/simd/_gen/simdgen/gen_simdssa.go
+++ b/src/simd/_gen/simdgen/gen_simdssa.go
@ -96,6 +96,9 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer {
 		"v2kvloadImm8",
 		"v31ResultInArg0Imm8",
 		"v31loadResultInArg0Imm8",
+		"v21ResultInArg0",
+		"v21ResultInArg0Imm8",
+		"v31x0AtIn2ResultInArg0",
 	}
 	regInfoSet := map[string][]string{}
 	for _, key := range regInfoKeys {
--- a/src/simd/_gen/simdgen/gen_utility.go
+++ b/src/simd/_gen/simdgen/gen_utility.go
@ -236,9 +236,9 @@ func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskS
 // regShape returns a string representation of the register shape.
 func (op *Operation) regShape(mem memShape) (string, error) {
 	_, _, _, _, gOp := op.shape()
-	var regInfo string
+	var regInfo, fixedName string
 	var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
-	for _, in := range gOp.In {
+	for i, in := range gOp.In {
 		switch in.Class {
 		case "vreg":
 			vRegInCnt++
@ -253,8 +253,11 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 			memInCnt++
 			vRegInCnt++
 		}
+		if in.FixedReg != nil {
+			fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
+		}
 	}
-	for _, out := range gOp.Out {
+	for i, out := range gOp.Out {
 		// If class overwrite is happening, that's not really a mask but a vreg.
 		if out.Class == "vreg" || out.OverwriteClass != nil {
 			vRegOutCnt++
@ -269,6 +272,9 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 			vRegOutCnt++
 			memOutCnt++
 		}
+		if out.FixedReg != nil {
+			fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
+		}
 	}
 	var inRegs, inMasks, outRegs, outMasks string

@ -309,6 +315,7 @@ func (op *Operation) regShape(mem memShape) (string, error) {
 	if memOutCnt > 0 {
 		panic("simdgen does not understand memory as output as of now")
 	}
+	regInfo += fixedName
 	return regInfo, nil
 }

--- a/src/simd/_gen/simdgen/godefs.go
+++ b/src/simd/_gen/simdgen/godefs.go
@ -256,6 +256,8 @@ type Operand struct {
 	// because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
 	// elemBits 16, which should be 8.
 	OverwriteElementBits *int
+	// FixedReg is the name of the fixed registers
+	FixedReg *string
 }

 // isDigit returns true if the byte is an ASCII digit.
--- a/src/simd/_gen/simdgen/main.go
+++ b/src/simd/_gen/simdgen/main.go
@ -92,8 +92,9 @@ import (
 	"slices"
 	"strings"

-	"gopkg.in/yaml.v3"
 	"simd/_gen/unify"
+
+	"gopkg.in/yaml.v3"
 )

 var (
@ -199,6 +200,15 @@ func main() {
 		log.Fatal(err)
 	}

+	// Validate results.
+	//
+	// Don't validate if this is a command-line query because that tends to
+	// eliminate lots of required defs and is used in cases where maybe defs
+	// aren't enumerable anyway.
+	if *flagQ == "" && len(must) > 0 {
+		validate(unified, must)
+	}
+
 	// Print results.
 	switch *flagO {
 	case "yaml":
@ -228,15 +238,6 @@ func main() {
 			fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
 		}
 	}
-
-	// Validate results.
-	//
-	// Don't validate if this is a command-line query because that tends to
-	// eliminate lots of required defs and is used in cases where maybe defs
-	// aren't enumerable anyway.
-	if *flagQ == "" && len(must) > 0 {
-		validate(unified, must)
-	}
 }

 func validate(cl unify.Closure, required map[*unify.Value]struct{}) {
--- a/src/simd/_gen/simdgen/ops/Others/categories.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/categories.yaml
@ -46,4 +46,63 @@
  documentation: !string |-
    // NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
    // x is the chunk of w array in use.
-    // result = InvMixColumns(x)
+    // result = InvMixColumns(x)
+- go: SHA1Round4
+  commutative: false
+  documentation: !string |-
+    // NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+    // x contains the state variables a, b, c and d from upper to lower order.
+    // y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+    // result = the state variables a', b', c', d' updated after 4 rounds.
+    // constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+- go: SHA1NextE
+  commutative: false
+  documentation: !string |-
+    // NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+    // x contains the state variable a (before the 4 rounds), placed in the upper element.
+    // y is the elements of W array for next 4 rounds from upper to lower order.
+    // result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+    // from upper to lower order.
+    // For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+    // for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+    // computation of the value of e'.)
+- go: SHA1Msg1
+  commutative: false
+  documentation: !string |-
+    // NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+    // x = {W3, W2, W1, W0}
+    // y = {0, 0, W5, W4}
+    // result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+- go: SHA1Msg2
+  commutative: false
+  documentation: !string |-
+    // NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+    // x = result of 2.
+    // y = {W15, W14, W13}
+    // result = {W19, W18, W17, W16}
+- go: SHA256Rounds2
+  commutative: false
+  documentation: !string |-
+    // NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+    // x = {h, g, d, c}
+    // y = {f, e, b, a}
+    // z = {W0+K0, W1+K1}
+    // result = {f', e', b', a'}
+    // The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+    // the corresponding element of the W array to make the input data z.
+    // The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+    // y (the state variables a, b, e, f before the 2 rounds).
+- go: SHA256Msg1
+  commutative: false
+  documentation: !string |-
+    // NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+    // x = {W0, W1, W2, W3}
+    // y = {W4, 0, 0, 0}
+    // result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+- go: SHA256Msg2
+  commutative: false
+  documentation: !string |-
+    // NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+    // x = result of 2
+    // y = {0, 0, W14, W15}
+    // result = {W16, W17, W18, W19}
--- a/src/simd/_gen/simdgen/ops/Others/go.yaml
+++ b/src/simd/_gen/simdgen/ops/Others/go.yaml
@ -52,4 +52,45 @@
  in:
  - *uint32s
  out:
-  - *uint32s
+  - *uint32s
+- go: SHA1Round4
+  asm: SHA1RNDS4
+  operandOrder: "SHA1RNDS4"
+  in: &2any1imm
+  - *any
+  - *any
+  - class: immediate
+    immOffset: 0
+  out: &1any
+  - *any
+- go: SHA1NextE
+  asm: SHA1NEXTE
+  in: &2any
+  - *any
+  - *any
+  out: *1any
+- go: SHA1Msg1
+  asm: SHA1MSG1
+  in: *2any
+  out: *1any
+- go: SHA1Msg2
+  asm: SHA1MSG2
+  in: *2any
+  out: *1any
+- go: SHA256Rounds2
+  asm: SHA256RNDS2
+  in:
+  - base: $t
+  - base: $t
+  - base: $t
+    overwriteElementBits: 32
+  out:
+  - base: $t
+- go: SHA256Msg1
+  asm: SHA256MSG1
+  in: *2any
+  out: *1any
+- go: SHA256Msg2
+  asm: SHA256MSG1
+  in: *2any
+  out: *1any
--- a/src/simd/_gen/simdgen/xed.go
+++ b/src/simd/_gen/simdgen/xed.go
@ -25,7 +25,6 @@ const (
 	NOT_REG_CLASS = iota // not a register
 	VREG_CLASS           // classify as a vector register; see
 	GREG_CLASS           // classify as a general register
-	REG_FIXED            // classify as a fixed  register
 )

 // instVariant is a bitmap indicating a variant of an instruction that has
@ -852,7 +851,7 @@ type fixedReg struct {
 }

 var fixedRegMap = map[string]fixedReg{
-	"XED_REG_XMM0": {REG_FIXED, "XMM0", 128},
+	"XED_REG_XMM0": {VREG_CLASS, "x0", 128},
 }

 // decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,
--- a/src/simd/cpu.go
+++ b/src/simd/cpu.go
@ -106,3 +106,11 @@ func HasAVX512VPOPCNTDQ() bool {
 func HasAVXVNNI() bool {
 	return cpu.X86.HasAVXVNNI
 }
+
+// HasSHA returns whether the CPU supports the SHA feature.
+//
+// HasSHA is defined on all GOARCHes, but will only return true on
+// GOARCH amd64.
+func HasSHA() bool {
+	return cpu.X86.HasSHA
+}
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@ -5623,6 +5623,156 @@ func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
 // Asm: VREDUCEPD, CPU Feature: AVX512
 func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8

+/* SHA1Msg1 */
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Int32x4) SHA1Msg1(y Int32x4) Int32x4
+
+// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W3, W2, W1, W0}
+// y = {0, 0, W5, W4}
+// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
+//
+// Asm: SHA1MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg1(y Uint32x4) Uint32x4
+
+/* SHA1Msg2 */
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Int32x4) SHA1Msg2(y Int32x4) Int32x4
+
+// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2.
+// y = {W15, W14, W13}
+// result = {W19, W18, W17, W16}
+//
+// Asm: SHA1MSG2, CPU Feature: SHA
+func (x Uint32x4) SHA1Msg2(y Uint32x4) Uint32x4
+
+/* SHA1NextE */
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Int32x4) SHA1NextE(y Int32x4) Int32x4
+
+// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variable a (before the 4 rounds), placed in the upper element.
+// y is the elements of W array for next 4 rounds from upper to lower order.
+// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
+// from upper to lower order.
+// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
+// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
+// computation of the value of e'.)
+//
+// Asm: SHA1NEXTE, CPU Feature: SHA
+func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
+
+/* SHA1Round4 */
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Int32x4) SHA1Round4(constant uint8, y Int32x4) Int32x4
+
+// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
+// x contains the state variables a, b, c and d from upper to lower order.
+// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
+// result = the state variables a', b', c', d' updated after 4 rounds.
+// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
+//
+// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
+//
+// Asm: SHA1RNDS4, CPU Feature: SHA
+func (x Uint32x4) SHA1Round4(constant uint8, y Uint32x4) Uint32x4
+
+/* SHA256Msg1 */
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg1(y Int32x4) Int32x4
+
+// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
+// x = {W0, W1, W2, W3}
+// y = {W4, 0, 0, 0}
+// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg1(y Uint32x4) Uint32x4
+
+/* SHA256Msg2 */
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Int32x4) SHA256Msg2(y Int32x4) Int32x4
+
+// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
+// x = result of 2
+// y = {0, 0, W14, W15}
+// result = {W16, W17, W18, W19}
+//
+// Asm: SHA256MSG1, CPU Feature: SHA
+func (x Uint32x4) SHA256Msg2(y Uint32x4) Uint32x4
+
+/* SHA256Rounds2 */
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Int32x4) SHA256Rounds2(y Int32x4, z Int32x4) Int32x4
+
+// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
+// x = {h, g, d, c}
+// y = {f, e, b, a}
+// z = {W0+K0, W1+K1}
+// result = {f', e', b', a'}
+// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
+// the corresponding element of the W array to make the input data z.
+// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
+// y (the state variables a, b, e, f before the 2 rounds).
+//
+// Asm: SHA256RNDS2, CPU Feature: SHA
+func (x Uint32x4) SHA256Rounds2(y Uint32x4, z Uint32x4) Uint32x4
+
 /* Scale */

 // Scale multiplies elements by a power of 2.