[dev.simd] cmd/compile, simd: add SHA features

This CL also fixed some bugs left in CL 712181.

Change-Id: I9cb6cd9fbaef307f352809bf21b8fec3eb62721a
Reviewed-on: https://go-review.googlesource.com/c/go/+/712361
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Junyang Shao 2025-10-16 16:07:32 +00:00
parent 2b8eded4f4
commit cf7c1a4cbb
22 changed files with 843 additions and 235 deletions

View file

@ -1955,6 +1955,18 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPTERNLOGQ512load:
p = simdV31loadResultInArg0Imm8(s, v)
case ssa.OpAMD64SHA1MSG1128,
ssa.OpAMD64SHA1MSG2128,
ssa.OpAMD64SHA1NEXTE128,
ssa.OpAMD64SHA256MSG1128:
p = simdV21ResultInArg0(s, v)
case ssa.OpAMD64SHA1RNDS4128:
p = simdV21ResultInArg0Imm8(s, v)
case ssa.OpAMD64SHA256RNDS2128:
p = simdV31x0AtIn2ResultInArg0(s, v)
default:
// Unknown reg shape
return false

View file

@ -2349,6 +2349,32 @@ func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
return p
}
// Example instruction: SHA1NEXTE X2, X2
func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: SHA1RNDS4 $1, X2, X2
func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: SHA256RNDS2 X0, X11, X2
func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
return simdV31ResultInArg0(s, v)
}
var blockJump = [...]struct {
asm, invasm obj.As
}{

View file

@ -135,6 +135,7 @@ func init() {
vz = v | x15
wz = w | x15
x0 = buildReg("X0")
)
// Common slices of register masks
var (
@ -213,7 +214,7 @@ func init() {
vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
v11 = regInfo{inputs: vzonly, outputs: vonly}
v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly}
v21 = regInfo{inputs: []regMask{v, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
vk = regInfo{inputs: vzonly, outputs: maskonly}
kv = regInfo{inputs: maskonly, outputs: vonly}
v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly}
@ -247,17 +248,18 @@ func init() {
// These register masks are used by SIMD only, they follow the pattern:
// Mem last, k mask second to last (if any), address right before mem and k mask.
wkwload = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: wonly}
v21load = regInfo{inputs: []regMask{vz, gpspsb, 0}, outputs: vonly}
v31load = regInfo{inputs: []regMask{v, vz, gpspsb, 0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v11load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: vonly}
w21load = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: wonly}
w31load = regInfo{inputs: []regMask{w, wz, gpspsb, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
w2kload = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: maskonly}
w2kwload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: wonly}
w11load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: wonly}
w3kwload = regInfo{inputs: []regMask{w, wz, gpspsb, mask, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
w2kkload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: maskonly}
wkwload = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: wonly}
v21load = regInfo{inputs: []regMask{v, gpspsb, 0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v31load = regInfo{inputs: []regMask{v, vz, gpspsb, 0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v11load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: vonly}
w21load = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: wonly}
w31load = regInfo{inputs: []regMask{w, wz, gpspsb, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
w2kload = regInfo{inputs: []regMask{wz, gpspsb, 0}, outputs: maskonly}
w2kwload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: wonly}
w11load = regInfo{inputs: []regMask{gpspsb, 0}, outputs: wonly}
w3kwload = regInfo{inputs: []regMask{w, wz, gpspsb, mask, 0}, outputs: wonly} // used in resultInArg0 ops, arg0 must not be x15
w2kkload = regInfo{inputs: []regMask{wz, gpspsb, mask, 0}, outputs: maskonly}
v31x0AtIn2 = regInfo{inputs: []regMask{v, vz, x0}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
@ -1477,7 +1479,7 @@ func init() {
genSIMDfile: "../../amd64/simdssa.go",
ops: append(AMD64ops, simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv,
w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw, wkwload, v21load, v31load, v11load,
w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload)...), // AMD64ops,
w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2)...), // AMD64ops,
blocks: AMD64blocks,
regnames: regNamesAMD64,
ParamIntRegNames: "AX BX CX DI SI R8 R9 R10 R11",

View file

@ -939,6 +939,20 @@
(RoundToEvenScaledResidueFloat64x2 [a] x) => (VREDUCEPD128 [a+0] x)
(RoundToEvenScaledResidueFloat64x4 [a] x) => (VREDUCEPD256 [a+0] x)
(RoundToEvenScaledResidueFloat64x8 [a] x) => (VREDUCEPD512 [a+0] x)
(SHA1Msg1Int32x4 ...) => (SHA1MSG1128 ...)
(SHA1Msg1Uint32x4 ...) => (SHA1MSG1128 ...)
(SHA1Msg2Int32x4 ...) => (SHA1MSG2128 ...)
(SHA1Msg2Uint32x4 ...) => (SHA1MSG2128 ...)
(SHA1NextEInt32x4 ...) => (SHA1NEXTE128 ...)
(SHA1NextEUint32x4 ...) => (SHA1NEXTE128 ...)
(SHA1Round4Int32x4 ...) => (SHA1RNDS4128 ...)
(SHA1Round4Uint32x4 ...) => (SHA1RNDS4128 ...)
(SHA256Msg1Int32x4 ...) => (SHA256MSG1128 ...)
(SHA256Msg1Uint32x4 ...) => (SHA256MSG1128 ...)
(SHA256Msg2Int32x4 ...) => (SHA256MSG1128 ...)
(SHA256Msg2Uint32x4 ...) => (SHA256MSG1128 ...)
(SHA256Rounds2Int32x4 ...) => (SHA256RNDS2128 ...)
(SHA256Rounds2Uint32x4 ...) => (SHA256RNDS2128 ...)
(ScaleFloat32x4 ...) => (VSCALEFPS128 ...)
(ScaleFloat32x8 ...) => (VSCALEFPS256 ...)
(ScaleFloat32x16 ...) => (VSCALEFPS512 ...)

View file

@ -3,8 +3,13 @@
package main
func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
return []opData{
{name: "SHA1MSG1128", argLength: 2, reg: v21, asm: "SHA1MSG1", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "SHA1MSG2128", argLength: 2, reg: v21, asm: "SHA1MSG2", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "SHA1NEXTE128", argLength: 2, reg: v21, asm: "SHA1NEXTE", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "SHA256MSG1128", argLength: 2, reg: v21, asm: "SHA256MSG1", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "SHA256RNDS2128", argLength: 3, reg: v31x0AtIn2, asm: "SHA256RNDS2", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VADDPD128", argLength: 2, reg: v21, asm: "VADDPD", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VADDPD256", argLength: 2, reg: v21, asm: "VADDPD", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VADDPD512", argLength: 2, reg: w21, asm: "VADDPD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -1216,6 +1221,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "SHA1RNDS4128", argLength: 2, reg: v21, asm: "SHA1RNDS4", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false},

View file

@ -844,6 +844,18 @@ func simdGenericOps() []opData {
{name: "RoundToEvenFloat32x8", argLength: 1, commutative: false},
{name: "RoundToEvenFloat64x2", argLength: 1, commutative: false},
{name: "RoundToEvenFloat64x4", argLength: 1, commutative: false},
{name: "SHA1Msg1Int32x4", argLength: 2, commutative: false},
{name: "SHA1Msg1Uint32x4", argLength: 2, commutative: false},
{name: "SHA1Msg2Int32x4", argLength: 2, commutative: false},
{name: "SHA1Msg2Uint32x4", argLength: 2, commutative: false},
{name: "SHA1NextEInt32x4", argLength: 2, commutative: false},
{name: "SHA1NextEUint32x4", argLength: 2, commutative: false},
{name: "SHA256Msg1Int32x4", argLength: 2, commutative: false},
{name: "SHA256Msg1Uint32x4", argLength: 2, commutative: false},
{name: "SHA256Msg2Int32x4", argLength: 2, commutative: false},
{name: "SHA256Msg2Uint32x4", argLength: 2, commutative: false},
{name: "SHA256Rounds2Int32x4", argLength: 3, commutative: false},
{name: "SHA256Rounds2Uint32x4", argLength: 3, commutative: false},
{name: "ScaleFloat32x4", argLength: 2, commutative: false},
{name: "ScaleFloat32x8", argLength: 2, commutative: false},
{name: "ScaleFloat32x16", argLength: 2, commutative: false},
@ -1206,6 +1218,8 @@ func simdGenericOps() []opData {
{name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"},
{name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"},
{name: "SHA1Round4Int32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "SHA1Round4Uint32x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"},
{name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"},
{name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"},

File diff suppressed because it is too large Load diff

View file

@ -4978,6 +4978,48 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpRsh8x64(v)
case OpRsh8x8:
return rewriteValueAMD64_OpRsh8x8(v)
case OpSHA1Msg1Int32x4:
v.Op = OpAMD64SHA1MSG1128
return true
case OpSHA1Msg1Uint32x4:
v.Op = OpAMD64SHA1MSG1128
return true
case OpSHA1Msg2Int32x4:
v.Op = OpAMD64SHA1MSG2128
return true
case OpSHA1Msg2Uint32x4:
v.Op = OpAMD64SHA1MSG2128
return true
case OpSHA1NextEInt32x4:
v.Op = OpAMD64SHA1NEXTE128
return true
case OpSHA1NextEUint32x4:
v.Op = OpAMD64SHA1NEXTE128
return true
case OpSHA1Round4Int32x4:
v.Op = OpAMD64SHA1RNDS4128
return true
case OpSHA1Round4Uint32x4:
v.Op = OpAMD64SHA1RNDS4128
return true
case OpSHA256Msg1Int32x4:
v.Op = OpAMD64SHA256MSG1128
return true
case OpSHA256Msg1Uint32x4:
v.Op = OpAMD64SHA256MSG1128
return true
case OpSHA256Msg2Int32x4:
v.Op = OpAMD64SHA256MSG1128
return true
case OpSHA256Msg2Uint32x4:
v.Op = OpAMD64SHA256MSG1128
return true
case OpSHA256Rounds2Int32x4:
v.Op = OpAMD64SHA256RNDS2128
return true
case OpSHA256Rounds2Uint32x4:
v.Op = OpAMD64SHA256RNDS2128
return true
case OpScaleFloat32x16:
v.Op = OpAMD64VSCALEFPS512
return true

View file

@ -1987,6 +1987,19 @@ func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExp
}
}
// The assembler requires the imm value of a SHA1RNDS4 instruction to be one of 0,1,2,3...
func opLen2Imm8_SHA1RNDS4(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 {
return s.newValue2I(op, t, (args[1].AuxInt<<int64(offset))&0b11, args[0], args[2])
}
return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset))&0b11, args[0], args[2])
})
}
}
func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[2].Op == ssa.OpConst8 {

View file

@ -951,6 +951,20 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x2, types.TypeVec128, 4), sys.AMD64)
addF(simdPackage, "Float64x4.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x4, types.TypeVec256, 4), sys.AMD64)
addF(simdPackage, "Float64x8.RoundToEvenScaledResidue", opLen1Imm8(ssa.OpRoundToEvenScaledResidueFloat64x8, types.TypeVec512, 4), sys.AMD64)
addF(simdPackage, "Int32x4.SHA1Msg1", opLen2(ssa.OpSHA1Msg1Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA1Msg1", opLen2(ssa.OpSHA1Msg1Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SHA1Msg2", opLen2(ssa.OpSHA1Msg2Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA1Msg2", opLen2(ssa.OpSHA1Msg2Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SHA1NextE", opLen2(ssa.OpSHA1NextEInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA1NextE", opLen2(ssa.OpSHA1NextEUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SHA1Round4", opLen2Imm8_SHA1RNDS4(ssa.OpSHA1Round4Int32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA1Round4", opLen2Imm8_SHA1RNDS4(ssa.OpSHA1Round4Uint32x4, types.TypeVec128, 0), sys.AMD64)
addF(simdPackage, "Int32x4.SHA256Msg1", opLen2(ssa.OpSHA256Msg1Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA256Msg1", opLen2(ssa.OpSHA256Msg1Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SHA256Msg2", opLen2(ssa.OpSHA256Msg2Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA256Msg2", opLen2(ssa.OpSHA256Msg2Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.SHA256Rounds2", opLen3(ssa.OpSHA256Rounds2Int32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.SHA256Rounds2", opLen3(ssa.OpSHA256Rounds2Uint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x4.Scale", opLen2(ssa.OpScaleFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Scale", opLen2(ssa.OpScaleFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Scale", opLen2(ssa.OpScaleFloat32x16, types.TypeVec512), sys.AMD64)

View file

@ -58,6 +58,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
{{end}}
{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
{{end}}
{{define "op2Imm8_SHA1RNDS4"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_SHA1RNDS4(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
{{end}}
{{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)
{{end}}
{{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64)

View file

@ -16,7 +16,7 @@ const simdMachineOpsTmpl = `
package main
func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vfpkv, w11, w21, w2k, wkw, w2kw, w2kk, w31, w3kw, wgpw, wgp, wfpw, wfpkw,
wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload regInfo) []opData {
wkwload, v21load, v31load, v11load, w21load, w31load, w2kload, w2kwload, w11load, w3kwload, w2kkload, v31x0AtIn2 regInfo) []opData {
return []opData{
{{- range .OpsData }}
{name: "{{.OpName}}", argLength: {{.OpInLen}}, reg: {{.RegInfo}}, asm: "{{.Asm}}", commutative: {{.Comm}}, typ: "{{.Type}}", resultInArg0: {{.ResultInArg0}}},
@ -61,7 +61,7 @@ func writeSIMDMachineOps(ops []Operation) *bytes.Buffer {
"v11": true, "v21": true, "v2k": true, "v2kv": true, "v2kk": true, "vkv": true, "v31": true, "v3kv": true, "vgpv": true, "vgp": true, "vfpv": true, "vfpkv": true,
"w11": true, "w21": true, "w2k": true, "w2kw": true, "w2kk": true, "wkw": true, "w31": true, "w3kw": true, "wgpw": true, "wgp": true, "wfpw": true, "wfpkw": true,
"wkwload": true, "v21load": true, "v31load": true, "v11load": true, "w21load": true, "w31load": true, "w2kload": true, "w2kwload": true, "w11load": true,
"w3kwload": true, "w2kkload": true}
"w3kwload": true, "w2kkload": true, "v31x0AtIn2": true}
opsData := make([]opData, 0)
opsDataImm := make([]opData, 0)
opsDataLoad := make([]opData, 0)

View file

@ -352,6 +352,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin
func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
{{end}}
{{define "op2Imm8_SHA1RNDS4"}}
{{if .Documentation}}{{.Documentation}}
//{{end}}
// {{.ImmName}} results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}}
func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}}
{{end}}
{{define "op3Imm8"}}
{{if .Documentation}}{{.Documentation}}
//{{end}}

View file

@ -96,6 +96,9 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer {
"v2kvloadImm8",
"v31ResultInArg0Imm8",
"v31loadResultInArg0Imm8",
"v21ResultInArg0",
"v21ResultInArg0Imm8",
"v31x0AtIn2ResultInArg0",
}
regInfoSet := map[string][]string{}
for _, key := range regInfoKeys {

View file

@ -236,9 +236,9 @@ func (op *Operation) shape() (shapeIn inShape, shapeOut outShape, maskType maskS
// regShape returns a string representation of the register shape.
func (op *Operation) regShape(mem memShape) (string, error) {
_, _, _, _, gOp := op.shape()
var regInfo string
var regInfo, fixedName string
var vRegInCnt, gRegInCnt, kMaskInCnt, vRegOutCnt, gRegOutCnt, kMaskOutCnt, memInCnt, memOutCnt int
for _, in := range gOp.In {
for i, in := range gOp.In {
switch in.Class {
case "vreg":
vRegInCnt++
@ -253,8 +253,11 @@ func (op *Operation) regShape(mem memShape) (string, error) {
memInCnt++
vRegInCnt++
}
if in.FixedReg != nil {
fixedName = fmt.Sprintf("%sAtIn%d", *in.FixedReg, i)
}
}
for _, out := range gOp.Out {
for i, out := range gOp.Out {
// If class overwrite is happening, that's not really a mask but a vreg.
if out.Class == "vreg" || out.OverwriteClass != nil {
vRegOutCnt++
@ -269,6 +272,9 @@ func (op *Operation) regShape(mem memShape) (string, error) {
vRegOutCnt++
memOutCnt++
}
if out.FixedReg != nil {
fixedName = fmt.Sprintf("%sAtIn%d", *out.FixedReg, i)
}
}
var inRegs, inMasks, outRegs, outMasks string
@ -309,6 +315,7 @@ func (op *Operation) regShape(mem memShape) (string, error) {
if memOutCnt > 0 {
panic("simdgen does not understand memory as output as of now")
}
regInfo += fixedName
return regInfo, nil
}

View file

@ -256,6 +256,8 @@ type Operand struct {
// because Intel's XED data is inconsistent. e.g. AVX512 VPMADDUBSW marks its operand
// elemBits 16, which should be 8.
OverwriteElementBits *int
// FixedReg is the name of the fixed registers
FixedReg *string
}
// isDigit returns true if the byte is an ASCII digit.

View file

@ -92,8 +92,9 @@ import (
"slices"
"strings"
"gopkg.in/yaml.v3"
"simd/_gen/unify"
"gopkg.in/yaml.v3"
)
var (
@ -199,6 +200,15 @@ func main() {
log.Fatal(err)
}
// Validate results.
//
// Don't validate if this is a command-line query because that tends to
// eliminate lots of required defs and is used in cases where maybe defs
// aren't enumerable anyway.
if *flagQ == "" && len(must) > 0 {
validate(unified, must)
}
// Print results.
switch *flagO {
case "yaml":
@ -228,15 +238,6 @@ func main() {
fmt.Fprintf(os.Stderr, "XED decoding generated %d \"errors\" which is not cause for alarm, use -v for details.\n", operandRemarks)
}
}
// Validate results.
//
// Don't validate if this is a command-line query because that tends to
// eliminate lots of required defs and is used in cases where maybe defs
// aren't enumerable anyway.
if *flagQ == "" && len(must) > 0 {
validate(unified, must)
}
}
func validate(cl unify.Closure, required map[*unify.Value]struct{}) {

View file

@ -46,4 +46,63 @@
documentation: !string |-
// NAME performs the InvMixColumns operation in AES cipher algorithm defined in FIPS 197.
// x is the chunk of w array in use.
// result = InvMixColumns(x)
// result = InvMixColumns(x)
- go: SHA1Round4
commutative: false
documentation: !string |-
// NAME performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variables a, b, c and d from upper to lower order.
// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
// result = the state variables a', b', c', d' updated after 4 rounds.
// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
- go: SHA1NextE
commutative: false
documentation: !string |-
// NAME calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variable a (before the 4 rounds), placed in the upper element.
// y is the elements of W array for next 4 rounds from upper to lower order.
// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
// from upper to lower order.
// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
// computation of the value of e'.)
- go: SHA1Msg1
commutative: false
documentation: !string |-
// NAME does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W3, W2, W1, W0}
// y = {0, 0, W5, W4}
// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
- go: SHA1Msg2
commutative: false
documentation: !string |-
// NAME does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2.
// y = {W15, W14, W13}
// result = {W19, W18, W17, W16}
- go: SHA256Rounds2
commutative: false
documentation: !string |-
// NAME does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
// x = {h, g, d, c}
// y = {f, e, b, a}
// z = {W0+K0, W1+K1}
// result = {f', e', b', a'}
// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
// the corresponding element of the W array to make the input data z.
// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
// y (the state variables a, b, e, f before the 2 rounds).
- go: SHA256Msg1
commutative: false
documentation: !string |-
// NAME does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W0, W1, W2, W3}
// y = {W4, 0, 0, 0}
// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
- go: SHA256Msg2
commutative: false
documentation: !string |-
// NAME does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2
// y = {0, 0, W14, W15}
// result = {W16, W17, W18, W19}

View file

@ -52,4 +52,45 @@
in:
- *uint32s
out:
- *uint32s
- *uint32s
- go: SHA1Round4
asm: SHA1RNDS4
operandOrder: "SHA1RNDS4"
in: &2any1imm
- *any
- *any
- class: immediate
immOffset: 0
out: &1any
- *any
- go: SHA1NextE
asm: SHA1NEXTE
in: &2any
- *any
- *any
out: *1any
- go: SHA1Msg1
asm: SHA1MSG1
in: *2any
out: *1any
- go: SHA1Msg2
asm: SHA1MSG2
in: *2any
out: *1any
- go: SHA256Rounds2
asm: SHA256RNDS2
in:
- base: $t
- base: $t
- base: $t
overwriteElementBits: 32
out:
- base: $t
- go: SHA256Msg1
asm: SHA256MSG1
in: *2any
out: *1any
- go: SHA256Msg2
asm: SHA256MSG1
in: *2any
out: *1any

View file

@ -25,7 +25,6 @@ const (
NOT_REG_CLASS = iota // not a register
VREG_CLASS // classify as a vector register; see
GREG_CLASS // classify as a general register
REG_FIXED // classify as a fixed register
)
// instVariant is a bitmap indicating a variant of an instruction that has
@ -852,7 +851,7 @@ type fixedReg struct {
}
var fixedRegMap = map[string]fixedReg{
"XED_REG_XMM0": {REG_FIXED, "XMM0", 128},
"XED_REG_XMM0": {VREG_CLASS, "x0", 128},
}
// decodeReg returns class (NOT_REG_CLASS, VREG_CLASS, GREG_CLASS, VREG_CLASS_FIXED,

View file

@ -106,3 +106,11 @@ func HasAVX512VPOPCNTDQ() bool {
func HasAVXVNNI() bool {
return cpu.X86.HasAVXVNNI
}
// HasSHA returns whether the CPU supports the SHA feature.
//
// HasSHA is defined on all GOARCHes, but will only return true on
// GOARCH amd64.
func HasSHA() bool {
return cpu.X86.HasSHA
}

View file

@ -5623,6 +5623,156 @@ func (x Float64x4) RoundToEvenScaledResidue(prec uint8) Float64x4
// Asm: VREDUCEPD, CPU Feature: AVX512
func (x Float64x8) RoundToEvenScaledResidue(prec uint8) Float64x8
/* SHA1Msg1 */
// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W3, W2, W1, W0}
// y = {0, 0, W5, W4}
// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
//
// Asm: SHA1MSG1, CPU Feature: SHA
func (x Int32x4) SHA1Msg1(y Int32x4) Int32x4
// SHA1Msg1 does the XORing of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W3, W2, W1, W0}
// y = {0, 0, W5, W4}
// result = {W3^W5, W2^W4, W1^W3, W0^W2}.
//
// Asm: SHA1MSG1, CPU Feature: SHA
func (x Uint32x4) SHA1Msg1(y Uint32x4) Uint32x4
/* SHA1Msg2 */
// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2.
// y = {W15, W14, W13}
// result = {W19, W18, W17, W16}
//
// Asm: SHA1MSG2, CPU Feature: SHA
func (x Int32x4) SHA1Msg2(y Int32x4) Int32x4
// SHA1Msg2 does the calculation of 3 and 4 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2.
// y = {W15, W14, W13}
// result = {W19, W18, W17, W16}
//
// Asm: SHA1MSG2, CPU Feature: SHA
func (x Uint32x4) SHA1Msg2(y Uint32x4) Uint32x4
/* SHA1NextE */
// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variable a (before the 4 rounds), placed in the upper element.
// y is the elements of W array for next 4 rounds from upper to lower order.
// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
// from upper to lower order.
// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
// computation of the value of e'.)
//
// Asm: SHA1NEXTE, CPU Feature: SHA
func (x Int32x4) SHA1NextE(y Int32x4) Int32x4
// SHA1NextE calculates the state variable e' updated after 4 rounds in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variable a (before the 4 rounds), placed in the upper element.
// y is the elements of W array for next 4 rounds from upper to lower order.
// result = the elements of the W array for the next 4 rounds, with the updated state variable e' added to the upper element,
// from upper to lower order.
// For the last round of the loop, you can specify zero for y to obtain the e' value itself, or better off specifying H4:0:0:0
// for y to get e' added to H4. (Note that the value of e' is computed only from x, and values of y don't affect the
// computation of the value of e'.)
//
// Asm: SHA1NEXTE, CPU Feature: SHA
func (x Uint32x4) SHA1NextE(y Uint32x4) Uint32x4
/* SHA1Round4 */
// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variables a, b, c and d from upper to lower order.
// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
// result = the state variables a', b', c', d' updated after 4 rounds.
// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
//
// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: SHA1RNDS4, CPU Feature: SHA
func (x Int32x4) SHA1Round4(constant uint8, y Int32x4) Int32x4
// SHA1Round4 performs 4 rounds of B loop in SHA1 algorithm defined in FIPS 180-4.
// x contains the state variables a, b, c and d from upper to lower order.
// y contains the W array elements (with the state variable e added to the upper element) from upper to lower order.
// result = the state variables a', b', c', d' updated after 4 rounds.
// constant = 0 for the first 20 rounds of the loop, 1 for the next 20 rounds of the loop..., 3 for the last 20 rounds of the loop.
//
// constant results in better performance when it's a constant, a non-constant value will be translated into a jump table.
//
// Asm: SHA1RNDS4, CPU Feature: SHA
func (x Uint32x4) SHA1Round4(constant uint8, y Uint32x4) Uint32x4
/* SHA256Msg1 */
// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W0, W1, W2, W3}
// y = {W4, 0, 0, 0}
// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
//
// Asm: SHA256MSG1, CPU Feature: SHA
func (x Int32x4) SHA256Msg1(y Int32x4) Int32x4
// SHA256Msg1 does the sigma and addtion of 1 in SHA1 algorithm defined in FIPS 180-4.
// x = {W0, W1, W2, W3}
// y = {W4, 0, 0, 0}
// result = {W0+σ(W1), W1+σ(W2), W2+σ(W3), W3+σ(W4)}
//
// Asm: SHA256MSG1, CPU Feature: SHA
func (x Uint32x4) SHA256Msg1(y Uint32x4) Uint32x4
/* SHA256Msg2 */
// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2
// y = {0, 0, W14, W15}
// result = {W16, W17, W18, W19}
//
// Asm: SHA256MSG1, CPU Feature: SHA
func (x Int32x4) SHA256Msg2(y Int32x4) Int32x4
// SHA256Msg2 does the sigma and addition of 3 in SHA1 algorithm defined in FIPS 180-4.
// x = result of 2
// y = {0, 0, W14, W15}
// result = {W16, W17, W18, W19}
//
// Asm: SHA256MSG1, CPU Feature: SHA
func (x Uint32x4) SHA256Msg2(y Uint32x4) Uint32x4
/* SHA256Rounds2 */
// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
// x = {h, g, d, c}
// y = {f, e, b, a}
// z = {W0+K0, W1+K1}
// result = {f', e', b', a'}
// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
// the corresponding element of the W array to make the input data z.
// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
// y (the state variables a, b, e, f before the 2 rounds).
//
// Asm: SHA256RNDS2, CPU Feature: SHA
func (x Int32x4) SHA256Rounds2(y Int32x4, z Int32x4) Int32x4
// SHA256Rounds2 does 2 rounds of B loop to calculate updated state variables in SHA1 algorithm defined in FIPS 180-4.
// x = {h, g, d, c}
// y = {f, e, b, a}
// z = {W0+K0, W1+K1}
// result = {f', e', b', a'}
// The K array is a 64-DWORD constant array defined in page 11 of FIPS 180-4. Each element of the K array is to be added to
// the corresponding element of the W array to make the input data z.
// The updated state variables c', d', g', h' are not returned by this instruction, because they are equal to the input data
// y (the state variables a, b, e, f before the 2 rounds).
//
// Asm: SHA256RNDS2, CPU Feature: SHA
func (x Uint32x4) SHA256Rounds2(y Uint32x4, z Uint32x4) Uint32x4
/* Scale */
// Scale multiplies elements by a power of 2.