diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index fe2ae019acd..86d44c12452 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1939,6 +1939,22 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VPSHRDQMasked512load: p = simdV2kvloadImm8(s, v) + case ssa.OpAMD64VPTERNLOGD128, + ssa.OpAMD64VPTERNLOGD256, + ssa.OpAMD64VPTERNLOGD512, + ssa.OpAMD64VPTERNLOGQ128, + ssa.OpAMD64VPTERNLOGQ256, + ssa.OpAMD64VPTERNLOGQ512: + p = simdV31ResultInArg0Imm8(s, v) + + case ssa.OpAMD64VPTERNLOGD128load, + ssa.OpAMD64VPTERNLOGD256load, + ssa.OpAMD64VPTERNLOGD512load, + ssa.OpAMD64VPTERNLOGQ128load, + ssa.OpAMD64VPTERNLOGQ256load, + ssa.OpAMD64VPTERNLOGQ512load: + p = simdV31loadResultInArg0Imm8(s, v) + default: // Unknown reg shape return false diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 25fa7b695a2..b3f8191609f 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -2095,6 +2095,37 @@ func simdV31ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog { return p } +func simdV31ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog { + p := s.Prog(v.Op.Asm()) + p.From.Offset = int64(v.AuxUInt8()) + p.From.Type = obj.TYPE_CONST + + p.AddRestSourceReg(simdReg(v.Args[2])) + p.AddRestSourceReg(simdReg(v.Args[1])) + // p.AddRestSourceReg(x86.REG_K0) + p.To.Type = obj.TYPE_REG + p.To.Reg = simdReg(v) + return p +} + +// v31loadResultInArg0Imm8 +// Example instruction: +// for (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +func simdV31loadResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog { + sc := v.AuxValAndOff() + p := s.Prog(v.Op.Asm()) + + p.From.Type = obj.TYPE_CONST + p.From.Offset = sc.Val64() + + m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[2].Reg()} + ssagen.AddAux2(&m, v, sc.Off64()) + p.AddRestSource(m) + + p.AddRestSourceReg(simdReg(v.Args[1])) + return p +} + // Example instruction: VFMADD213PD Z2, Z1, K1, Z0 func simdV3kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog { p := s.Prog(v.Op.Asm()) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 9e34d4b8816..2cda679f2dd 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1320,6 +1320,18 @@ (moveMaskedUint16x32 x mask) => (VMOVDQU16Masked512 x (VPMOVVec16x32ToM mask)) (moveMaskedUint32x16 x mask) => (VMOVDQU32Masked512 x (VPMOVVec32x16ToM mask)) (moveMaskedUint64x8 x mask) => (VMOVDQU64Masked512 x (VPMOVVec64x8ToM mask)) +(ternInt32x4 ...) => (VPTERNLOGD128 ...) +(ternInt32x8 ...) => (VPTERNLOGD256 ...) +(ternInt32x16 ...) => (VPTERNLOGD512 ...) +(ternInt64x2 ...) => (VPTERNLOGQ128 ...) +(ternInt64x4 ...) => (VPTERNLOGQ256 ...) +(ternInt64x8 ...) => (VPTERNLOGQ512 ...) +(ternUint32x4 ...) => (VPTERNLOGD128 ...) +(ternUint32x8 ...) => (VPTERNLOGD256 ...) +(ternUint32x16 ...) => (VPTERNLOGD512 ...) +(ternUint64x2 ...) => (VPTERNLOGQ128 ...) +(ternUint64x4 ...) => (VPTERNLOGQ256 ...) +(ternUint64x8 ...) => (VPTERNLOGQ512 ...) (VMOVDQU8Masked512 (VPABSB512 x) mask) => (VPABSBMasked512 x mask) (VMOVDQU16Masked512 (VPABSW512 x) mask) => (VPABSWMasked512 x mask) (VMOVDQU32Masked512 (VPABSD512 x) mask) => (VPABSDMasked512 x mask) @@ -2047,3 +2059,9 @@ (VPSRAQMasked128const [c] l:(VMOVDQUload128 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked128constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VPSRAQMasked256const [c] l:(VMOVDQUload256 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked256constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) (VPSRAQMasked512const [c] l:(VMOVDQUload512 {sym} [off] ptr mem) mask) && canMergeLoad(v, l) && clobber(l) => (VPSRAQMasked512constload {sym} [makeValAndOff(int32(int8(c)),off)] ptr mask mem) +(VPTERNLOGD128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +(VPTERNLOGD256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +(VPTERNLOGD512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGD512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +(VPTERNLOGQ128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +(VPTERNLOGQ256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) +(VPTERNLOGQ512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem)) && canMergeLoad(v, l) && clobber(l) => (VPTERNLOGQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 2cdf80c1ba1..add281c6b92 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1322,6 +1322,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSRAQMasked128const", argLength: 2, reg: wkw, asm: "VPSRAQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRAQMasked256const", argLength: 2, reg: wkw, asm: "VPSRAQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSRAQMasked512const", argLength: 2, reg: wkw, asm: "VPSRAQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPTERNLOGD128", argLength: 3, reg: w31, asm: "VPTERNLOGD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPTERNLOGD256", argLength: 3, reg: w31, asm: "VPTERNLOGD", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPTERNLOGD512", argLength: 3, reg: w31, asm: "VPTERNLOGD", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, + {name: "VPTERNLOGQ128", argLength: 3, reg: w31, asm: "VPTERNLOGQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: true}, + {name: "VPTERNLOGQ256", argLength: 3, reg: w31, asm: "VPTERNLOGQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: true}, + {name: "VPTERNLOGQ512", argLength: 3, reg: w31, asm: "VPTERNLOGQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPABSD512load", argLength: 2, reg: w11load, asm: "VPABSD", commutative: false, typ: "Vec512", aux: "SymOff", symEffect: "Read", resultInArg0: false}, {name: "VPABSQ128load", argLength: 2, reg: w11load, asm: "VPABSQ", commutative: false, typ: "Vec128", aux: "SymOff", symEffect: "Read", resultInArg0: false}, {name: "VPABSQ256load", argLength: 2, reg: w11load, asm: "VPABSQ", commutative: false, typ: "Vec256", aux: "SymOff", symEffect: "Read", resultInArg0: false}, @@ -1870,5 +1876,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPSRAQMasked128constload", argLength: 3, reg: wkwload, asm: "VPSRAQ", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSRAQMasked256constload", argLength: 3, reg: wkwload, asm: "VPSRAQ", commutative: false, typ: "Vec256", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, {name: "VPSRAQMasked512constload", argLength: 3, reg: wkwload, asm: "VPSRAQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: false}, + {name: "VPTERNLOGD128load", argLength: 4, reg: w31load, asm: "VPTERNLOGD", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, + {name: "VPTERNLOGD256load", argLength: 4, reg: w31load, asm: "VPTERNLOGD", commutative: false, typ: "Vec256", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, + {name: "VPTERNLOGD512load", argLength: 4, reg: w31load, asm: "VPTERNLOGD", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, + {name: "VPTERNLOGQ128load", argLength: 4, reg: w31load, asm: "VPTERNLOGQ", commutative: false, typ: "Vec128", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, + {name: "VPTERNLOGQ256load", argLength: 4, reg: w31load, asm: "VPTERNLOGQ", commutative: false, typ: "Vec256", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, + {name: "VPTERNLOGQ512load", argLength: 4, reg: w31load, asm: "VPTERNLOGQ", commutative: false, typ: "Vec512", aux: "SymValAndOff", symEffect: "Read", resultInArg0: true}, } } diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index f5eb9075d71..546f6c0bc58 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1288,5 +1288,17 @@ func simdGenericOps() []opData { {name: "concatSelectedConstantInt64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "concatSelectedConstantUint64x2", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "ternInt32x4", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternInt32x8", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternInt32x16", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternInt64x2", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternInt64x4", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternInt64x8", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint32x4", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint32x8", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint32x16", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint64x2", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint64x4", argLength: 3, commutative: false, aux: "UInt8"}, + {name: "ternUint64x8", argLength: 3, commutative: false, aux: "UInt8"}, } } diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 6dd7082e100..91873744602 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2562,6 +2562,12 @@ const ( OpAMD64VPSRAQMasked128const OpAMD64VPSRAQMasked256const OpAMD64VPSRAQMasked512const + OpAMD64VPTERNLOGD128 + OpAMD64VPTERNLOGD256 + OpAMD64VPTERNLOGD512 + OpAMD64VPTERNLOGQ128 + OpAMD64VPTERNLOGQ256 + OpAMD64VPTERNLOGQ512 OpAMD64VPABSD512load OpAMD64VPABSQ128load OpAMD64VPABSQ256load @@ -3110,6 +3116,12 @@ const ( OpAMD64VPSRAQMasked128constload OpAMD64VPSRAQMasked256constload OpAMD64VPSRAQMasked512constload + OpAMD64VPTERNLOGD128load + OpAMD64VPTERNLOGD256load + OpAMD64VPTERNLOGD512load + OpAMD64VPTERNLOGQ128load + OpAMD64VPTERNLOGQ256load + OpAMD64VPTERNLOGQ512load OpARMADD OpARMADDconst @@ -6669,6 +6681,18 @@ const ( OpconcatSelectedConstantInt64x2 OpconcatSelectedConstantUint32x4 OpconcatSelectedConstantUint64x2 + OpternInt32x4 + OpternInt32x8 + OpternInt32x16 + OpternInt64x2 + OpternInt64x4 + OpternInt64x8 + OpternUint32x4 + OpternUint32x8 + OpternUint32x16 + OpternUint64x2 + OpternUint64x4 + OpternUint64x8 ) var opcodeTable = [...]opInfo{ @@ -39366,6 +39390,108 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPTERNLOGD128", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGD256", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGD512", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ128", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ256", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ512", + auxType: auxUInt8, + argLen: 3, + resultInArg0: true, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {2, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "VPABSD512load", auxType: auxSymOff, @@ -48504,6 +48630,114 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPTERNLOGD128load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGD256load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGD512load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGD, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ128load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ256load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, + { + name: "VPTERNLOGQ512load", + auxType: auxSymValAndOff, + argLen: 4, + resultInArg0: true, + symEffect: SymRead, + asm: x86.AVPTERNLOGQ, + reg: regInfo{ + inputs: []inputInfo{ + {2, 72057594037977087}, // AX CX DX BX SP BP SI DI R8 R9 R10 R11 R12 R13 R15 SB + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + {1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + outputs: []outputInfo{ + {0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31 + }, + }, + }, { name: "ADD", @@ -82840,6 +83074,78 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "ternInt32x4", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternInt32x8", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternInt32x16", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternInt64x2", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternInt64x4", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternInt64x8", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint32x4", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint32x8", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint32x16", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint64x2", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint64x4", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, + { + name: "ternUint64x8", + auxType: auxUInt8, + argLen: 3, + generic: true, + }, } func (o Op) Asm() obj.As { return opcodeTable[o].asm } diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 42814029144..89b6d1600b3 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -1609,6 +1609,18 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpAMD64VPSUBQMasked256(v) case OpAMD64VPSUBQMasked512: return rewriteValueAMD64_OpAMD64VPSUBQMasked512(v) + case OpAMD64VPTERNLOGD128: + return rewriteValueAMD64_OpAMD64VPTERNLOGD128(v) + case OpAMD64VPTERNLOGD256: + return rewriteValueAMD64_OpAMD64VPTERNLOGD256(v) + case OpAMD64VPTERNLOGD512: + return rewriteValueAMD64_OpAMD64VPTERNLOGD512(v) + case OpAMD64VPTERNLOGQ128: + return rewriteValueAMD64_OpAMD64VPTERNLOGQ128(v) + case OpAMD64VPTERNLOGQ256: + return rewriteValueAMD64_OpAMD64VPTERNLOGQ256(v) + case OpAMD64VPTERNLOGQ512: + return rewriteValueAMD64_OpAMD64VPTERNLOGQ512(v) case OpAMD64VPUNPCKHDQ512: return rewriteValueAMD64_OpAMD64VPUNPCKHDQ512(v) case OpAMD64VPUNPCKHQDQ512: @@ -6061,6 +6073,42 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpmoveMaskedUint64x8(v) case OpmoveMaskedUint8x64: return rewriteValueAMD64_OpmoveMaskedUint8x64(v) + case OpternInt32x16: + v.Op = OpAMD64VPTERNLOGD512 + return true + case OpternInt32x4: + v.Op = OpAMD64VPTERNLOGD128 + return true + case OpternInt32x8: + v.Op = OpAMD64VPTERNLOGD256 + return true + case OpternInt64x2: + v.Op = OpAMD64VPTERNLOGQ128 + return true + case OpternInt64x4: + v.Op = OpAMD64VPTERNLOGQ256 + return true + case OpternInt64x8: + v.Op = OpAMD64VPTERNLOGQ512 + return true + case OpternUint32x16: + v.Op = OpAMD64VPTERNLOGD512 + return true + case OpternUint32x4: + v.Op = OpAMD64VPTERNLOGD128 + return true + case OpternUint32x8: + v.Op = OpAMD64VPTERNLOGD256 + return true + case OpternUint64x2: + v.Op = OpAMD64VPTERNLOGQ128 + return true + case OpternUint64x4: + v.Op = OpAMD64VPTERNLOGQ256 + return true + case OpternUint64x8: + v.Op = OpAMD64VPTERNLOGQ512 + return true } return false } @@ -45655,6 +45703,186 @@ func rewriteValueAMD64_OpAMD64VPSUBQMasked512(v *Value) bool { } return false } +func rewriteValueAMD64_OpAMD64VPTERNLOGD128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGD128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload128 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGD128load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPTERNLOGD256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGD256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGD256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload256 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGD256load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPTERNLOGD512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGD512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGD512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload512 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGD512load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPTERNLOGQ128(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGQ128 [c] x y l:(VMOVDQUload128 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGQ128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload128 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGQ128load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPTERNLOGQ256(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGQ256 [c] x y l:(VMOVDQUload256 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGQ256load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload256 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGQ256load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} +func rewriteValueAMD64_OpAMD64VPTERNLOGQ512(v *Value) bool { + v_2 := v.Args[2] + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (VPTERNLOGQ512 [c] x y l:(VMOVDQUload512 {sym} [off] ptr mem)) + // cond: canMergeLoad(v, l) && clobber(l) + // result: (VPTERNLOGQ512load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem) + for { + c := auxIntToUint8(v.AuxInt) + x := v_0 + y := v_1 + l := v_2 + if l.Op != OpAMD64VMOVDQUload512 { + break + } + off := auxIntToInt32(l.AuxInt) + sym := auxToSym(l.Aux) + mem := l.Args[1] + ptr := l.Args[0] + if !(canMergeLoad(v, l) && clobber(l)) { + break + } + v.reset(OpAMD64VPTERNLOGQ512load) + v.AuxInt = valAndOffToAuxInt(makeValAndOff(int32(int8(c)), off)) + v.Aux = symToAux(sym) + v.AddArg4(x, y, ptr, mem) + return true + } + return false +} func rewriteValueAMD64_OpAMD64VPUNPCKHDQ512(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index d4fb524b247..5b6b25fb70d 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -1296,6 +1296,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint16x32.moveMasked", opLen2(ssa.OpmoveMaskedUint16x32, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint32x16.moveMasked", opLen2(ssa.OpmoveMaskedUint32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint64x8.moveMasked", opLen2(ssa.OpmoveMaskedUint64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Int32x4.tern", opLen3Imm8(ssa.OpternInt32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int32x8.tern", opLen3Imm8(ssa.OpternInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x16.tern", opLen3Imm8(ssa.OpternInt32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Int64x2.tern", opLen3Imm8(ssa.OpternInt64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Int64x4.tern", opLen3Imm8(ssa.OpternInt64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int64x8.tern", opLen3Imm8(ssa.OpternInt64x8, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint32x4.tern", opLen3Imm8(ssa.OpternUint32x4, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.tern", opLen3Imm8(ssa.OpternUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x16.tern", opLen3Imm8(ssa.OpternUint32x16, types.TypeVec512, 0), sys.AMD64) + addF(simdPackage, "Uint64x2.tern", opLen3Imm8(ssa.OpternUint64x2, types.TypeVec128, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.tern", opLen3Imm8(ssa.OpternUint64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x8.tern", opLen3Imm8(ssa.OpternUint64x8, types.TypeVec512, 0), sys.AMD64) addF(simdPackage, "Float32x4.AsFloat64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Float32x4.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Float32x4.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdssa.go b/src/simd/_gen/simdgen/gen_simdssa.go index b48f5ce831e..c1ce5845493 100644 --- a/src/simd/_gen/simdgen/gen_simdssa.go +++ b/src/simd/_gen/simdgen/gen_simdssa.go @@ -94,6 +94,8 @@ func writeSIMDSSA(ops []Operation) *bytes.Buffer { "v2kloadImm8", "v2kkloadImm8", "v2kvloadImm8", + "v31ResultInArg0Imm8", + "v31loadResultInArg0Imm8", } regInfoSet := map[string][]string{} for _, key := range regInfoKeys { diff --git a/src/simd/_gen/simdgen/ops/BitwiseLogic/categories.yaml b/src/simd/_gen/simdgen/ops/BitwiseLogic/categories.yaml index 3142d1910d3..197e994b54c 100644 --- a/src/simd/_gen/simdgen/ops/BitwiseLogic/categories.yaml +++ b/src/simd/_gen/simdgen/ops/BitwiseLogic/categories.yaml @@ -15,6 +15,11 @@ commutative: true documentation: !string |- // NAME performs a bitwise XOR operation between two vectors. +- go: tern + commutative: false + documentation: !string |- + // NAME performs a logical operation on three vectors based on the 8-bit truth table. + // Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) # We also have PTEST and VPTERNLOG, those should be hidden from the users # and only appear in rewrite rules. diff --git a/src/simd/_gen/simdgen/ops/BitwiseLogic/go.yaml b/src/simd/_gen/simdgen/ops/BitwiseLogic/go.yaml index ab344438fb2..ad46115462f 100644 --- a/src/simd/_gen/simdgen/ops/BitwiseLogic/go.yaml +++ b/src/simd/_gen/simdgen/ops/BitwiseLogic/go.yaml @@ -125,4 +125,18 @@ asm: "VPXORD" # Fill in the gap, Or is missing for Uint16x32 and Int16x32 inVariant: [] in: *twoI16x32 - out: *oneI16x32 \ No newline at end of file + out: *oneI16x32 + +- go: tern + asm: "VPTERNLOGD|VPTERNLOGQ" + in: + - &tern_op + go: $t + - *tern_op + - *tern_op + - class: immediate + immOffset: 0 + name: table + inVariant: [] + out: + - *tern_op diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index 23316223617..49c387aea9c 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -7872,6 +7872,104 @@ func (x Uint32x16) moveMasked(mask Mask32x16) Uint32x16 // Asm: VMOVDQU64, CPU Feature: AVX512 func (x Uint64x8) moveMasked(mask Mask64x8) Uint64x8 +/* tern */ + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x4) tern(table uint8, y Int32x4, z Int32x4) Int32x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x8) tern(table uint8, y Int32x8, z Int32x8) Int32x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Int32x16) tern(table uint8, y Int32x16, z Int32x16) Int32x16 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x2) tern(table uint8, y Int64x2, z Int64x2) Int64x2 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x4) tern(table uint8, y Int64x4, z Int64x4) Int64x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Int64x8) tern(table uint8, y Int64x8, z Int64x8) Int64x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x4) tern(table uint8, y Uint32x4, z Uint32x4) Uint32x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x8) tern(table uint8, y Uint32x8, z Uint32x8) Uint32x8 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGD, CPU Feature: AVX512 +func (x Uint32x16) tern(table uint8, y Uint32x16, z Uint32x16) Uint32x16 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x2) tern(table uint8, y Uint64x2, z Uint64x2) Uint64x2 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x4) tern(table uint8, y Uint64x4, z Uint64x4) Uint64x4 + +// tern performs a logical operation on three vectors based on the 8-bit truth table. +// Bitwise, the result is equal to 1 & (table >> (x<<2 + y<<1 + z)) +// +// table results in better performance when it's a constant, a non-constant value will be translated into a jump table. +// +// Asm: VPTERNLOGQ, CPU Feature: AVX512 +func (x Uint64x8) tern(table uint8, y Uint64x8, z Uint64x8) Uint64x8 + // Float64x2 converts from Float32x4 to Float64x2 func (from Float32x4) AsFloat64x2() (to Float64x2) diff --git a/src/simd/pkginternal_test.go b/src/simd/pkginternal_test.go index 632e24d9d9a..c5b46eb0d96 100644 --- a/src/simd/pkginternal_test.go +++ b/src/simd/pkginternal_test.go @@ -47,6 +47,31 @@ func TestConcatSelectedConstantGrouped32(t *testing.T) { test_helpers.CheckSlices[uint32](t, a, []uint32{2, 0, 5, 7, 10, 8, 13, 15}) } +func TestTern(t *testing.T) { + if !HasAVX512() { + t.Skip("This test needs AVX512") + } + x := LoadInt32x8Slice([]int32{0, 0, 0, 0, 1, 1, 1, 1}) + y := LoadInt32x8Slice([]int32{0, 0, 1, 1, 0, 0, 1, 1}) + z := LoadInt32x8Slice([]int32{0, 1, 0, 1, 0, 1, 0, 1}) + + foo := func(w Int32x8, k uint8) { + a := make([]int32, 8) + w.StoreSlice(a) + t.Logf("For k=%0b, w=%v", k, a) + for i, b := range a { + if (int32(k)>>i)&1 != b { + t.Errorf("Element %d of stored slice (=%d) did not match corresponding bit in 0b%b", + i, b, k) + } + } + } + + foo(x.tern(0b1111_0000, y, z), 0b1111_0000) + foo(x.tern(0b1100_1100, y, z), 0b1100_1100) + foo(x.tern(0b1010_1010, y, z), 0b1010_1010) +} + func TestSelect2x4x32(t *testing.T) { for a := range uint8(8) { for b := range uint8(8) {