[dev.simd] cmd/compile, simd: add VPTEST

Change-Id: Ia5103100eca2747fd10917ee2f32e3403e68e844
Reviewed-on: https://go-review.googlesource.com/c/go/+/702175
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Rob Lee <leerob7691@gmail.com>
This commit is contained in:
Junyang Shao 2025-09-09 16:29:38 +00:00
parent d9751166a6
commit f1e3651c33
9 changed files with 236 additions and 16 deletions

View file

@ -1845,6 +1845,14 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.From.Reg = v.Args[0].Reg() p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg() p.To.Reg = v.Reg()
case ssa.OpAMD64VPTEST:
// Some instructions setting flags put their second operand into the destination reg.
// See also CMP[BWDQ].
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v.Args[1])
default: default:
if !ssaGenSIMDValue(s, v) { if !ssaGenSIMDValue(s, v) {

View file

@ -1732,6 +1732,9 @@
(StoreMasked64 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK64store128 ptr mask val mem) (StoreMasked64 {t} ptr mask val mem) && t.Size() == 16 => (VPMASK64store128 ptr mask val mem)
(StoreMasked64 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK64store256 ptr mask val mem) (StoreMasked64 {t} ptr mask val mem) && t.Size() == 32 => (VPMASK64store256 ptr mask val mem)
// Misc
(IsZeroVec x) => (SETEQ (VPTEST x x))
// SIMD vector K-masked loads and stores // SIMD vector K-masked loads and stores
(LoadMasked64 <t> ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM <types.TypeMask> mask) mem) (LoadMasked64 <t> ptr mask mem) && t.Size() == 64 => (VPMASK64load512 ptr (VPMOVVec64x8ToM <types.TypeMask> mask) mem)

View file

@ -212,22 +212,23 @@ func init() {
vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly} vloadk = regInfo{inputs: []regMask{gpspsb, mask, 0}, outputs: vonly}
vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}} vstorek = regInfo{inputs: []regMask{gpspsb, mask, v, 0}}
v11 = regInfo{inputs: vzonly, outputs: vonly} v11 = regInfo{inputs: vzonly, outputs: vonly}
v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly} v21 = regInfo{inputs: []regMask{vz, vz}, outputs: vonly}
vk = regInfo{inputs: vzonly, outputs: maskonly} vk = regInfo{inputs: vzonly, outputs: maskonly}
kv = regInfo{inputs: maskonly, outputs: vonly} kv = regInfo{inputs: maskonly, outputs: vonly}
v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly} v2k = regInfo{inputs: []regMask{vz, vz}, outputs: maskonly}
vkv = regInfo{inputs: []regMask{vz, mask}, outputs: vonly} vkv = regInfo{inputs: []regMask{vz, mask}, outputs: vonly}
v2kv = regInfo{inputs: []regMask{vz, vz, mask}, outputs: vonly} v2kv = regInfo{inputs: []regMask{vz, vz, mask}, outputs: vonly}
v2kk = regInfo{inputs: []regMask{vz, vz, mask}, outputs: maskonly} v2kk = regInfo{inputs: []regMask{vz, vz, mask}, outputs: maskonly}
v31 = regInfo{inputs: []regMask{v, vz, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 v31 = regInfo{inputs: []regMask{v, vz, vz}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
v3kv = regInfo{inputs: []regMask{v, vz, vz, mask}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15 v3kv = regInfo{inputs: []regMask{v, vz, vz, mask}, outputs: vonly} // used in resultInArg0 ops, arg0 must not be x15
vgpv = regInfo{inputs: []regMask{vz, gp}, outputs: vonly} vgpv = regInfo{inputs: []regMask{vz, gp}, outputs: vonly}
vgp = regInfo{inputs: vonly, outputs: gponly} vgp = regInfo{inputs: vonly, outputs: gponly}
vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly} vfpv = regInfo{inputs: []regMask{vz, fp}, outputs: vonly}
vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly} vfpkv = regInfo{inputs: []regMask{vz, fp, mask}, outputs: vonly}
fpv = regInfo{inputs: []regMask{fp}, outputs: vonly} fpv = regInfo{inputs: []regMask{fp}, outputs: vonly}
gpv = regInfo{inputs: []regMask{gp}, outputs: vonly} gpv = regInfo{inputs: []regMask{gp}, outputs: vonly}
v2flags = regInfo{inputs: []regMask{vz, vz}}
w11 = regInfo{inputs: wzonly, outputs: wonly} w11 = regInfo{inputs: wzonly, outputs: wonly}
w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly} w21 = regInfo{inputs: []regMask{wz, wz}, outputs: wonly}
@ -1426,6 +1427,9 @@ func init() {
{name: "KMOVDi", argLength: 1, reg: kgp, asm: "KMOVD"}, {name: "KMOVDi", argLength: 1, reg: kgp, asm: "KMOVD"},
{name: "KMOVWi", argLength: 1, reg: kgp, asm: "KMOVW"}, {name: "KMOVWi", argLength: 1, reg: kgp, asm: "KMOVW"},
{name: "KMOVBi", argLength: 1, reg: kgp, asm: "KMOVB"}, {name: "KMOVBi", argLength: 1, reg: kgp, asm: "KMOVB"},
// VPTEST
{name: "VPTEST", asm: "VPTEST", argLength: 2, reg: v2flags, clobberFlags: true, typ: "Flags"},
} }
var AMD64blocks = []blockData{ var AMD64blocks = []blockData{

View file

@ -731,6 +731,9 @@ var genericOps = []opData{
{name: "CvtMask64x2to8", argLength: 1}, // arg0 = mask {name: "CvtMask64x2to8", argLength: 1}, // arg0 = mask
{name: "CvtMask64x4to8", argLength: 1}, // arg0 = mask {name: "CvtMask64x4to8", argLength: 1}, // arg0 = mask
{name: "CvtMask64x8to8", argLength: 1}, // arg0 = mask {name: "CvtMask64x8to8", argLength: 1}, // arg0 = mask
// Returns true if arg0 is all zero.
{name: "IsZeroVec", argLength: 1},
} }
// kind controls successors implicit exit // kind controls successors implicit exit

View file

@ -1236,6 +1236,7 @@ const (
OpAMD64KMOVDi OpAMD64KMOVDi
OpAMD64KMOVWi OpAMD64KMOVWi
OpAMD64KMOVBi OpAMD64KMOVBi
OpAMD64VPTEST
OpAMD64VADDPD128 OpAMD64VADDPD128
OpAMD64VADDPD256 OpAMD64VADDPD256
OpAMD64VADDPD512 OpAMD64VADDPD512
@ -5390,6 +5391,7 @@ const (
OpCvtMask64x2to8 OpCvtMask64x2to8
OpCvtMask64x4to8 OpCvtMask64x4to8
OpCvtMask64x8to8 OpCvtMask64x8to8
OpIsZeroVec
OpAbsInt8x16 OpAbsInt8x16
OpAbsInt8x32 OpAbsInt8x32
OpAbsInt8x64 OpAbsInt8x64
@ -19799,6 +19801,18 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPTEST",
argLen: 2,
clobberFlags: true,
asm: x86.AVPTEST,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
},
},
{ {
name: "VADDPD128", name: "VADDPD128",
argLen: 2, argLen: 2,
@ -75862,6 +75876,11 @@ var opcodeTable = [...]opInfo{
argLen: 1, argLen: 1,
generic: true, generic: true,
}, },
{
name: "IsZeroVec",
argLen: 1,
generic: true,
},
{ {
name: "AbsInt8x16", name: "AbsInt8x16",
argLen: 1, argLen: 1,

View file

@ -3599,6 +3599,8 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpIsNonNil(v) return rewriteValueAMD64_OpIsNonNil(v)
case OpIsSliceInBounds: case OpIsSliceInBounds:
return rewriteValueAMD64_OpIsSliceInBounds(v) return rewriteValueAMD64_OpIsSliceInBounds(v)
case OpIsZeroVec:
return rewriteValueAMD64_OpIsZeroVec(v)
case OpLeadingZerosInt32x16: case OpLeadingZerosInt32x16:
v.Op = OpAMD64VPLZCNTD512 v.Op = OpAMD64VPLZCNTD512
return true return true
@ -53712,6 +53714,20 @@ func rewriteValueAMD64_OpIsSliceInBounds(v *Value) bool {
return true return true
} }
} }
func rewriteValueAMD64_OpIsZeroVec(v *Value) bool {
v_0 := v.Args[0]
b := v.Block
// match: (IsZeroVec x)
// result: (SETEQ (VPTEST x x))
for {
x := v_0
v.reset(OpAMD64SETEQ)
v0 := b.NewValue0(v.Pos, OpAMD64VPTEST, types.TypeFlags)
v0.AddArg2(x, x)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpLeq16(v *Value) bool { func rewriteValueAMD64_OpLeq16(v *Value) bool {
v_1 := v.Args[1] v_1 := v.Args[1]
v_0 := v.Args[0] v_0 := v.Args[0]

View file

@ -1614,6 +1614,22 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
return nil return nil
}, },
sys.AMD64) sys.AMD64)
addF(simdPackage, "Int8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint8x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint16x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint32x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint64x2.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Int64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint8x32.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint16x16.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
} }
} }

View file

@ -15,3 +15,131 @@ package simd
// //
// Asm: VZEROUPPER, CPU Feature: AVX // Asm: VZEROUPPER, CPU Feature: AVX
func ClearAVXUpperBits() func ClearAVXUpperBits()
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int8x16) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int8x32) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int16x8) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int16x16) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int32x4) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int32x8) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int64x2) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Int64x4) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint8x16) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint8x32) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint16x8) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint16x16) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint32x4) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint32x8) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint64x2) IsZero() bool
// IsZero returns true if all elements of x are zeros.
//
// This method compiles to VPTEST x, x.
// x.And(y).IsZero() and x.AndNot(y).IsZero() will be optimized to VPTEST x, y
//
// Asm: VPTEST, CPU Feature: AVX
func (x Uint64x4) IsZero() bool

View file

@ -557,3 +557,26 @@ func TestLeadingZeros(t *testing.T) {
} }
} }
} }
func TestIsZero(t *testing.T) {
v1 := simd.LoadUint64x2Slice([]uint64{0, 1})
v2 := simd.LoadUint64x2Slice([]uint64{0, 0})
if v1.IsZero() {
t.Errorf("Result incorrect, want false, got true")
}
if !v2.IsZero() {
t.Errorf("Result incorrect, want true, got false")
}
if !v1.And(v2).IsZero() {
t.Errorf("Result incorrect, want true, got false")
}
if v1.AndNot(v2).IsZero() {
t.Errorf("Result incorrect, want false, got true")
}
if !v2.And(v1).IsZero() {
t.Errorf("Result incorrect, want true, got false")
}
if !v2.AndNot(v1).IsZero() {
t.Errorf("Result incorrect, want true, got false")
}
}