[dev.simd] cmd/compile, simd: add VPLZCNT[DQ]

Change-Id: Ifd6d8c12deac9c41722fdf2511d860a334e83438
Reviewed-on: https://go-review.googlesource.com/c/go/+/701915
Reviewed-by: Cherry Mui <cherryyz@google.com>
TryBot-Bypass: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
Junyang Shao 2025-09-08 19:38:56 +00:00
parent 832c1f76dc
commit c39b2fdd1e
11 changed files with 466 additions and 0 deletions

View file

@ -110,6 +110,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPMOVZXBQ256, ssa.OpAMD64VPMOVZXBQ256,
ssa.OpAMD64VPMOVZXWQ256, ssa.OpAMD64VPMOVZXWQ256,
ssa.OpAMD64VPMOVZXBQ512, ssa.OpAMD64VPMOVZXBQ512,
ssa.OpAMD64VPLZCNTD128,
ssa.OpAMD64VPLZCNTD256,
ssa.OpAMD64VPLZCNTD512,
ssa.OpAMD64VPLZCNTQ128,
ssa.OpAMD64VPLZCNTQ256,
ssa.OpAMD64VPLZCNTQ512,
ssa.OpAMD64VPOPCNTB128, ssa.OpAMD64VPOPCNTB128,
ssa.OpAMD64VPOPCNTB256, ssa.OpAMD64VPOPCNTB256,
ssa.OpAMD64VPOPCNTB512, ssa.OpAMD64VPOPCNTB512,
@ -863,6 +869,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPEXPANDQMasked128, ssa.OpAMD64VPEXPANDQMasked128,
ssa.OpAMD64VPEXPANDQMasked256, ssa.OpAMD64VPEXPANDQMasked256,
ssa.OpAMD64VPEXPANDQMasked512, ssa.OpAMD64VPEXPANDQMasked512,
ssa.OpAMD64VPLZCNTDMasked128,
ssa.OpAMD64VPLZCNTDMasked256,
ssa.OpAMD64VPLZCNTDMasked512,
ssa.OpAMD64VPLZCNTQMasked128,
ssa.OpAMD64VPLZCNTQMasked256,
ssa.OpAMD64VPLZCNTQMasked512,
ssa.OpAMD64VPOPCNTBMasked128, ssa.OpAMD64VPOPCNTBMasked128,
ssa.OpAMD64VPOPCNTBMasked256, ssa.OpAMD64VPOPCNTBMasked256,
ssa.OpAMD64VPOPCNTBMasked512, ssa.OpAMD64VPOPCNTBMasked512,
@ -1581,6 +1593,12 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VGF2P8MULBMasked128, ssa.OpAMD64VGF2P8MULBMasked128,
ssa.OpAMD64VGF2P8MULBMasked256, ssa.OpAMD64VGF2P8MULBMasked256,
ssa.OpAMD64VGF2P8MULBMasked512, ssa.OpAMD64VGF2P8MULBMasked512,
ssa.OpAMD64VPLZCNTDMasked128,
ssa.OpAMD64VPLZCNTDMasked256,
ssa.OpAMD64VPLZCNTDMasked512,
ssa.OpAMD64VPLZCNTQMasked128,
ssa.OpAMD64VPLZCNTQMasked256,
ssa.OpAMD64VPLZCNTQMasked512,
ssa.OpAMD64VMAXPSMasked128, ssa.OpAMD64VMAXPSMasked128,
ssa.OpAMD64VMAXPSMasked256, ssa.OpAMD64VMAXPSMasked256,
ssa.OpAMD64VMAXPSMasked512, ssa.OpAMD64VMAXPSMasked512,

View file

@ -562,6 +562,18 @@
(IsNanFloat64x2 x y) => (VCMPPD128 [3] x y) (IsNanFloat64x2 x y) => (VCMPPD128 [3] x y)
(IsNanFloat64x4 x y) => (VCMPPD256 [3] x y) (IsNanFloat64x4 x y) => (VCMPPD256 [3] x y)
(IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y)) (IsNanFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [3] x y))
(LeadingZerosInt32x4 ...) => (VPLZCNTD128 ...)
(LeadingZerosInt32x8 ...) => (VPLZCNTD256 ...)
(LeadingZerosInt32x16 ...) => (VPLZCNTD512 ...)
(LeadingZerosInt64x2 ...) => (VPLZCNTQ128 ...)
(LeadingZerosInt64x4 ...) => (VPLZCNTQ256 ...)
(LeadingZerosInt64x8 ...) => (VPLZCNTQ512 ...)
(LeadingZerosUint32x4 ...) => (VPLZCNTD128 ...)
(LeadingZerosUint32x8 ...) => (VPLZCNTD256 ...)
(LeadingZerosUint32x16 ...) => (VPLZCNTD512 ...)
(LeadingZerosUint64x2 ...) => (VPLZCNTQ128 ...)
(LeadingZerosUint64x4 ...) => (VPLZCNTQ256 ...)
(LeadingZerosUint64x8 ...) => (VPLZCNTQ512 ...)
(LessFloat32x4 x y) => (VCMPPS128 [1] x y) (LessFloat32x4 x y) => (VCMPPS128 [1] x y)
(LessFloat32x8 x y) => (VCMPPS256 [1] x y) (LessFloat32x8 x y) => (VCMPPS256 [1] x y)
(LessFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [1] x y)) (LessFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [1] x y))
@ -1334,6 +1346,8 @@
(VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask) (VMOVDQU8Masked512 (VGF2P8AFFINEINVQB512 [a] x y) mask) => (VGF2P8AFFINEINVQBMasked512 [a] x y mask)
(VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask) (VMOVDQU8Masked512 (VGF2P8AFFINEQB512 [a] x y) mask) => (VGF2P8AFFINEQBMasked512 [a] x y mask)
(VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask) (VMOVDQU8Masked512 (VGF2P8MULB512 x y) mask) => (VGF2P8MULBMasked512 x y mask)
(VMOVDQU32Masked512 (VPLZCNTD512 x) mask) => (VPLZCNTDMasked512 x mask)
(VMOVDQU64Masked512 (VPLZCNTQ512 x) mask) => (VPLZCNTQMasked512 x mask)
(VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask) (VMOVDQU32Masked512 (VMAXPS512 x y) mask) => (VMAXPSMasked512 x y mask)
(VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask) (VMOVDQU64Masked512 (VMAXPD512 x y) mask) => (VMAXPDMasked512 x y mask)
(VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask) (VMOVDQU8Masked512 (VPMAXSB512 x y) mask) => (VPMAXSBMasked512 x y mask)

View file

@ -450,6 +450,18 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPHSUBSW256", argLength: 2, reg: v21, asm: "VPHSUBSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPHSUBSW256", argLength: 2, reg: v21, asm: "VPHSUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPHSUBW128", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPHSUBW128", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPHSUBW256", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPHSUBW256", argLength: 2, reg: v21, asm: "VPHSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPLZCNTD128", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPLZCNTD256", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPLZCNTD512", argLength: 1, reg: w11, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPLZCNTDMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPLZCNTDMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPLZCNTDMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPLZCNTQ128", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPLZCNTQ256", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPLZCNTQ512", argLength: 1, reg: w11, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPLZCNTQMasked128", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPLZCNTQMasked256", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPLZCNTQMasked512", argLength: 2, reg: wkw, asm: "VPLZCNTQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPMADDUBSW128", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPMADDUBSW256", argLength: 2, reg: v21, asm: "VPMADDUBSW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPMADDUBSW512", argLength: 2, reg: w21, asm: "VPMADDUBSW", commutative: false, typ: "Vec512", resultInArg0: false},

View file

@ -526,6 +526,18 @@ func simdGenericOps() []opData {
{name: "IsNanFloat64x2", argLength: 2, commutative: true}, {name: "IsNanFloat64x2", argLength: 2, commutative: true},
{name: "IsNanFloat64x4", argLength: 2, commutative: true}, {name: "IsNanFloat64x4", argLength: 2, commutative: true},
{name: "IsNanFloat64x8", argLength: 2, commutative: true}, {name: "IsNanFloat64x8", argLength: 2, commutative: true},
{name: "LeadingZerosInt32x4", argLength: 1, commutative: false},
{name: "LeadingZerosInt32x8", argLength: 1, commutative: false},
{name: "LeadingZerosInt32x16", argLength: 1, commutative: false},
{name: "LeadingZerosInt64x2", argLength: 1, commutative: false},
{name: "LeadingZerosInt64x4", argLength: 1, commutative: false},
{name: "LeadingZerosInt64x8", argLength: 1, commutative: false},
{name: "LeadingZerosUint32x4", argLength: 1, commutative: false},
{name: "LeadingZerosUint32x8", argLength: 1, commutative: false},
{name: "LeadingZerosUint32x16", argLength: 1, commutative: false},
{name: "LeadingZerosUint64x2", argLength: 1, commutative: false},
{name: "LeadingZerosUint64x4", argLength: 1, commutative: false},
{name: "LeadingZerosUint64x8", argLength: 1, commutative: false},
{name: "LessEqualFloat32x4", argLength: 2, commutative: false}, {name: "LessEqualFloat32x4", argLength: 2, commutative: false},
{name: "LessEqualFloat32x8", argLength: 2, commutative: false}, {name: "LessEqualFloat32x8", argLength: 2, commutative: false},
{name: "LessEqualFloat32x16", argLength: 2, commutative: false}, {name: "LessEqualFloat32x16", argLength: 2, commutative: false},

View file

@ -1682,6 +1682,18 @@ const (
OpAMD64VPHSUBSW256 OpAMD64VPHSUBSW256
OpAMD64VPHSUBW128 OpAMD64VPHSUBW128
OpAMD64VPHSUBW256 OpAMD64VPHSUBW256
OpAMD64VPLZCNTD128
OpAMD64VPLZCNTD256
OpAMD64VPLZCNTD512
OpAMD64VPLZCNTDMasked128
OpAMD64VPLZCNTDMasked256
OpAMD64VPLZCNTDMasked512
OpAMD64VPLZCNTQ128
OpAMD64VPLZCNTQ256
OpAMD64VPLZCNTQ512
OpAMD64VPLZCNTQMasked128
OpAMD64VPLZCNTQMasked256
OpAMD64VPLZCNTQMasked512
OpAMD64VPMADDUBSW128 OpAMD64VPMADDUBSW128
OpAMD64VPMADDUBSW256 OpAMD64VPMADDUBSW256
OpAMD64VPMADDUBSW512 OpAMD64VPMADDUBSW512
@ -5343,6 +5355,18 @@ const (
OpIsNanFloat64x2 OpIsNanFloat64x2
OpIsNanFloat64x4 OpIsNanFloat64x4
OpIsNanFloat64x8 OpIsNanFloat64x8
OpLeadingZerosInt32x4
OpLeadingZerosInt32x8
OpLeadingZerosInt32x16
OpLeadingZerosInt64x2
OpLeadingZerosInt64x4
OpLeadingZerosInt64x8
OpLeadingZerosUint32x4
OpLeadingZerosUint32x8
OpLeadingZerosUint32x16
OpLeadingZerosUint64x2
OpLeadingZerosUint64x4
OpLeadingZerosUint64x8
OpLessEqualFloat32x4 OpLessEqualFloat32x4
OpLessEqualFloat32x8 OpLessEqualFloat32x8
OpLessEqualFloat32x16 OpLessEqualFloat32x16
@ -25897,6 +25921,168 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "VPLZCNTD128",
argLen: 1,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTD256",
argLen: 1,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTD512",
argLen: 1,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTDMasked128",
argLen: 2,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTDMasked256",
argLen: 2,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTDMasked512",
argLen: 2,
asm: x86.AVPLZCNTD,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQ128",
argLen: 1,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQ256",
argLen: 1,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQ512",
argLen: 1,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQMasked128",
argLen: 2,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQMasked256",
argLen: 2,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPLZCNTQMasked512",
argLen: 2,
asm: x86.AVPLZCNTQ,
reg: regInfo{
inputs: []inputInfo{
{1, 71494644084506624}, // K1 K2 K3 K4 K5 K6 K7
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{ {
name: "VPMADDUBSW128", name: "VPMADDUBSW128",
argLen: 2, argLen: 2,
@ -68572,6 +68758,66 @@ var opcodeTable = [...]opInfo{
commutative: true, commutative: true,
generic: true, generic: true,
}, },
{
name: "LeadingZerosInt32x4",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosInt32x8",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosInt32x16",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosInt64x2",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosInt64x4",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosInt64x8",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint32x4",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint32x8",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint32x16",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint64x2",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint64x4",
argLen: 1,
generic: true,
},
{
name: "LeadingZerosUint64x8",
argLen: 1,
generic: true,
},
{ {
name: "LessEqualFloat32x4", name: "LessEqualFloat32x4",
argLen: 2, argLen: 2,

View file

@ -2489,6 +2489,42 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpIsNonNil(v) return rewriteValueAMD64_OpIsNonNil(v)
case OpIsSliceInBounds: case OpIsSliceInBounds:
return rewriteValueAMD64_OpIsSliceInBounds(v) return rewriteValueAMD64_OpIsSliceInBounds(v)
case OpLeadingZerosInt32x16:
v.Op = OpAMD64VPLZCNTD512
return true
case OpLeadingZerosInt32x4:
v.Op = OpAMD64VPLZCNTD128
return true
case OpLeadingZerosInt32x8:
v.Op = OpAMD64VPLZCNTD256
return true
case OpLeadingZerosInt64x2:
v.Op = OpAMD64VPLZCNTQ128
return true
case OpLeadingZerosInt64x4:
v.Op = OpAMD64VPLZCNTQ256
return true
case OpLeadingZerosInt64x8:
v.Op = OpAMD64VPLZCNTQ512
return true
case OpLeadingZerosUint32x16:
v.Op = OpAMD64VPLZCNTD512
return true
case OpLeadingZerosUint32x4:
v.Op = OpAMD64VPLZCNTD128
return true
case OpLeadingZerosUint32x8:
v.Op = OpAMD64VPLZCNTD256
return true
case OpLeadingZerosUint64x2:
v.Op = OpAMD64VPLZCNTQ128
return true
case OpLeadingZerosUint64x4:
v.Op = OpAMD64VPLZCNTQ256
return true
case OpLeadingZerosUint64x8:
v.Op = OpAMD64VPLZCNTQ512
return true
case OpLeq16: case OpLeq16:
return rewriteValueAMD64_OpLeq16(v) return rewriteValueAMD64_OpLeq16(v)
case OpLeq16U: case OpLeq16U:
@ -27364,6 +27400,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU32Masked512(v *Value) bool {
v.AddArg3(x, y, mask) v.AddArg3(x, y, mask)
return true return true
} }
// match: (VMOVDQU32Masked512 (VPLZCNTD512 x) mask)
// result: (VPLZCNTDMasked512 x mask)
for {
if v_0.Op != OpAMD64VPLZCNTD512 {
break
}
x := v_0.Args[0]
mask := v_1
v.reset(OpAMD64VPLZCNTDMasked512)
v.AddArg2(x, mask)
return true
}
// match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask) // match: (VMOVDQU32Masked512 (VMAXPS512 x y) mask)
// result: (VMAXPSMasked512 x y mask) // result: (VMAXPSMasked512 x y mask)
for { for {
@ -28057,6 +28105,18 @@ func rewriteValueAMD64_OpAMD64VMOVDQU64Masked512(v *Value) bool {
v.AddArg3(x, y, mask) v.AddArg3(x, y, mask)
return true return true
} }
// match: (VMOVDQU64Masked512 (VPLZCNTQ512 x) mask)
// result: (VPLZCNTQMasked512 x mask)
for {
if v_0.Op != OpAMD64VPLZCNTQ512 {
break
}
x := v_0.Args[0]
mask := v_1
v.reset(OpAMD64VPLZCNTQMasked512)
v.AddArg2(x, mask)
return true
}
// match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask) // match: (VMOVDQU64Masked512 (VMAXPD512 x y) mask)
// result: (VMAXPDMasked512 x y mask) // result: (VMAXPDMasked512 x y mask)
for { for {

View file

@ -574,6 +574,18 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.IsNan", opLen2(ssa.OpIsNanFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x4.IsNan", opLen2(ssa.OpIsNanFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.IsNan", opLen2(ssa.OpIsNanFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x16.LeadingZeros", opLen1(ssa.OpLeadingZerosUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.LeadingZeros", opLen1(ssa.OpLeadingZerosUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Less", opLen2(ssa.OpLessFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.Less", opLen2(ssa.OpLessFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Less", opLen2(ssa.OpLessFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.Less", opLen2(ssa.OpLessFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Less", opLen2(ssa.OpLessFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.Less", opLen2(ssa.OpLessFloat32x16, types.TypeVec512), sys.AMD64)

View file

@ -0,0 +1,5 @@
!sum
- go: LeadingZeros
commutative: false
documentation: !string |-
// NAME counts the leading zeros of each element in x.

View file

@ -0,0 +1,8 @@
!sum
- go: LeadingZeros
asm: "VPLZCNT[DQ]"
in:
- &any
go: $t
out:
- *any

View file

@ -540,3 +540,20 @@ func TestClearAVXUpperBits(t *testing.T) {
checkSlices[int64](t, r, []int64{11, 22, 33, 44}) checkSlices[int64](t, r, []int64{11, 22, 33, 44})
checkSlices[int64](t, s, []int64{9, 18, 27, 36}) checkSlices[int64](t, s, []int64{9, 18, 27, 36})
} }
func TestLeadingZeros(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
src := []uint64{0b1111, 0}
want := []uint64{60, 64}
got := make([]uint64, 2)
simd.LoadUint64x2Slice(src).LeadingZeros().StoreSlice(got)
for i := range 2 {
if want[i] != got[i] {
t.Errorf("Result incorrect at %d: want %d, got %d", i, want[i], got[i])
}
}
}

View file

@ -3298,6 +3298,68 @@ func (x Float64x4) IsNan(y Float64x4) Mask64x4
// Asm: VCMPPD, CPU Feature: AVX512 // Asm: VCMPPD, CPU Feature: AVX512
func (x Float64x8) IsNan(y Float64x8) Mask64x8 func (x Float64x8) IsNan(y Float64x8) Mask64x8
/* LeadingZeros */
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Int32x4) LeadingZeros() Int32x4
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Int32x8) LeadingZeros() Int32x8
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Int32x16) LeadingZeros() Int32x16
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Int64x2) LeadingZeros() Int64x2
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Int64x4) LeadingZeros() Int64x4
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Int64x8) LeadingZeros() Int64x8
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Uint32x4) LeadingZeros() Uint32x4
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Uint32x8) LeadingZeros() Uint32x8
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTD, CPU Feature: AVX512
func (x Uint32x16) LeadingZeros() Uint32x16
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Uint64x2) LeadingZeros() Uint64x2
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Uint64x4) LeadingZeros() Uint64x4
// LeadingZeros counts the leading zeros of each element in x.
//
// Asm: VPLZCNTQ, CPU Feature: AVX512
func (x Uint64x8) LeadingZeros() Uint64x8
/* Less */ /* Less */
// Less compares for less than. // Less compares for less than.