[dev.simd] cmd/compile, simd: support store to bits for mask

This CL is partially generated by CL 689775.

Change-Id: I0c36fd2a44706c88db1a1d5ea4a6d0b9f891d85f
Reviewed-on: https://go-review.googlesource.com/c/go/+/689795
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Junyang Shao 2025-07-23 07:37:14 +00:00
parent 41054cdb1c
commit 6f7a1164e7
15 changed files with 1192 additions and 523 deletions

View file

@ -24,8 +24,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQ128,
ssa.OpAMD64VPABSQ256,
ssa.OpAMD64VPABSQ512,
ssa.OpAMD64VRCP14PS128,
ssa.OpAMD64VRCP14PS256,
ssa.OpAMD64VRCPPS128,
ssa.OpAMD64VRCPPS256,
ssa.OpAMD64VRCP14PS512,
ssa.OpAMD64VRCP14PD128,
ssa.OpAMD64VRCP14PD256,
@ -335,6 +335,16 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPXORQ512:
p = simdV21(s, v)
case ssa.OpAMD64VPCMPEQB512,
ssa.OpAMD64VPCMPEQW512,
ssa.OpAMD64VPCMPEQD512,
ssa.OpAMD64VPCMPEQQ512,
ssa.OpAMD64VPCMPGTB512,
ssa.OpAMD64VPCMPGTW512,
ssa.OpAMD64VPCMPGTD512,
ssa.OpAMD64VPCMPGTQ512:
p = simdV2k(s, v)
case ssa.OpAMD64VADDPSMasked128,
ssa.OpAMD64VADDPSMasked256,
ssa.OpAMD64VADDPSMasked512,
@ -733,30 +743,30 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VCMPPS512,
ssa.OpAMD64VCMPPD512,
ssa.OpAMD64VPCMPB512,
ssa.OpAMD64VPCMPW512,
ssa.OpAMD64VPCMPD512,
ssa.OpAMD64VPCMPQ512,
ssa.OpAMD64VPCMPUB512,
ssa.OpAMD64VPCMPUW512,
ssa.OpAMD64VPCMPUD512,
ssa.OpAMD64VPCMPUQ512,
ssa.OpAMD64VPCMPUB128,
ssa.OpAMD64VPCMPUB256,
ssa.OpAMD64VPCMPUB512,
ssa.OpAMD64VPCMPUW128,
ssa.OpAMD64VPCMPUW256,
ssa.OpAMD64VPCMPUW512,
ssa.OpAMD64VPCMPUD128,
ssa.OpAMD64VPCMPUD256,
ssa.OpAMD64VPCMPUD512,
ssa.OpAMD64VPCMPUQ128,
ssa.OpAMD64VPCMPUQ256,
ssa.OpAMD64VPCMPUQ512,
ssa.OpAMD64VPCMPB128,
ssa.OpAMD64VPCMPB256,
ssa.OpAMD64VPCMPB512,
ssa.OpAMD64VPCMPW128,
ssa.OpAMD64VPCMPW256,
ssa.OpAMD64VPCMPW512,
ssa.OpAMD64VPCMPD128,
ssa.OpAMD64VPCMPD256,
ssa.OpAMD64VPCMPD512,
ssa.OpAMD64VPCMPQ128,
ssa.OpAMD64VPCMPQ256:
ssa.OpAMD64VPCMPQ256,
ssa.OpAMD64VPCMPQ512:
p = simdV2kImm8(s, v)
case ssa.OpAMD64VCMPPSMasked128,

View file

@ -1468,10 +1468,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdOrMaskReg(v)
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512:
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.From.Reg = simdOrMaskReg(v.Args[1])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)

View file

@ -1698,6 +1698,22 @@
(LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)

View file

@ -234,7 +234,8 @@ func init() {
wfpw = regInfo{inputs: []regMask{w, fp}, outputs: wonly}
wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly}
kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
prefreg = regInfo{inputs: []regMask{gpspsbg}}
)
@ -1318,6 +1319,7 @@ func init() {
{name: "VZEROALL", argLength: 0, asm: "VZEROALL"},
{name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
{name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
}
var AMD64blocks = []blockData{

View file

@ -678,6 +678,19 @@ var genericOps = []opData{
{name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
}
// kind controls successors implicit exit

View file

@ -152,8 +152,8 @@
(AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
(AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
(AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
(ApproximateReciprocalFloat32x4 ...) => (VRCP14PS128 ...)
(ApproximateReciprocalFloat32x8 ...) => (VRCP14PS256 ...)
(ApproximateReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ApproximateReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
(ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...)
(ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...)
@ -305,28 +305,28 @@
(EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y))
(EqualInt8x16 ...) => (VPCMPEQB128 ...)
(EqualInt8x32 ...) => (VPCMPEQB256 ...)
(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
(EqualInt16x8 ...) => (VPCMPEQW128 ...)
(EqualInt16x16 ...) => (VPCMPEQW256 ...)
(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
(EqualInt32x4 ...) => (VPCMPEQD128 ...)
(EqualInt32x8 ...) => (VPCMPEQD256 ...)
(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
(EqualInt64x2 ...) => (VPCMPEQQ128 ...)
(EqualInt64x4 ...) => (VPCMPEQQ256 ...)
(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
(EqualUint8x16 ...) => (VPCMPEQB128 ...)
(EqualUint8x32 ...) => (VPCMPEQB256 ...)
(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [0] x y))
(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
(EqualUint16x8 ...) => (VPCMPEQW128 ...)
(EqualUint16x16 ...) => (VPCMPEQW256 ...)
(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [0] x y))
(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
(EqualUint32x4 ...) => (VPCMPEQD128 ...)
(EqualUint32x8 ...) => (VPCMPEQD256 ...)
(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [0] x y))
(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
(EqualUint64x2 ...) => (VPCMPEQQ128 ...)
(EqualUint64x4 ...) => (VPCMPEQQ256 ...)
(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y))
(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
(EqualMaskedFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
(EqualMaskedFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
(EqualMaskedFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
@ -453,16 +453,16 @@
(GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [14] x y))
(GreaterInt8x16 ...) => (VPCMPGTB128 ...)
(GreaterInt8x32 ...) => (VPCMPGTB256 ...)
(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [14] x y))
(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPGTB512 x y))
(GreaterInt16x8 ...) => (VPCMPGTW128 ...)
(GreaterInt16x16 ...) => (VPCMPGTW256 ...)
(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [14] x y))
(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPGTW512 x y))
(GreaterInt32x4 ...) => (VPCMPGTD128 ...)
(GreaterInt32x8 ...) => (VPCMPGTD256 ...)
(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [14] x y))
(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPGTD512 x y))
(GreaterInt64x2 ...) => (VPCMPGTQ128 ...)
(GreaterInt64x4 ...) => (VPCMPGTQ256 ...)
(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [14] x y))
(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
(GreaterUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [14] x y))
(GreaterUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [14] x y))
(GreaterUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [14] x y))

View file

@ -33,7 +33,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VADDPS128", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VADDPSMasked128", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRCP14PS128", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRCPPS128", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRCP14PSMasked128", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRSQRTPS128", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRSQRT14PSMasked128", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec128", resultInArg0: false},
@ -63,7 +63,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VADDPS256", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VADDPSMasked256", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRCP14PS256", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRCPPS256", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRCP14PSMasked256", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRSQRTPS256", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRSQRT14PSMasked256", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec256", resultInArg0: false},
@ -224,6 +224,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPADDWMasked512", argLength: 3, reg: w2kw, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQW512", argLength: 2, reg: w2k, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTW512", argLength: 2, reg: w2k, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSW512", argLength: 2, reg: w21, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSWMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSW512", argLength: 2, reg: w21, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false},
@ -305,6 +307,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPANDND512", argLength: 2, reg: w21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPANDNDMasked512", argLength: 3, reg: w2kw, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSDMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQD512", argLength: 2, reg: w2k, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTD512", argLength: 2, reg: w2k, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSD512", argLength: 2, reg: w21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSD512", argLength: 2, reg: w21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -526,6 +530,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPANDNQ512", argLength: 2, reg: w21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPANDNQMasked512", argLength: 3, reg: w2kw, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSQMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQQ512", argLength: 2, reg: w2k, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTQ512", argLength: 2, reg: w2k, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSQ512", argLength: 2, reg: w21, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSQMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSQ512", argLength: 2, reg: w21, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false},
@ -611,6 +617,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPADDBMasked512", argLength: 3, reg: w2kw, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSBMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTB512", argLength: 2, reg: w2k, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSB512", argLength: 2, reg: w21, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSB512", argLength: 2, reg: w21, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false},
@ -692,10 +700,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@ -705,12 +713,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -735,10 +743,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
@ -759,8 +767,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@ -858,8 +866,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDW256", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPSHLDW512", argLength: 2, reg: w21, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHLDWMasked512", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHRDW512", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -872,8 +880,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHRDW128", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHRDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPROLDMasked512", argLength: 2, reg: wkw, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPRORD512", argLength: 1, reg: w11, asm: "VPRORD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -926,8 +934,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDQ256", argLength: 2, reg: w21, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPROLQ512", argLength: 1, reg: w11, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPROLQMasked512", argLength: 2, reg: wkw, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPRORQ512", argLength: 1, reg: w11, asm: "VPRORQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -944,16 +952,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPCMPB256", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW256", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW128", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD128", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
@ -962,8 +970,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPCMPUQ128", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ256", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VGF2P8AFFINEQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VGF2P8AFFINEINVQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
@ -976,11 +984,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VGF2P8AFFINEINVQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPUB256", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VGF2P8AFFINEQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEINVQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEINVQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
}
}

View file

@ -912,10 +912,10 @@ func simdGenericOps() []opData {
{name: "PermuteUint16x16", argLength: 2, commutative: false},
{name: "Permute2Uint16x16", argLength: 3, commutative: false},
{name: "Permute2Int16x16", argLength: 3, commutative: false},
{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
{name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
{name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
{name: "PopCountUint16x16", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@ -966,8 +966,8 @@ func simdGenericOps() []opData {
{name: "Permute2Int16x32", argLength: 3, commutative: false},
{name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
{name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
{name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
{name: "PopCountUint16x32", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@ -1018,12 +1018,12 @@ func simdGenericOps() []opData {
{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
{name: "PermuteInt16x8", argLength: 2, commutative: false},
{name: "PermuteUint16x8", argLength: 2, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2Uint16x8", argLength: 3, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
{name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
{name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
{name: "PopCountUint16x8", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@ -1070,17 +1070,17 @@ func simdGenericOps() []opData {
{name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
{name: "OrUint32x16", argLength: 2, commutative: true},
{name: "OrMaskedUint32x16", argLength: 3, commutative: true},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteInt32x16", argLength: 2, commutative: false},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteUint32x16", argLength: 2, commutative: false},
{name: "Permute2Uint32x16", argLength: 3, commutative: false},
{name: "Permute2Float32x16", argLength: 3, commutative: false},
{name: "Permute2Int32x16", argLength: 3, commutative: false},
{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
{name: "PopCountUint32x16", argLength: 1, commutative: false},
{name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
@ -1307,15 +1307,15 @@ func simdGenericOps() []opData {
{name: "PermuteUint64x4", argLength: 2, commutative: false},
{name: "PermuteInt64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Uint64x4", argLength: 3, commutative: false},
{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
{name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
{name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
{name: "PopCountUint64x4", argLength: 1, commutative: false},
{name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
{name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@ -1365,18 +1365,18 @@ func simdGenericOps() []opData {
{name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true},
{name: "OrMaskedUint64x8", argLength: 3, commutative: true},
{name: "PermuteUint64x8", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false},
{name: "PermuteInt64x8", argLength: 2, commutative: false},
{name: "PermuteUint64x8", argLength: 2, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2Float64x8", argLength: 3, commutative: false},
{name: "Permute2Uint64x8", argLength: 3, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
{name: "PopCountUint64x8", argLength: 1, commutative: false},
{name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
{name: "RotateLeftUint64x8", argLength: 2, commutative: false},

File diff suppressed because it is too large Load diff

View file

@ -985,10 +985,10 @@ func rewriteValueAMD64(v *Value) bool {
v.Op = OpAMD64VRCP14PS512
return true
case OpApproximateReciprocalFloat32x4:
v.Op = OpAMD64VRCP14PS128
v.Op = OpAMD64VRCPPS128
return true
case OpApproximateReciprocalFloat32x8:
v.Op = OpAMD64VRCP14PS256
v.Op = OpAMD64VRCPPS256
return true
case OpApproximateReciprocalFloat64x2:
v.Op = OpAMD64VRCP14PD128
@ -5184,6 +5184,30 @@ func rewriteValueAMD64(v *Value) bool {
return true
case OpStore:
return rewriteValueAMD64_OpStore(v)
case OpStoreMask16x16:
return rewriteValueAMD64_OpStoreMask16x16(v)
case OpStoreMask16x32:
return rewriteValueAMD64_OpStoreMask16x32(v)
case OpStoreMask16x8:
return rewriteValueAMD64_OpStoreMask16x8(v)
case OpStoreMask32x16:
return rewriteValueAMD64_OpStoreMask32x16(v)
case OpStoreMask32x4:
return rewriteValueAMD64_OpStoreMask32x4(v)
case OpStoreMask32x8:
return rewriteValueAMD64_OpStoreMask32x8(v)
case OpStoreMask64x2:
return rewriteValueAMD64_OpStoreMask64x2(v)
case OpStoreMask64x4:
return rewriteValueAMD64_OpStoreMask64x4(v)
case OpStoreMask64x8:
return rewriteValueAMD64_OpStoreMask64x8(v)
case OpStoreMask8x16:
return rewriteValueAMD64_OpStoreMask8x16(v)
case OpStoreMask8x32:
return rewriteValueAMD64_OpStoreMask8x32(v)
case OpStoreMask8x64:
return rewriteValueAMD64_OpStoreMask8x64(v)
case OpSub16:
v.Op = OpAMD64SUBL
return true
@ -33388,13 +33412,12 @@ func rewriteValueAMD64_OpEqualInt16x32(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualInt16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
// result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -33406,13 +33429,12 @@ func rewriteValueAMD64_OpEqualInt32x16(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualInt32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
// result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -33424,13 +33446,12 @@ func rewriteValueAMD64_OpEqualInt64x8(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualInt64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
// result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -33442,13 +33463,12 @@ func rewriteValueAMD64_OpEqualInt8x64(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualInt8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
// result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -34120,13 +34140,12 @@ func rewriteValueAMD64_OpEqualUint16x32(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualUint16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPUW512 [0] x y))
// result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -34138,13 +34157,12 @@ func rewriteValueAMD64_OpEqualUint32x16(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualUint32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPUD512 [0] x y))
// result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -34156,13 +34174,12 @@ func rewriteValueAMD64_OpEqualUint64x8(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualUint64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y))
// result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -34174,13 +34191,12 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (EqualUint8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPUB512 [0] x y))
// result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -36279,13 +36295,12 @@ func rewriteValueAMD64_OpGreaterInt16x32(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (GreaterInt16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPW512 [14] x y))
// result: (VPMOVMToVec16x32 (VPCMPGTW512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTW512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -36297,13 +36312,12 @@ func rewriteValueAMD64_OpGreaterInt32x16(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (GreaterInt32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPD512 [14] x y))
// result: (VPMOVMToVec32x16 (VPCMPGTD512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTD512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -36315,13 +36329,12 @@ func rewriteValueAMD64_OpGreaterInt64x8(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (GreaterInt64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPQ512 [14] x y))
// result: (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -36333,13 +36346,12 @@ func rewriteValueAMD64_OpGreaterInt8x64(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (GreaterInt8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPB512 [14] x y))
// result: (VPMOVMToVec8x64 (VPCMPGTB512 x y))
for {
x := v_0
y := v_1
v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTB512, typ.Mask)
v0.AddArg2(x, y)
v.AddArg(v0)
return true
@ -53277,6 +53289,234 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
}
return false
}
func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x2 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x64 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]

View file

@ -1791,6 +1791,23 @@ func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ss
}
}
func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
opCodes := map[int]map[int]ssa.Op{
8: {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
}
op := opCodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
return nil
}
}
// findIntrinsic returns a function which builds the SSA equivalent of the
// function identified by the symbol sym. If sym is not an intrinsic call, returns nil.
func findIntrinsic(sym *types.Sym) intrinsicBuilder {

View file

@ -310,34 +310,34 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Equal", opLen2(ssa.OpEqualInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Equal", opLen2(ssa.OpEqualInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.Equal", opLen2(ssa.OpEqualInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.Equal", opLen2(ssa.OpEqualInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.Equal", opLen2(ssa.OpEqualInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.Equal", opLen2(ssa.OpEqualInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.Equal", opLen2(ssa.OpEqualUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.Equal", opLen2(ssa.OpEqualUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x8.Equal", opLen2(ssa.OpEqualUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.Equal", opLen2(ssa.OpEqualUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x4.Equal", opLen2(ssa.OpEqualUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x8.Equal", opLen2(ssa.OpEqualUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.Equal", opLen2(ssa.OpEqualUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Equal", opLen2(ssa.OpEqualUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Equal", opLen2(ssa.OpEqualFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Equal", opLen2(ssa.OpEqualFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Equal", opLen2(ssa.OpEqualFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Equal", opLen2(ssa.OpEqualFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Equal", opLen2(ssa.OpEqualFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Equal", opLen2(ssa.OpEqualFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x16, types.TypeVec512), sys.AMD64)
@ -458,22 +458,22 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.GetElem", opLen1Imm8(ssa.OpGetElemUint64x2, types.Types[types.TUINT64], 0), sys.AMD64)
addF(simdPackage, "Int8x16.Greater", opLen2(ssa.OpGreaterInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Greater", opLen2(ssa.OpGreaterInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Greater", opLen2(ssa.OpGreaterInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Greater", opLen2(ssa.OpGreaterInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.Greater", opLen2(ssa.OpGreaterInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.Greater", opLen2(ssa.OpGreaterInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.Greater", opLen2(ssa.OpGreaterInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.Greater", opLen2(ssa.OpGreaterInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Greater", opLen2(ssa.OpGreaterFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Greater", opLen2(ssa.OpGreaterFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Greater", opLen2(ssa.OpGreaterFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Greater", opLen2(ssa.OpGreaterFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Greater", opLen2(ssa.OpGreaterFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Greater", opLen2(ssa.OpGreaterFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.Greater", opLen2(ssa.OpGreaterUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.Greater", opLen2(ssa.OpGreaterUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.Greater", opLen2(ssa.OpGreaterUint8x64, types.TypeVec512), sys.AMD64)
@ -2137,59 +2137,71 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
}

View file

@ -918,12 +918,12 @@ func (x Uint64x8) AndNotMasked(y Uint64x8, mask Mask64x8) Uint64x8
// ApproximateReciprocal computes an approximate reciprocal of each element.
//
// Asm: VRCP14PS, CPU Feature: AVX512F
// Asm: VRCPPS, CPU Feature: AVX
func (x Float32x4) ApproximateReciprocal() Float32x4
// ApproximateReciprocal computes an approximate reciprocal of each element.
//
// Asm: VRCP14PS, CPU Feature: AVX512F
// Asm: VRCPPS, CPU Feature: AVX
func (x Float32x8) ApproximateReciprocal() Float32x8
// ApproximateReciprocal computes an approximate reciprocal of each element.
@ -1951,6 +1951,11 @@ func (x Int8x16) Equal(y Int8x16) Mask8x16
// Asm: VPCMPEQB, CPU Feature: AVX2
func (x Int8x32) Equal(y Int8x32) Mask8x32
// Equal compares for equality.
//
// Asm: VPCMPEQB, CPU Feature: AVX512BW
func (x Int8x64) Equal(y Int8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX
@ -1961,6 +1966,11 @@ func (x Int16x8) Equal(y Int16x8) Mask16x8
// Asm: VPCMPEQW, CPU Feature: AVX2
func (x Int16x16) Equal(y Int16x16) Mask16x16
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX512BW
func (x Int16x32) Equal(y Int16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX
@ -1971,6 +1981,11 @@ func (x Int32x4) Equal(y Int32x4) Mask32x4
// Asm: VPCMPEQD, CPU Feature: AVX2
func (x Int32x8) Equal(y Int32x8) Mask32x8
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX512F
func (x Int32x16) Equal(y Int32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX
@ -1981,6 +1996,11 @@ func (x Int64x2) Equal(y Int64x2) Mask64x2
// Asm: VPCMPEQQ, CPU Feature: AVX2
func (x Int64x4) Equal(y Int64x4) Mask64x4
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX512F
func (x Int64x8) Equal(y Int64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VPCMPEQB, CPU Feature: AVX
@ -1991,6 +2011,11 @@ func (x Uint8x16) Equal(y Uint8x16) Mask8x16
// Asm: VPCMPEQB, CPU Feature: AVX2
func (x Uint8x32) Equal(y Uint8x32) Mask8x32
// Equal compares for equality.
//
// Asm: VPCMPEQB, CPU Feature: AVX512BW
func (x Uint8x64) Equal(y Uint8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX
@ -2001,6 +2026,11 @@ func (x Uint16x8) Equal(y Uint16x8) Mask16x8
// Asm: VPCMPEQW, CPU Feature: AVX2
func (x Uint16x16) Equal(y Uint16x16) Mask16x16
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX512BW
func (x Uint16x32) Equal(y Uint16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX
@ -2011,6 +2041,11 @@ func (x Uint32x4) Equal(y Uint32x4) Mask32x4
// Asm: VPCMPEQD, CPU Feature: AVX2
func (x Uint32x8) Equal(y Uint32x8) Mask32x8
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX512F
func (x Uint32x16) Equal(y Uint32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX
@ -2021,6 +2056,11 @@ func (x Uint64x2) Equal(y Uint64x2) Mask64x2
// Asm: VPCMPEQQ, CPU Feature: AVX2
func (x Uint64x4) Equal(y Uint64x4) Mask64x4
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX512F
func (x Uint64x8) Equal(y Uint64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VCMPPS, CPU Feature: AVX
@ -2051,46 +2091,6 @@ func (x Float64x4) Equal(y Float64x4) Mask64x4
// Asm: VCMPPD, CPU Feature: AVX512F
func (x Float64x8) Equal(y Float64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VPCMPB, CPU Feature: AVX512BW
func (x Int8x64) Equal(y Int8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPW, CPU Feature: AVX512BW
func (x Int16x32) Equal(y Int16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPD, CPU Feature: AVX512F
func (x Int32x16) Equal(y Int32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPQ, CPU Feature: AVX512F
func (x Int64x8) Equal(y Int64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VPCMPUB, CPU Feature: AVX512BW
func (x Uint8x64) Equal(y Uint8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPUW, CPU Feature: AVX512BW
func (x Uint16x32) Equal(y Uint16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPUD, CPU Feature: AVX512F
func (x Uint32x16) Equal(y Uint32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPUQ, CPU Feature: AVX512F
func (x Uint64x8) Equal(y Uint64x8) Mask64x8
/* EqualMasked */
// EqualMasked compares for equality.
@ -2733,7 +2733,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x6
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16
func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
// GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2746,7 +2746,7 @@ func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32
func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
// GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2759,7 +2759,7 @@ func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64
func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
/* GaloisFieldAffineTransformMasked */
@ -2773,7 +2773,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16
func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
// GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2785,7 +2785,7 @@ func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32
func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
// GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2797,7 +2797,7 @@ func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x
// b is expected to be a constant, non-constant value will trigger a runtime panic.
//
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64
func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
/* GaloisFieldMul */
@ -2987,6 +2987,11 @@ func (x Int8x16) Greater(y Int8x16) Mask8x16
// Asm: VPCMPGTB, CPU Feature: AVX2
func (x Int8x32) Greater(y Int8x32) Mask8x32
// Greater compares for greater than.
//
// Asm: VPCMPGTB, CPU Feature: AVX512BW
func (x Int8x64) Greater(y Int8x64) Mask8x64
// Greater compares for greater than.
//
// Asm: VPCMPGTW, CPU Feature: AVX
@ -2997,6 +3002,11 @@ func (x Int16x8) Greater(y Int16x8) Mask16x8
// Asm: VPCMPGTW, CPU Feature: AVX2
func (x Int16x16) Greater(y Int16x16) Mask16x16
// Greater compares for greater than.
//
// Asm: VPCMPGTW, CPU Feature: AVX512BW
func (x Int16x32) Greater(y Int16x32) Mask16x32
// Greater compares for greater than.
//
// Asm: VPCMPGTD, CPU Feature: AVX
@ -3007,6 +3017,11 @@ func (x Int32x4) Greater(y Int32x4) Mask32x4
// Asm: VPCMPGTD, CPU Feature: AVX2
func (x Int32x8) Greater(y Int32x8) Mask32x8
// Greater compares for greater than.
//
// Asm: VPCMPGTD, CPU Feature: AVX512F
func (x Int32x16) Greater(y Int32x16) Mask32x16
// Greater compares for greater than.
//
// Asm: VPCMPGTQ, CPU Feature: AVX
@ -3017,6 +3032,11 @@ func (x Int64x2) Greater(y Int64x2) Mask64x2
// Asm: VPCMPGTQ, CPU Feature: AVX2
func (x Int64x4) Greater(y Int64x4) Mask64x4
// Greater compares for greater than.
//
// Asm: VPCMPGTQ, CPU Feature: AVX512F
func (x Int64x8) Greater(y Int64x8) Mask64x8
// Greater compares for greater than.
//
// Asm: VCMPPS, CPU Feature: AVX
@ -3047,26 +3067,6 @@ func (x Float64x4) Greater(y Float64x4) Mask64x4
// Asm: VCMPPD, CPU Feature: AVX512F
func (x Float64x8) Greater(y Float64x8) Mask64x8
// Greater compares for greater than.
//
// Asm: VPCMPB, CPU Feature: AVX512BW
func (x Int8x64) Greater(y Int8x64) Mask8x64
// Greater compares for greater than.
//
// Asm: VPCMPW, CPU Feature: AVX512BW
func (x Int16x32) Greater(y Int16x32) Mask16x32
// Greater compares for greater than.
//
// Asm: VPCMPD, CPU Feature: AVX512F
func (x Int32x16) Greater(y Int32x16) Mask32x16
// Greater compares for greater than.
//
// Asm: VPCMPQ, CPU Feature: AVX512F
func (x Int64x8) Greater(y Int64x8) Mask64x8
// Greater compares for greater than.
//
// Asm: VPCMPUB, CPU Feature: AVX512BW
@ -6475,84 +6475,84 @@ func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
/* Permute */
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x16) Permute(indices Uint8x16) Int8x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x32) Permute(indices Uint8x32) Int8x32
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x64) Permute(indices Uint8x64) Int8x64
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x8) Permute(indices Uint16x8) Int16x8
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x16) Permute(indices Uint16x16) Int16x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x32) Permute(indices Uint16x32) Int16x32
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -6580,63 +6580,63 @@ func (x Int32x8) Permute(indices Uint32x8) Int32x8
// Asm: VPERMD, CPU Feature: AVX2
func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x16) Permute(indices Uint32x16) Float32x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x16) Permute(indices Uint32x16) Int32x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x4) Permute(indices Uint64x4) Float64x4
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x4) Permute(indices Uint64x4) Int64x4
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x8) Permute(indices Uint64x8) Float64x8
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x8) Permute(indices Uint64x8) Int64x8
// Permute performs a full permutation of vector y using indices:
// Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7189,7 +7189,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
/* PermuteMasked */
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7198,7 +7198,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7207,7 +7207,7 @@ func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7216,7 +7216,7 @@ func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7225,7 +7225,7 @@ func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7234,7 +7234,7 @@ func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7243,7 +7243,7 @@ func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
// Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7252,7 +7252,7 @@ func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7261,7 +7261,7 @@ func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7270,7 +7270,7 @@ func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7279,7 +7279,7 @@ func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7288,7 +7288,7 @@ func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7297,7 +7297,7 @@ func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
// Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7306,7 +7306,7 @@ func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
// Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7315,7 +7315,7 @@ func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
// Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7324,7 +7324,7 @@ func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
// Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7333,7 +7333,7 @@ func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
// Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7342,7 +7342,7 @@ func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
// Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7351,7 +7351,7 @@ func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
// Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7360,7 +7360,7 @@ func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
// Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7369,7 +7369,7 @@ func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7378,7 +7378,7 @@ func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7387,7 +7387,7 @@ func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
// Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//
@ -7396,7 +7396,7 @@ func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
// Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8
// PermuteMasked performs a full permutation of vector y using indices:
// PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements.
//

View file

@ -461,7 +461,7 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
}
}
func TestBitMask(t *testing.T) {
func TestBitMaskLoad(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
@ -477,3 +477,19 @@ func TestBitMask(t *testing.T) {
}
}
}
func TestBitMaskStore(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
var want uint64 = 0b101
var got uint64
x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
m := y.Greater(x)
m.StoreToBits(&got)
if got != want {
t.Errorf("Result incorrect: want %b, got %b", want, got)
}
}

View file

@ -205,48 +205,88 @@ type Mask8x16 struct {
vals [16]int8
}
// Mask8x16FromBits constructs a Mask8x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x16FromBits(y *uint64) Mask8x16
// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x16) StoreToBits(y *uint64)
// Mask16x8 is a 128-bit SIMD vector of 8 int16
type Mask16x8 struct {
int16x8 v128
vals [8]int16
}
// Mask16x8FromBits constructs a Mask16x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x8FromBits(y *uint64) Mask16x8
// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x8) StoreToBits(y *uint64)
// Mask32x4 is a 128-bit SIMD vector of 4 int32
type Mask32x4 struct {
int32x4 v128
vals [4]int32
}
// Mask32x4FromBits constructs a Mask32x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x4FromBits(y *uint64) Mask32x4
// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x4) StoreToBits(y *uint64)
// Mask64x2 is a 128-bit SIMD vector of 2 int64
type Mask64x2 struct {
int64x2 v128
vals [2]int64
}
// Mask64x2FromBits constructs a Mask64x2 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x2FromBits(y *uint64) Mask64x2
// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x2) StoreToBits(y *uint64)
// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
type v256 struct {
_256 struct{}
@ -448,48 +488,88 @@ type Mask8x32 struct {
vals [32]int8
}
// Mask8x32FromBits constructs a Mask8x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x32FromBits(y *uint64) Mask8x32
// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x32) StoreToBits(y *uint64)
// Mask16x16 is a 256-bit SIMD vector of 16 int16
type Mask16x16 struct {
int16x16 v256
vals [16]int16
}
// Mask16x16FromBits constructs a Mask16x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x16FromBits(y *uint64) Mask16x16
// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x16) StoreToBits(y *uint64)
// Mask32x8 is a 256-bit SIMD vector of 8 int32
type Mask32x8 struct {
int32x8 v256
vals [8]int32
}
// Mask32x8FromBits constructs a Mask32x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x8FromBits(y *uint64) Mask32x8
// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x8) StoreToBits(y *uint64)
// Mask64x4 is a 256-bit SIMD vector of 4 int64
type Mask64x4 struct {
int64x4 v256
vals [4]int64
}
// Mask64x4FromBits constructs a Mask64x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x4FromBits(y *uint64) Mask64x4
// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x4) StoreToBits(y *uint64)
// v512 is a tag type that tells the compiler that this is really 512-bit SIMD
type v512 struct {
_512 struct{}
@ -691,44 +771,84 @@ type Mask8x64 struct {
vals [64]int8
}
// Mask8x64FromBits constructs a Mask8x64 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask8x64FromBits(y *uint64) Mask8x64
// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x64) StoreToBits(y *uint64)
// Mask16x32 is a 512-bit SIMD vector of 32 int16
type Mask16x32 struct {
int16x32 v512
vals [32]int16
}
// Mask16x32FromBits constructs a Mask16x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask16x32FromBits(y *uint64) Mask16x32
// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x32) StoreToBits(y *uint64)
// Mask32x16 is a 512-bit SIMD vector of 16 int32
type Mask32x16 struct {
int32x16 v512
vals [16]int32
}
// Mask32x16FromBits constructs a Mask32x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask32x16FromBits(y *uint64) Mask32x16
// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x16) StoreToBits(y *uint64)
// Mask64x8 is a 512-bit SIMD vector of 8 int64
type Mask64x8 struct {
int64x8 v512
vals [8]int64
}
// Mask64x8FromBits constructs a Mask64x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func LoadMask64x8FromBits(y *uint64) Mask64x8
// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x8) StoreToBits(y *uint64)