[dev.simd] cmd/compile, simd: support store to bits for mask

This CL is partially generated by CL 689775.

Change-Id: I0c36fd2a44706c88db1a1d5ea4a6d0b9f891d85f
Reviewed-on: https://go-review.googlesource.com/c/go/+/689795
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
Junyang Shao 2025-07-23 07:37:14 +00:00
parent 41054cdb1c
commit 6f7a1164e7
15 changed files with 1192 additions and 523 deletions

View file

@ -24,8 +24,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPABSQ128, ssa.OpAMD64VPABSQ128,
ssa.OpAMD64VPABSQ256, ssa.OpAMD64VPABSQ256,
ssa.OpAMD64VPABSQ512, ssa.OpAMD64VPABSQ512,
ssa.OpAMD64VRCP14PS128, ssa.OpAMD64VRCPPS128,
ssa.OpAMD64VRCP14PS256, ssa.OpAMD64VRCPPS256,
ssa.OpAMD64VRCP14PS512, ssa.OpAMD64VRCP14PS512,
ssa.OpAMD64VRCP14PD128, ssa.OpAMD64VRCP14PD128,
ssa.OpAMD64VRCP14PD256, ssa.OpAMD64VRCP14PD256,
@ -335,6 +335,16 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPXORQ512: ssa.OpAMD64VPXORQ512:
p = simdV21(s, v) p = simdV21(s, v)
case ssa.OpAMD64VPCMPEQB512,
ssa.OpAMD64VPCMPEQW512,
ssa.OpAMD64VPCMPEQD512,
ssa.OpAMD64VPCMPEQQ512,
ssa.OpAMD64VPCMPGTB512,
ssa.OpAMD64VPCMPGTW512,
ssa.OpAMD64VPCMPGTD512,
ssa.OpAMD64VPCMPGTQ512:
p = simdV2k(s, v)
case ssa.OpAMD64VADDPSMasked128, case ssa.OpAMD64VADDPSMasked128,
ssa.OpAMD64VADDPSMasked256, ssa.OpAMD64VADDPSMasked256,
ssa.OpAMD64VADDPSMasked512, ssa.OpAMD64VADDPSMasked512,
@ -733,30 +743,30 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
case ssa.OpAMD64VCMPPS512, case ssa.OpAMD64VCMPPS512,
ssa.OpAMD64VCMPPD512, ssa.OpAMD64VCMPPD512,
ssa.OpAMD64VPCMPB512,
ssa.OpAMD64VPCMPW512,
ssa.OpAMD64VPCMPD512,
ssa.OpAMD64VPCMPQ512,
ssa.OpAMD64VPCMPUB512,
ssa.OpAMD64VPCMPUW512,
ssa.OpAMD64VPCMPUD512,
ssa.OpAMD64VPCMPUQ512,
ssa.OpAMD64VPCMPUB128, ssa.OpAMD64VPCMPUB128,
ssa.OpAMD64VPCMPUB256, ssa.OpAMD64VPCMPUB256,
ssa.OpAMD64VPCMPUB512,
ssa.OpAMD64VPCMPUW128, ssa.OpAMD64VPCMPUW128,
ssa.OpAMD64VPCMPUW256, ssa.OpAMD64VPCMPUW256,
ssa.OpAMD64VPCMPUW512,
ssa.OpAMD64VPCMPUD128, ssa.OpAMD64VPCMPUD128,
ssa.OpAMD64VPCMPUD256, ssa.OpAMD64VPCMPUD256,
ssa.OpAMD64VPCMPUD512,
ssa.OpAMD64VPCMPUQ128, ssa.OpAMD64VPCMPUQ128,
ssa.OpAMD64VPCMPUQ256, ssa.OpAMD64VPCMPUQ256,
ssa.OpAMD64VPCMPUQ512,
ssa.OpAMD64VPCMPB128, ssa.OpAMD64VPCMPB128,
ssa.OpAMD64VPCMPB256, ssa.OpAMD64VPCMPB256,
ssa.OpAMD64VPCMPB512,
ssa.OpAMD64VPCMPW128, ssa.OpAMD64VPCMPW128,
ssa.OpAMD64VPCMPW256, ssa.OpAMD64VPCMPW256,
ssa.OpAMD64VPCMPW512,
ssa.OpAMD64VPCMPD128, ssa.OpAMD64VPCMPD128,
ssa.OpAMD64VPCMPD256, ssa.OpAMD64VPCMPD256,
ssa.OpAMD64VPCMPD512,
ssa.OpAMD64VPCMPQ128, ssa.OpAMD64VPCMPQ128,
ssa.OpAMD64VPCMPQ256: ssa.OpAMD64VPCMPQ256,
ssa.OpAMD64VPCMPQ512:
p = simdV2kImm8(s, v) p = simdV2kImm8(s, v)
case ssa.OpAMD64VCMPPSMasked128, case ssa.OpAMD64VCMPPSMasked128,

View file

@ -1468,10 +1468,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
ssagen.AddAux(&p.From, v) ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = simdOrMaskReg(v) p.To.Reg = simdOrMaskReg(v)
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512: case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore:
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1]) p.From.Reg = simdOrMaskReg(v.Args[1])
p.To.Type = obj.TYPE_MEM p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg() p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v) ssagen.AddAux(&p.To, v)

View file

@ -1698,6 +1698,22 @@
(LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem)) (LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
(LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem)) (LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
(StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
(StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
(StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
(StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
(StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
(StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
(StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
(StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
(StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
(StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
(StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
(StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
(Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem) (Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
(Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem) (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)

View file

@ -235,6 +235,7 @@ func init() {
wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly} wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly}
kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly} kload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
prefreg = regInfo{inputs: []regMask{gpspsbg}} prefreg = regInfo{inputs: []regMask{gpspsbg}}
) )
@ -1318,6 +1319,7 @@ func init() {
{name: "VZEROALL", argLength: 0, asm: "VZEROALL"}, {name: "VZEROALL", argLength: 0, asm: "VZEROALL"},
{name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, {name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
{name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
} }
var AMD64blocks = []blockData{ var AMD64blocks = []blockData{

View file

@ -678,6 +678,19 @@ var genericOps = []opData{
{name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem {name: "LoadMask64x2", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem {name: "LoadMask64x4", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem {name: "LoadMask64x8", argLength: 2}, // arg0 = ptr, arg1 = mem
{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0. arg2=memory, aux=type. Returns memory.
} }
// kind controls successors implicit exit // kind controls successors implicit exit

View file

@ -152,8 +152,8 @@
(AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask)) (AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
(AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask)) (AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
(AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask)) (AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
(ApproximateReciprocalFloat32x4 ...) => (VRCP14PS128 ...) (ApproximateReciprocalFloat32x4 ...) => (VRCPPS128 ...)
(ApproximateReciprocalFloat32x8 ...) => (VRCP14PS256 ...) (ApproximateReciprocalFloat32x8 ...) => (VRCPPS256 ...)
(ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...) (ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
(ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...) (ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...)
(ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...) (ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...)
@ -305,28 +305,28 @@
(EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y)) (EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y))
(EqualInt8x16 ...) => (VPCMPEQB128 ...) (EqualInt8x16 ...) => (VPCMPEQB128 ...)
(EqualInt8x32 ...) => (VPCMPEQB256 ...) (EqualInt8x32 ...) => (VPCMPEQB256 ...)
(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [0] x y)) (EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
(EqualInt16x8 ...) => (VPCMPEQW128 ...) (EqualInt16x8 ...) => (VPCMPEQW128 ...)
(EqualInt16x16 ...) => (VPCMPEQW256 ...) (EqualInt16x16 ...) => (VPCMPEQW256 ...)
(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [0] x y)) (EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
(EqualInt32x4 ...) => (VPCMPEQD128 ...) (EqualInt32x4 ...) => (VPCMPEQD128 ...)
(EqualInt32x8 ...) => (VPCMPEQD256 ...) (EqualInt32x8 ...) => (VPCMPEQD256 ...)
(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [0] x y)) (EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
(EqualInt64x2 ...) => (VPCMPEQQ128 ...) (EqualInt64x2 ...) => (VPCMPEQQ128 ...)
(EqualInt64x4 ...) => (VPCMPEQQ256 ...) (EqualInt64x4 ...) => (VPCMPEQQ256 ...)
(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [0] x y)) (EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
(EqualUint8x16 ...) => (VPCMPEQB128 ...) (EqualUint8x16 ...) => (VPCMPEQB128 ...)
(EqualUint8x32 ...) => (VPCMPEQB256 ...) (EqualUint8x32 ...) => (VPCMPEQB256 ...)
(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [0] x y)) (EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
(EqualUint16x8 ...) => (VPCMPEQW128 ...) (EqualUint16x8 ...) => (VPCMPEQW128 ...)
(EqualUint16x16 ...) => (VPCMPEQW256 ...) (EqualUint16x16 ...) => (VPCMPEQW256 ...)
(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [0] x y)) (EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
(EqualUint32x4 ...) => (VPCMPEQD128 ...) (EqualUint32x4 ...) => (VPCMPEQD128 ...)
(EqualUint32x8 ...) => (VPCMPEQD256 ...) (EqualUint32x8 ...) => (VPCMPEQD256 ...)
(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [0] x y)) (EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
(EqualUint64x2 ...) => (VPCMPEQQ128 ...) (EqualUint64x2 ...) => (VPCMPEQQ128 ...)
(EqualUint64x4 ...) => (VPCMPEQQ256 ...) (EqualUint64x4 ...) => (VPCMPEQQ256 ...)
(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y)) (EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
(EqualMaskedFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask))) (EqualMaskedFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
(EqualMaskedFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask))) (EqualMaskedFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
(EqualMaskedFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask))) (EqualMaskedFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
@ -453,16 +453,16 @@
(GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [14] x y)) (GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [14] x y))
(GreaterInt8x16 ...) => (VPCMPGTB128 ...) (GreaterInt8x16 ...) => (VPCMPGTB128 ...)
(GreaterInt8x32 ...) => (VPCMPGTB256 ...) (GreaterInt8x32 ...) => (VPCMPGTB256 ...)
(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [14] x y)) (GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPGTB512 x y))
(GreaterInt16x8 ...) => (VPCMPGTW128 ...) (GreaterInt16x8 ...) => (VPCMPGTW128 ...)
(GreaterInt16x16 ...) => (VPCMPGTW256 ...) (GreaterInt16x16 ...) => (VPCMPGTW256 ...)
(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [14] x y)) (GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPGTW512 x y))
(GreaterInt32x4 ...) => (VPCMPGTD128 ...) (GreaterInt32x4 ...) => (VPCMPGTD128 ...)
(GreaterInt32x8 ...) => (VPCMPGTD256 ...) (GreaterInt32x8 ...) => (VPCMPGTD256 ...)
(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [14] x y)) (GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPGTD512 x y))
(GreaterInt64x2 ...) => (VPCMPGTQ128 ...) (GreaterInt64x2 ...) => (VPCMPGTQ128 ...)
(GreaterInt64x4 ...) => (VPCMPGTQ256 ...) (GreaterInt64x4 ...) => (VPCMPGTQ256 ...)
(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [14] x y)) (GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
(GreaterUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [14] x y)) (GreaterUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [14] x y))
(GreaterUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [14] x y)) (GreaterUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [14] x y))
(GreaterUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [14] x y)) (GreaterUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [14] x y))

View file

@ -33,7 +33,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VADDPS128", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VADDPS128", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VADDPSMasked128", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VADDPSMasked128", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRCP14PS128", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRCPPS128", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRCP14PSMasked128", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRCP14PSMasked128", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRSQRTPS128", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRSQRTPS128", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VRSQRT14PSMasked128", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRSQRT14PSMasked128", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec128", resultInArg0: false},
@ -63,7 +63,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VADDPS256", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VADDPS256", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VADDPSMasked256", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VADDPSMasked256", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRCP14PS256", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRCPPS256", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRCP14PSMasked256", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRCP14PSMasked256", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRSQRTPS256", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRSQRTPS256", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VRSQRT14PSMasked256", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VRSQRT14PSMasked256", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec256", resultInArg0: false},
@ -224,6 +224,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPADDWMasked512", argLength: 3, reg: w2kw, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDWMasked512", argLength: 3, reg: w2kw, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQW512", argLength: 2, reg: w2k, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTW512", argLength: 2, reg: w2k, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSW512", argLength: 2, reg: w21, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSW512", argLength: 2, reg: w21, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSWMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSWMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSW512", argLength: 2, reg: w21, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSW512", argLength: 2, reg: w21, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false},
@ -305,6 +307,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPANDND512", argLength: 2, reg: w21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDND512", argLength: 2, reg: w21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPANDNDMasked512", argLength: 3, reg: w2kw, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDNDMasked512", argLength: 3, reg: w2kw, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSDMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSD", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSDMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQD512", argLength: 2, reg: w2k, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTD512", argLength: 2, reg: w2k, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSD512", argLength: 2, reg: w21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSD512", argLength: 2, reg: w21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSD512", argLength: 2, reg: w21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSD512", argLength: 2, reg: w21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -526,6 +530,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPANDNQ512", argLength: 2, reg: w21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDNQ512", argLength: 2, reg: w21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPANDNQMasked512", argLength: 3, reg: w2kw, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPANDNQMasked512", argLength: 3, reg: w2kw, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSQMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSQMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQQ512", argLength: 2, reg: w2k, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTQ512", argLength: 2, reg: w2k, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSQ512", argLength: 2, reg: w21, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSQ512", argLength: 2, reg: w21, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSQMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSQMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSQ512", argLength: 2, reg: w21, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSQ512", argLength: 2, reg: w21, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false},
@ -611,6 +617,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPADDBMasked512", argLength: 3, reg: w2kw, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPADDBMasked512", argLength: 3, reg: w2kw, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPCOMPRESSBMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSB", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPCOMPRESSBMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSB", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPGTB512", argLength: 2, reg: w2k, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPMAXSB512", argLength: 2, reg: w21, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSB512", argLength: 2, reg: w21, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMAXSBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMAXSBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
{name: "VPMINSB512", argLength: 2, reg: w21, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false}, {name: "VPMINSB512", argLength: 2, reg: w21, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false},
@ -692,10 +700,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false}, {name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true}, {name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
{name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@ -705,12 +713,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -735,10 +743,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false}, {name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true}, {name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
{name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
@ -759,8 +767,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true}, {name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@ -858,8 +866,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHLDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDW256", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDW256", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPSHLDW512", argLength: 2, reg: w21, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHLDW512", argLength: 2, reg: w21, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHLDWMasked512", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHLDWMasked512", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPSHRDW512", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPSHRDW512", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -872,8 +880,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHLDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHRDW128", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHRDW128", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSHRDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPSHRDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPROLDMasked512", argLength: 2, reg: wkw, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLDMasked512", argLength: 2, reg: wkw, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPRORD512", argLength: 1, reg: w11, asm: "VPRORD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPRORD512", argLength: 1, reg: w11, asm: "VPRORD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -926,8 +934,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSHLDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHLDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDQ256", argLength: 2, reg: w21, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDQ256", argLength: 2, reg: w21, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPROLQ512", argLength: 1, reg: w11, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLQ512", argLength: 1, reg: w11, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPROLQMasked512", argLength: 2, reg: wkw, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPROLQMasked512", argLength: 2, reg: wkw, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPRORQ512", argLength: 1, reg: w11, asm: "VPRORQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VPRORQ512", argLength: 1, reg: w11, asm: "VPRORQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -944,16 +952,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPCMPB256", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPB256", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW256", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUW256", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUWMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUWMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUW128", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUW128", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUD128", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUD128", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUDMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUDMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
@ -962,8 +970,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPCMPUQ128", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQ128", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ256", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQ256", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VGF2P8AFFINEQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VGF2P8AFFINEINVQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VGF2P8AFFINEINVQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
@ -976,11 +984,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VGF2P8AFFINEINVQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEINVQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPCMPUB256", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPUB256", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VPCMPUBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
{name: "VGF2P8AFFINEQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEINVQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEINVQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEINVQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEINVQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false}, {name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
} }
} }

View file

@ -912,10 +912,10 @@ func simdGenericOps() []opData {
{name: "PermuteUint16x16", argLength: 2, commutative: false}, {name: "PermuteUint16x16", argLength: 2, commutative: false},
{name: "Permute2Uint16x16", argLength: 3, commutative: false}, {name: "Permute2Uint16x16", argLength: 3, commutative: false},
{name: "Permute2Int16x16", argLength: 3, commutative: false}, {name: "Permute2Int16x16", argLength: 3, commutative: false},
{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
{name: "Permute2MaskedUint16x16", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false}, {name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
{name: "PermuteMaskedInt16x16", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
{name: "PopCountUint16x16", argLength: 1, commutative: false}, {name: "PopCountUint16x16", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x16", argLength: 2, commutative: false}, {name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x16", argLength: 2, commutative: true}, {name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@ -966,8 +966,8 @@ func simdGenericOps() []opData {
{name: "Permute2Int16x32", argLength: 3, commutative: false}, {name: "Permute2Int16x32", argLength: 3, commutative: false},
{name: "Permute2MaskedUint16x32", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
{name: "Permute2MaskedInt16x32", argLength: 4, commutative: false}, {name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
{name: "PermuteMaskedInt16x32", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
{name: "PopCountUint16x32", argLength: 1, commutative: false}, {name: "PopCountUint16x32", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x32", argLength: 2, commutative: false}, {name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x32", argLength: 2, commutative: true}, {name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@ -1018,12 +1018,12 @@ func simdGenericOps() []opData {
{name: "PairwiseSubUint16x8", argLength: 2, commutative: false}, {name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
{name: "PermuteInt16x8", argLength: 2, commutative: false}, {name: "PermuteInt16x8", argLength: 2, commutative: false},
{name: "PermuteUint16x8", argLength: 2, commutative: false}, {name: "PermuteUint16x8", argLength: 2, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2Uint16x8", argLength: 3, commutative: false}, {name: "Permute2Uint16x8", argLength: 3, commutative: false},
{name: "Permute2Int16x8", argLength: 3, commutative: false},
{name: "Permute2MaskedInt16x8", argLength: 4, commutative: false}, {name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
{name: "Permute2MaskedUint16x8", argLength: 4, commutative: false}, {name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
{name: "PermuteMaskedInt16x8", argLength: 3, commutative: false}, {name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
{name: "PopCountUint16x8", argLength: 1, commutative: false}, {name: "PopCountUint16x8", argLength: 1, commutative: false},
{name: "PopCountMaskedUint16x8", argLength: 2, commutative: false}, {name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
{name: "SaturatedAddUint16x8", argLength: 2, commutative: true}, {name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@ -1070,17 +1070,17 @@ func simdGenericOps() []opData {
{name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true}, {name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
{name: "OrUint32x16", argLength: 2, commutative: true}, {name: "OrUint32x16", argLength: 2, commutative: true},
{name: "OrMaskedUint32x16", argLength: 3, commutative: true}, {name: "OrMaskedUint32x16", argLength: 3, commutative: true},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteInt32x16", argLength: 2, commutative: false}, {name: "PermuteInt32x16", argLength: 2, commutative: false},
{name: "PermuteFloat32x16", argLength: 2, commutative: false},
{name: "PermuteUint32x16", argLength: 2, commutative: false}, {name: "PermuteUint32x16", argLength: 2, commutative: false},
{name: "Permute2Uint32x16", argLength: 3, commutative: false}, {name: "Permute2Uint32x16", argLength: 3, commutative: false},
{name: "Permute2Float32x16", argLength: 3, commutative: false}, {name: "Permute2Float32x16", argLength: 3, commutative: false},
{name: "Permute2Int32x16", argLength: 3, commutative: false}, {name: "Permute2Int32x16", argLength: 3, commutative: false},
{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedInt32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false}, {name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
{name: "PermuteMaskedUint32x16", argLength: 3, commutative: false}, {name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
{name: "PopCountUint32x16", argLength: 1, commutative: false}, {name: "PopCountUint32x16", argLength: 1, commutative: false},
{name: "PopCountMaskedUint32x16", argLength: 2, commutative: false}, {name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
@ -1307,15 +1307,15 @@ func simdGenericOps() []opData {
{name: "PermuteUint64x4", argLength: 2, commutative: false}, {name: "PermuteUint64x4", argLength: 2, commutative: false},
{name: "PermuteInt64x4", argLength: 2, commutative: false}, {name: "PermuteInt64x4", argLength: 2, commutative: false},
{name: "PermuteFloat64x4", argLength: 2, commutative: false}, {name: "PermuteFloat64x4", argLength: 2, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Uint64x4", argLength: 3, commutative: false}, {name: "Permute2Uint64x4", argLength: 3, commutative: false},
{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false}, {name: "Permute2Int64x4", argLength: 3, commutative: false},
{name: "Permute2Float64x4", argLength: 3, commutative: false},
{name: "Permute2MaskedUint64x4", argLength: 4, commutative: false}, {name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
{name: "Permute2MaskedInt64x4", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false}, {name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedInt64x4", argLength: 3, commutative: false}, {name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
{name: "PopCountUint64x4", argLength: 1, commutative: false}, {name: "PopCountUint64x4", argLength: 1, commutative: false},
{name: "PopCountMaskedUint64x4", argLength: 2, commutative: false}, {name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
{name: "RotateLeftUint64x4", argLength: 2, commutative: false}, {name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@ -1365,18 +1365,18 @@ func simdGenericOps() []opData {
{name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true}, {name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
{name: "OrUint64x8", argLength: 2, commutative: true}, {name: "OrUint64x8", argLength: 2, commutative: true},
{name: "OrMaskedUint64x8", argLength: 3, commutative: true}, {name: "OrMaskedUint64x8", argLength: 3, commutative: true},
{name: "PermuteUint64x8", argLength: 2, commutative: false},
{name: "PermuteFloat64x8", argLength: 2, commutative: false}, {name: "PermuteFloat64x8", argLength: 2, commutative: false},
{name: "PermuteInt64x8", argLength: 2, commutative: false}, {name: "PermuteInt64x8", argLength: 2, commutative: false},
{name: "PermuteUint64x8", argLength: 2, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2Float64x8", argLength: 3, commutative: false}, {name: "Permute2Float64x8", argLength: 3, commutative: false},
{name: "Permute2Uint64x8", argLength: 3, commutative: false}, {name: "Permute2Uint64x8", argLength: 3, commutative: false},
{name: "Permute2Int64x8", argLength: 3, commutative: false},
{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedUint64x8", argLength: 4, commutative: false}, {name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedInt64x8", argLength: 4, commutative: false}, {name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedInt64x8", argLength: 3, commutative: false}, {name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
{name: "PopCountUint64x8", argLength: 1, commutative: false}, {name: "PopCountUint64x8", argLength: 1, commutative: false},
{name: "PopCountMaskedUint64x8", argLength: 2, commutative: false}, {name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
{name: "RotateLeftUint64x8", argLength: 2, commutative: false}, {name: "RotateLeftUint64x8", argLength: 2, commutative: false},

File diff suppressed because it is too large Load diff

View file

@ -985,10 +985,10 @@ func rewriteValueAMD64(v *Value) bool {
v.Op = OpAMD64VRCP14PS512 v.Op = OpAMD64VRCP14PS512
return true return true
case OpApproximateReciprocalFloat32x4: case OpApproximateReciprocalFloat32x4:
v.Op = OpAMD64VRCP14PS128 v.Op = OpAMD64VRCPPS128
return true return true
case OpApproximateReciprocalFloat32x8: case OpApproximateReciprocalFloat32x8:
v.Op = OpAMD64VRCP14PS256 v.Op = OpAMD64VRCPPS256
return true return true
case OpApproximateReciprocalFloat64x2: case OpApproximateReciprocalFloat64x2:
v.Op = OpAMD64VRCP14PD128 v.Op = OpAMD64VRCP14PD128
@ -5184,6 +5184,30 @@ func rewriteValueAMD64(v *Value) bool {
return true return true
case OpStore: case OpStore:
return rewriteValueAMD64_OpStore(v) return rewriteValueAMD64_OpStore(v)
case OpStoreMask16x16:
return rewriteValueAMD64_OpStoreMask16x16(v)
case OpStoreMask16x32:
return rewriteValueAMD64_OpStoreMask16x32(v)
case OpStoreMask16x8:
return rewriteValueAMD64_OpStoreMask16x8(v)
case OpStoreMask32x16:
return rewriteValueAMD64_OpStoreMask32x16(v)
case OpStoreMask32x4:
return rewriteValueAMD64_OpStoreMask32x4(v)
case OpStoreMask32x8:
return rewriteValueAMD64_OpStoreMask32x8(v)
case OpStoreMask64x2:
return rewriteValueAMD64_OpStoreMask64x2(v)
case OpStoreMask64x4:
return rewriteValueAMD64_OpStoreMask64x4(v)
case OpStoreMask64x8:
return rewriteValueAMD64_OpStoreMask64x8(v)
case OpStoreMask8x16:
return rewriteValueAMD64_OpStoreMask8x16(v)
case OpStoreMask8x32:
return rewriteValueAMD64_OpStoreMask8x32(v)
case OpStoreMask8x64:
return rewriteValueAMD64_OpStoreMask8x64(v)
case OpSub16: case OpSub16:
v.Op = OpAMD64SUBL v.Op = OpAMD64SUBL
return true return true
@ -33388,13 +33412,12 @@ func rewriteValueAMD64_OpEqualInt16x32(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualInt16x32 x y) // match: (EqualInt16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPW512 [0] x y)) // result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec16x32) v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -33406,13 +33429,12 @@ func rewriteValueAMD64_OpEqualInt32x16(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualInt32x16 x y) // match: (EqualInt32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPD512 [0] x y)) // result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec32x16) v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -33424,13 +33446,12 @@ func rewriteValueAMD64_OpEqualInt64x8(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualInt64x8 x y) // match: (EqualInt64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPQ512 [0] x y)) // result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec64x8) v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -33442,13 +33463,12 @@ func rewriteValueAMD64_OpEqualInt8x64(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualInt8x64 x y) // match: (EqualInt8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPB512 [0] x y)) // result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec8x64) v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -34120,13 +34140,12 @@ func rewriteValueAMD64_OpEqualUint16x32(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualUint16x32 x y) // match: (EqualUint16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPUW512 [0] x y)) // result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec16x32) v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUW512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -34138,13 +34157,12 @@ func rewriteValueAMD64_OpEqualUint32x16(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualUint32x16 x y) // match: (EqualUint32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPUD512 [0] x y)) // result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec32x16) v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUD512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -34156,13 +34174,12 @@ func rewriteValueAMD64_OpEqualUint64x8(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualUint64x8 x y) // match: (EqualUint64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y)) // result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec64x8) v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUQ512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -34174,13 +34191,12 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (EqualUint8x64 x y) // match: (EqualUint8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPUB512 [0] x y)) // result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec8x64) v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUB512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(0)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -36279,13 +36295,12 @@ func rewriteValueAMD64_OpGreaterInt16x32(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (GreaterInt16x32 x y) // match: (GreaterInt16x32 x y)
// result: (VPMOVMToVec16x32 (VPCMPW512 [14] x y)) // result: (VPMOVMToVec16x32 (VPCMPGTW512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec16x32) v.reset(OpAMD64VPMOVMToVec16x32)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTW512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -36297,13 +36312,12 @@ func rewriteValueAMD64_OpGreaterInt32x16(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (GreaterInt32x16 x y) // match: (GreaterInt32x16 x y)
// result: (VPMOVMToVec32x16 (VPCMPD512 [14] x y)) // result: (VPMOVMToVec32x16 (VPCMPGTD512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec32x16) v.reset(OpAMD64VPMOVMToVec32x16)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTD512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -36315,13 +36329,12 @@ func rewriteValueAMD64_OpGreaterInt64x8(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (GreaterInt64x8 x y) // match: (GreaterInt64x8 x y)
// result: (VPMOVMToVec64x8 (VPCMPQ512 [14] x y)) // result: (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec64x8) v.reset(OpAMD64VPMOVMToVec64x8)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -36333,13 +36346,12 @@ func rewriteValueAMD64_OpGreaterInt8x64(v *Value) bool {
b := v.Block b := v.Block
typ := &b.Func.Config.Types typ := &b.Func.Config.Types
// match: (GreaterInt8x64 x y) // match: (GreaterInt8x64 x y)
// result: (VPMOVMToVec8x64 (VPCMPB512 [14] x y)) // result: (VPMOVMToVec8x64 (VPCMPGTB512 x y))
for { for {
x := v_0 x := v_0
y := v_1 y := v_1
v.reset(OpAMD64VPMOVMToVec8x64) v.reset(OpAMD64VPMOVMToVec8x64)
v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask) v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTB512, typ.Mask)
v0.AuxInt = int8ToAuxInt(14)
v0.AddArg2(x, y) v0.AddArg2(x, y)
v.AddArg(v0) v.AddArg(v0)
return true return true
@ -53277,6 +53289,234 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
} }
return false return false
} }
func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask16x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask32x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x2 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x4 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask64x8 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x16 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x32 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
// match: (StoreMask8x64 {t} ptr val mem)
// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
for {
t := auxToType(v.Aux)
ptr := v_0
val := v_1
mem := v_2
v.reset(OpAMD64KMOVQstore)
v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
v0.AddArg(val)
v.AddArg3(ptr, v0, mem)
return true
}
}
func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool { func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool {
v_2 := v.Args[2] v_2 := v.Args[2]
v_1 := v.Args[1] v_1 := v.Args[1]

View file

@ -1791,6 +1791,23 @@ func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ss
} }
} }
func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
opCodes := map[int]map[int]ssa.Op{
8: {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
}
op := opCodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
return nil
}
}
// findIntrinsic returns a function which builds the SSA equivalent of the // findIntrinsic returns a function which builds the SSA equivalent of the
// function identified by the symbol sym. If sym is not an intrinsic call, returns nil. // function identified by the symbol sym. If sym is not an intrinsic call, returns nil.
func findIntrinsic(sym *types.Sym) intrinsicBuilder { func findIntrinsic(sym *types.Sym) intrinsicBuilder {

View file

@ -310,34 +310,34 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Equal", opLen2(ssa.OpEqualInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x8.Equal", opLen2(ssa.OpEqualInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Equal", opLen2(ssa.OpEqualInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x16.Equal", opLen2(ssa.OpEqualInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.Equal", opLen2(ssa.OpEqualInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.Equal", opLen2(ssa.OpEqualInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.Equal", opLen2(ssa.OpEqualInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.Equal", opLen2(ssa.OpEqualInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.Equal", opLen2(ssa.OpEqualInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x2.Equal", opLen2(ssa.OpEqualInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.Equal", opLen2(ssa.OpEqualInt64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x4.Equal", opLen2(ssa.OpEqualInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.Equal", opLen2(ssa.OpEqualUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Equal", opLen2(ssa.OpEqualUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.Equal", opLen2(ssa.OpEqualUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.Equal", opLen2(ssa.OpEqualUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x8.Equal", opLen2(ssa.OpEqualUint16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint16x8.Equal", opLen2(ssa.OpEqualUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x16.Equal", opLen2(ssa.OpEqualUint16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint16x16.Equal", opLen2(ssa.OpEqualUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x4.Equal", opLen2(ssa.OpEqualUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x4.Equal", opLen2(ssa.OpEqualUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x8.Equal", opLen2(ssa.OpEqualUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x8.Equal", opLen2(ssa.OpEqualUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x2.Equal", opLen2(ssa.OpEqualUint64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint64x2.Equal", opLen2(ssa.OpEqualUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x4.Equal", opLen2(ssa.OpEqualUint64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint64x4.Equal", opLen2(ssa.OpEqualUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Equal", opLen2(ssa.OpEqualFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.Equal", opLen2(ssa.OpEqualFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Equal", opLen2(ssa.OpEqualFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.Equal", opLen2(ssa.OpEqualFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Equal", opLen2(ssa.OpEqualFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.Equal", opLen2(ssa.OpEqualFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Equal", opLen2(ssa.OpEqualFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.Equal", opLen2(ssa.OpEqualFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Equal", opLen2(ssa.OpEqualFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x4.Equal", opLen2(ssa.OpEqualFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Equal", opLen2(ssa.OpEqualFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.Equal", opLen2(ssa.OpEqualFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x16, types.TypeVec512), sys.AMD64)
@ -458,22 +458,22 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint64x2.GetElem", opLen1Imm8(ssa.OpGetElemUint64x2, types.Types[types.TUINT64], 0), sys.AMD64) addF(simdPackage, "Uint64x2.GetElem", opLen1Imm8(ssa.OpGetElemUint64x2, types.Types[types.TUINT64], 0), sys.AMD64)
addF(simdPackage, "Int8x16.Greater", opLen2(ssa.OpGreaterInt8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int8x16.Greater", opLen2(ssa.OpGreaterInt8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int8x32.Greater", opLen2(ssa.OpGreaterInt8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int8x32.Greater", opLen2(ssa.OpGreaterInt8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.Greater", opLen2(ssa.OpGreaterInt16x8, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int16x8.Greater", opLen2(ssa.OpGreaterInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.Greater", opLen2(ssa.OpGreaterInt16x16, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int16x16.Greater", opLen2(ssa.OpGreaterInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x4.Greater", opLen2(ssa.OpGreaterInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int32x4.Greater", opLen2(ssa.OpGreaterInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x8.Greater", opLen2(ssa.OpGreaterInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int32x8.Greater", opLen2(ssa.OpGreaterInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x2.Greater", opLen2(ssa.OpGreaterInt64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Int64x2.Greater", opLen2(ssa.OpGreaterInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x4.Greater", opLen2(ssa.OpGreaterInt64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Int64x4.Greater", opLen2(ssa.OpGreaterInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.Greater", opLen2(ssa.OpGreaterFloat32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float32x4.Greater", opLen2(ssa.OpGreaterFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.Greater", opLen2(ssa.OpGreaterFloat32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float32x8.Greater", opLen2(ssa.OpGreaterFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.Greater", opLen2(ssa.OpGreaterFloat32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x16.Greater", opLen2(ssa.OpGreaterFloat32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float64x2.Greater", opLen2(ssa.OpGreaterFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x2.Greater", opLen2(ssa.OpGreaterFloat64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float64x4.Greater", opLen2(ssa.OpGreaterFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x4.Greater", opLen2(ssa.OpGreaterFloat64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float64x8.Greater", opLen2(ssa.OpGreaterFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float64x8.Greater", opLen2(ssa.OpGreaterFloat64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint8x16.Greater", opLen2(ssa.OpGreaterUint8x16, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint8x16.Greater", opLen2(ssa.OpGreaterUint8x16, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint8x32.Greater", opLen2(ssa.OpGreaterUint8x32, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint8x32.Greater", opLen2(ssa.OpGreaterUint8x32, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint8x64.Greater", opLen2(ssa.OpGreaterUint8x64, types.TypeVec512), sys.AMD64) addF(simdPackage, "Uint8x64.Greater", opLen2(ssa.OpGreaterUint8x64, types.TypeVec512), sys.AMD64)
@ -2137,59 +2137,71 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64) addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64) addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64) addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64) addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64) addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64) addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64) addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64) addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64) addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64) addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64) addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64) addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64) addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
} }

View file

@ -918,12 +918,12 @@ func (x Uint64x8) AndNotMasked(y Uint64x8, mask Mask64x8) Uint64x8
// ApproximateReciprocal computes an approximate reciprocal of each element. // ApproximateReciprocal computes an approximate reciprocal of each element.
// //
// Asm: VRCP14PS, CPU Feature: AVX512F // Asm: VRCPPS, CPU Feature: AVX
func (x Float32x4) ApproximateReciprocal() Float32x4 func (x Float32x4) ApproximateReciprocal() Float32x4
// ApproximateReciprocal computes an approximate reciprocal of each element. // ApproximateReciprocal computes an approximate reciprocal of each element.
// //
// Asm: VRCP14PS, CPU Feature: AVX512F // Asm: VRCPPS, CPU Feature: AVX
func (x Float32x8) ApproximateReciprocal() Float32x8 func (x Float32x8) ApproximateReciprocal() Float32x8
// ApproximateReciprocal computes an approximate reciprocal of each element. // ApproximateReciprocal computes an approximate reciprocal of each element.
@ -1951,6 +1951,11 @@ func (x Int8x16) Equal(y Int8x16) Mask8x16
// Asm: VPCMPEQB, CPU Feature: AVX2 // Asm: VPCMPEQB, CPU Feature: AVX2
func (x Int8x32) Equal(y Int8x32) Mask8x32 func (x Int8x32) Equal(y Int8x32) Mask8x32
// Equal compares for equality.
//
// Asm: VPCMPEQB, CPU Feature: AVX512BW
func (x Int8x64) Equal(y Int8x64) Mask8x64
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQW, CPU Feature: AVX // Asm: VPCMPEQW, CPU Feature: AVX
@ -1961,6 +1966,11 @@ func (x Int16x8) Equal(y Int16x8) Mask16x8
// Asm: VPCMPEQW, CPU Feature: AVX2 // Asm: VPCMPEQW, CPU Feature: AVX2
func (x Int16x16) Equal(y Int16x16) Mask16x16 func (x Int16x16) Equal(y Int16x16) Mask16x16
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX512BW
func (x Int16x32) Equal(y Int16x32) Mask16x32
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQD, CPU Feature: AVX // Asm: VPCMPEQD, CPU Feature: AVX
@ -1971,6 +1981,11 @@ func (x Int32x4) Equal(y Int32x4) Mask32x4
// Asm: VPCMPEQD, CPU Feature: AVX2 // Asm: VPCMPEQD, CPU Feature: AVX2
func (x Int32x8) Equal(y Int32x8) Mask32x8 func (x Int32x8) Equal(y Int32x8) Mask32x8
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX512F
func (x Int32x16) Equal(y Int32x16) Mask32x16
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQQ, CPU Feature: AVX // Asm: VPCMPEQQ, CPU Feature: AVX
@ -1981,6 +1996,11 @@ func (x Int64x2) Equal(y Int64x2) Mask64x2
// Asm: VPCMPEQQ, CPU Feature: AVX2 // Asm: VPCMPEQQ, CPU Feature: AVX2
func (x Int64x4) Equal(y Int64x4) Mask64x4 func (x Int64x4) Equal(y Int64x4) Mask64x4
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX512F
func (x Int64x8) Equal(y Int64x8) Mask64x8
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQB, CPU Feature: AVX // Asm: VPCMPEQB, CPU Feature: AVX
@ -1991,6 +2011,11 @@ func (x Uint8x16) Equal(y Uint8x16) Mask8x16
// Asm: VPCMPEQB, CPU Feature: AVX2 // Asm: VPCMPEQB, CPU Feature: AVX2
func (x Uint8x32) Equal(y Uint8x32) Mask8x32 func (x Uint8x32) Equal(y Uint8x32) Mask8x32
// Equal compares for equality.
//
// Asm: VPCMPEQB, CPU Feature: AVX512BW
func (x Uint8x64) Equal(y Uint8x64) Mask8x64
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQW, CPU Feature: AVX // Asm: VPCMPEQW, CPU Feature: AVX
@ -2001,6 +2026,11 @@ func (x Uint16x8) Equal(y Uint16x8) Mask16x8
// Asm: VPCMPEQW, CPU Feature: AVX2 // Asm: VPCMPEQW, CPU Feature: AVX2
func (x Uint16x16) Equal(y Uint16x16) Mask16x16 func (x Uint16x16) Equal(y Uint16x16) Mask16x16
// Equal compares for equality.
//
// Asm: VPCMPEQW, CPU Feature: AVX512BW
func (x Uint16x32) Equal(y Uint16x32) Mask16x32
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQD, CPU Feature: AVX // Asm: VPCMPEQD, CPU Feature: AVX
@ -2011,6 +2041,11 @@ func (x Uint32x4) Equal(y Uint32x4) Mask32x4
// Asm: VPCMPEQD, CPU Feature: AVX2 // Asm: VPCMPEQD, CPU Feature: AVX2
func (x Uint32x8) Equal(y Uint32x8) Mask32x8 func (x Uint32x8) Equal(y Uint32x8) Mask32x8
// Equal compares for equality.
//
// Asm: VPCMPEQD, CPU Feature: AVX512F
func (x Uint32x16) Equal(y Uint32x16) Mask32x16
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VPCMPEQQ, CPU Feature: AVX // Asm: VPCMPEQQ, CPU Feature: AVX
@ -2021,6 +2056,11 @@ func (x Uint64x2) Equal(y Uint64x2) Mask64x2
// Asm: VPCMPEQQ, CPU Feature: AVX2 // Asm: VPCMPEQQ, CPU Feature: AVX2
func (x Uint64x4) Equal(y Uint64x4) Mask64x4 func (x Uint64x4) Equal(y Uint64x4) Mask64x4
// Equal compares for equality.
//
// Asm: VPCMPEQQ, CPU Feature: AVX512F
func (x Uint64x8) Equal(y Uint64x8) Mask64x8
// Equal compares for equality. // Equal compares for equality.
// //
// Asm: VCMPPS, CPU Feature: AVX // Asm: VCMPPS, CPU Feature: AVX
@ -2051,46 +2091,6 @@ func (x Float64x4) Equal(y Float64x4) Mask64x4
// Asm: VCMPPD, CPU Feature: AVX512F // Asm: VCMPPD, CPU Feature: AVX512F
func (x Float64x8) Equal(y Float64x8) Mask64x8 func (x Float64x8) Equal(y Float64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VPCMPB, CPU Feature: AVX512BW
func (x Int8x64) Equal(y Int8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPW, CPU Feature: AVX512BW
func (x Int16x32) Equal(y Int16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPD, CPU Feature: AVX512F
func (x Int32x16) Equal(y Int32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPQ, CPU Feature: AVX512F
func (x Int64x8) Equal(y Int64x8) Mask64x8
// Equal compares for equality.
//
// Asm: VPCMPUB, CPU Feature: AVX512BW
func (x Uint8x64) Equal(y Uint8x64) Mask8x64
// Equal compares for equality.
//
// Asm: VPCMPUW, CPU Feature: AVX512BW
func (x Uint16x32) Equal(y Uint16x32) Mask16x32
// Equal compares for equality.
//
// Asm: VPCMPUD, CPU Feature: AVX512F
func (x Uint32x16) Equal(y Uint32x16) Mask32x16
// Equal compares for equality.
//
// Asm: VPCMPUQ, CPU Feature: AVX512F
func (x Uint64x8) Equal(y Uint64x8) Mask64x8
/* EqualMasked */ /* EqualMasked */
// EqualMasked compares for equality. // EqualMasked compares for equality.
@ -2733,7 +2733,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x6
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16 func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
// GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8), // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1: // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2746,7 +2746,7 @@ func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32 func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
// GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8), // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
// with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1: // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2759,7 +2759,7 @@ func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64 func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
/* GaloisFieldAffineTransformMasked */ /* GaloisFieldAffineTransformMasked */
@ -2773,7 +2773,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16 func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
// GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8): // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes; // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2785,7 +2785,7 @@ func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32 func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
// GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8): // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
// x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes; // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2797,7 +2797,7 @@ func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x
// b is expected to be a constant, non-constant value will trigger a runtime panic. // b is expected to be a constant, non-constant value will trigger a runtime panic.
// //
// Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64 func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
/* GaloisFieldMul */ /* GaloisFieldMul */
@ -2987,6 +2987,11 @@ func (x Int8x16) Greater(y Int8x16) Mask8x16
// Asm: VPCMPGTB, CPU Feature: AVX2 // Asm: VPCMPGTB, CPU Feature: AVX2
func (x Int8x32) Greater(y Int8x32) Mask8x32 func (x Int8x32) Greater(y Int8x32) Mask8x32
// Greater compares for greater than.
//
// Asm: VPCMPGTB, CPU Feature: AVX512BW
func (x Int8x64) Greater(y Int8x64) Mask8x64
// Greater compares for greater than. // Greater compares for greater than.
// //
// Asm: VPCMPGTW, CPU Feature: AVX // Asm: VPCMPGTW, CPU Feature: AVX
@ -2997,6 +3002,11 @@ func (x Int16x8) Greater(y Int16x8) Mask16x8
// Asm: VPCMPGTW, CPU Feature: AVX2 // Asm: VPCMPGTW, CPU Feature: AVX2
func (x Int16x16) Greater(y Int16x16) Mask16x16 func (x Int16x16) Greater(y Int16x16) Mask16x16
// Greater compares for greater than.
//
// Asm: VPCMPGTW, CPU Feature: AVX512BW
func (x Int16x32) Greater(y Int16x32) Mask16x32
// Greater compares for greater than. // Greater compares for greater than.
// //
// Asm: VPCMPGTD, CPU Feature: AVX // Asm: VPCMPGTD, CPU Feature: AVX
@ -3007,6 +3017,11 @@ func (x Int32x4) Greater(y Int32x4) Mask32x4
// Asm: VPCMPGTD, CPU Feature: AVX2 // Asm: VPCMPGTD, CPU Feature: AVX2
func (x Int32x8) Greater(y Int32x8) Mask32x8 func (x Int32x8) Greater(y Int32x8) Mask32x8
// Greater compares for greater than.
//
// Asm: VPCMPGTD, CPU Feature: AVX512F
func (x Int32x16) Greater(y Int32x16) Mask32x16
// Greater compares for greater than. // Greater compares for greater than.
// //
// Asm: VPCMPGTQ, CPU Feature: AVX // Asm: VPCMPGTQ, CPU Feature: AVX
@ -3017,6 +3032,11 @@ func (x Int64x2) Greater(y Int64x2) Mask64x2
// Asm: VPCMPGTQ, CPU Feature: AVX2 // Asm: VPCMPGTQ, CPU Feature: AVX2
func (x Int64x4) Greater(y Int64x4) Mask64x4 func (x Int64x4) Greater(y Int64x4) Mask64x4
// Greater compares for greater than.
//
// Asm: VPCMPGTQ, CPU Feature: AVX512F
func (x Int64x8) Greater(y Int64x8) Mask64x8
// Greater compares for greater than. // Greater compares for greater than.
// //
// Asm: VCMPPS, CPU Feature: AVX // Asm: VCMPPS, CPU Feature: AVX
@ -3047,26 +3067,6 @@ func (x Float64x4) Greater(y Float64x4) Mask64x4
// Asm: VCMPPD, CPU Feature: AVX512F // Asm: VCMPPD, CPU Feature: AVX512F
func (x Float64x8) Greater(y Float64x8) Mask64x8 func (x Float64x8) Greater(y Float64x8) Mask64x8
// Greater compares for greater than.
//
// Asm: VPCMPB, CPU Feature: AVX512BW
func (x Int8x64) Greater(y Int8x64) Mask8x64
// Greater compares for greater than.
//
// Asm: VPCMPW, CPU Feature: AVX512BW
func (x Int16x32) Greater(y Int16x32) Mask16x32
// Greater compares for greater than.
//
// Asm: VPCMPD, CPU Feature: AVX512F
func (x Int32x16) Greater(y Int32x16) Mask32x16
// Greater compares for greater than.
//
// Asm: VPCMPQ, CPU Feature: AVX512F
func (x Int64x8) Greater(y Int64x8) Mask64x8
// Greater compares for greater than. // Greater compares for greater than.
// //
// Asm: VPCMPUB, CPU Feature: AVX512BW // Asm: VPCMPUB, CPU Feature: AVX512BW
@ -6475,84 +6475,84 @@ func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
/* Permute */ /* Permute */
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x16) Permute(indices Uint8x16) Int8x16 func (x Int8x16) Permute(indices Uint8x16) Int8x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x16) Permute(indices Uint8x16) Uint8x16 func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x32) Permute(indices Uint8x32) Int8x32 func (x Int8x32) Permute(indices Uint8x32) Int8x32
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x32) Permute(indices Uint8x32) Uint8x32 func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x64) Permute(indices Uint8x64) Int8x64 func (x Int8x64) Permute(indices Uint8x64) Int8x64
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x64) Permute(indices Uint8x64) Uint8x64 func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x8) Permute(indices Uint16x8) Int16x8 func (x Int16x8) Permute(indices Uint16x8) Int16x8
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x8) Permute(indices Uint16x8) Uint16x8 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x16) Permute(indices Uint16x16) Int16x16 func (x Int16x16) Permute(indices Uint16x16) Int16x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x16) Permute(indices Uint16x16) Uint16x16 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x32) Permute(indices Uint16x32) Int16x32 func (x Int16x32) Permute(indices Uint16x32) Int16x32
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -6580,63 +6580,63 @@ func (x Int32x8) Permute(indices Uint32x8) Int32x8
// Asm: VPERMD, CPU Feature: AVX2 // Asm: VPERMD, CPU Feature: AVX2
func (x Uint32x8) Permute(indices Uint32x8) Uint32x8 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMPS, CPU Feature: AVX512F // Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x16) Permute(indices Uint32x16) Float32x16 func (x Float32x16) Permute(indices Uint32x16) Float32x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x16) Permute(indices Uint32x16) Int32x16 func (x Int32x16) Permute(indices Uint32x16) Int32x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x16) Permute(indices Uint32x16) Uint32x16 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMPD, CPU Feature: AVX512F // Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x4) Permute(indices Uint64x4) Float64x4 func (x Float64x4) Permute(indices Uint64x4) Float64x4
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x4) Permute(indices Uint64x4) Int64x4 func (x Int64x4) Permute(indices Uint64x4) Int64x4
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Uint64x4) Permute(indices Uint64x4) Uint64x4 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMPD, CPU Feature: AVX512F // Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x8) Permute(indices Uint64x8) Float64x8 func (x Float64x8) Permute(indices Uint64x8) Float64x8
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x8) Permute(indices Uint64x8) Int64x8 func (x Int64x8) Permute(indices Uint64x8) Int64x8
// Permute performs a full permutation of vector y using indices: // Permute performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7189,7 +7189,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
/* PermuteMasked */ /* PermuteMasked */
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7198,7 +7198,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16 func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7207,7 +7207,7 @@ func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16 func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7216,7 +7216,7 @@ func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32 func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7225,7 +7225,7 @@ func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32 func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7234,7 +7234,7 @@ func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64 func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7243,7 +7243,7 @@ func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
// Asm: VPERMB, CPU Feature: AVX512VBMI // Asm: VPERMB, CPU Feature: AVX512VBMI
func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64 func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7252,7 +7252,7 @@ func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8 func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7261,7 +7261,7 @@ func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8 func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7270,7 +7270,7 @@ func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16 func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7279,7 +7279,7 @@ func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16 func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7288,7 +7288,7 @@ func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32 func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7297,7 +7297,7 @@ func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
// Asm: VPERMW, CPU Feature: AVX512BW // Asm: VPERMW, CPU Feature: AVX512BW
func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32 func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7306,7 +7306,7 @@ func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
// Asm: VPERMPS, CPU Feature: AVX512F // Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8 func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7315,7 +7315,7 @@ func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8 func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7324,7 +7324,7 @@ func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8 func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7333,7 +7333,7 @@ func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
// Asm: VPERMPS, CPU Feature: AVX512F // Asm: VPERMPS, CPU Feature: AVX512F
func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16 func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7342,7 +7342,7 @@ func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16 func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7351,7 +7351,7 @@ func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
// Asm: VPERMD, CPU Feature: AVX512F // Asm: VPERMD, CPU Feature: AVX512F
func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16 func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7360,7 +7360,7 @@ func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
// Asm: VPERMPD, CPU Feature: AVX512F // Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4 func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7369,7 +7369,7 @@ func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4 func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7378,7 +7378,7 @@ func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4 func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7387,7 +7387,7 @@ func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
// Asm: VPERMPD, CPU Feature: AVX512F // Asm: VPERMPD, CPU Feature: AVX512F
func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8 func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //
@ -7396,7 +7396,7 @@ func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
// Asm: VPERMQ, CPU Feature: AVX512F // Asm: VPERMQ, CPU Feature: AVX512F
func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8 func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8
// PermuteMasked performs a full permutation of vector y using indices: // PermuteMasked performs a full permutation of vector x using indices:
// result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]} // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
// Only the needed bits to represent x's index are used in indices' elements. // Only the needed bits to represent x's index are used in indices' elements.
// //

View file

@ -461,7 +461,7 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
} }
} }
func TestBitMask(t *testing.T) { func TestBitMaskLoad(t *testing.T) {
if !simd.HasAVX512() { if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware") t.Skip("Test requires HasAVX512, not available on this hardware")
return return
@ -477,3 +477,19 @@ func TestBitMask(t *testing.T) {
} }
} }
} }
func TestBitMaskStore(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
var want uint64 = 0b101
var got uint64
x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
m := y.Greater(x)
m.StoreToBits(&got)
if got != want {
t.Errorf("Result incorrect: want %b, got %b", want, got)
}
}

View file

@ -205,48 +205,88 @@ type Mask8x16 struct {
vals [16]int8 vals [16]int8
} }
// Mask8x16FromBits constructs a Mask8x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used. // Only the lower 16 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask8x16FromBits(y *uint64) Mask8x16 func LoadMask8x16FromBits(y *uint64) Mask8x16
// StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x16) StoreToBits(y *uint64)
// Mask16x8 is a 128-bit SIMD vector of 8 int16 // Mask16x8 is a 128-bit SIMD vector of 8 int16
type Mask16x8 struct { type Mask16x8 struct {
int16x8 v128 int16x8 v128
vals [8]int16 vals [8]int16
} }
// Mask16x8FromBits constructs a Mask16x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used. // Only the lower 8 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask16x8FromBits(y *uint64) Mask16x8 func LoadMask16x8FromBits(y *uint64) Mask16x8
// StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x8) StoreToBits(y *uint64)
// Mask32x4 is a 128-bit SIMD vector of 4 int32 // Mask32x4 is a 128-bit SIMD vector of 4 int32
type Mask32x4 struct { type Mask32x4 struct {
int32x4 v128 int32x4 v128
vals [4]int32 vals [4]int32
} }
// Mask32x4FromBits constructs a Mask32x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used. // Only the lower 4 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask32x4FromBits(y *uint64) Mask32x4 func LoadMask32x4FromBits(y *uint64) Mask32x4
// StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x4) StoreToBits(y *uint64)
// Mask64x2 is a 128-bit SIMD vector of 2 int64 // Mask64x2 is a 128-bit SIMD vector of 2 int64
type Mask64x2 struct { type Mask64x2 struct {
int64x2 v128 int64x2 v128
vals [2]int64 vals [2]int64
} }
// Mask64x2FromBits constructs a Mask64x2 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used. // Only the lower 2 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask64x2FromBits(y *uint64) Mask64x2 func LoadMask64x2FromBits(y *uint64) Mask64x2
// StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x2) StoreToBits(y *uint64)
// v256 is a tag type that tells the compiler that this is really 256-bit SIMD // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
type v256 struct { type v256 struct {
_256 struct{} _256 struct{}
@ -448,48 +488,88 @@ type Mask8x32 struct {
vals [32]int8 vals [32]int8
} }
// Mask8x32FromBits constructs a Mask8x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used. // Only the lower 32 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask8x32FromBits(y *uint64) Mask8x32 func LoadMask8x32FromBits(y *uint64) Mask8x32
// StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x32) StoreToBits(y *uint64)
// Mask16x16 is a 256-bit SIMD vector of 16 int16 // Mask16x16 is a 256-bit SIMD vector of 16 int16
type Mask16x16 struct { type Mask16x16 struct {
int16x16 v256 int16x16 v256
vals [16]int16 vals [16]int16
} }
// Mask16x16FromBits constructs a Mask16x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used. // Only the lower 16 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask16x16FromBits(y *uint64) Mask16x16 func LoadMask16x16FromBits(y *uint64) Mask16x16
// StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x16) StoreToBits(y *uint64)
// Mask32x8 is a 256-bit SIMD vector of 8 int32 // Mask32x8 is a 256-bit SIMD vector of 8 int32
type Mask32x8 struct { type Mask32x8 struct {
int32x8 v256 int32x8 v256
vals [8]int32 vals [8]int32
} }
// Mask32x8FromBits constructs a Mask32x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used. // Only the lower 8 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask32x8FromBits(y *uint64) Mask32x8 func LoadMask32x8FromBits(y *uint64) Mask32x8
// StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x8) StoreToBits(y *uint64)
// Mask64x4 is a 256-bit SIMD vector of 4 int64 // Mask64x4 is a 256-bit SIMD vector of 4 int64
type Mask64x4 struct { type Mask64x4 struct {
int64x4 v256 int64x4 v256
vals [4]int64 vals [4]int64
} }
// Mask64x4FromBits constructs a Mask64x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used. // Only the lower 4 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask64x4FromBits(y *uint64) Mask64x4 func LoadMask64x4FromBits(y *uint64) Mask64x4
// StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x4) StoreToBits(y *uint64)
// v512 is a tag type that tells the compiler that this is really 512-bit SIMD // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
type v512 struct { type v512 struct {
_512 struct{} _512 struct{}
@ -691,44 +771,84 @@ type Mask8x64 struct {
vals [64]int8 vals [64]int8
} }
// Mask8x64FromBits constructs a Mask8x64 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used. // Only the lower 64 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask8x64FromBits(y *uint64) Mask8x64 func LoadMask8x64FromBits(y *uint64) Mask8x64
// StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask8x64) StoreToBits(y *uint64)
// Mask16x32 is a 512-bit SIMD vector of 32 int16 // Mask16x32 is a 512-bit SIMD vector of 32 int16
type Mask16x32 struct { type Mask16x32 struct {
int16x32 v512 int16x32 v512
vals [32]int16 vals [32]int16
} }
// Mask16x32FromBits constructs a Mask16x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used. // Only the lower 32 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask16x32FromBits(y *uint64) Mask16x32 func LoadMask16x32FromBits(y *uint64) Mask16x32
// StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask16x32) StoreToBits(y *uint64)
// Mask32x16 is a 512-bit SIMD vector of 16 int32 // Mask32x16 is a 512-bit SIMD vector of 16 int32
type Mask32x16 struct { type Mask32x16 struct {
int32x16 v512 int32x16 v512
vals [16]int32 vals [16]int32
} }
// Mask32x16FromBits constructs a Mask32x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used. // Only the lower 16 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask32x16FromBits(y *uint64) Mask32x16 func LoadMask32x16FromBits(y *uint64) Mask32x16
// StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask32x16) StoreToBits(y *uint64)
// Mask64x8 is a 512-bit SIMD vector of 8 int64 // Mask64x8 is a 512-bit SIMD vector of 8 int64
type Mask64x8 struct { type Mask64x8 struct {
int64x8 v512 int64x8 v512
vals [8]int64 vals [8]int64
} }
// Mask64x8FromBits constructs a Mask64x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset. // LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used. // Only the lower 8 bits of y are used.
// //
// CPU Features: AVX512
//
//go:noescape //go:noescape
func LoadMask64x8FromBits(y *uint64) Mask64x8 func LoadMask64x8FromBits(y *uint64) Mask64x8
// StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// CPU Features: AVX512
//
//go:noescape
func (x Mask64x8) StoreToBits(y *uint64)