[dev.simd] cmd/compile, simd: support store to bits for mask

This CL is partially generated by CL 689775. Change-Id: I0c36fd2a44706c88db1a1d5ea4a6d0b9f891d85f Reviewed-on: https://go-review.googlesource.com/c/go/+/689795 Reviewed-by: David Chase <drchase@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-12-08 06:10:04 +00:00 · 2025-07-23 07:37:14 +00:00 · 2025-07-23 07:37:14 +00:00 · 6f7a1164e7
commit 6f7a1164e7
parent 41054cdb1c
15 changed files with 1192 additions and 523 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -24,8 +24,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPABSQ128,
 		ssa.OpAMD64VPABSQ256,
 		ssa.OpAMD64VPABSQ512,
-		ssa.OpAMD64VRCP14PS128,
+		ssa.OpAMD64VRCPPS128,
-		ssa.OpAMD64VRCP14PS256,
+		ssa.OpAMD64VRCPPS256,
 		ssa.OpAMD64VRCP14PS512,
 		ssa.OpAMD64VRCP14PD128,
 		ssa.OpAMD64VRCP14PD256,
@ -335,6 +335,16 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VPXORQ512:
 		p = simdV21(s, v)
 	case ssa.OpAMD64VPCMPEQB512,
 		ssa.OpAMD64VPCMPEQW512,
 		ssa.OpAMD64VPCMPEQD512,
 		ssa.OpAMD64VPCMPEQQ512,
 		ssa.OpAMD64VPCMPGTB512,
 		ssa.OpAMD64VPCMPGTW512,
 		ssa.OpAMD64VPCMPGTD512,
 		ssa.OpAMD64VPCMPGTQ512:
 		p = simdV2k(s, v)
 	case ssa.OpAMD64VADDPSMasked128,
 		ssa.OpAMD64VADDPSMasked256,
 		ssa.OpAMD64VADDPSMasked512,
@ -733,30 +743,30 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 	case ssa.OpAMD64VCMPPS512,
 		ssa.OpAMD64VCMPPD512,
 		ssa.OpAMD64VPCMPB512,
 		ssa.OpAMD64VPCMPW512,
 		ssa.OpAMD64VPCMPD512,
 		ssa.OpAMD64VPCMPQ512,
 		ssa.OpAMD64VPCMPUB512,
 		ssa.OpAMD64VPCMPUW512,
 		ssa.OpAMD64VPCMPUD512,
 		ssa.OpAMD64VPCMPUQ512,
 		ssa.OpAMD64VPCMPUB128,
 		ssa.OpAMD64VPCMPUB256,
 		ssa.OpAMD64VPCMPUB512,
 		ssa.OpAMD64VPCMPUW128,
 		ssa.OpAMD64VPCMPUW256,
 		ssa.OpAMD64VPCMPUW512,
 		ssa.OpAMD64VPCMPUD128,
 		ssa.OpAMD64VPCMPUD256,
 		ssa.OpAMD64VPCMPUD512,
 		ssa.OpAMD64VPCMPUQ128,
 		ssa.OpAMD64VPCMPUQ256,
 		ssa.OpAMD64VPCMPUQ512,
 		ssa.OpAMD64VPCMPB128,
 		ssa.OpAMD64VPCMPB256,
 		ssa.OpAMD64VPCMPB512,
 		ssa.OpAMD64VPCMPW128,
 		ssa.OpAMD64VPCMPW256,
 		ssa.OpAMD64VPCMPW512,
 		ssa.OpAMD64VPCMPD128,
 		ssa.OpAMD64VPCMPD256,
 		ssa.OpAMD64VPCMPD512,
 		ssa.OpAMD64VPCMPQ128,
-		ssa.OpAMD64VPCMPQ256:
+		ssa.OpAMD64VPCMPQ256,
 		ssa.OpAMD64VPCMPQ512:
 		p = simdV2kImm8(s, v)
 	case ssa.OpAMD64VCMPPSMasked128,
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -1468,10 +1468,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		ssagen.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = simdOrMaskReg(v)
-	case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512:
+	case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512, ssa.OpAMD64KMOVQstore:
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
-		p.From.Reg = simdReg(v.Args[1])
+		p.From.Reg = simdOrMaskReg(v.Args[1])
 		p.To.Type = obj.TYPE_MEM
 		p.To.Reg = v.Args[0].Reg()
 		ssagen.AddAux(&p.To, v)
--- a/src/cmd/compile/internal/ssa/_gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64.rules
@ -1698,6 +1698,22 @@
 (LoadMask64x4 <t> ptr mem) => (VPMOVMToVec64x4 <types.TypeVec256> (KMOVQload <t> ptr mem))
 (LoadMask64x8 <t> ptr mem) => (VPMOVMToVec64x8 <types.TypeVec512> (KMOVQload <t> ptr mem))
 (StoreMask8x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
 (StoreMask8x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
 (StoreMask8x64 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
 (StoreMask16x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
 (StoreMask16x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
 (StoreMask16x32 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
 (StoreMask32x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
 (StoreMask32x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
 (StoreMask32x16 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
 (StoreMask64x2 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
 (StoreMask64x4 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
 (StoreMask64x8 {t} ptr val mem) => (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
 (Load <t> ptr mem) && t.Size() == 16 => (VMOVDQUload128 ptr mem)
 (Store {t} ptr val mem) && t.Size() == 16 => (VMOVDQUstore128 ptr val mem)
--- a/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/AMD64Ops.go
@ -235,6 +235,7 @@ func init() {
 		wfpkw = regInfo{inputs: []regMask{w, fp, mask}, outputs: wonly}
 		kload  = regInfo{inputs: []regMask{gpspsb, 0}, outputs: maskonly}
 		kstore = regInfo{inputs: []regMask{gpspsb, mask, 0}}
 		prefreg = regInfo{inputs: []regMask{gpspsbg}}
 	)
@ -1318,6 +1319,7 @@ func init() {
 		{name: "VZEROALL", argLength: 0, asm: "VZEROALL"},
 		{name: "KMOVQload", argLength: 2, reg: kload, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"},
 		{name: "KMOVQstore", argLength: 3, reg: kstore, asm: "KMOVQ", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"},
 	}
 	var AMD64blocks = []blockData{
--- a/src/cmd/compile/internal/ssa/_gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/genericOps.go
@ -678,6 +678,19 @@ var genericOps = []opData{
 	{name: "LoadMask64x2", argLength: 2},  // arg0 = ptr, arg1 = mem
 	{name: "LoadMask64x4", argLength: 2},  // arg0 = ptr, arg1 = mem
 	{name: "LoadMask64x8", argLength: 2},  // arg0 = ptr, arg1 = mem
 	{name: "StoreMask8x16", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask8x32", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask8x64", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask16x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask16x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask16x32", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask32x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask32x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask32x16", argLength: 3, typ: "Mem", aux: "Typ"}, // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask64x2", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask64x4", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 	{name: "StoreMask64x8", argLength: 3, typ: "Mem", aux: "Typ"},  // Store arg1 to arg0.  arg2=memory, aux=type.  Returns memory.
 }
 //     kind          controls          successors   implicit exit
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -152,8 +152,8 @@
 (AndNotMaskedUint64x2 x y mask) => (VPANDNQMasked128 x y (VPMOVVec64x2ToM <types.TypeMask> mask))
 (AndNotMaskedUint64x4 x y mask) => (VPANDNQMasked256 x y (VPMOVVec64x4ToM <types.TypeMask> mask))
 (AndNotMaskedUint64x8 x y mask) => (VPANDNQMasked512 x y (VPMOVVec64x8ToM <types.TypeMask> mask))
-(ApproximateReciprocalFloat32x4 ...) => (VRCP14PS128 ...)
+(ApproximateReciprocalFloat32x4 ...) => (VRCPPS128 ...)
-(ApproximateReciprocalFloat32x8 ...) => (VRCP14PS256 ...)
+(ApproximateReciprocalFloat32x8 ...) => (VRCPPS256 ...)
 (ApproximateReciprocalFloat32x16 ...) => (VRCP14PS512 ...)
 (ApproximateReciprocalFloat64x2 ...) => (VRCP14PD128 ...)
 (ApproximateReciprocalFloat64x4 ...) => (VRCP14PD256 ...)
@ -305,28 +305,28 @@
 (EqualFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [0] x y))
 (EqualInt8x16 ...) => (VPCMPEQB128 ...)
 (EqualInt8x32 ...) => (VPCMPEQB256 ...)
-(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
+(EqualInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
 (EqualInt16x8 ...) => (VPCMPEQW128 ...)
 (EqualInt16x16 ...) => (VPCMPEQW256 ...)
-(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
+(EqualInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
 (EqualInt32x4 ...) => (VPCMPEQD128 ...)
 (EqualInt32x8 ...) => (VPCMPEQD256 ...)
-(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
+(EqualInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
 (EqualInt64x2 ...) => (VPCMPEQQ128 ...)
 (EqualInt64x4 ...) => (VPCMPEQQ256 ...)
-(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
+(EqualInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
 (EqualUint8x16 ...) => (VPCMPEQB128 ...)
 (EqualUint8x32 ...) => (VPCMPEQB256 ...)
-(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [0] x y))
+(EqualUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPEQB512 x y))
 (EqualUint16x8 ...) => (VPCMPEQW128 ...)
 (EqualUint16x16 ...) => (VPCMPEQW256 ...)
-(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [0] x y))
+(EqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPEQW512 x y))
 (EqualUint32x4 ...) => (VPCMPEQD128 ...)
 (EqualUint32x8 ...) => (VPCMPEQD256 ...)
-(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [0] x y))
+(EqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPEQD512 x y))
 (EqualUint64x2 ...) => (VPCMPEQQ128 ...)
 (EqualUint64x4 ...) => (VPCMPEQQ256 ...)
-(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y))
+(EqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
 (EqualMaskedFloat32x4 x y mask) => (VPMOVMToVec32x4 (VCMPPSMasked128 [0] x y (VPMOVVec32x4ToM <types.TypeMask> mask)))
 (EqualMaskedFloat32x8 x y mask) => (VPMOVMToVec32x8 (VCMPPSMasked256 [0] x y (VPMOVVec32x8ToM <types.TypeMask> mask)))
 (EqualMaskedFloat32x16 x y mask) => (VPMOVMToVec32x16 (VCMPPSMasked512 [0] x y (VPMOVVec32x16ToM <types.TypeMask> mask)))
@ -453,16 +453,16 @@
 (GreaterFloat64x8 x y) => (VPMOVMToVec64x8 (VCMPPD512 [14] x y))
 (GreaterInt8x16 ...) => (VPCMPGTB128 ...)
 (GreaterInt8x32 ...) => (VPCMPGTB256 ...)
-(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPB512 [14] x y))
+(GreaterInt8x64 x y) => (VPMOVMToVec8x64 (VPCMPGTB512 x y))
 (GreaterInt16x8 ...) => (VPCMPGTW128 ...)
 (GreaterInt16x16 ...) => (VPCMPGTW256 ...)
-(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPW512 [14] x y))
+(GreaterInt16x32 x y) => (VPMOVMToVec16x32 (VPCMPGTW512 x y))
 (GreaterInt32x4 ...) => (VPCMPGTD128 ...)
 (GreaterInt32x8 ...) => (VPCMPGTD256 ...)
-(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPD512 [14] x y))
+(GreaterInt32x16 x y) => (VPMOVMToVec32x16 (VPCMPGTD512 x y))
 (GreaterInt64x2 ...) => (VPCMPGTQ128 ...)
 (GreaterInt64x4 ...) => (VPCMPGTQ256 ...)
-(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPQ512 [14] x y))
+(GreaterInt64x8 x y) => (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
 (GreaterUint8x16 x y) => (VPMOVMToVec8x16 (VPCMPUB128 [14] x y))
 (GreaterUint8x32 x y) => (VPMOVMToVec8x32 (VPCMPUB256 [14] x y))
 (GreaterUint8x64 x y) => (VPMOVMToVec8x64 (VPCMPUB512 [14] x y))
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -33,7 +33,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VADDPS128", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDPSMasked128", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VADDSUBPS128", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec128", resultInArg0: false},
-		{name: "VRCP14PS128", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VRCPPS128", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VRCP14PSMasked128", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VRSQRTPS128", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VRSQRT14PSMasked128", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec128", resultInArg0: false},
@ -63,7 +63,7 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VADDPS256", argLength: 2, reg: v21, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDPSMasked256", argLength: 3, reg: w2kw, asm: "VADDPS", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VADDSUBPS256", argLength: 2, reg: v21, asm: "VADDSUBPS", commutative: false, typ: "Vec256", resultInArg0: false},
-		{name: "VRCP14PS256", argLength: 1, reg: w11, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
+		{name: "VRCPPS256", argLength: 1, reg: v11, asm: "VRCPPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VRCP14PSMasked256", argLength: 2, reg: wkw, asm: "VRCP14PS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VRSQRTPS256", argLength: 1, reg: v11, asm: "VRSQRTPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VRSQRT14PSMasked256", argLength: 2, reg: wkw, asm: "VRSQRT14PS", commutative: false, typ: "Vec256", resultInArg0: false},
@ -224,6 +224,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPADDW512", argLength: 2, reg: w21, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDWMasked512", argLength: 3, reg: w2kw, asm: "VPADDW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCOMPRESSWMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSW", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQW512", argLength: 2, reg: w2k, asm: "VPCMPEQW", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTW512", argLength: 2, reg: w2k, asm: "VPCMPGTW", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSW512", argLength: 2, reg: w21, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMAXSWMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSW", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSW512", argLength: 2, reg: w21, asm: "VPMINSW", commutative: true, typ: "Vec512", resultInArg0: false},
@ -305,6 +307,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPANDND512", argLength: 2, reg: w21, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDNDMasked512", argLength: 3, reg: w2kw, asm: "VPANDND", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCOMPRESSDMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQD512", argLength: 2, reg: w2k, asm: "VPCMPEQD", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTD512", argLength: 2, reg: w2k, asm: "VPCMPGTD", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSD512", argLength: 2, reg: w21, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMAXSDMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSD", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSD512", argLength: 2, reg: w21, asm: "VPMINSD", commutative: true, typ: "Vec512", resultInArg0: false},
@ -526,6 +530,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPANDNQ512", argLength: 2, reg: w21, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPANDNQMasked512", argLength: 3, reg: w2kw, asm: "VPANDNQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCOMPRESSQMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQQ512", argLength: 2, reg: w2k, asm: "VPCMPEQQ", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTQ512", argLength: 2, reg: w2k, asm: "VPCMPGTQ", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSQ512", argLength: 2, reg: w21, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMAXSQMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSQ", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSQ512", argLength: 2, reg: w21, asm: "VPMINSQ", commutative: true, typ: "Vec512", resultInArg0: false},
@ -611,6 +617,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPADDB512", argLength: 2, reg: w21, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPADDBMasked512", argLength: 3, reg: w2kw, asm: "VPADDB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPCOMPRESSBMasked512", argLength: 2, reg: wkw, asm: "VPCOMPRESSB", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPEQB512", argLength: 2, reg: w2k, asm: "VPCMPEQB", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPGTB512", argLength: 2, reg: w2k, asm: "VPCMPGTB", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPMAXSB512", argLength: 2, reg: w21, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMAXSBMasked512", argLength: 3, reg: w2kw, asm: "VPMAXSB", commutative: true, typ: "Vec512", resultInArg0: false},
 		{name: "VPMINSB512", argLength: 2, reg: w21, asm: "VPMINSB", commutative: true, typ: "Vec512", resultInArg0: false},
@ -692,10 +700,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD128", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMINUDMasked128", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPMULUDQ128", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec128", resultInArg0: false},
 		{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2PS128", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
-		{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
+		{name: "VPERMI2D128", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2DMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPERMI2PSMasked128", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec128", resultInArg0: true},
 		{name: "VPSRLD128", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLDMasked128", argLength: 3, reg: wfpkw, asm: "VPSRLD", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSRLVD128", argLength: 2, reg: v21, asm: "VPSRLVD", commutative: false, typ: "Vec128", resultInArg0: false},
@ -705,12 +713,12 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUD256", argLength: 2, reg: v21, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUDMasked256", argLength: 3, reg: w2kw, asm: "VPMINUD", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQ256", argLength: 2, reg: v21, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMD256", argLength: 2, reg: v21, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
-		{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMPS256", argLength: 2, reg: v21, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2PS256", argLength: 3, reg: w31, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
-		{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMI2D256", argLength: 3, reg: w31, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2PSMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PS", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2DMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2D", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMPSMasked256", argLength: 3, reg: w2kw, asm: "VPERMPS", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMDMasked256", argLength: 3, reg: w2kw, asm: "VPERMD", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSRLD256", argLength: 2, reg: vfpv, asm: "VPSRLD", commutative: false, typ: "Vec256", resultInArg0: false},
@ -735,10 +743,10 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPMINUQ256", argLength: 2, reg: w21, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMINUQMasked256", argLength: 3, reg: w2kw, asm: "VPMINUQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPMULUDQMasked256", argLength: 3, reg: w2kw, asm: "VPMULUDQ", commutative: true, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMPD256", argLength: 2, reg: w21, asm: "VPERMPD", commutative: false, typ: "Vec256", resultInArg0: false},
-		{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
+		{name: "VPERMQ256", argLength: 2, reg: w21, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPERMI2PD256", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2Q256", argLength: 3, reg: w31, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2PDMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMI2QMasked256", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec256", resultInArg0: true},
 		{name: "VPERMQMasked256", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec256", resultInArg0: false},
@ -759,8 +767,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPERMI2PD512", argLength: 3, reg: w31, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2QMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2Q", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMI2PDMasked512", argLength: 4, reg: w3kw, asm: "VPERMI2PD", commutative: false, typ: "Vec512", resultInArg0: true},
 		{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMPDMasked512", argLength: 3, reg: w2kw, asm: "VPERMPD", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPERMQMasked512", argLength: 3, reg: w2kw, asm: "VPERMQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQ512", argLength: 2, reg: wfpw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLQMasked512", argLength: 3, reg: wfpkw, asm: "VPSRLQ", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSRLVQ512", argLength: 2, reg: w21, asm: "VPSRLVQ", commutative: false, typ: "Vec512", resultInArg0: false},
@ -858,8 +866,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHLDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHRDW256", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHRDWMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPW512", argLength: 2, reg: w2k, asm: "VPCMPW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPSHLDW512", argLength: 2, reg: w21, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSHLDWMasked512", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPSHRDW512", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -872,8 +880,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHLDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHLDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHRDW128", argLength: 2, reg: w21, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPSHRDWMasked128", argLength: 3, reg: w2kw, asm: "VPSHRDW", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPD512", argLength: 2, reg: w2k, asm: "VPCMPD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPROLD512", argLength: 1, reg: w11, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPROLDMasked512", argLength: 2, reg: wkw, asm: "VPROLD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPRORD512", argLength: 1, reg: w11, asm: "VPRORD", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -926,8 +934,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPSHLDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHLDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHRDQ256", argLength: 2, reg: w21, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPSHRDQMasked256", argLength: 3, reg: w2kw, asm: "VPSHRDQ", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPQ512", argLength: 2, reg: w2k, asm: "VPCMPQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPROLQ512", argLength: 1, reg: w11, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPROLQMasked512", argLength: 2, reg: wkw, asm: "VPROLQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPRORQ512", argLength: 1, reg: w11, asm: "VPRORQ", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
@ -944,16 +952,16 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VEXTRACTI128128", argLength: 1, reg: v11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPB256", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VINSERTI128256", argLength: 2, reg: v21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPB512", argLength: 2, reg: w2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUWMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW256", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUWMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW512", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUWMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUW128", argLength: 2, reg: w2k, asm: "VPCMPUW", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUDMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUD512", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUDMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUD128", argLength: 2, reg: w2k, asm: "VPCMPUD", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUDMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUD", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
@ -962,8 +970,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VPCMPUQ128", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUQMasked256", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUQ256", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUQMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUQ", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUQ512", argLength: 2, reg: w2k, asm: "VPCMPUQ", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUBMasked128", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VGF2P8AFFINEQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VGF2P8AFFINEINVQB128", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
@ -976,11 +984,11 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
 		{name: "VGF2P8AFFINEINVQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VGF2P8AFFINEQBMasked256", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VPCMPUB256", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPUBMasked512", argLength: 3, reg: w2kk, asm: "VPCMPUB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VGF2P8AFFINEQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VGF2P8AFFINEINVQB512", argLength: 2, reg: w21, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VGF2P8AFFINEINVQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEINVQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VGF2P8AFFINEQBMasked512", argLength: 3, reg: w2kw, asm: "VGF2P8AFFINEQB", aux: "Int8", commutative: false, typ: "Vec512", resultInArg0: false},
 		{name: "VPCMPUB512", argLength: 2, reg: w2k, asm: "VPCMPUB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 	}
 }
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -912,10 +912,10 @@ func simdGenericOps() []opData {
 		{name: "PermuteUint16x16", argLength: 2, commutative: false},
 		{name: "Permute2Uint16x16", argLength: 3, commutative: false},
 		{name: "Permute2Int16x16", argLength: 3, commutative: false},
 		{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint16x16", argLength: 4, commutative: false},
-		{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
+		{name: "Permute2MaskedInt16x16", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt16x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint16x16", argLength: 3, commutative: false},
 		{name: "PopCountUint16x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x16", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x16", argLength: 2, commutative: true},
@ -966,8 +966,8 @@ func simdGenericOps() []opData {
 		{name: "Permute2Int16x32", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint16x32", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt16x32", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt16x32", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint16x32", argLength: 3, commutative: false},
 		{name: "PopCountUint16x32", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x32", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x32", argLength: 2, commutative: true},
@ -1018,12 +1018,12 @@ func simdGenericOps() []opData {
 		{name: "PairwiseSubUint16x8", argLength: 2, commutative: false},
 		{name: "PermuteInt16x8", argLength: 2, commutative: false},
 		{name: "PermuteUint16x8", argLength: 2, commutative: false},
 		{name: "Permute2Int16x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint16x8", argLength: 3, commutative: false},
 		{name: "Permute2Int16x8", argLength: 3, commutative: false},
 		{name: "Permute2MaskedInt16x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint16x8", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt16x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint16x8", argLength: 3, commutative: false},
 		{name: "PopCountUint16x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint16x8", argLength: 2, commutative: false},
 		{name: "SaturatedAddUint16x8", argLength: 2, commutative: true},
@ -1070,17 +1070,17 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint32x16", argLength: 3, commutative: true},
 		{name: "OrUint32x16", argLength: 2, commutative: true},
 		{name: "OrMaskedUint32x16", argLength: 3, commutative: true},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "PermuteInt32x16", argLength: 2, commutative: false},
 		{name: "PermuteFloat32x16", argLength: 2, commutative: false},
 		{name: "PermuteUint32x16", argLength: 2, commutative: false},
 		{name: "Permute2Uint32x16", argLength: 3, commutative: false},
 		{name: "Permute2Float32x16", argLength: 3, commutative: false},
 		{name: "Permute2Int32x16", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt32x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat32x16", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint32x16", argLength: 4, commutative: false},
 		{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat32x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt32x16", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint32x16", argLength: 3, commutative: false},
 		{name: "PopCountUint32x16", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint32x16", argLength: 2, commutative: false},
@ -1307,15 +1307,15 @@ func simdGenericOps() []opData {
 		{name: "PermuteUint64x4", argLength: 2, commutative: false},
 		{name: "PermuteInt64x4", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x4", argLength: 2, commutative: false},
 		{name: "Permute2Float64x4", argLength: 3, commutative: false},
 		{name: "Permute2Int64x4", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x4", argLength: 3, commutative: false},
-		{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
+		{name: "Permute2Int64x4", argLength: 3, commutative: false},
 		{name: "Permute2Float64x4", argLength: 3, commutative: false},
 		{name: "Permute2MaskedUint64x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat64x4", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt64x4", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat64x4", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt64x4", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint64x4", argLength: 3, commutative: false},
 		{name: "PopCountUint64x4", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x4", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x4", argLength: 2, commutative: false},
@ -1365,18 +1365,18 @@ func simdGenericOps() []opData {
 		{name: "NotEqualMaskedUint64x8", argLength: 3, commutative: true},
 		{name: "OrUint64x8", argLength: 2, commutative: true},
 		{name: "OrMaskedUint64x8", argLength: 3, commutative: true},
 		{name: "PermuteUint64x8", argLength: 2, commutative: false},
 		{name: "PermuteFloat64x8", argLength: 2, commutative: false},
 		{name: "PermuteInt64x8", argLength: 2, commutative: false},
 		{name: "PermuteUint64x8", argLength: 2, commutative: false},
 		{name: "Permute2Int64x8", argLength: 3, commutative: false},
 		{name: "Permute2Float64x8", argLength: 3, commutative: false},
 		{name: "Permute2Uint64x8", argLength: 3, commutative: false},
 		{name: "Permute2Int64x8", argLength: 3, commutative: false},
 		{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedUint64x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedInt64x8", argLength: 4, commutative: false},
 		{name: "Permute2MaskedFloat64x8", argLength: 4, commutative: false},
 		{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedInt64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedFloat64x8", argLength: 3, commutative: false},
 		{name: "PermuteMaskedUint64x8", argLength: 3, commutative: false},
 		{name: "PopCountUint64x8", argLength: 1, commutative: false},
 		{name: "PopCountMaskedUint64x8", argLength: 2, commutative: false},
 		{name: "RotateLeftUint64x8", argLength: 2, commutative: false},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -985,10 +985,10 @@ func rewriteValueAMD64(v *Value) bool {
 		v.Op = OpAMD64VRCP14PS512
 		return true
 	case OpApproximateReciprocalFloat32x4:
-		v.Op = OpAMD64VRCP14PS128
+		v.Op = OpAMD64VRCPPS128
 		return true
 	case OpApproximateReciprocalFloat32x8:
-		v.Op = OpAMD64VRCP14PS256
+		v.Op = OpAMD64VRCPPS256
 		return true
 	case OpApproximateReciprocalFloat64x2:
 		v.Op = OpAMD64VRCP14PD128
@ -5184,6 +5184,30 @@ func rewriteValueAMD64(v *Value) bool {
 		return true
 	case OpStore:
 		return rewriteValueAMD64_OpStore(v)
 	case OpStoreMask16x16:
 		return rewriteValueAMD64_OpStoreMask16x16(v)
 	case OpStoreMask16x32:
 		return rewriteValueAMD64_OpStoreMask16x32(v)
 	case OpStoreMask16x8:
 		return rewriteValueAMD64_OpStoreMask16x8(v)
 	case OpStoreMask32x16:
 		return rewriteValueAMD64_OpStoreMask32x16(v)
 	case OpStoreMask32x4:
 		return rewriteValueAMD64_OpStoreMask32x4(v)
 	case OpStoreMask32x8:
 		return rewriteValueAMD64_OpStoreMask32x8(v)
 	case OpStoreMask64x2:
 		return rewriteValueAMD64_OpStoreMask64x2(v)
 	case OpStoreMask64x4:
 		return rewriteValueAMD64_OpStoreMask64x4(v)
 	case OpStoreMask64x8:
 		return rewriteValueAMD64_OpStoreMask64x8(v)
 	case OpStoreMask8x16:
 		return rewriteValueAMD64_OpStoreMask8x16(v)
 	case OpStoreMask8x32:
 		return rewriteValueAMD64_OpStoreMask8x32(v)
 	case OpStoreMask8x64:
 		return rewriteValueAMD64_OpStoreMask8x64(v)
 	case OpSub16:
 		v.Op = OpAMD64SUBL
 		return true
@ -33388,13 +33412,12 @@ func rewriteValueAMD64_OpEqualInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt16x32 x y)
-	// result: (VPMOVMToVec16x32 (VPCMPW512 [0] x y))
+	// result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -33406,13 +33429,12 @@ func rewriteValueAMD64_OpEqualInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt32x16 x y)
-	// result: (VPMOVMToVec32x16 (VPCMPD512 [0] x y))
+	// result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -33424,13 +33446,12 @@ func rewriteValueAMD64_OpEqualInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt64x8 x y)
-	// result: (VPMOVMToVec64x8 (VPCMPQ512 [0] x y))
+	// result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -33442,13 +33463,12 @@ func rewriteValueAMD64_OpEqualInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualInt8x64 x y)
-	// result: (VPMOVMToVec8x64 (VPCMPB512 [0] x y))
+	// result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -34120,13 +34140,12 @@ func rewriteValueAMD64_OpEqualUint16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualUint16x32 x y)
-	// result: (VPMOVMToVec16x32 (VPCMPUW512 [0] x y))
+	// result: (VPMOVMToVec16x32 (VPCMPEQW512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUW512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQW512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -34138,13 +34157,12 @@ func rewriteValueAMD64_OpEqualUint32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualUint32x16 x y)
-	// result: (VPMOVMToVec32x16 (VPCMPUD512 [0] x y))
+	// result: (VPMOVMToVec32x16 (VPCMPEQD512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUD512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQD512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -34156,13 +34174,12 @@ func rewriteValueAMD64_OpEqualUint64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualUint64x8 x y)
-	// result: (VPMOVMToVec64x8 (VPCMPUQ512 [0] x y))
+	// result: (VPMOVMToVec64x8 (VPCMPEQQ512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUQ512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQQ512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -34174,13 +34191,12 @@ func rewriteValueAMD64_OpEqualUint8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (EqualUint8x64 x y)
-	// result: (VPMOVMToVec8x64 (VPCMPUB512 [0] x y))
+	// result: (VPMOVMToVec8x64 (VPCMPEQB512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPUB512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPEQB512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(0)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -36279,13 +36295,12 @@ func rewriteValueAMD64_OpGreaterInt16x32(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt16x32 x y)
-	// result: (VPMOVMToVec16x32 (VPCMPW512 [14] x y))
+	// result: (VPMOVMToVec16x32 (VPCMPGTW512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec16x32)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPW512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTW512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(14)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -36297,13 +36312,12 @@ func rewriteValueAMD64_OpGreaterInt32x16(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt32x16 x y)
-	// result: (VPMOVMToVec32x16 (VPCMPD512 [14] x y))
+	// result: (VPMOVMToVec32x16 (VPCMPGTD512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec32x16)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPD512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTD512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(14)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -36315,13 +36329,12 @@ func rewriteValueAMD64_OpGreaterInt64x8(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt64x8 x y)
-	// result: (VPMOVMToVec64x8 (VPCMPQ512 [14] x y))
+	// result: (VPMOVMToVec64x8 (VPCMPGTQ512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec64x8)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPQ512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTQ512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(14)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -36333,13 +36346,12 @@ func rewriteValueAMD64_OpGreaterInt8x64(v *Value) bool {
 	b := v.Block
 	typ := &b.Func.Config.Types
 	// match: (GreaterInt8x64 x y)
-	// result: (VPMOVMToVec8x64 (VPCMPB512 [14] x y))
+	// result: (VPMOVMToVec8x64 (VPCMPGTB512 x y))
 	for {
 		x := v_0
 		y := v_1
 		v.reset(OpAMD64VPMOVMToVec8x64)
-		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPB512, typ.Mask)
+		v0 := b.NewValue0(v.Pos, OpAMD64VPCMPGTB512, typ.Mask)
 		v0.AuxInt = int8ToAuxInt(14)
 		v0.AddArg2(x, y)
 		v.AddArg(v0)
 		return true
@ -53277,6 +53289,234 @@ func rewriteValueAMD64_OpStore(v *Value) bool {
 	}
 	return false
 }
 func rewriteValueAMD64_OpStoreMask16x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask16x16 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec16x16ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x16ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask16x32(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask16x32 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec16x32ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x32ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask16x8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask16x8 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec16x8ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec16x8ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask32x16 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec32x16ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x16ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask32x4(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask32x4 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec32x4ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x4ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask32x8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask32x8 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec32x8ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec32x8ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask64x2(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask64x2 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec64x2ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x2ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask64x4(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask64x4 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec64x4ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x4ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask64x8(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask64x8 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec64x8ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec64x8ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask8x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask8x16 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec8x16ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x16ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask8x32(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask8x32 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec8x32ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x32ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpStoreMask8x64(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
 	// match: (StoreMask8x64 {t} ptr val mem)
 	// result: (KMOVQstore ptr (VPMOVVec8x64ToM <t> val) mem)
 	for {
 		t := auxToType(v.Aux)
 		ptr := v_0
 		val := v_1
 		mem := v_2
 		v.reset(OpAMD64KMOVQstore)
 		v0 := b.NewValue0(v.Pos, OpAMD64VPMOVVec8x64ToM, t)
 		v0.AddArg(val)
 		v.AddArg3(ptr, v0, mem)
 		return true
 	}
 }
 func rewriteValueAMD64_OpSubMaskedFloat32x16(v *Value) bool {
 	v_2 := v.Args[2]
 	v_1 := v.Args[1]
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -1791,6 +1791,23 @@ func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ss
 	}
 }
 func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 	return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
 		opCodes := map[int]map[int]ssa.Op{
 			8:  {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
 			16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
 			32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
 			64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
 		}
 		op := opCodes[elemBits][lanes]
 		if op == 0 {
 			panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
 		}
 		s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
 		return nil
 	}
 }
 // findIntrinsic returns a function which builds the SSA equivalent of the
 // function identified by the symbol sym.  If sym is not an intrinsic call, returns nil.
 func findIntrinsic(sym *types.Sym) intrinsicBuilder {
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -310,34 +310,34 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float64x2.DotProdBroadcast", opLen2(ssa.OpDotProdBroadcastFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x16.Equal", opLen2(ssa.OpEqualInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Equal", opLen2(ssa.OpEqualInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Equal", opLen2(ssa.OpEqualInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Equal", opLen2(ssa.OpEqualInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.Equal", opLen2(ssa.OpEqualInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.Equal", opLen2(ssa.OpEqualInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x2.Equal", opLen2(ssa.OpEqualInt64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x4.Equal", opLen2(ssa.OpEqualInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Equal", opLen2(ssa.OpEqualUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Equal", opLen2(ssa.OpEqualUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x8.Equal", opLen2(ssa.OpEqualUint16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint16x16.Equal", opLen2(ssa.OpEqualUint16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x4.Equal", opLen2(ssa.OpEqualUint32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint32x8.Equal", opLen2(ssa.OpEqualUint32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x2.Equal", opLen2(ssa.OpEqualUint64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint64x4.Equal", opLen2(ssa.OpEqualUint64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Equal", opLen2(ssa.OpEqualFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Equal", opLen2(ssa.OpEqualFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Equal", opLen2(ssa.OpEqualFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.Equal", opLen2(ssa.OpEqualFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Equal", opLen2(ssa.OpEqualFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Equal", opLen2(ssa.OpEqualFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x64.Equal", opLen2(ssa.OpEqualInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x32.Equal", opLen2(ssa.OpEqualInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Equal", opLen2(ssa.OpEqualInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Equal", opLen2(ssa.OpEqualInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Equal", opLen2(ssa.OpEqualUint8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint16x32.Equal", opLen2(ssa.OpEqualUint16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint32x16.Equal", opLen2(ssa.OpEqualUint32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint64x8.Equal", opLen2(ssa.OpEqualUint64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.EqualMasked", opLen3(ssa.OpEqualMaskedFloat32x16, types.TypeVec512), sys.AMD64)
@ -458,22 +458,22 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint64x2.GetElem", opLen1Imm8(ssa.OpGetElemUint64x2, types.Types[types.TUINT64], 0), sys.AMD64)
 	addF(simdPackage, "Int8x16.Greater", opLen2(ssa.OpGreaterInt8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int8x32.Greater", opLen2(ssa.OpGreaterInt8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x8.Greater", opLen2(ssa.OpGreaterInt16x8, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int16x16.Greater", opLen2(ssa.OpGreaterInt16x16, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x4.Greater", opLen2(ssa.OpGreaterInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int32x8.Greater", opLen2(ssa.OpGreaterInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x2.Greater", opLen2(ssa.OpGreaterInt64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Int64x4.Greater", opLen2(ssa.OpGreaterInt64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float32x4.Greater", opLen2(ssa.OpGreaterFloat32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float32x8.Greater", opLen2(ssa.OpGreaterFloat32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float32x16.Greater", opLen2(ssa.OpGreaterFloat32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Float64x2.Greater", opLen2(ssa.OpGreaterFloat64x2, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Float64x4.Greater", opLen2(ssa.OpGreaterFloat64x4, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Float64x8.Greater", opLen2(ssa.OpGreaterFloat64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int8x64.Greater", opLen2(ssa.OpGreaterInt8x64, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int16x32.Greater", opLen2(ssa.OpGreaterInt16x32, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int32x16.Greater", opLen2(ssa.OpGreaterInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Int64x8.Greater", opLen2(ssa.OpGreaterInt64x8, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Uint8x16.Greater", opLen2(ssa.OpGreaterUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.Greater", opLen2(ssa.OpGreaterUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.Greater", opLen2(ssa.OpGreaterUint8x64, types.TypeVec512), sys.AMD64)
@ -2137,59 +2137,71 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask8x16.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "LoadMask8x16FromBits", simdLoadMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x16.StoreToBits", simdStoreMask(8, 16), sys.AMD64)
 	addF(simdPackage, "Mask8x32.AsInt8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x32.AsMask8x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x32.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask8x32.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "LoadMask8x32FromBits", simdLoadMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x32.StoreToBits", simdStoreMask(8, 32), sys.AMD64)
 	addF(simdPackage, "Mask8x64.AsInt8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x64.AsMask8x64", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x64.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask8x64.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "LoadMask8x64FromBits", simdLoadMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask8x64.StoreToBits", simdStoreMask(8, 64), sys.AMD64)
 	addF(simdPackage, "Mask16x8.AsInt16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x8.AsMask16x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x8.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask16x8.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "LoadMask16x8FromBits", simdLoadMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x8.StoreToBits", simdStoreMask(16, 8), sys.AMD64)
 	addF(simdPackage, "Mask16x16.AsInt16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x16.AsMask16x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x16.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask16x16.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "LoadMask16x16FromBits", simdLoadMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x16.StoreToBits", simdStoreMask(16, 16), sys.AMD64)
 	addF(simdPackage, "Mask16x32.AsInt16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int16x32.AsMask16x32", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask16x32.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask16x32.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "LoadMask16x32FromBits", simdLoadMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask16x32.StoreToBits", simdStoreMask(16, 32), sys.AMD64)
 	addF(simdPackage, "Mask32x4.AsInt32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x4.AsMask32x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x4.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask32x4.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "LoadMask32x4FromBits", simdLoadMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x4.StoreToBits", simdStoreMask(32, 4), sys.AMD64)
 	addF(simdPackage, "Mask32x8.AsInt32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x8.AsMask32x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x8.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask32x8.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "LoadMask32x8FromBits", simdLoadMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x8.StoreToBits", simdStoreMask(32, 8), sys.AMD64)
 	addF(simdPackage, "Mask32x16.AsInt32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int32x16.AsMask32x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask32x16.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask32x16.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "LoadMask32x16FromBits", simdLoadMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask32x16.StoreToBits", simdStoreMask(32, 16), sys.AMD64)
 	addF(simdPackage, "Mask64x2.AsInt64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x2.AsMask64x2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x2.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Mask64x2.Or", opLen2(ssa.OpOrInt32x4, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "LoadMask64x2FromBits", simdLoadMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x2.StoreToBits", simdStoreMask(64, 2), sys.AMD64)
 	addF(simdPackage, "Mask64x4.AsInt64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x4.AsMask64x4", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x4.And", opLen2(ssa.OpAndInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Mask64x4.Or", opLen2(ssa.OpOrInt32x8, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "LoadMask64x4FromBits", simdLoadMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x4.StoreToBits", simdStoreMask(64, 4), sys.AMD64)
 	addF(simdPackage, "Mask64x8.AsInt64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int64x8.AsMask64x8", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask64x8.And", opLen2(ssa.OpAndInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "Mask64x8.Or", opLen2(ssa.OpOrInt32x16, types.TypeVec512), sys.AMD64)
 	addF(simdPackage, "LoadMask64x8FromBits", simdLoadMask(64, 8), sys.AMD64)
 	addF(simdPackage, "Mask64x8.StoreToBits", simdStoreMask(64, 8), sys.AMD64)
 }
--- a/src/simd/ops_amd64.go
+++ b/src/simd/ops_amd64.go
@ -918,12 +918,12 @@ func (x Uint64x8) AndNotMasked(y Uint64x8, mask Mask64x8) Uint64x8
 // ApproximateReciprocal computes an approximate reciprocal of each element.
 //
-// Asm: VRCP14PS, CPU Feature: AVX512F
+// Asm: VRCPPS, CPU Feature: AVX
 func (x Float32x4) ApproximateReciprocal() Float32x4
 // ApproximateReciprocal computes an approximate reciprocal of each element.
 //
-// Asm: VRCP14PS, CPU Feature: AVX512F
+// Asm: VRCPPS, CPU Feature: AVX
 func (x Float32x8) ApproximateReciprocal() Float32x8
 // ApproximateReciprocal computes an approximate reciprocal of each element.
@ -1951,6 +1951,11 @@ func (x Int8x16) Equal(y Int8x16) Mask8x16
 // Asm: VPCMPEQB, CPU Feature: AVX2
 func (x Int8x32) Equal(y Int8x32) Mask8x32
 // Equal compares for equality.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX512BW
 func (x Int8x64) Equal(y Int8x64) Mask8x64
 // Equal compares for equality.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX
@ -1961,6 +1966,11 @@ func (x Int16x8) Equal(y Int16x8) Mask16x8
 // Asm: VPCMPEQW, CPU Feature: AVX2
 func (x Int16x16) Equal(y Int16x16) Mask16x16
 // Equal compares for equality.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX512BW
 func (x Int16x32) Equal(y Int16x32) Mask16x32
 // Equal compares for equality.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX
@ -1971,6 +1981,11 @@ func (x Int32x4) Equal(y Int32x4) Mask32x4
 // Asm: VPCMPEQD, CPU Feature: AVX2
 func (x Int32x8) Equal(y Int32x8) Mask32x8
 // Equal compares for equality.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX512F
 func (x Int32x16) Equal(y Int32x16) Mask32x16
 // Equal compares for equality.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX
@ -1981,6 +1996,11 @@ func (x Int64x2) Equal(y Int64x2) Mask64x2
 // Asm: VPCMPEQQ, CPU Feature: AVX2
 func (x Int64x4) Equal(y Int64x4) Mask64x4
 // Equal compares for equality.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX512F
 func (x Int64x8) Equal(y Int64x8) Mask64x8
 // Equal compares for equality.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX
@ -1991,6 +2011,11 @@ func (x Uint8x16) Equal(y Uint8x16) Mask8x16
 // Asm: VPCMPEQB, CPU Feature: AVX2
 func (x Uint8x32) Equal(y Uint8x32) Mask8x32
 // Equal compares for equality.
 //
 // Asm: VPCMPEQB, CPU Feature: AVX512BW
 func (x Uint8x64) Equal(y Uint8x64) Mask8x64
 // Equal compares for equality.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX
@ -2001,6 +2026,11 @@ func (x Uint16x8) Equal(y Uint16x8) Mask16x8
 // Asm: VPCMPEQW, CPU Feature: AVX2
 func (x Uint16x16) Equal(y Uint16x16) Mask16x16
 // Equal compares for equality.
 //
 // Asm: VPCMPEQW, CPU Feature: AVX512BW
 func (x Uint16x32) Equal(y Uint16x32) Mask16x32
 // Equal compares for equality.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX
@ -2011,6 +2041,11 @@ func (x Uint32x4) Equal(y Uint32x4) Mask32x4
 // Asm: VPCMPEQD, CPU Feature: AVX2
 func (x Uint32x8) Equal(y Uint32x8) Mask32x8
 // Equal compares for equality.
 //
 // Asm: VPCMPEQD, CPU Feature: AVX512F
 func (x Uint32x16) Equal(y Uint32x16) Mask32x16
 // Equal compares for equality.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX
@ -2021,6 +2056,11 @@ func (x Uint64x2) Equal(y Uint64x2) Mask64x2
 // Asm: VPCMPEQQ, CPU Feature: AVX2
 func (x Uint64x4) Equal(y Uint64x4) Mask64x4
 // Equal compares for equality.
 //
 // Asm: VPCMPEQQ, CPU Feature: AVX512F
 func (x Uint64x8) Equal(y Uint64x8) Mask64x8
 // Equal compares for equality.
 //
 // Asm: VCMPPS, CPU Feature: AVX
@ -2051,46 +2091,6 @@ func (x Float64x4) Equal(y Float64x4) Mask64x4
 // Asm: VCMPPD, CPU Feature: AVX512F
 func (x Float64x8) Equal(y Float64x8) Mask64x8
 // Equal compares for equality.
 //
 // Asm: VPCMPB, CPU Feature: AVX512BW
 func (x Int8x64) Equal(y Int8x64) Mask8x64
 // Equal compares for equality.
 //
 // Asm: VPCMPW, CPU Feature: AVX512BW
 func (x Int16x32) Equal(y Int16x32) Mask16x32
 // Equal compares for equality.
 //
 // Asm: VPCMPD, CPU Feature: AVX512F
 func (x Int32x16) Equal(y Int32x16) Mask32x16
 // Equal compares for equality.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512F
 func (x Int64x8) Equal(y Int64x8) Mask64x8
 // Equal compares for equality.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512BW
 func (x Uint8x64) Equal(y Uint8x64) Mask8x64
 // Equal compares for equality.
 //
 // Asm: VPCMPUW, CPU Feature: AVX512BW
 func (x Uint16x32) Equal(y Uint16x32) Mask16x32
 // Equal compares for equality.
 //
 // Asm: VPCMPUD, CPU Feature: AVX512F
 func (x Uint32x16) Equal(y Uint32x16) Mask32x16
 // Equal compares for equality.
 //
 // Asm: VPCMPUQ, CPU Feature: AVX512F
 func (x Uint64x8) Equal(y Uint64x8) Mask64x8
 /* EqualMasked */
 // EqualMasked compares for equality.
@ -2733,7 +2733,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverse(y Uint64x8, b uint8) Uint8x6
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16
+func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
 // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
 // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2746,7 +2746,7 @@ func (x Uint8x16) GaloisFieldAffineTransformInverseMasked(y Uint64x2, b uint8, m
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32
+func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
 // GaloisFieldAffineTransformInverseMasked computes an affine transformation in GF(2^8),
 // with x inverted with respect to reduction polynomial x^8 + x^4 + x^3 + x + 1:
@ -2759,7 +2759,7 @@ func (x Uint8x32) GaloisFieldAffineTransformInverseMasked(y Uint64x4, b uint8, m
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEINVQB, CPU Feature: AVX512GFNI
-func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64
+func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
 /* GaloisFieldAffineTransformMasked */
@ -2773,7 +2773,7 @@ func (x Uint8x64) GaloisFieldAffineTransformInverseMasked(y Uint64x8, b uint8, m
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x16) Uint8x16
+func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, mask Mask8x16) Uint8x16
 // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
 // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2785,7 +2785,7 @@ func (x Uint8x16) GaloisFieldAffineTransformMasked(y Uint64x2, b uint8, m Mask8x
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x32) Uint8x32
+func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, mask Mask8x32) Uint8x32
 // GaloisFieldAffineTransformMasked computes an affine transformation in GF(2^8):
 // x is a vector of 8-bit vectors, with each adjacent 8 as a group; y is a vector of 8x8 1-bit matrixes;
@ -2797,7 +2797,7 @@ func (x Uint8x32) GaloisFieldAffineTransformMasked(y Uint64x4, b uint8, m Mask8x
 // b is expected to be a constant, non-constant value will trigger a runtime panic.
 //
 // Asm: VGF2P8AFFINEQB, CPU Feature: AVX512GFNI
-func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, m Mask8x64) Uint8x64
+func (x Uint8x64) GaloisFieldAffineTransformMasked(y Uint64x8, b uint8, mask Mask8x64) Uint8x64
 /* GaloisFieldMul */
@ -2987,6 +2987,11 @@ func (x Int8x16) Greater(y Int8x16) Mask8x16
 // Asm: VPCMPGTB, CPU Feature: AVX2
 func (x Int8x32) Greater(y Int8x32) Mask8x32
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTB, CPU Feature: AVX512BW
 func (x Int8x64) Greater(y Int8x64) Mask8x64
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTW, CPU Feature: AVX
@ -2997,6 +3002,11 @@ func (x Int16x8) Greater(y Int16x8) Mask16x8
 // Asm: VPCMPGTW, CPU Feature: AVX2
 func (x Int16x16) Greater(y Int16x16) Mask16x16
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTW, CPU Feature: AVX512BW
 func (x Int16x32) Greater(y Int16x32) Mask16x32
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTD, CPU Feature: AVX
@ -3007,6 +3017,11 @@ func (x Int32x4) Greater(y Int32x4) Mask32x4
 // Asm: VPCMPGTD, CPU Feature: AVX2
 func (x Int32x8) Greater(y Int32x8) Mask32x8
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTD, CPU Feature: AVX512F
 func (x Int32x16) Greater(y Int32x16) Mask32x16
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTQ, CPU Feature: AVX
@ -3017,6 +3032,11 @@ func (x Int64x2) Greater(y Int64x2) Mask64x2
 // Asm: VPCMPGTQ, CPU Feature: AVX2
 func (x Int64x4) Greater(y Int64x4) Mask64x4
 // Greater compares for greater than.
 //
 // Asm: VPCMPGTQ, CPU Feature: AVX512F
 func (x Int64x8) Greater(y Int64x8) Mask64x8
 // Greater compares for greater than.
 //
 // Asm: VCMPPS, CPU Feature: AVX
@ -3047,26 +3067,6 @@ func (x Float64x4) Greater(y Float64x4) Mask64x4
 // Asm: VCMPPD, CPU Feature: AVX512F
 func (x Float64x8) Greater(y Float64x8) Mask64x8
 // Greater compares for greater than.
 //
 // Asm: VPCMPB, CPU Feature: AVX512BW
 func (x Int8x64) Greater(y Int8x64) Mask8x64
 // Greater compares for greater than.
 //
 // Asm: VPCMPW, CPU Feature: AVX512BW
 func (x Int16x32) Greater(y Int16x32) Mask16x32
 // Greater compares for greater than.
 //
 // Asm: VPCMPD, CPU Feature: AVX512F
 func (x Int32x16) Greater(y Int32x16) Mask32x16
 // Greater compares for greater than.
 //
 // Asm: VPCMPQ, CPU Feature: AVX512F
 func (x Int64x8) Greater(y Int64x8) Mask64x8
 // Greater compares for greater than.
 //
 // Asm: VPCMPUB, CPU Feature: AVX512BW
@ -6475,84 +6475,84 @@ func (x Uint32x8) PairwiseSub(y Uint32x8) Uint32x8
 /* Permute */
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x16) Permute(indices Uint8x16) Int8x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) Permute(indices Uint8x16) Uint8x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) Permute(indices Uint8x32) Int8x32
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) Permute(indices Uint8x32) Uint8x32
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) Permute(indices Uint8x64) Int8x64
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) Permute(indices Uint8x64) Uint8x64
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x8) Permute(indices Uint16x8) Int16x8
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x8) Permute(indices Uint16x8) Uint16x8
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x16) Permute(indices Uint16x16) Int16x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x16) Permute(indices Uint16x16) Uint16x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x32) Permute(indices Uint16x32) Int16x32
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -6580,63 +6580,63 @@ func (x Int32x8) Permute(indices Uint32x8) Int32x8
 // Asm: VPERMD, CPU Feature: AVX2
 func (x Uint32x8) Permute(indices Uint32x8) Uint32x8
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x16) Permute(indices Uint32x16) Float32x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x16) Permute(indices Uint32x16) Int32x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x16) Permute(indices Uint32x16) Uint32x16
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x4) Permute(indices Uint64x4) Float64x4
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x4) Permute(indices Uint64x4) Int64x4
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x4) Permute(indices Uint64x4) Uint64x4
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x8) Permute(indices Uint64x8) Float64x8
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x8) Permute(indices Uint64x8) Int64x8
-// Permute performs a full permutation of vector y using indices:
+// Permute performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7189,7 +7189,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
 /* PermuteMasked */
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7198,7 +7198,7 @@ func (x Uint64x8) Permute2Masked(y Uint64x8, indices Uint64x8, mask Mask64x8) Ui
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7207,7 +7207,7 @@ func (x Int8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Int8x16
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7216,7 +7216,7 @@ func (x Uint8x16) PermuteMasked(indices Uint8x16, mask Mask8x16) Uint8x16
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7225,7 +7225,7 @@ func (x Int8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Int8x32
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7234,7 +7234,7 @@ func (x Uint8x32) PermuteMasked(indices Uint8x32, mask Mask8x32) Uint8x32
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7243,7 +7243,7 @@ func (x Int8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Int8x64
 // Asm: VPERMB, CPU Feature: AVX512VBMI
 func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7252,7 +7252,7 @@ func (x Uint8x64) PermuteMasked(indices Uint8x64, mask Mask8x64) Uint8x64
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7261,7 +7261,7 @@ func (x Int16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Int16x8
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7270,7 +7270,7 @@ func (x Uint16x8) PermuteMasked(indices Uint16x8, mask Mask16x8) Uint16x8
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7279,7 +7279,7 @@ func (x Int16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Int16x16
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7288,7 +7288,7 @@ func (x Uint16x16) PermuteMasked(indices Uint16x16, mask Mask16x16) Uint16x16
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7297,7 +7297,7 @@ func (x Int16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Int16x32
 // Asm: VPERMW, CPU Feature: AVX512BW
 func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7306,7 +7306,7 @@ func (x Uint16x32) PermuteMasked(indices Uint16x32, mask Mask16x32) Uint16x32
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7315,7 +7315,7 @@ func (x Float32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Float32x8
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7324,7 +7324,7 @@ func (x Int32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Int32x8
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7333,7 +7333,7 @@ func (x Uint32x8) PermuteMasked(indices Uint32x8, mask Mask32x8) Uint32x8
 // Asm: VPERMPS, CPU Feature: AVX512F
 func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7342,7 +7342,7 @@ func (x Float32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Float32x16
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7351,7 +7351,7 @@ func (x Int32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Int32x16
 // Asm: VPERMD, CPU Feature: AVX512F
 func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7360,7 +7360,7 @@ func (x Uint32x16) PermuteMasked(indices Uint32x16, mask Mask32x16) Uint32x16
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7369,7 +7369,7 @@ func (x Float64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Float64x4
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7378,7 +7378,7 @@ func (x Int64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Int64x4
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7387,7 +7387,7 @@ func (x Uint64x4) PermuteMasked(indices Uint64x4, mask Mask64x4) Uint64x4
 // Asm: VPERMPD, CPU Feature: AVX512F
 func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
@ -7396,7 +7396,7 @@ func (x Float64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Float64x8
 // Asm: VPERMQ, CPU Feature: AVX512F
 func (x Int64x8) PermuteMasked(indices Uint64x8, mask Mask64x8) Int64x8
-// PermuteMasked performs a full permutation of vector y using indices:
+// PermuteMasked performs a full permutation of vector x using indices:
 // result := {x[indices[0]], x[indices[1]], ..., x[indices[n]]}
 // Only the needed bits to represent x's index are used in indices' elements.
 //
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@ -461,7 +461,7 @@ func testMergeLocalswrapper(t *testing.T, op func(simd.Int64x4, simd.Int64x4) si
 	}
 }
-func TestBitMask(t *testing.T) {
+func TestBitMaskLoad(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
 		return
@ -477,3 +477,19 @@ func TestBitMask(t *testing.T) {
 		}
 	}
 }
 func TestBitMaskStore(t *testing.T) {
 	if !simd.HasAVX512() {
 		t.Skip("Test requires HasAVX512, not available on this hardware")
 		return
 	}
 	var want uint64 = 0b101
 	var got uint64
 	x := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
 	y := simd.LoadInt32x4Slice([]int32{5, 0, 5, 0})
 	m := y.Greater(x)
 	m.StoreToBits(&got)
 	if got != want {
 		t.Errorf("Result incorrect: want %b, got %b", want, got)
 	}
 }
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@ -205,48 +205,88 @@ type Mask8x16 struct {
 	vals    [16]int8
 }
-// Mask8x16FromBits constructs a Mask8x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask8x16FromBits constructs a Mask8x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask8x16FromBits(y *uint64) Mask8x16
 // StoreToBits stores a Mask8x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask8x16) StoreToBits(y *uint64)
 // Mask16x8 is a 128-bit SIMD vector of 8 int16
 type Mask16x8 struct {
 	int16x8 v128
 	vals    [8]int16
 }
-// Mask16x8FromBits constructs a Mask16x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask16x8FromBits constructs a Mask16x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask16x8FromBits(y *uint64) Mask16x8
 // StoreToBits stores a Mask16x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask16x8) StoreToBits(y *uint64)
 // Mask32x4 is a 128-bit SIMD vector of 4 int32
 type Mask32x4 struct {
 	int32x4 v128
 	vals    [4]int32
 }
-// Mask32x4FromBits constructs a Mask32x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask32x4FromBits constructs a Mask32x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask32x4FromBits(y *uint64) Mask32x4
 // StoreToBits stores a Mask32x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask32x4) StoreToBits(y *uint64)
 // Mask64x2 is a 128-bit SIMD vector of 2 int64
 type Mask64x2 struct {
 	int64x2 v128
 	vals    [2]int64
 }
-// Mask64x2FromBits constructs a Mask64x2 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask64x2FromBits constructs a Mask64x2 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask64x2FromBits(y *uint64) Mask64x2
 // StoreToBits stores a Mask64x2 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask64x2) StoreToBits(y *uint64)
 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
 type v256 struct {
 	_256 struct{}
@ -448,48 +488,88 @@ type Mask8x32 struct {
 	vals    [32]int8
 }
-// Mask8x32FromBits constructs a Mask8x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask8x32FromBits constructs a Mask8x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask8x32FromBits(y *uint64) Mask8x32
 // StoreToBits stores a Mask8x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask8x32) StoreToBits(y *uint64)
 // Mask16x16 is a 256-bit SIMD vector of 16 int16
 type Mask16x16 struct {
 	int16x16 v256
 	vals     [16]int16
 }
-// Mask16x16FromBits constructs a Mask16x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask16x16FromBits constructs a Mask16x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask16x16FromBits(y *uint64) Mask16x16
 // StoreToBits stores a Mask16x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask16x16) StoreToBits(y *uint64)
 // Mask32x8 is a 256-bit SIMD vector of 8 int32
 type Mask32x8 struct {
 	int32x8 v256
 	vals    [8]int32
 }
-// Mask32x8FromBits constructs a Mask32x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask32x8FromBits constructs a Mask32x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask32x8FromBits(y *uint64) Mask32x8
 // StoreToBits stores a Mask32x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask32x8) StoreToBits(y *uint64)
 // Mask64x4 is a 256-bit SIMD vector of 4 int64
 type Mask64x4 struct {
 	int64x4 v256
 	vals    [4]int64
 }
-// Mask64x4FromBits constructs a Mask64x4 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask64x4FromBits constructs a Mask64x4 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask64x4FromBits(y *uint64) Mask64x4
 // StoreToBits stores a Mask64x4 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask64x4) StoreToBits(y *uint64)
 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
 type v512 struct {
 	_512 struct{}
@ -691,44 +771,84 @@ type Mask8x64 struct {
 	vals    [64]int8
 }
-// Mask8x64FromBits constructs a Mask8x64 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask8x64FromBits constructs a Mask8x64 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 64 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask8x64FromBits(y *uint64) Mask8x64
 // StoreToBits stores a Mask8x64 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 64 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask8x64) StoreToBits(y *uint64)
 // Mask16x32 is a 512-bit SIMD vector of 32 int16
 type Mask16x32 struct {
 	int16x32 v512
 	vals     [32]int16
 }
-// Mask16x32FromBits constructs a Mask16x32 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask16x32FromBits constructs a Mask16x32 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask16x32FromBits(y *uint64) Mask16x32
 // StoreToBits stores a Mask16x32 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask16x32) StoreToBits(y *uint64)
 // Mask32x16 is a 512-bit SIMD vector of 16 int32
 type Mask32x16 struct {
 	int32x16 v512
 	vals     [16]int32
 }
-// Mask32x16FromBits constructs a Mask32x16 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask32x16FromBits constructs a Mask32x16 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask32x16FromBits(y *uint64) Mask32x16
 // StoreToBits stores a Mask32x16 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask32x16) StoreToBits(y *uint64)
 // Mask64x8 is a 512-bit SIMD vector of 8 int64
 type Mask64x8 struct {
 	int64x8 v512
 	vals    [8]int64
 }
-// Mask64x8FromBits constructs a Mask64x8 from an a bitmap, where 1 means set for the indexed element, 0 means unset.
+// LoadMask64x8FromBits constructs a Mask64x8 from a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func LoadMask64x8FromBits(y *uint64) Mask64x8
 // StoreToBits stores a Mask64x8 as a bitmap, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
 //
 // CPU Features: AVX512
 //
 //go:noescape
 func (x Mask64x8) StoreToBits(y *uint64)