[dev.simd] cmd/compile: generated code for K-mask-register slice load/stores

plus slice-part load, store and test for a single type. Generated by arch/internal/simdgen CL 690315 Change-Id: I58052728b544c4a772a2870ac68f3c832813e1ea Reviewed-on: https://go-review.googlesource.com/c/go/+/690336 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2026-02-06 18:00:01 +00:00 · 2025-07-24 10:31:46 -04:00 · 2025-07-24 10:31:46 -04:00 · c25e5c86b2
commit c25e5c86b2
parent 1ac5f3533f
4 changed files with 352 additions and 0 deletions
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -2148,26 +2148,54 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Float32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedFloat32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
 	addF(simdPackage, "Float32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+	addF(simdPackage, "LoadMaskedFloat32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+	addF(simdPackage, "Float32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedFloat64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Float64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
 	addF(simdPackage, "LoadMaskedFloat64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Float64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedFloat64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+	addF(simdPackage, "Float64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedInt8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+	addF(simdPackage, "Int8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+	addF(simdPackage, "LoadMaskedInt16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+	addF(simdPackage, "Int16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
 	addF(simdPackage, "LoadMaskedInt32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
 	addF(simdPackage, "Int32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedInt32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
 	addF(simdPackage, "Int32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+	addF(simdPackage, "LoadMaskedInt32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+	addF(simdPackage, "Int32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedInt64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Int64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
 	addF(simdPackage, "LoadMaskedInt64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Int64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedInt64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+	addF(simdPackage, "Int64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedUint8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+	addF(simdPackage, "Uint8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+	addF(simdPackage, "LoadMaskedUint16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+	addF(simdPackage, "Uint16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
 	addF(simdPackage, "LoadMaskedUint32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
 	addF(simdPackage, "Uint32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedUint32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
 	addF(simdPackage, "Uint32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+	addF(simdPackage, "LoadMaskedUint32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+	addF(simdPackage, "Uint32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
 	addF(simdPackage, "LoadMaskedUint64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Uint64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
 	addF(simdPackage, "LoadMaskedUint64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
 	addF(simdPackage, "Uint64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedUint64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+	addF(simdPackage, "Uint64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
+	addF(simdPackage, "LoadMaskedMask8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
+	addF(simdPackage, "Mask8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
+	addF(simdPackage, "LoadMaskedMask16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
+	addF(simdPackage, "Mask16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
+	addF(simdPackage, "LoadMaskedMask32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
+	addF(simdPackage, "Mask32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
+	addF(simdPackage, "LoadMaskedMask64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
+	addF(simdPackage, "Mask64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
 	addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
 	addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)
--- a/src/simd/slicepart_amd64.go
+++ b/src/simd/slicepart_amd64.go
@ -419,6 +419,24 @@ func paInt64x4(s []int64) *[4]int64 {
 	return (*[4]int64)(unsafe.Pointer(&s[0]))
 }

+// For 512-bit masked loads/stores
+
+func paInt64x8(s []int64) *[8]int64 {
+	return (*[8]int64)(unsafe.Pointer(&s[0]))
+}
+
+func paInt32x16(s []int32) *[16]int32 {
+	return (*[16]int32)(unsafe.Pointer(&s[0]))
+}
+
+func paInt16x32(s []int16) *[32]int16 {
+	return (*[32]int16)(unsafe.Pointer(&s[0]))
+}
+
+func paInt8x64(s []int8) *[64]int8 {
+	return (*[64]int8)(unsafe.Pointer(&s[0]))
+}
+
 /* 32 and 64-bit slice-part loads for AVX2 (128 and 256 bit) */

 // LoadInt32x4SlicePart loads a Int32x4 from the slice s.
@ -742,3 +760,30 @@ func (x Float64x4) StoreSlicePart(s []float64) {
 	t := unsafe.Slice((*int64)(unsafe.Pointer(&s[0])), len(s))
 	x.AsInt64x4().StoreSlicePart(t)
 }
+
+func LoadInt64x8SlicePart(s []int64) Int64x8 {
+	l := len(s)
+	if l >= 8 {
+		return LoadInt64x8Slice(s)
+	}
+	if l == 0 {
+		var x Int64x8
+		return x
+	}
+
+	mask := Mask64x8FromBits(0xff >> (8 - l))
+	return LoadMaskedInt64x8(paInt64x8(s), mask)
+}
+
+func (x Int64x8) StoreSlicePart(s []int64) {
+	l := len(s)
+	if l >= 8 {
+		x.StoreSlice(s)
+		return
+	}
+	if l == 0 {
+		return
+	}
+	mask := Mask64x8FromBits(0xff >> (8 - l))
+	x.StoreMasked(paInt64x8(s), mask)
+}
--- a/src/simd/slicepart_test.go
+++ b/src/simd/slicepart_test.go
@ -341,3 +341,50 @@ func TestSlicePartFloat32(t *testing.T) {
 		}
 	}
 }
+
+// 512-bit load
+
+func TestSlicePartInt64(t *testing.T) {
+	if !simd.HasAVX512() {
+		t.Skip("Test requires HasAVX512, not available on this hardware")
+		return
+	}
+
+	L := 8
+	c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
+	a := c[:L+1]
+	for i := range a {
+		// Test the load first
+		// e is a partial slice.
+		e := a[i:]
+		v := simd.LoadInt64x8SlicePart(e)
+		// d contains what a ought to contain
+		d := make([]int64, L)
+		for j := 0; j < len(e) && j < len(d); j++ {
+			d[j] = e[j]
+		}
+
+		b := make([]int64, L)
+		v.StoreSlice(b)
+		// test the load
+		checkSlicesLogInput(t, b, d, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
+
+		// Test the store
+		f := make([]int64, L+1)
+		for i := range f {
+			f[i] = 99
+		}
+
+		v.StoreSlicePart(f[:len(e)])
+		if len(e) < len(b) {
+			checkSlices(t, f, b[:len(e)])
+		} else {
+			checkSlices(t, f, b)
+		}
+		for i := len(e); i < len(f); i++ {
+			if f[i] != 99 {
+				t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
+			}
+		}
+	}
+}
--- a/src/simd/types_amd64.go
+++ b/src/simd/types_amd64.go
@ -31,12 +31,16 @@ func (x Float32x4) Store(y *[4]float32)
 // LoadMaskedFloat32x4 loads a Float32x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4

 // StoreMasked stores a Float32x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)

@ -62,12 +66,16 @@ func (x Float64x2) Store(y *[2]float64)
 // LoadMaskedFloat64x2 loads a Float64x2 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2

 // StoreMasked stores a Float64x2 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)

@ -131,12 +139,16 @@ func (x Int32x4) Store(y *[4]int32)
 // LoadMaskedInt32x4 loads a Int32x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4

 // StoreMasked stores a Int32x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)

@ -162,12 +174,16 @@ func (x Int64x2) Store(y *[2]int64)
 // LoadMaskedInt64x2 loads a Int64x2 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2

 // StoreMasked stores a Int64x2 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)

@ -231,12 +247,16 @@ func (x Uint32x4) Store(y *[4]uint32)
 // LoadMaskedUint32x4 loads a Uint32x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4

 // StoreMasked stores a Uint32x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)

@ -262,12 +282,16 @@ func (x Uint64x2) Store(y *[2]uint64)
 // LoadMaskedUint64x2 loads a Uint64x2 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2

 // StoreMasked stores a Uint64x2 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)

@ -295,6 +319,8 @@ func (x Mask8x16) StoreToBits(y *uint64)

 // Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
 func Mask8x16FromBits(y uint16) Mask8x16

 // Mask16x8 is a 128-bit SIMD vector of 8 int16
@ -321,6 +347,8 @@ func (x Mask16x8) StoreToBits(y *uint64)

 // Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
 func Mask16x8FromBits(y uint8) Mask16x8

 // Mask32x4 is a 128-bit SIMD vector of 4 int32
@ -347,6 +375,8 @@ func (x Mask32x4) StoreToBits(y *uint64)

 // Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
 func Mask32x4FromBits(y uint8) Mask32x4

 // Mask64x2 is a 128-bit SIMD vector of 2 int64
@ -373,6 +403,8 @@ func (x Mask64x2) StoreToBits(y *uint64)

 // Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 2 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
 func Mask64x2FromBits(y uint8) Mask64x2

 // v256 is a tag type that tells the compiler that this is really 256-bit SIMD
@ -402,12 +434,16 @@ func (x Float32x8) Store(y *[8]float32)
 // LoadMaskedFloat32x8 loads a Float32x8 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8

 // StoreMasked stores a Float32x8 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)

@ -433,12 +469,16 @@ func (x Float64x4) Store(y *[4]float64)
 // LoadMaskedFloat64x4 loads a Float64x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4

 // StoreMasked stores a Float64x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)

@ -502,12 +542,16 @@ func (x Int32x8) Store(y *[8]int32)
 // LoadMaskedInt32x8 loads a Int32x8 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8

 // StoreMasked stores a Int32x8 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)

@ -533,12 +577,16 @@ func (x Int64x4) Store(y *[4]int64)
 // LoadMaskedInt64x4 loads a Int64x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4

 // StoreMasked stores a Int64x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)

@ -602,12 +650,16 @@ func (x Uint32x8) Store(y *[8]uint32)
 // LoadMaskedUint32x8 loads a Uint32x8 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8

 // StoreMasked stores a Uint32x8 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVD, CPU Feature: AVX2
+//
 //go:noescape
 func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)

@ -633,12 +685,16 @@ func (x Uint64x4) Store(y *[4]uint64)
 // LoadMaskedUint64x4 loads a Uint64x4 from an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4

 // StoreMasked stores a Uint64x4 to an array,
 // at those elements enabled by mask
 //
+// Asm: VMASKMOVQ, CPU Feature: AVX2
+//
 //go:noescape
 func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)

@ -666,6 +722,8 @@ func (x Mask8x32) StoreToBits(y *uint64)

 // Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
 func Mask8x32FromBits(y uint32) Mask8x32

 // Mask16x16 is a 256-bit SIMD vector of 16 int16
@ -692,6 +750,8 @@ func (x Mask16x16) StoreToBits(y *uint64)

 // Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
 func Mask16x16FromBits(y uint16) Mask16x16

 // Mask32x8 is a 256-bit SIMD vector of 8 int32
@ -718,6 +778,8 @@ func (x Mask32x8) StoreToBits(y *uint64)

 // Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
 func Mask32x8FromBits(y uint8) Mask32x8

 // Mask64x4 is a 256-bit SIMD vector of 4 int64
@ -744,6 +806,8 @@ func (x Mask64x4) StoreToBits(y *uint64)

 // Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 4 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
 func Mask64x4FromBits(y uint8) Mask64x4

 // v512 is a tag type that tells the compiler that this is really 512-bit SIMD
@ -770,6 +834,22 @@ func LoadFloat32x16(y *[16]float32) Float32x16
 //go:noescape
 func (x Float32x16) Store(y *[16]float32)

+// LoadMaskedFloat32x16 loads a Float32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
+
+// StoreMasked stores a Float32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
+
 // Float64x8 is a 512-bit SIMD vector of 8 float64
 type Float64x8 struct {
 	float64x8 v512
@ -789,6 +869,22 @@ func LoadFloat64x8(y *[8]float64) Float64x8
 //go:noescape
 func (x Float64x8) Store(y *[8]float64)

+// LoadMaskedFloat64x8 loads a Float64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
+
+// StoreMasked stores a Float64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
+
 // Int8x64 is a 512-bit SIMD vector of 64 int8
 type Int8x64 struct {
 	int8x64 v512
@ -808,6 +904,22 @@ func LoadInt8x64(y *[64]int8) Int8x64
 //go:noescape
 func (x Int8x64) Store(y *[64]int8)

+// LoadMaskedInt8x64 loads a Int8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
+
+// StoreMasked stores a Int8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
+
 // Int16x32 is a 512-bit SIMD vector of 32 int16
 type Int16x32 struct {
 	int16x32 v512
@ -827,6 +939,22 @@ func LoadInt16x32(y *[32]int16) Int16x32
 //go:noescape
 func (x Int16x32) Store(y *[32]int16)

+// LoadMaskedInt16x32 loads a Int16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
+
+// StoreMasked stores a Int16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
+
 // Int32x16 is a 512-bit SIMD vector of 16 int32
 type Int32x16 struct {
 	int32x16 v512
@ -846,6 +974,22 @@ func LoadInt32x16(y *[16]int32) Int32x16
 //go:noescape
 func (x Int32x16) Store(y *[16]int32)

+// LoadMaskedInt32x16 loads a Int32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
+
+// StoreMasked stores a Int32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
+
 // Int64x8 is a 512-bit SIMD vector of 8 int64
 type Int64x8 struct {
 	int64x8 v512
@ -865,6 +1009,22 @@ func LoadInt64x8(y *[8]int64) Int64x8
 //go:noescape
 func (x Int64x8) Store(y *[8]int64)

+// LoadMaskedInt64x8 loads a Int64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
+
+// StoreMasked stores a Int64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
+
 // Uint8x64 is a 512-bit SIMD vector of 64 uint8
 type Uint8x64 struct {
 	uint8x64 v512
@ -884,6 +1044,22 @@ func LoadUint8x64(y *[64]uint8) Uint8x64
 //go:noescape
 func (x Uint8x64) Store(y *[64]uint8)

+// LoadMaskedUint8x64 loads a Uint8x64 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
+
+// StoreMasked stores a Uint8x64 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU8, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
+
 // Uint16x32 is a 512-bit SIMD vector of 32 uint16
 type Uint16x32 struct {
 	uint16x32 v512
@ -903,6 +1079,22 @@ func LoadUint16x32(y *[32]uint16) Uint16x32
 //go:noescape
 func (x Uint16x32) Store(y *[32]uint16)

+// LoadMaskedUint16x32 loads a Uint16x32 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
+
+// StoreMasked stores a Uint16x32 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU16, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
+
 // Uint32x16 is a 512-bit SIMD vector of 16 uint32
 type Uint32x16 struct {
 	uint32x16 v512
@ -922,6 +1114,22 @@ func LoadUint32x16(y *[16]uint32) Uint32x16
 //go:noescape
 func (x Uint32x16) Store(y *[16]uint32)

+// LoadMaskedUint32x16 loads a Uint32x16 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
+
+// StoreMasked stores a Uint32x16 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU32, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
+
 // Uint64x8 is a 512-bit SIMD vector of 8 uint64
 type Uint64x8 struct {
 	uint64x8 v512
@ -941,6 +1149,22 @@ func LoadUint64x8(y *[8]uint64) Uint64x8
 //go:noescape
 func (x Uint64x8) Store(y *[8]uint64)

+// LoadMaskedUint64x8 loads a Uint64x8 from an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64.Z, CPU Feature: AVX512
+//
+//go:noescape
+func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
+
+// StoreMasked stores a Uint64x8 to an array,
+// at those elements enabled by mask
+//
+// Asm: VMOVDQU64, CPU Feature: AVX512
+//
+//go:noescape
+func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
+
 // Mask8x64 is a 512-bit SIMD vector of 64 int8
 type Mask8x64 struct {
 	int8x64 v512
@ -965,6 +1189,8 @@ func (x Mask8x64) StoreToBits(y *uint64)

 // Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 64 bits of y are used.
+//
+// Asm: KMOVB, CPU Feature: AVX512"
 func Mask8x64FromBits(y uint64) Mask8x64

 // Mask16x32 is a 512-bit SIMD vector of 32 int16
@ -991,6 +1217,8 @@ func (x Mask16x32) StoreToBits(y *uint64)

 // Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 32 bits of y are used.
+//
+// Asm: KMOVW, CPU Feature: AVX512"
 func Mask16x32FromBits(y uint32) Mask16x32

 // Mask32x16 is a 512-bit SIMD vector of 16 int32
@ -1017,6 +1245,8 @@ func (x Mask32x16) StoreToBits(y *uint64)

 // Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 16 bits of y are used.
+//
+// Asm: KMOVD, CPU Feature: AVX512"
 func Mask32x16FromBits(y uint16) Mask32x16

 // Mask64x8 is a 512-bit SIMD vector of 8 int64
@ -1043,4 +1273,6 @@ func (x Mask64x8) StoreToBits(y *uint64)

 // Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
 // Only the lower 8 bits of y are used.
+//
+// Asm: KMOVQ, CPU Feature: AVX512"
 func Mask64x8FromBits(y uint8) Mask64x8