[dev.simd] cmd/compile: generated code for K-mask-register slice load/stores

plus slice-part load, store and test for a single type.

Generated by arch/internal/simdgen CL 690315

Change-Id: I58052728b544c4a772a2870ac68f3c832813e1ea
Reviewed-on: https://go-review.googlesource.com/c/go/+/690336
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
David Chase 2025-07-24 10:31:46 -04:00
parent 1ac5f3533f
commit c25e5c86b2
4 changed files with 352 additions and 0 deletions

View file

@ -2148,26 +2148,54 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Float32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Float32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Float32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Float64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Float64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedFloat64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Float64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedInt8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
addF(simdPackage, "Int8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
addF(simdPackage, "LoadMaskedInt16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
addF(simdPackage, "Int16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
addF(simdPackage, "LoadMaskedInt32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Int32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedInt32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Int32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedInt32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Int32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedInt64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Int64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedInt64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Int64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedInt64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Int64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedUint8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
addF(simdPackage, "Uint8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
addF(simdPackage, "LoadMaskedUint16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
addF(simdPackage, "Uint16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
addF(simdPackage, "LoadMaskedUint32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Uint32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedUint32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Uint32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedUint32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Uint32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedUint64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Uint64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedUint64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Uint64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedUint64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Uint64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "LoadMaskedMask8x64", simdMaskedLoad(ssa.OpLoadMasked8), sys.AMD64)
addF(simdPackage, "Mask8x64.StoreMasked", simdMaskedStore(ssa.OpStoreMasked8), sys.AMD64)
addF(simdPackage, "LoadMaskedMask16x32", simdMaskedLoad(ssa.OpLoadMasked16), sys.AMD64)
addF(simdPackage, "Mask16x32.StoreMasked", simdMaskedStore(ssa.OpStoreMasked16), sys.AMD64)
addF(simdPackage, "LoadMaskedMask32x16", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64)
addF(simdPackage, "Mask32x16.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64)
addF(simdPackage, "LoadMaskedMask64x8", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64)
addF(simdPackage, "Mask64x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64)
addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64)
addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64)

View file

@ -419,6 +419,24 @@ func paInt64x4(s []int64) *[4]int64 {
return (*[4]int64)(unsafe.Pointer(&s[0]))
}
// For 512-bit masked loads/stores
func paInt64x8(s []int64) *[8]int64 {
return (*[8]int64)(unsafe.Pointer(&s[0]))
}
func paInt32x16(s []int32) *[16]int32 {
return (*[16]int32)(unsafe.Pointer(&s[0]))
}
func paInt16x32(s []int16) *[32]int16 {
return (*[32]int16)(unsafe.Pointer(&s[0]))
}
func paInt8x64(s []int8) *[64]int8 {
return (*[64]int8)(unsafe.Pointer(&s[0]))
}
/* 32 and 64-bit slice-part loads for AVX2 (128 and 256 bit) */
// LoadInt32x4SlicePart loads a Int32x4 from the slice s.
@ -742,3 +760,30 @@ func (x Float64x4) StoreSlicePart(s []float64) {
t := unsafe.Slice((*int64)(unsafe.Pointer(&s[0])), len(s))
x.AsInt64x4().StoreSlicePart(t)
}
func LoadInt64x8SlicePart(s []int64) Int64x8 {
l := len(s)
if l >= 8 {
return LoadInt64x8Slice(s)
}
if l == 0 {
var x Int64x8
return x
}
mask := Mask64x8FromBits(0xff >> (8 - l))
return LoadMaskedInt64x8(paInt64x8(s), mask)
}
func (x Int64x8) StoreSlicePart(s []int64) {
l := len(s)
if l >= 8 {
x.StoreSlice(s)
return
}
if l == 0 {
return
}
mask := Mask64x8FromBits(0xff >> (8 - l))
x.StoreMasked(paInt64x8(s), mask)
}

View file

@ -341,3 +341,50 @@ func TestSlicePartFloat32(t *testing.T) {
}
}
}
// 512-bit load
func TestSlicePartInt64(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
L := 8
c := []int64{1, 2, 3, 4, 5, 6, 7, 8, 86, 86, 86, 86}
a := c[:L+1]
for i := range a {
// Test the load first
// e is a partial slice.
e := a[i:]
v := simd.LoadInt64x8SlicePart(e)
// d contains what a ought to contain
d := make([]int64, L)
for j := 0; j < len(e) && j < len(d); j++ {
d[j] = e[j]
}
b := make([]int64, L)
v.StoreSlice(b)
// test the load
checkSlicesLogInput(t, b, d, func() { t.Helper(); t.Logf("Len(e)=%d", len(e)) })
// Test the store
f := make([]int64, L+1)
for i := range f {
f[i] = 99
}
v.StoreSlicePart(f[:len(e)])
if len(e) < len(b) {
checkSlices(t, f, b[:len(e)])
} else {
checkSlices(t, f, b)
}
for i := len(e); i < len(f); i++ {
if f[i] != 99 {
t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %v", i, f[i])
}
}
}
}

View file

@ -31,12 +31,16 @@ func (x Float32x4) Store(y *[4]float32)
// LoadMaskedFloat32x4 loads a Float32x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4
// StoreMasked stores a Float32x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4)
@ -62,12 +66,16 @@ func (x Float64x2) Store(y *[2]float64)
// LoadMaskedFloat64x2 loads a Float64x2 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2
// StoreMasked stores a Float64x2 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2)
@ -131,12 +139,16 @@ func (x Int32x4) Store(y *[4]int32)
// LoadMaskedInt32x4 loads a Int32x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4
// StoreMasked stores a Int32x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4)
@ -162,12 +174,16 @@ func (x Int64x2) Store(y *[2]int64)
// LoadMaskedInt64x2 loads a Int64x2 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2
// StoreMasked stores a Int64x2 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2)
@ -231,12 +247,16 @@ func (x Uint32x4) Store(y *[4]uint32)
// LoadMaskedUint32x4 loads a Uint32x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4
// StoreMasked stores a Uint32x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4)
@ -262,12 +282,16 @@ func (x Uint64x2) Store(y *[2]uint64)
// LoadMaskedUint64x2 loads a Uint64x2 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2
// StoreMasked stores a Uint64x2 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2)
@ -295,6 +319,8 @@ func (x Mask8x16) StoreToBits(y *uint64)
// Mask8x16FromBits constructs a Mask8x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x16FromBits(y uint16) Mask8x16
// Mask16x8 is a 128-bit SIMD vector of 8 int16
@ -321,6 +347,8 @@ func (x Mask16x8) StoreToBits(y *uint64)
// Mask16x8FromBits constructs a Mask16x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x8FromBits(y uint8) Mask16x8
// Mask32x4 is a 128-bit SIMD vector of 4 int32
@ -347,6 +375,8 @@ func (x Mask32x4) StoreToBits(y *uint64)
// Mask32x4FromBits constructs a Mask32x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x4FromBits(y uint8) Mask32x4
// Mask64x2 is a 128-bit SIMD vector of 2 int64
@ -373,6 +403,8 @@ func (x Mask64x2) StoreToBits(y *uint64)
// Mask64x2FromBits constructs a Mask64x2 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 2 bits of y are used.
//
// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x2FromBits(y uint8) Mask64x2
// v256 is a tag type that tells the compiler that this is really 256-bit SIMD
@ -402,12 +434,16 @@ func (x Float32x8) Store(y *[8]float32)
// LoadMaskedFloat32x8 loads a Float32x8 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8
// StoreMasked stores a Float32x8 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8)
@ -433,12 +469,16 @@ func (x Float64x4) Store(y *[4]float64)
// LoadMaskedFloat64x4 loads a Float64x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4
// StoreMasked stores a Float64x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4)
@ -502,12 +542,16 @@ func (x Int32x8) Store(y *[8]int32)
// LoadMaskedInt32x8 loads a Int32x8 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8
// StoreMasked stores a Int32x8 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8)
@ -533,12 +577,16 @@ func (x Int64x4) Store(y *[4]int64)
// LoadMaskedInt64x4 loads a Int64x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4
// StoreMasked stores a Int64x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4)
@ -602,12 +650,16 @@ func (x Uint32x8) Store(y *[8]uint32)
// LoadMaskedUint32x8 loads a Uint32x8 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8
// StoreMasked stores a Uint32x8 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVD, CPU Feature: AVX2
//
//go:noescape
func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8)
@ -633,12 +685,16 @@ func (x Uint64x4) Store(y *[4]uint64)
// LoadMaskedUint64x4 loads a Uint64x4 from an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4
// StoreMasked stores a Uint64x4 to an array,
// at those elements enabled by mask
//
// Asm: VMASKMOVQ, CPU Feature: AVX2
//
//go:noescape
func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4)
@ -666,6 +722,8 @@ func (x Mask8x32) StoreToBits(y *uint64)
// Mask8x32FromBits constructs a Mask8x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x32FromBits(y uint32) Mask8x32
// Mask16x16 is a 256-bit SIMD vector of 16 int16
@ -692,6 +750,8 @@ func (x Mask16x16) StoreToBits(y *uint64)
// Mask16x16FromBits constructs a Mask16x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x16FromBits(y uint16) Mask16x16
// Mask32x8 is a 256-bit SIMD vector of 8 int32
@ -718,6 +778,8 @@ func (x Mask32x8) StoreToBits(y *uint64)
// Mask32x8FromBits constructs a Mask32x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x8FromBits(y uint8) Mask32x8
// Mask64x4 is a 256-bit SIMD vector of 4 int64
@ -744,6 +806,8 @@ func (x Mask64x4) StoreToBits(y *uint64)
// Mask64x4FromBits constructs a Mask64x4 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 4 bits of y are used.
//
// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x4FromBits(y uint8) Mask64x4
// v512 is a tag type that tells the compiler that this is really 512-bit SIMD
@ -770,6 +834,22 @@ func LoadFloat32x16(y *[16]float32) Float32x16
//go:noescape
func (x Float32x16) Store(y *[16]float32)
// LoadMaskedFloat32x16 loads a Float32x16 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedFloat32x16(y *[16]float32, mask Mask32x16) Float32x16
// StoreMasked stores a Float32x16 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32, CPU Feature: AVX512
//
//go:noescape
func (x Float32x16) StoreMasked(y *[16]float32, mask Mask32x16)
// Float64x8 is a 512-bit SIMD vector of 8 float64
type Float64x8 struct {
float64x8 v512
@ -789,6 +869,22 @@ func LoadFloat64x8(y *[8]float64) Float64x8
//go:noescape
func (x Float64x8) Store(y *[8]float64)
// LoadMaskedFloat64x8 loads a Float64x8 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedFloat64x8(y *[8]float64, mask Mask64x8) Float64x8
// StoreMasked stores a Float64x8 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64, CPU Feature: AVX512
//
//go:noescape
func (x Float64x8) StoreMasked(y *[8]float64, mask Mask64x8)
// Int8x64 is a 512-bit SIMD vector of 64 int8
type Int8x64 struct {
int8x64 v512
@ -808,6 +904,22 @@ func LoadInt8x64(y *[64]int8) Int8x64
//go:noescape
func (x Int8x64) Store(y *[64]int8)
// LoadMaskedInt8x64 loads a Int8x64 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU8.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedInt8x64(y *[64]int8, mask Mask8x64) Int8x64
// StoreMasked stores a Int8x64 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU8, CPU Feature: AVX512
//
//go:noescape
func (x Int8x64) StoreMasked(y *[64]int8, mask Mask8x64)
// Int16x32 is a 512-bit SIMD vector of 32 int16
type Int16x32 struct {
int16x32 v512
@ -827,6 +939,22 @@ func LoadInt16x32(y *[32]int16) Int16x32
//go:noescape
func (x Int16x32) Store(y *[32]int16)
// LoadMaskedInt16x32 loads a Int16x32 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU16.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedInt16x32(y *[32]int16, mask Mask16x32) Int16x32
// StoreMasked stores a Int16x32 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU16, CPU Feature: AVX512
//
//go:noescape
func (x Int16x32) StoreMasked(y *[32]int16, mask Mask16x32)
// Int32x16 is a 512-bit SIMD vector of 16 int32
type Int32x16 struct {
int32x16 v512
@ -846,6 +974,22 @@ func LoadInt32x16(y *[16]int32) Int32x16
//go:noescape
func (x Int32x16) Store(y *[16]int32)
// LoadMaskedInt32x16 loads a Int32x16 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedInt32x16(y *[16]int32, mask Mask32x16) Int32x16
// StoreMasked stores a Int32x16 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32, CPU Feature: AVX512
//
//go:noescape
func (x Int32x16) StoreMasked(y *[16]int32, mask Mask32x16)
// Int64x8 is a 512-bit SIMD vector of 8 int64
type Int64x8 struct {
int64x8 v512
@ -865,6 +1009,22 @@ func LoadInt64x8(y *[8]int64) Int64x8
//go:noescape
func (x Int64x8) Store(y *[8]int64)
// LoadMaskedInt64x8 loads a Int64x8 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedInt64x8(y *[8]int64, mask Mask64x8) Int64x8
// StoreMasked stores a Int64x8 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64, CPU Feature: AVX512
//
//go:noescape
func (x Int64x8) StoreMasked(y *[8]int64, mask Mask64x8)
// Uint8x64 is a 512-bit SIMD vector of 64 uint8
type Uint8x64 struct {
uint8x64 v512
@ -884,6 +1044,22 @@ func LoadUint8x64(y *[64]uint8) Uint8x64
//go:noescape
func (x Uint8x64) Store(y *[64]uint8)
// LoadMaskedUint8x64 loads a Uint8x64 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU8.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedUint8x64(y *[64]uint8, mask Mask8x64) Uint8x64
// StoreMasked stores a Uint8x64 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU8, CPU Feature: AVX512
//
//go:noescape
func (x Uint8x64) StoreMasked(y *[64]uint8, mask Mask8x64)
// Uint16x32 is a 512-bit SIMD vector of 32 uint16
type Uint16x32 struct {
uint16x32 v512
@ -903,6 +1079,22 @@ func LoadUint16x32(y *[32]uint16) Uint16x32
//go:noescape
func (x Uint16x32) Store(y *[32]uint16)
// LoadMaskedUint16x32 loads a Uint16x32 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU16.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedUint16x32(y *[32]uint16, mask Mask16x32) Uint16x32
// StoreMasked stores a Uint16x32 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU16, CPU Feature: AVX512
//
//go:noescape
func (x Uint16x32) StoreMasked(y *[32]uint16, mask Mask16x32)
// Uint32x16 is a 512-bit SIMD vector of 16 uint32
type Uint32x16 struct {
uint32x16 v512
@ -922,6 +1114,22 @@ func LoadUint32x16(y *[16]uint32) Uint32x16
//go:noescape
func (x Uint32x16) Store(y *[16]uint32)
// LoadMaskedUint32x16 loads a Uint32x16 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedUint32x16(y *[16]uint32, mask Mask32x16) Uint32x16
// StoreMasked stores a Uint32x16 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU32, CPU Feature: AVX512
//
//go:noescape
func (x Uint32x16) StoreMasked(y *[16]uint32, mask Mask32x16)
// Uint64x8 is a 512-bit SIMD vector of 8 uint64
type Uint64x8 struct {
uint64x8 v512
@ -941,6 +1149,22 @@ func LoadUint64x8(y *[8]uint64) Uint64x8
//go:noescape
func (x Uint64x8) Store(y *[8]uint64)
// LoadMaskedUint64x8 loads a Uint64x8 from an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64.Z, CPU Feature: AVX512
//
//go:noescape
func LoadMaskedUint64x8(y *[8]uint64, mask Mask64x8) Uint64x8
// StoreMasked stores a Uint64x8 to an array,
// at those elements enabled by mask
//
// Asm: VMOVDQU64, CPU Feature: AVX512
//
//go:noescape
func (x Uint64x8) StoreMasked(y *[8]uint64, mask Mask64x8)
// Mask8x64 is a 512-bit SIMD vector of 64 int8
type Mask8x64 struct {
int8x64 v512
@ -965,6 +1189,8 @@ func (x Mask8x64) StoreToBits(y *uint64)
// Mask8x64FromBits constructs a Mask8x64 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 64 bits of y are used.
//
// Asm: KMOVB, CPU Feature: AVX512"
func Mask8x64FromBits(y uint64) Mask8x64
// Mask16x32 is a 512-bit SIMD vector of 32 int16
@ -991,6 +1217,8 @@ func (x Mask16x32) StoreToBits(y *uint64)
// Mask16x32FromBits constructs a Mask16x32 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 32 bits of y are used.
//
// Asm: KMOVW, CPU Feature: AVX512"
func Mask16x32FromBits(y uint32) Mask16x32
// Mask32x16 is a 512-bit SIMD vector of 16 int32
@ -1017,6 +1245,8 @@ func (x Mask32x16) StoreToBits(y *uint64)
// Mask32x16FromBits constructs a Mask32x16 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 16 bits of y are used.
//
// Asm: KMOVD, CPU Feature: AVX512"
func Mask32x16FromBits(y uint16) Mask32x16
// Mask64x8 is a 512-bit SIMD vector of 8 int64
@ -1043,4 +1273,6 @@ func (x Mask64x8) StoreToBits(y *uint64)
// Mask64x8FromBits constructs a Mask64x8 from a bitmap value, where 1 means set for the indexed element, 0 means unset.
// Only the lower 8 bits of y are used.
//
// Asm: KMOVQ, CPU Feature: AVX512"
func Mask64x8FromBits(y uint8) Mask64x8