diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 8b3b08f886f..cf2e7fc6764 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -2132,6 +2132,30 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint64x4.Store", simdStore(), sys.AMD64) addF(simdPackage, "LoadUint64x8", simdLoad(), sys.AMD64) addF(simdPackage, "Uint64x8.Store", simdStore(), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Float32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Float32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Float64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedFloat64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Float64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedInt32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Int32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedInt32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Int32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedInt64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Int64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedInt64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Int64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedUint32x4", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Uint32x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedUint32x8", simdMaskedLoad(ssa.OpLoadMasked32), sys.AMD64) + addF(simdPackage, "Uint32x8.StoreMasked", simdMaskedStore(ssa.OpStoreMasked32), sys.AMD64) + addF(simdPackage, "LoadMaskedUint64x2", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Uint64x2.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) + addF(simdPackage, "LoadMaskedUint64x4", simdMaskedLoad(ssa.OpLoadMasked64), sys.AMD64) + addF(simdPackage, "Uint64x4.StoreMasked", simdMaskedStore(ssa.OpStoreMasked64), sys.AMD64) addF(simdPackage, "Mask8x16.AsInt8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Int8x16.AsMask8x16", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return args[0] }, sys.AMD64) addF(simdPackage, "Mask8x16.And", opLen2(ssa.OpAndInt32x4, types.TypeVec128), sys.AMD64) diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 06af3458b52..541a33d34ad 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -206,16 +206,6 @@ func TestPairDotProdAccumulate(t *testing.T) { } } -// checkInt8Slices ensures that b and a are equal, to the end of b. -// also serves to use the slices, to prevent accidental optimization. -func checkInt8Slices(t *testing.T, a, b []int8) { - for i := range b { - if a[i] != b[i] { - t.Errorf("a and b differ at index %d, a=%d, b=%d", i, a[i], b[i]) - } - } -} - func TestSlicesInt8(t *testing.T) { a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} diff --git a/src/simd/slicepart_amd64.go b/src/simd/slicepart_amd64.go index 7f5247cd8c2..920cdb8ccd9 100644 --- a/src/simd/slicepart_amd64.go +++ b/src/simd/slicepart_amd64.go @@ -37,6 +37,10 @@ func int64atP32(p *int32) *int64 { return (*int64)(unsafe.Pointer(p)) } +func int32atP64(p *int64) *int32 { + return (*int32)(unsafe.Pointer(p)) +} + /* unsigned versions of integer slice part loads */ // LoadUint8x16SlicePart loads a Uint8x16 from the slice s. @@ -385,3 +389,70 @@ func (x Int16x8) StoreSlicePart(s []int16) { } return } + +var vecMask64 = [16]int64{ + -1, -1, -1, -1, + -1, -1, -1, -1, + 0, 0, 0, 0, + 0, 0, 0, 0, +} + +// paInt32x4 is an unchecked cast from a slice to an +// pointer-to-array type, for used in a masked +// load/store. In practice, the slice will be too +// short, so this has to be unsafe, and its only +// use must be with an instruction with masked +// load/store effect (including faults). +func paInt32x4(s []int32) *[4]int32 { + return (*[4]int32)(unsafe.Pointer(&s[0])) +} + +/* 32 and 64-bit slice-part loads for AVX2 (128 and 256 bit) */ + +func LoadInt32x4SlicePart(s []int32) Int32x4 { + l := len(s) + if l >= 4 { + return LoadInt32x4Slice(s) + } + if l == 0 { + var x Int32x4 + return x + } + p := int32atP64(&vecMask64[0]) + mask := unsafe.Slice(p, 32)[16-l:] + return LoadMaskedInt32x4(paInt32x4(s), LoadInt32x4Slice(mask).AsMask32x4()) +} + +func (x Int32x4) StoreSlicePart(s []int32) { + l := len(s) + if l >= 4 { + x.StoreSlice(s) + return + } + if l == 0 { + return + } + p := int32atP64(&vecMask64[0]) + mask := unsafe.Slice(p, 32)[16-l:] + x.StoreMasked(paInt32x4(s), LoadInt32x4Slice(mask).AsMask32x4()) +} + +// func LoadInt32x8SlicePart(s []int32) Int32x8 { +// } + +// func LoadInt64x2SlicePart(s []int64) Int64x2 { +// } + +// func LoadInt64x4SlicePart(s []int64) Int64x4 { +// } + +// func (x Int32x8) StoreSlicePart(s []int32) { +// } + +// func (x Int64x4) StoreSlicePart(s []int64) { +// } + +// func (x Int64x8) StoreSlicePart(s []int64) { +// } + +// Handle float32, float64, uint32, and uint64 with ugly casts. diff --git a/src/simd/slicepart_test.go b/src/simd/slicepart_test.go index 6e047248790..cd282be7b1c 100644 --- a/src/simd/slicepart_test.go +++ b/src/simd/slicepart_test.go @@ -177,3 +177,43 @@ func TestSlicesPartStoreUint8x32(t *testing.T) { } } } + +func TestSlicePartInt32(t *testing.T) { + L := 4 + c := []int32{1, 2, 3, 4, 5, -1, -1, -1, -1} + a := c[:L+1] + for i := range a { + // Test the load first + // e is a partial slice. + e := a[i:] + v := simd.LoadInt32x4SlicePart(e) + // d contains what a ought to contain + d := make([]int32, L) + for j := 0; j < len(e) && j < len(d); j++ { + d[j] = e[j] + } + + b := make([]int32, L) + v.StoreSlice(b) + // test the load + checkSlices(t, d, b) + + // Test the store + f := make([]int32, L+1) + for i := range f { + f[i] = 99 + } + + v.StoreSlicePart(f[:len(e)]) + if len(e) < len(b) { + checkSlices(t, f, b[:len(e)]) + } else { + checkSlices(t, f, b) + } + for i := len(e); i < len(f); i++ { + if f[i] != 99 { + t.Errorf("StoreSlicePart altered f[%d], expected 99, saw %d", i, f[i]) + } + } + } +} diff --git a/src/simd/types_amd64.go b/src/simd/types_amd64.go index 998a8f9fe1d..c1676ff34e2 100644 --- a/src/simd/types_amd64.go +++ b/src/simd/types_amd64.go @@ -28,6 +28,18 @@ func LoadFloat32x4(y *[4]float32) Float32x4 //go:noescape func (x Float32x4) Store(y *[4]float32) +// LoadMaskedFloat32x4 loads a Float32x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedFloat32x4(y *[4]float32, mask Mask32x4) Float32x4 + +// StoreMasked stores a Float32x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Float32x4) StoreMasked(y *[4]float32, mask Mask32x4) + // Float64x2 is a 128-bit SIMD vector of 2 float64 type Float64x2 struct { float64x2 v128 @@ -47,6 +59,18 @@ func LoadFloat64x2(y *[2]float64) Float64x2 //go:noescape func (x Float64x2) Store(y *[2]float64) +// LoadMaskedFloat64x2 loads a Float64x2 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedFloat64x2(y *[2]float64, mask Mask64x2) Float64x2 + +// StoreMasked stores a Float64x2 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Float64x2) StoreMasked(y *[2]float64, mask Mask64x2) + // Int8x16 is a 128-bit SIMD vector of 16 int8 type Int8x16 struct { int8x16 v128 @@ -104,6 +128,18 @@ func LoadInt32x4(y *[4]int32) Int32x4 //go:noescape func (x Int32x4) Store(y *[4]int32) +// LoadMaskedInt32x4 loads a Int32x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedInt32x4(y *[4]int32, mask Mask32x4) Int32x4 + +// StoreMasked stores a Int32x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Int32x4) StoreMasked(y *[4]int32, mask Mask32x4) + // Int64x2 is a 128-bit SIMD vector of 2 int64 type Int64x2 struct { int64x2 v128 @@ -123,6 +159,18 @@ func LoadInt64x2(y *[2]int64) Int64x2 //go:noescape func (x Int64x2) Store(y *[2]int64) +// LoadMaskedInt64x2 loads a Int64x2 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedInt64x2(y *[2]int64, mask Mask64x2) Int64x2 + +// StoreMasked stores a Int64x2 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Int64x2) StoreMasked(y *[2]int64, mask Mask64x2) + // Uint8x16 is a 128-bit SIMD vector of 16 uint8 type Uint8x16 struct { uint8x16 v128 @@ -180,6 +228,18 @@ func LoadUint32x4(y *[4]uint32) Uint32x4 //go:noescape func (x Uint32x4) Store(y *[4]uint32) +// LoadMaskedUint32x4 loads a Uint32x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedUint32x4(y *[4]uint32, mask Mask32x4) Uint32x4 + +// StoreMasked stores a Uint32x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Uint32x4) StoreMasked(y *[4]uint32, mask Mask32x4) + // Uint64x2 is a 128-bit SIMD vector of 2 uint64 type Uint64x2 struct { uint64x2 v128 @@ -199,6 +259,18 @@ func LoadUint64x2(y *[2]uint64) Uint64x2 //go:noescape func (x Uint64x2) Store(y *[2]uint64) +// LoadMaskedUint64x2 loads a Uint64x2 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedUint64x2(y *[2]uint64, mask Mask64x2) Uint64x2 + +// StoreMasked stores a Uint64x2 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Uint64x2) StoreMasked(y *[2]uint64, mask Mask64x2) + // Mask8x16 is a 128-bit SIMD vector of 16 int8 type Mask8x16 struct { int8x16 v128 @@ -311,6 +383,18 @@ func LoadFloat32x8(y *[8]float32) Float32x8 //go:noescape func (x Float32x8) Store(y *[8]float32) +// LoadMaskedFloat32x8 loads a Float32x8 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedFloat32x8(y *[8]float32, mask Mask32x8) Float32x8 + +// StoreMasked stores a Float32x8 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Float32x8) StoreMasked(y *[8]float32, mask Mask32x8) + // Float64x4 is a 256-bit SIMD vector of 4 float64 type Float64x4 struct { float64x4 v256 @@ -330,6 +414,18 @@ func LoadFloat64x4(y *[4]float64) Float64x4 //go:noescape func (x Float64x4) Store(y *[4]float64) +// LoadMaskedFloat64x4 loads a Float64x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedFloat64x4(y *[4]float64, mask Mask64x4) Float64x4 + +// StoreMasked stores a Float64x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Float64x4) StoreMasked(y *[4]float64, mask Mask64x4) + // Int8x32 is a 256-bit SIMD vector of 32 int8 type Int8x32 struct { int8x32 v256 @@ -387,6 +483,18 @@ func LoadInt32x8(y *[8]int32) Int32x8 //go:noescape func (x Int32x8) Store(y *[8]int32) +// LoadMaskedInt32x8 loads a Int32x8 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedInt32x8(y *[8]int32, mask Mask32x8) Int32x8 + +// StoreMasked stores a Int32x8 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Int32x8) StoreMasked(y *[8]int32, mask Mask32x8) + // Int64x4 is a 256-bit SIMD vector of 4 int64 type Int64x4 struct { int64x4 v256 @@ -406,6 +514,18 @@ func LoadInt64x4(y *[4]int64) Int64x4 //go:noescape func (x Int64x4) Store(y *[4]int64) +// LoadMaskedInt64x4 loads a Int64x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedInt64x4(y *[4]int64, mask Mask64x4) Int64x4 + +// StoreMasked stores a Int64x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Int64x4) StoreMasked(y *[4]int64, mask Mask64x4) + // Uint8x32 is a 256-bit SIMD vector of 32 uint8 type Uint8x32 struct { uint8x32 v256 @@ -463,6 +583,18 @@ func LoadUint32x8(y *[8]uint32) Uint32x8 //go:noescape func (x Uint32x8) Store(y *[8]uint32) +// LoadMaskedUint32x8 loads a Uint32x8 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedUint32x8(y *[8]uint32, mask Mask32x8) Uint32x8 + +// StoreMasked stores a Uint32x8 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Uint32x8) StoreMasked(y *[8]uint32, mask Mask32x8) + // Uint64x4 is a 256-bit SIMD vector of 4 uint64 type Uint64x4 struct { uint64x4 v256 @@ -482,6 +614,18 @@ func LoadUint64x4(y *[4]uint64) Uint64x4 //go:noescape func (x Uint64x4) Store(y *[4]uint64) +// LoadMaskedUint64x4 loads a Uint64x4 from an array, +// at those elements enabled by mask +// +//go:noescape +func LoadMaskedUint64x4(y *[4]uint64, mask Mask64x4) Uint64x4 + +// StoreMasked stores a Uint64x4 to an array, +// at those elements enabled by mask +// +//go:noescape +func (x Uint64x4) StoreMasked(y *[4]uint64, mask Mask64x4) + // Mask8x32 is a 256-bit SIMD vector of 32 int8 type Mask8x32 struct { int8x32 v256