[dev.simd] cmd/compile, simd: add 64-bit select-from-pair methods

these are in the same style as the 32-bit select-from-pair,
including the grouped variant.  This does not quite capture
the full awesome power of VSHUFPD where it can select
differently in each group; that will be some other method,
that is more complex.

Change-Id: I807ddd7c1256103b5b0d7c5d60bd70b185e3aaf0
Reviewed-on: https://go-review.googlesource.com/c/go/+/705695
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
David Chase 2025-09-20 16:52:07 -04:00
parent 25c36b95d1
commit ea3b2ecd28
4 changed files with 820 additions and 352 deletions

View file

@ -1632,12 +1632,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) addF(simdPackage, "Uint32x8.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64) addF(simdPackage, "Uint64x4.IsZero", opLen1(ssa.OpIsZeroVec, types.Types[types.TBOOL]), sys.AMD64)
sfp := func(method string, hwop ssa.Op, vectype *types.Type) { sfp4 := func(method string, hwop ssa.Op, vectype *types.Type) {
addF("simd", method, addF("simd", method,
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5] x, a, b, c, d, y := args[0], args[1], args[2], args[3], args[4], args[5]
if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 { if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 && c.Op == ssa.OpConst8 && d.Op == ssa.OpConst8 {
return selectFromPair(x, a, b, c, d, y, s, hwop, vectype) return select4FromPair(x, a, b, c, d, y, s, hwop, vectype)
} else { } else {
return s.callResult(n, callNormal) return s.callResult(n, callNormal)
} }
@ -1645,25 +1645,64 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
sys.AMD64) sys.AMD64)
} }
sfp("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128) sfp4("Int32x4.SelectFromPair", ssa.OpconcatSelectedConstantInt32x4, types.TypeVec128)
sfp("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128) sfp4("Uint32x4.SelectFromPair", ssa.OpconcatSelectedConstantUint32x4, types.TypeVec128)
sfp("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128) sfp4("Float32x4.SelectFromPair", ssa.OpconcatSelectedConstantFloat32x4, types.TypeVec128)
sfp("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256) sfp4("Int32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x8, types.TypeVec256)
sfp("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256) sfp4("Uint32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x8, types.TypeVec256)
sfp("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256) sfp4("Float32x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x8, types.TypeVec256)
sfp("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512) sfp4("Int32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt32x16, types.TypeVec512)
sfp("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512) sfp4("Uint32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint32x16, types.TypeVec512)
sfp("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512) sfp4("Float32x16.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat32x16, types.TypeVec512)
sfp2 := func(method string, hwop ssa.Op, vectype *types.Type, cscimm func(i, j uint8) int64) {
addF("simd", method,
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x, a, b, y := args[0], args[1], args[2], args[3]
if a.Op == ssa.OpConst8 && b.Op == ssa.OpConst8 {
return select2FromPair(x, a, b, y, s, hwop, vectype, cscimm)
} else {
return s.callResult(n, callNormal)
}
},
sys.AMD64)
}
sfp2("Uint64x2.SelectFromPair", ssa.OpconcatSelectedConstantUint64x2, types.TypeVec128, cscimm2)
sfp2("Int64x2.SelectFromPair", ssa.OpconcatSelectedConstantInt64x2, types.TypeVec128, cscimm2)
sfp2("Float64x2.SelectFromPair", ssa.OpconcatSelectedConstantFloat64x2, types.TypeVec128, cscimm2)
sfp2("Uint64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x4, types.TypeVec256, cscimm2g2)
sfp2("Int64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x4, types.TypeVec256, cscimm2g2)
sfp2("Float64x4.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x4, types.TypeVec256, cscimm2g2)
sfp2("Uint64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedUint64x8, types.TypeVec512, cscimm2g4)
sfp2("Int64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedInt64x8, types.TypeVec512, cscimm2g4)
sfp2("Float64x8.SelectFromPairGrouped", ssa.OpconcatSelectedConstantGroupedFloat64x8, types.TypeVec512, cscimm2g4)
} }
} }
func cscimm(a, b, c, d uint8) int64 { func cscimm4(a, b, c, d uint8) int64 {
return se(a + b<<2 + c<<4 + d<<6) return se(a + b<<2 + c<<4 + d<<6)
} }
func cscimm2(a, b uint8) int64 {
return se(a + b<<1)
}
func cscimm2g2(a, b uint8) int64 {
g := cscimm2(a, b)
return int64(int8(g + g<<2))
}
func cscimm2g4(a, b uint8) int64 {
g := cscimm2g2(a, b)
return int64(int8(g + g<<4))
}
const ( const (
_LLLL = iota _LLLL = iota
_HLLL _HLLL
@ -1683,7 +1722,32 @@ const (
_HHHH _HHHH
) )
func selectFromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value { const (
_LL = iota
_HL
_LH
_HH
)
func select2FromPair(x, _a, _b, y *ssa.Value, s *state, op ssa.Op, t *types.Type, csc func(a, b uint8) int64) *ssa.Value {
a, b := uint8(_a.AuxInt8()), uint8(_b.AuxInt8())
pattern := (a&2)>>1 + (b & 2)
a, b = a&1, b&1
switch pattern {
case _LL:
return s.newValue2I(op, t, csc(a, b), x, x)
case _HH:
return s.newValue2I(op, t, csc(a, b), y, y)
case _LH:
return s.newValue2I(op, t, csc(a, b), x, y)
case _HL:
return s.newValue2I(op, t, csc(a, b), y, x)
}
panic("The preceding switch should have been exhaustive")
}
func select4FromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *types.Type) *ssa.Value {
a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8()) a, b, c, d := uint8(_a.AuxInt8()), uint8(_b.AuxInt8()), uint8(_c.AuxInt8()), uint8(_d.AuxInt8())
pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1 pattern := a>>2 + (b&4)>>1 + (c & 4) + (d&4)<<1
@ -1692,54 +1756,54 @@ func selectFromPair(x, _a, _b, _c, _d, y *ssa.Value, s *state, op ssa.Op, t *typ
switch pattern { switch pattern {
case _LLLL: case _LLLL:
// TODO DETECT 0,1,2,3, 0,0,0,0 // TODO DETECT 0,1,2,3, 0,0,0,0
return s.newValue2I(op, t, cscimm(a, b, c, d), x, x) return s.newValue2I(op, t, cscimm4(a, b, c, d), x, x)
case _HHHH: case _HHHH:
// TODO DETECT 0,1,2,3, 0,0,0,0 // TODO DETECT 0,1,2,3, 0,0,0,0
return s.newValue2I(op, t, cscimm(a, b, c, d), y, y) return s.newValue2I(op, t, cscimm4(a, b, c, d), y, y)
case _LLHH: case _LLHH:
return s.newValue2I(op, t, cscimm(a, b, c, d), x, y) return s.newValue2I(op, t, cscimm4(a, b, c, d), x, y)
case _HHLL: case _HHLL:
return s.newValue2I(op, t, cscimm(a, b, c, d), y, x) return s.newValue2I(op, t, cscimm4(a, b, c, d), y, x)
case _HLLL: case _HLLL:
z := s.newValue2I(op, t, cscimm(a, a, b, b), y, x) z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
return s.newValue2I(op, t, cscimm(0, 2, c, d), z, x) return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
case _LHLL: case _LHLL:
z := s.newValue2I(op, t, cscimm(a, a, b, b), x, y) z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
return s.newValue2I(op, t, cscimm(0, 2, c, d), z, x) return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, x)
case _HLHH: case _HLHH:
z := s.newValue2I(op, t, cscimm(a, a, b, b), y, x) z := s.newValue2I(op, t, cscimm4(a, a, b, b), y, x)
return s.newValue2I(op, t, cscimm(0, 2, c, d), z, y) return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
case _LHHH: case _LHHH:
z := s.newValue2I(op, t, cscimm(a, a, b, b), x, y) z := s.newValue2I(op, t, cscimm4(a, a, b, b), x, y)
return s.newValue2I(op, t, cscimm(0, 2, c, d), z, y) return s.newValue2I(op, t, cscimm4(0, 2, c, d), z, y)
case _LLLH: case _LLLH:
z := s.newValue2I(op, t, cscimm(c, c, d, d), x, y) z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
return s.newValue2I(op, t, cscimm(a, b, 0, 2), x, z) return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
case _LLHL: case _LLHL:
z := s.newValue2I(op, t, cscimm(c, c, d, d), y, x) z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
return s.newValue2I(op, t, cscimm(a, b, 0, 2), x, z) return s.newValue2I(op, t, cscimm4(a, b, 0, 2), x, z)
case _HHLH: case _HHLH:
z := s.newValue2I(op, t, cscimm(c, c, d, d), x, y) z := s.newValue2I(op, t, cscimm4(c, c, d, d), x, y)
return s.newValue2I(op, t, cscimm(a, b, 0, 2), y, z) return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
case _HHHL: case _HHHL:
z := s.newValue2I(op, t, cscimm(c, c, d, d), y, x) z := s.newValue2I(op, t, cscimm4(c, c, d, d), y, x)
return s.newValue2I(op, t, cscimm(a, b, 0, 2), y, z) return s.newValue2I(op, t, cscimm4(a, b, 0, 2), y, z)
case _LHLH: case _LHLH:
z := s.newValue2I(op, t, cscimm(a, c, b, d), x, y) z := s.newValue2I(op, t, cscimm4(a, c, b, d), x, y)
return s.newValue2I(op, t, se(0b11_01_10_00), z, z) return s.newValue2I(op, t, se(0b11_01_10_00), z, z)
case _HLHL: case _HLHL:
z := s.newValue2I(op, t, cscimm(b, d, a, c), x, y) z := s.newValue2I(op, t, cscimm4(b, d, a, c), x, y)
return s.newValue2I(op, t, se(0b01_11_00_10), z, z) return s.newValue2I(op, t, se(0b01_11_00_10), z, z)
case _HLLH: case _HLLH:
z := s.newValue2I(op, t, cscimm(b, c, a, d), x, y) z := s.newValue2I(op, t, cscimm4(b, c, a, d), x, y)
return s.newValue2I(op, t, se(0b11_01_00_10), z, z) return s.newValue2I(op, t, se(0b11_01_00_10), z, z)
case _LHHL: case _LHHL:
z := s.newValue2I(op, t, cscimm(a, d, b, c), x, y) z := s.newValue2I(op, t, cscimm4(a, d, b, c), x, y)
return s.newValue2I(op, t, se(0b01_11_10_00), z, z) return s.newValue2I(op, t, se(0b01_11_10_00), z, z)
} }
panic("The preceding switch should have been exhaustive") panic("The preceding switch should have been exhaustive")
@ -1906,7 +1970,7 @@ func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExp
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 { if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 {
i1, i2 := args[1].AuxInt, args[2].AuxInt i1, i2 := args[1].AuxInt, args[2].AuxInt
return s.newValue2I(op, t, i1+i2<<4, args[0], args[3]) return s.newValue2I(op, t, int64(int8(i1+i2<<4)), args[0], args[3])
} }
four := s.constInt64(types.Types[types.TUINT8], 4) four := s.constInt64(types.Types[types.TUINT8], 4)
shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four) shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four)

View file

@ -595,7 +595,7 @@ func TestIsZero(t *testing.T) {
} }
} }
func TestSelectFromPairConst(t *testing.T) { func TestSelect4FromPairConst(t *testing.T) {
x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
@ -652,7 +652,7 @@ func selectFromPairInt32x4(x simd.Int32x4, a, b, c, d uint8, y simd.Int32x4) sim
return x.SelectFromPair(a, b, c, d, y) return x.SelectFromPair(a, b, c, d, y)
} }
func TestSelectFromPairVar(t *testing.T) { func TestSelect4FromPairVar(t *testing.T) {
x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3}) x := simd.LoadInt32x4Slice([]int32{0, 1, 2, 3})
y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7}) y := simd.LoadInt32x4Slice([]int32{4, 5, 6, 7})
@ -704,7 +704,7 @@ func TestSelectFromPairVar(t *testing.T) {
foo(hllh, 4, 0, 1, 5) foo(hllh, 4, 0, 1, 5)
} }
func TestSelectFromPairConstGroupedFloat32x8(t *testing.T) { func TestSelect4FromPairConstGrouped(t *testing.T) {
x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13}) x := simd.LoadFloat32x8Slice([]float32{0, 1, 2, 3, 10, 11, 12, 13})
y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17}) y := simd.LoadFloat32x8Slice([]float32{4, 5, 6, 7, 14, 15, 16, 17})
@ -887,5 +887,119 @@ func TestSelect128FromPairVar(t *testing.T) {
foo(cd, 2, 3) foo(cd, 2, 3)
foo(da, 3, 0) foo(da, 3, 0)
foo(dc, 3, 2) foo(dc, 3, 2)
}
func TestSelect2FromPairConst(t *testing.T) {
x := simd.LoadUint64x2Slice([]uint64{0, 1})
y := simd.LoadUint64x2Slice([]uint64{2, 3})
ll := x.SelectFromPair(0, 1, y)
hh := x.SelectFromPair(3, 2, y)
lh := x.SelectFromPair(0, 3, y)
hl := x.SelectFromPair(2, 1, y)
r := make([]uint64, 2, 2)
foo := func(v simd.Uint64x2, a, b uint64) {
v.StoreSlice(r)
checkSlices[uint64](t, r, []uint64{a, b})
}
foo(ll, 0, 1)
foo(hh, 3, 2)
foo(lh, 0, 3)
foo(hl, 2, 1)
}
func TestSelect2FromPairConstGroupedUint(t *testing.T) {
x := simd.LoadUint64x4Slice([]uint64{0, 1, 10, 11})
y := simd.LoadUint64x4Slice([]uint64{2, 3, 12, 13})
ll := x.SelectFromPairGrouped(0, 1, y)
hh := x.SelectFromPairGrouped(3, 2, y)
lh := x.SelectFromPairGrouped(0, 3, y)
hl := x.SelectFromPairGrouped(2, 1, y)
r := make([]uint64, 4, 4)
foo := func(v simd.Uint64x4, a, b uint64) {
v.StoreSlice(r)
checkSlices[uint64](t, r, []uint64{a, b, a + 10, b + 10})
}
foo(ll, 0, 1)
foo(hh, 3, 2)
foo(lh, 0, 3)
foo(hl, 2, 1)
}
func TestSelect2FromPairConstGroupedFloat(t *testing.T) {
x := simd.LoadFloat64x4Slice([]float64{0, 1, 10, 11})
y := simd.LoadFloat64x4Slice([]float64{2, 3, 12, 13})
ll := x.SelectFromPairGrouped(0, 1, y)
hh := x.SelectFromPairGrouped(3, 2, y)
lh := x.SelectFromPairGrouped(0, 3, y)
hl := x.SelectFromPairGrouped(2, 1, y)
r := make([]float64, 4, 4)
foo := func(v simd.Float64x4, a, b float64) {
v.StoreSlice(r)
checkSlices[float64](t, r, []float64{a, b, a + 10, b + 10})
}
foo(ll, 0, 1)
foo(hh, 3, 2)
foo(lh, 0, 3)
foo(hl, 2, 1)
}
func TestSelect2FromPairConstGroupedInt(t *testing.T) {
x := simd.LoadInt64x4Slice([]int64{0, 1, 10, 11})
y := simd.LoadInt64x4Slice([]int64{2, 3, 12, 13})
ll := x.SelectFromPairGrouped(0, 1, y)
hh := x.SelectFromPairGrouped(3, 2, y)
lh := x.SelectFromPairGrouped(0, 3, y)
hl := x.SelectFromPairGrouped(2, 1, y)
r := make([]int64, 4, 4)
foo := func(v simd.Int64x4, a, b int64) {
v.StoreSlice(r)
checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10})
}
foo(ll, 0, 1)
foo(hh, 3, 2)
foo(lh, 0, 3)
foo(hl, 2, 1)
}
func TestSelect2FromPairConstGroupedInt512(t *testing.T) {
if !simd.HasAVX512() {
t.Skip("Test requires HasAVX512, not available on this hardware")
return
}
x := simd.LoadInt64x8Slice([]int64{0, 1, 10, 11, 20, 21, 30, 31})
y := simd.LoadInt64x8Slice([]int64{2, 3, 12, 13, 22, 23, 32, 33})
ll := x.SelectFromPairGrouped(0, 1, y)
hh := x.SelectFromPairGrouped(3, 2, y)
lh := x.SelectFromPairGrouped(0, 3, y)
hl := x.SelectFromPairGrouped(2, 1, y)
r := make([]int64, 8, 8)
foo := func(v simd.Int64x8, a, b int64) {
v.StoreSlice(r)
checkSlices[int64](t, r, []int64{a, b, a + 10, b + 10, a + 20, b + 20, a + 30, b + 30})
}
foo(ll, 0, 1)
foo(hh, 3, 2)
foo(lh, 0, 3)
foo(hl, 2, 1)
} }

View file

@ -99,53 +99,53 @@ func select2x4x32(x Int32x4, a, b, c, d uint8, y Int32x4) Int32x4 {
switch pattern { switch pattern {
case _LLLL: case _LLLL:
return x.concatSelectedConstant(cscimm(a, b, c, d), x) return x.concatSelectedConstant(cscimm4(a, b, c, d), x)
case _HHHH: case _HHHH:
return y.concatSelectedConstant(cscimm(a, b, c, d), y) return y.concatSelectedConstant(cscimm4(a, b, c, d), y)
case _LLHH: case _LLHH:
return x.concatSelectedConstant(cscimm(a, b, c, d), y) return x.concatSelectedConstant(cscimm4(a, b, c, d), y)
case _HHLL: case _HHLL:
return y.concatSelectedConstant(cscimm(a, b, c, d), x) return y.concatSelectedConstant(cscimm4(a, b, c, d), x)
case _HLLL: case _HLLL:
z := y.concatSelectedConstant(cscimm(a, a, b, b), x) z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
return z.concatSelectedConstant(cscimm(0, 2, c, d), x) return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
case _LHLL: case _LHLL:
z := x.concatSelectedConstant(cscimm(a, a, b, b), y) z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
return z.concatSelectedConstant(cscimm(0, 2, c, d), x) return z.concatSelectedConstant(cscimm4(0, 2, c, d), x)
case _HLHH: case _HLHH:
z := y.concatSelectedConstant(cscimm(a, a, b, b), x) z := y.concatSelectedConstant(cscimm4(a, a, b, b), x)
return z.concatSelectedConstant(cscimm(0, 2, c, d), y) return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
case _LHHH: case _LHHH:
z := x.concatSelectedConstant(cscimm(a, a, b, b), y) z := x.concatSelectedConstant(cscimm4(a, a, b, b), y)
return z.concatSelectedConstant(cscimm(0, 2, c, d), y) return z.concatSelectedConstant(cscimm4(0, 2, c, d), y)
case _LLLH: case _LLLH:
z := x.concatSelectedConstant(cscimm(c, c, d, d), y) z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
case _LLHL: case _LLHL:
z := y.concatSelectedConstant(cscimm(c, c, d, d), x) z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
return x.concatSelectedConstant(cscimm(a, b, 0, 2), z) return x.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
case _HHLH: case _HHLH:
z := x.concatSelectedConstant(cscimm(c, c, d, d), y) z := x.concatSelectedConstant(cscimm4(c, c, d, d), y)
return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
case _HHHL: case _HHHL:
z := y.concatSelectedConstant(cscimm(c, c, d, d), x) z := y.concatSelectedConstant(cscimm4(c, c, d, d), x)
return y.concatSelectedConstant(cscimm(a, b, 0, 2), z) return y.concatSelectedConstant(cscimm4(a, b, 0, 2), z)
case _LHLH: case _LHLH:
z := x.concatSelectedConstant(cscimm(a, c, b, d), y) z := x.concatSelectedConstant(cscimm4(a, c, b, d), y)
return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) return z.concatSelectedConstant(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
case _HLHL: case _HLHL:
z := x.concatSelectedConstant(cscimm(b, d, a, c), y) z := x.concatSelectedConstant(cscimm4(b, d, a, c), y)
return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) return z.concatSelectedConstant(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
case _HLLH: case _HLLH:
z := x.concatSelectedConstant(cscimm(b, c, a, d), y) z := x.concatSelectedConstant(cscimm4(b, c, a, d), y)
return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) return z.concatSelectedConstant(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
case _LHHL: case _LHHL:
z := x.concatSelectedConstant(cscimm(a, d, b, c), y) z := x.concatSelectedConstant(cscimm4(a, d, b, c), y)
return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) return z.concatSelectedConstant(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
} }
panic("missing case, switch should be exhaustive") panic("missing case, switch should be exhaustive")
} }
@ -180,53 +180,53 @@ func select2x8x32Grouped(x Int32x8, a, b, c, d uint8, y Int32x8) Int32x8 {
switch pattern { switch pattern {
case _LLLL: case _LLLL:
return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
case _HHHH: case _HHHH:
return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
case _LLHH: case _LLHH:
return x.concatSelectedConstantGrouped(cscimm(a, b, c, d), y) return x.concatSelectedConstantGrouped(cscimm4(a, b, c, d), y)
case _HHLL: case _HHLL:
return y.concatSelectedConstantGrouped(cscimm(a, b, c, d), x) return y.concatSelectedConstantGrouped(cscimm4(a, b, c, d), x)
case _HLLL: case _HLLL:
z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
case _LHLL: case _LHLL:
z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), x) return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), x)
case _HLHH: case _HLHH:
z := y.concatSelectedConstantGrouped(cscimm(a, a, b, b), x) z := y.concatSelectedConstantGrouped(cscimm4(a, a, b, b), x)
return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
case _LHHH: case _LHHH:
z := x.concatSelectedConstantGrouped(cscimm(a, a, b, b), y) z := x.concatSelectedConstantGrouped(cscimm4(a, a, b, b), y)
return z.concatSelectedConstantGrouped(cscimm(0, 2, c, d), y) return z.concatSelectedConstantGrouped(cscimm4(0, 2, c, d), y)
case _LLLH: case _LLLH:
z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
case _LLHL: case _LLHL:
z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
return x.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) return x.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
case _HHLH: case _HHLH:
z := x.concatSelectedConstantGrouped(cscimm(c, c, d, d), y) z := x.concatSelectedConstantGrouped(cscimm4(c, c, d, d), y)
return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
case _HHHL: case _HHHL:
z := y.concatSelectedConstantGrouped(cscimm(c, c, d, d), x) z := y.concatSelectedConstantGrouped(cscimm4(c, c, d, d), x)
return y.concatSelectedConstantGrouped(cscimm(a, b, 0, 2), z) return y.concatSelectedConstantGrouped(cscimm4(a, b, 0, 2), z)
case _LHLH: case _LHLH:
z := x.concatSelectedConstantGrouped(cscimm(a, c, b, d), y) z := x.concatSelectedConstantGrouped(cscimm4(a, c, b, d), y)
return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm(0, 2, 1, 3) */, z) return z.concatSelectedConstantGrouped(0b11_01_10_00 /* =cscimm4(0, 2, 1, 3) */, z)
case _HLHL: case _HLHL:
z := x.concatSelectedConstantGrouped(cscimm(b, d, a, c), y) z := x.concatSelectedConstantGrouped(cscimm4(b, d, a, c), y)
return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm(2, 0, 3, 1) */, z) return z.concatSelectedConstantGrouped(0b01_11_00_10 /* =cscimm4(2, 0, 3, 1) */, z)
case _HLLH: case _HLLH:
z := x.concatSelectedConstantGrouped(cscimm(b, c, a, d), y) z := x.concatSelectedConstantGrouped(cscimm4(b, c, a, d), y)
return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm(2, 0, 1, 3) */, z) return z.concatSelectedConstantGrouped(0b11_01_00_10 /* =cscimm4(2, 0, 1, 3) */, z)
case _LHHL: case _LHHL:
z := x.concatSelectedConstantGrouped(cscimm(a, d, b, c), y) z := x.concatSelectedConstantGrouped(cscimm4(a, d, b, c), y)
return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm(0, 2, 3, 1) */, z) return z.concatSelectedConstantGrouped(0b01_11_10_00 /* =cscimm4(0, 2, 3, 1) */, z)
} }
panic("missing case, switch should be exhaustive") panic("missing case, switch should be exhaustive")
} }

File diff suppressed because it is too large Load diff