[dev.simd] simd, cmd/compile: add Interleave{Hi,Lo} (VPUNPCK*)

these are building blocks for transpose, not sure of their
best names yet.

Change-Id: I3800a55de9fa7fde2590ca822894c8a75387dec3
Reviewed-on: https://go-review.googlesource.com/c/go/+/698576
Reviewed-by: Junyang Shao <shaojunyang@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
This commit is contained in:
David Chase 2025-08-20 16:58:55 -04:00
parent 6890aa2e20
commit b509516b2e
12 changed files with 1021 additions and 2 deletions

View file

@ -243,6 +243,24 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
ssa.OpAMD64VPCMPGTD256,
ssa.OpAMD64VPCMPGTQ128,
ssa.OpAMD64VPCMPGTQ256,
ssa.OpAMD64VPUNPCKHWD128,
ssa.OpAMD64VPUNPCKHDQ128,
ssa.OpAMD64VPUNPCKHQDQ128,
ssa.OpAMD64VPUNPCKHWD256,
ssa.OpAMD64VPUNPCKHWD512,
ssa.OpAMD64VPUNPCKHDQ256,
ssa.OpAMD64VPUNPCKHDQ512,
ssa.OpAMD64VPUNPCKHQDQ256,
ssa.OpAMD64VPUNPCKHQDQ512,
ssa.OpAMD64VPUNPCKLWD128,
ssa.OpAMD64VPUNPCKLDQ128,
ssa.OpAMD64VPUNPCKLQDQ128,
ssa.OpAMD64VPUNPCKLWD256,
ssa.OpAMD64VPUNPCKLWD512,
ssa.OpAMD64VPUNPCKLDQ256,
ssa.OpAMD64VPUNPCKLDQ512,
ssa.OpAMD64VPUNPCKLQDQ256,
ssa.OpAMD64VPUNPCKLQDQ512,
ssa.OpAMD64VMAXPS128,
ssa.OpAMD64VMAXPS256,
ssa.OpAMD64VMAXPS512,

View file

@ -520,6 +520,42 @@
(GreaterEqualUint16x32 x y) => (VPMOVMToVec16x32 (VPCMPUW512 [13] x y))
(GreaterEqualUint32x16 x y) => (VPMOVMToVec32x16 (VPCMPUD512 [13] x y))
(GreaterEqualUint64x8 x y) => (VPMOVMToVec64x8 (VPCMPUQ512 [13] x y))
(InterleaveHiInt16x8 ...) => (VPUNPCKHWD128 ...)
(InterleaveHiInt32x4 ...) => (VPUNPCKHDQ128 ...)
(InterleaveHiInt64x2 ...) => (VPUNPCKHQDQ128 ...)
(InterleaveHiUint16x8 ...) => (VPUNPCKHWD128 ...)
(InterleaveHiUint32x4 ...) => (VPUNPCKHDQ128 ...)
(InterleaveHiUint64x2 ...) => (VPUNPCKHQDQ128 ...)
(InterleaveHiGroupedInt16x16 ...) => (VPUNPCKHWD256 ...)
(InterleaveHiGroupedInt16x32 ...) => (VPUNPCKHWD512 ...)
(InterleaveHiGroupedInt32x8 ...) => (VPUNPCKHDQ256 ...)
(InterleaveHiGroupedInt32x16 ...) => (VPUNPCKHDQ512 ...)
(InterleaveHiGroupedInt64x4 ...) => (VPUNPCKHQDQ256 ...)
(InterleaveHiGroupedInt64x8 ...) => (VPUNPCKHQDQ512 ...)
(InterleaveHiGroupedUint16x16 ...) => (VPUNPCKHWD256 ...)
(InterleaveHiGroupedUint16x32 ...) => (VPUNPCKHWD512 ...)
(InterleaveHiGroupedUint32x8 ...) => (VPUNPCKHDQ256 ...)
(InterleaveHiGroupedUint32x16 ...) => (VPUNPCKHDQ512 ...)
(InterleaveHiGroupedUint64x4 ...) => (VPUNPCKHQDQ256 ...)
(InterleaveHiGroupedUint64x8 ...) => (VPUNPCKHQDQ512 ...)
(InterleaveLoInt16x8 ...) => (VPUNPCKLWD128 ...)
(InterleaveLoInt32x4 ...) => (VPUNPCKLDQ128 ...)
(InterleaveLoInt64x2 ...) => (VPUNPCKLQDQ128 ...)
(InterleaveLoUint16x8 ...) => (VPUNPCKLWD128 ...)
(InterleaveLoUint32x4 ...) => (VPUNPCKLDQ128 ...)
(InterleaveLoUint64x2 ...) => (VPUNPCKLQDQ128 ...)
(InterleaveLoGroupedInt16x16 ...) => (VPUNPCKLWD256 ...)
(InterleaveLoGroupedInt16x32 ...) => (VPUNPCKLWD512 ...)
(InterleaveLoGroupedInt32x8 ...) => (VPUNPCKLDQ256 ...)
(InterleaveLoGroupedInt32x16 ...) => (VPUNPCKLDQ512 ...)
(InterleaveLoGroupedInt64x4 ...) => (VPUNPCKLQDQ256 ...)
(InterleaveLoGroupedInt64x8 ...) => (VPUNPCKLQDQ512 ...)
(InterleaveLoGroupedUint16x16 ...) => (VPUNPCKLWD256 ...)
(InterleaveLoGroupedUint16x32 ...) => (VPUNPCKLWD512 ...)
(InterleaveLoGroupedUint32x8 ...) => (VPUNPCKLDQ256 ...)
(InterleaveLoGroupedUint32x16 ...) => (VPUNPCKLDQ512 ...)
(InterleaveLoGroupedUint64x4 ...) => (VPUNPCKLQDQ256 ...)
(InterleaveLoGroupedUint64x8 ...) => (VPUNPCKLQDQ512 ...)
(IsNanFloat32x4 x y) => (VCMPPS128 [3] x y)
(IsNanFloat32x8 x y) => (VCMPPS256 [3] x y)
(IsNanFloat32x16 x y) => (VPMOVMToVec32x16 (VCMPPS512 [3] x y))

View file

@ -983,6 +983,24 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf
{name: "VPSUBWMasked128", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPSUBWMasked256", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPSUBWMasked512", argLength: 3, reg: w2kw, asm: "VPSUBW", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKHDQ128", argLength: 2, reg: v21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKHDQ256", argLength: 2, reg: v21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKHDQ512", argLength: 2, reg: w21, asm: "VPUNPCKHDQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKHQDQ128", argLength: 2, reg: v21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKHQDQ256", argLength: 2, reg: v21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKHQDQ512", argLength: 2, reg: w21, asm: "VPUNPCKHQDQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKHWD128", argLength: 2, reg: v21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKHWD256", argLength: 2, reg: v21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKHWD512", argLength: 2, reg: w21, asm: "VPUNPCKHWD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKLDQ128", argLength: 2, reg: v21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKLDQ256", argLength: 2, reg: v21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKLDQ512", argLength: 2, reg: w21, asm: "VPUNPCKLDQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKLQDQ128", argLength: 2, reg: v21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKLQDQ256", argLength: 2, reg: v21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKLQDQ512", argLength: 2, reg: w21, asm: "VPUNPCKLQDQ", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPUNPCKLWD128", argLength: 2, reg: v21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec128", resultInArg0: false},
{name: "VPUNPCKLWD256", argLength: 2, reg: v21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec256", resultInArg0: false},
{name: "VPUNPCKLWD512", argLength: 2, reg: w21, asm: "VPUNPCKLWD", commutative: false, typ: "Vec512", resultInArg0: false},
{name: "VPXOR128", argLength: 2, reg: v21, asm: "VPXOR", commutative: true, typ: "Vec128", resultInArg0: false},
{name: "VPXOR256", argLength: 2, reg: v21, asm: "VPXOR", commutative: true, typ: "Vec256", resultInArg0: false},
{name: "VPXORD512", argLength: 2, reg: w21, asm: "VPXORD", commutative: true, typ: "Vec512", resultInArg0: false},

View file

@ -484,6 +484,42 @@ func simdGenericOps() []opData {
{name: "GreaterUint16x32", argLength: 2, commutative: false},
{name: "GreaterUint32x16", argLength: 2, commutative: false},
{name: "GreaterUint64x8", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt16x16", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt16x32", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt32x8", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt32x16", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt64x4", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedInt64x8", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint16x16", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint16x32", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint32x8", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint32x16", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint64x4", argLength: 2, commutative: false},
{name: "InterleaveHiGroupedUint64x8", argLength: 2, commutative: false},
{name: "InterleaveHiInt16x8", argLength: 2, commutative: false},
{name: "InterleaveHiInt32x4", argLength: 2, commutative: false},
{name: "InterleaveHiInt64x2", argLength: 2, commutative: false},
{name: "InterleaveHiUint16x8", argLength: 2, commutative: false},
{name: "InterleaveHiUint32x4", argLength: 2, commutative: false},
{name: "InterleaveHiUint64x2", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt16x16", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt16x32", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt32x8", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt32x16", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt64x4", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedInt64x8", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint16x16", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint16x32", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint32x8", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint32x16", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint64x4", argLength: 2, commutative: false},
{name: "InterleaveLoGroupedUint64x8", argLength: 2, commutative: false},
{name: "InterleaveLoInt16x8", argLength: 2, commutative: false},
{name: "InterleaveLoInt32x4", argLength: 2, commutative: false},
{name: "InterleaveLoInt64x2", argLength: 2, commutative: false},
{name: "InterleaveLoUint16x8", argLength: 2, commutative: false},
{name: "InterleaveLoUint32x4", argLength: 2, commutative: false},
{name: "InterleaveLoUint64x2", argLength: 2, commutative: false},
{name: "IsNanFloat32x4", argLength: 2, commutative: true},
{name: "IsNanFloat32x8", argLength: 2, commutative: true},
{name: "IsNanFloat32x16", argLength: 2, commutative: true},

View file

@ -2215,6 +2215,24 @@ const (
OpAMD64VPSUBWMasked128
OpAMD64VPSUBWMasked256
OpAMD64VPSUBWMasked512
OpAMD64VPUNPCKHDQ128
OpAMD64VPUNPCKHDQ256
OpAMD64VPUNPCKHDQ512
OpAMD64VPUNPCKHQDQ128
OpAMD64VPUNPCKHQDQ256
OpAMD64VPUNPCKHQDQ512
OpAMD64VPUNPCKHWD128
OpAMD64VPUNPCKHWD256
OpAMD64VPUNPCKHWD512
OpAMD64VPUNPCKLDQ128
OpAMD64VPUNPCKLDQ256
OpAMD64VPUNPCKLDQ512
OpAMD64VPUNPCKLQDQ128
OpAMD64VPUNPCKLQDQ256
OpAMD64VPUNPCKLQDQ512
OpAMD64VPUNPCKLWD128
OpAMD64VPUNPCKLWD256
OpAMD64VPUNPCKLWD512
OpAMD64VPXOR128
OpAMD64VPXOR256
OpAMD64VPXORD512
@ -5288,6 +5306,42 @@ const (
OpGreaterUint16x32
OpGreaterUint32x16
OpGreaterUint64x8
OpInterleaveHiGroupedInt16x16
OpInterleaveHiGroupedInt16x32
OpInterleaveHiGroupedInt32x8
OpInterleaveHiGroupedInt32x16
OpInterleaveHiGroupedInt64x4
OpInterleaveHiGroupedInt64x8
OpInterleaveHiGroupedUint16x16
OpInterleaveHiGroupedUint16x32
OpInterleaveHiGroupedUint32x8
OpInterleaveHiGroupedUint32x16
OpInterleaveHiGroupedUint64x4
OpInterleaveHiGroupedUint64x8
OpInterleaveHiInt16x8
OpInterleaveHiInt32x4
OpInterleaveHiInt64x2
OpInterleaveHiUint16x8
OpInterleaveHiUint32x4
OpInterleaveHiUint64x2
OpInterleaveLoGroupedInt16x16
OpInterleaveLoGroupedInt16x32
OpInterleaveLoGroupedInt32x8
OpInterleaveLoGroupedInt32x16
OpInterleaveLoGroupedInt64x4
OpInterleaveLoGroupedInt64x8
OpInterleaveLoGroupedUint16x16
OpInterleaveLoGroupedUint16x32
OpInterleaveLoGroupedUint32x8
OpInterleaveLoGroupedUint32x16
OpInterleaveLoGroupedUint64x4
OpInterleaveLoGroupedUint64x8
OpInterleaveLoInt16x8
OpInterleaveLoInt32x4
OpInterleaveLoInt64x2
OpInterleaveLoUint16x8
OpInterleaveLoUint32x4
OpInterleaveLoUint64x2
OpIsNanFloat32x4
OpIsNanFloat32x8
OpIsNanFloat32x16
@ -33629,6 +33683,258 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "VPUNPCKHDQ128",
argLen: 2,
asm: x86.AVPUNPCKHDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHDQ256",
argLen: 2,
asm: x86.AVPUNPCKHDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHDQ512",
argLen: 2,
asm: x86.AVPUNPCKHDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPUNPCKHQDQ128",
argLen: 2,
asm: x86.AVPUNPCKHQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHQDQ256",
argLen: 2,
asm: x86.AVPUNPCKHQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHQDQ512",
argLen: 2,
asm: x86.AVPUNPCKHQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPUNPCKHWD128",
argLen: 2,
asm: x86.AVPUNPCKHWD,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHWD256",
argLen: 2,
asm: x86.AVPUNPCKHWD,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKHWD512",
argLen: 2,
asm: x86.AVPUNPCKHWD,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPUNPCKLDQ128",
argLen: 2,
asm: x86.AVPUNPCKLDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLDQ256",
argLen: 2,
asm: x86.AVPUNPCKLDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLDQ512",
argLen: 2,
asm: x86.AVPUNPCKLDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPUNPCKLQDQ128",
argLen: 2,
asm: x86.AVPUNPCKLQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLQDQ256",
argLen: 2,
asm: x86.AVPUNPCKLQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLQDQ512",
argLen: 2,
asm: x86.AVPUNPCKLQDQ,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPUNPCKLWD128",
argLen: 2,
asm: x86.AVPUNPCKLWD,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLWD256",
argLen: 2,
asm: x86.AVPUNPCKLWD,
reg: regInfo{
inputs: []inputInfo{
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
},
outputs: []outputInfo{
{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
},
},
},
{
name: "VPUNPCKLWD512",
argLen: 2,
asm: x86.AVPUNPCKLWD,
reg: regInfo{
inputs: []inputInfo{
{0, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
{1, 281474976645120}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
outputs: []outputInfo{
{0, 281472829161472}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X16 X17 X18 X19 X20 X21 X22 X23 X24 X25 X26 X27 X28 X29 X30 X31
},
},
},
{
name: "VPXOR128",
argLen: 2,
@ -68116,6 +68422,186 @@ var opcodeTable = [...]opInfo{
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt16x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt16x32",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt32x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt32x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt64x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedInt64x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint16x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint16x32",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint32x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint32x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint64x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiGroupedUint64x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiInt16x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiInt32x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiInt64x2",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiUint16x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiUint32x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveHiUint64x2",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt16x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt16x32",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt32x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt32x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt64x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedInt64x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint16x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint16x32",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint32x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint32x16",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint64x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoGroupedUint64x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoInt16x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoInt32x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoInt64x2",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoUint16x8",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoUint32x4",
argLen: 2,
generic: true,
},
{
name: "InterleaveLoUint64x2",
argLen: 2,
generic: true,
},
{
name: "IsNanFloat32x4",
argLen: 2,

View file

@ -2363,6 +2363,114 @@ func rewriteValueAMD64(v *Value) bool {
case OpInterCall:
v.Op = OpAMD64CALLinter
return true
case OpInterleaveHiGroupedInt16x16:
v.Op = OpAMD64VPUNPCKHWD256
return true
case OpInterleaveHiGroupedInt16x32:
v.Op = OpAMD64VPUNPCKHWD512
return true
case OpInterleaveHiGroupedInt32x16:
v.Op = OpAMD64VPUNPCKHDQ512
return true
case OpInterleaveHiGroupedInt32x8:
v.Op = OpAMD64VPUNPCKHDQ256
return true
case OpInterleaveHiGroupedInt64x4:
v.Op = OpAMD64VPUNPCKHQDQ256
return true
case OpInterleaveHiGroupedInt64x8:
v.Op = OpAMD64VPUNPCKHQDQ512
return true
case OpInterleaveHiGroupedUint16x16:
v.Op = OpAMD64VPUNPCKHWD256
return true
case OpInterleaveHiGroupedUint16x32:
v.Op = OpAMD64VPUNPCKHWD512
return true
case OpInterleaveHiGroupedUint32x16:
v.Op = OpAMD64VPUNPCKHDQ512
return true
case OpInterleaveHiGroupedUint32x8:
v.Op = OpAMD64VPUNPCKHDQ256
return true
case OpInterleaveHiGroupedUint64x4:
v.Op = OpAMD64VPUNPCKHQDQ256
return true
case OpInterleaveHiGroupedUint64x8:
v.Op = OpAMD64VPUNPCKHQDQ512
return true
case OpInterleaveHiInt16x8:
v.Op = OpAMD64VPUNPCKHWD128
return true
case OpInterleaveHiInt32x4:
v.Op = OpAMD64VPUNPCKHDQ128
return true
case OpInterleaveHiInt64x2:
v.Op = OpAMD64VPUNPCKHQDQ128
return true
case OpInterleaveHiUint16x8:
v.Op = OpAMD64VPUNPCKHWD128
return true
case OpInterleaveHiUint32x4:
v.Op = OpAMD64VPUNPCKHDQ128
return true
case OpInterleaveHiUint64x2:
v.Op = OpAMD64VPUNPCKHQDQ128
return true
case OpInterleaveLoGroupedInt16x16:
v.Op = OpAMD64VPUNPCKLWD256
return true
case OpInterleaveLoGroupedInt16x32:
v.Op = OpAMD64VPUNPCKLWD512
return true
case OpInterleaveLoGroupedInt32x16:
v.Op = OpAMD64VPUNPCKLDQ512
return true
case OpInterleaveLoGroupedInt32x8:
v.Op = OpAMD64VPUNPCKLDQ256
return true
case OpInterleaveLoGroupedInt64x4:
v.Op = OpAMD64VPUNPCKLQDQ256
return true
case OpInterleaveLoGroupedInt64x8:
v.Op = OpAMD64VPUNPCKLQDQ512
return true
case OpInterleaveLoGroupedUint16x16:
v.Op = OpAMD64VPUNPCKLWD256
return true
case OpInterleaveLoGroupedUint16x32:
v.Op = OpAMD64VPUNPCKLWD512
return true
case OpInterleaveLoGroupedUint32x16:
v.Op = OpAMD64VPUNPCKLDQ512
return true
case OpInterleaveLoGroupedUint32x8:
v.Op = OpAMD64VPUNPCKLDQ256
return true
case OpInterleaveLoGroupedUint64x4:
v.Op = OpAMD64VPUNPCKLQDQ256
return true
case OpInterleaveLoGroupedUint64x8:
v.Op = OpAMD64VPUNPCKLQDQ512
return true
case OpInterleaveLoInt16x8:
v.Op = OpAMD64VPUNPCKLWD128
return true
case OpInterleaveLoInt32x4:
v.Op = OpAMD64VPUNPCKLDQ128
return true
case OpInterleaveLoInt64x2:
v.Op = OpAMD64VPUNPCKLQDQ128
return true
case OpInterleaveLoUint16x8:
v.Op = OpAMD64VPUNPCKLWD128
return true
case OpInterleaveLoUint32x4:
v.Op = OpAMD64VPUNPCKLDQ128
return true
case OpInterleaveLoUint64x2:
v.Op = OpAMD64VPUNPCKLQDQ128
return true
case OpIsInBounds:
return rewriteValueAMD64_OpIsInBounds(v)
case OpIsNanFloat32x16:

View file

@ -532,6 +532,42 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
addF(simdPackage, "Uint16x32.GreaterEqual", opLen2(ssa.OpGreaterEqualUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x16.GreaterEqual", opLen2(ssa.OpGreaterEqualUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x8.GreaterEqual", opLen2(ssa.OpGreaterEqualUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.InterleaveHi", opLen2(ssa.OpInterleaveHiInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.InterleaveHi", opLen2(ssa.OpInterleaveHiInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.InterleaveHi", opLen2(ssa.OpInterleaveHiInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.InterleaveHi", opLen2(ssa.OpInterleaveHiUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.InterleaveHi", opLen2(ssa.OpInterleaveHiUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.InterleaveHi", opLen2(ssa.OpInterleaveHiUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x4.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x16.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x4.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.InterleaveHiGrouped", opLen2(ssa.OpInterleaveHiGroupedUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int16x8.InterleaveLo", opLen2(ssa.OpInterleaveLoInt16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int32x4.InterleaveLo", opLen2(ssa.OpInterleaveLoInt32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int64x2.InterleaveLo", opLen2(ssa.OpInterleaveLoInt64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint16x8.InterleaveLo", opLen2(ssa.OpInterleaveLoUint16x8, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint32x4.InterleaveLo", opLen2(ssa.OpInterleaveLoUint32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Uint64x2.InterleaveLo", opLen2(ssa.OpInterleaveLoUint64x2, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Int16x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int16x32.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int32x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Int64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Int64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedInt64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint16x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint16x16, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint16x32.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint16x32, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint32x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint32x16.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint32x16, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Uint64x4.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x4, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Uint64x8.InterleaveLoGrouped", opLen2(ssa.OpInterleaveLoGroupedUint64x8, types.TypeVec512), sys.AMD64)
addF(simdPackage, "Float32x4.IsNan", opLen2(ssa.OpIsNanFloat32x4, types.TypeVec128), sys.AMD64)
addF(simdPackage, "Float32x8.IsNan", opLen2(ssa.OpIsNanFloat32x8, types.TypeVec256), sys.AMD64)
addF(simdPackage, "Float32x16.IsNan", opLen2(ssa.OpIsNanFloat32x16, types.TypeVec512), sys.AMD64)

View file

@ -103,3 +103,20 @@
commutative: false
documentation: !string |- # Detailed documentation will rely on the specific ops.
// NAME performs a grouped permutation of vector x using constant indices:
- go: InterleaveHi
commutative: false
documentation: !string |-
// NAME interleaves the elements of the high halves of x and y.
- go: InterleaveLo
commutative: false
documentation: !string |-
// NAME interleaves the elements of the low halves of x and y.
- go: InterleaveHiGrouped
commutative: false
documentation: !string |-
// NAME interleaves the elements of the high half of each 128-bit subvector of x and y.
- go: InterleaveLoGrouped
commutative: false
documentation: !string |-
// NAME interleaves the elements of the low half of each 128-bit subvector of x and y.

View file

@ -527,3 +527,40 @@
name: indices
out:
- *256Or512any
- go: InterleaveHi
asm: VPUNPCKH(QDQ|DQ|WD|WB)
in:
- *128any
- *128any
inVariant: []
out:
- *128any
- go: InterleaveLo
asm: VPUNPCKL(QDQ|DQ|WD|WB)
in:
- *128any
- *128any
inVariant: []
out:
- *128any
- go: InterleaveHiGrouped
asm: VPUNPCKH(QDQ|DQ|WD|WB)
in:
- *256Or512any
- *256Or512any
inVariant: []
out:
- *256Or512any
- go: InterleaveLoGrouped
asm: VPUNPCKL(QDQ|DQ|WD|WB)
in:
- *256Or512any
- *256Or512any
inVariant: []
out:
- *256Or512any

View file

@ -494,3 +494,27 @@ func TestMaskOpt512(t *testing.T) {
checkSlices[int64](t, k, []int64{-1, 0, -1, 0, -1, 0, -1, 0})
checkSlices[float64](t, s, []float64{3, 0, 9, 0, 15, 0, 21, 0})
}
// flattenedTranspose tranposes x and y, regarded as a pair of 2x2
// matrices, but then flattens the rows in order, i.e
// x: ABCD ==> a: A1B2
// y: 1234 b: C3D4
func flattenedTranspose(x, y simd.Int32x4) (a, b simd.Int32x4) {
return x.InterleaveLo(y), x.InterleaveHi(y)
}
func TestFlattenedTranspose(t *testing.T) {
r := make([]int32, 4, 4)
s := make([]int32, 4, 4)
x := simd.LoadInt32x4Slice([]int32{0xA, 0xB, 0xC, 0xD})
y := simd.LoadInt32x4Slice([]int32{1, 2, 3, 4})
a, b := flattenedTranspose(x, y)
a.StoreSlice(r)
b.StoreSlice(s)
checkSlices[int32](t, r, []int32{0xA, 1, 0xB, 2})
checkSlices[int32](t, s, []int32{0xC, 3, 0xD, 4})
}

View file

@ -3078,6 +3078,194 @@ func (x Uint32x16) GreaterEqual(y Uint32x16) Mask32x16
// Asm: VPCMPUQ, CPU Feature: AVX512
func (x Uint64x8) GreaterEqual(y Uint64x8) Mask64x8
/* InterleaveHi */
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX
func (x Int16x8) InterleaveHi(y Int16x8) Int16x8
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX
func (x Int32x4) InterleaveHi(y Int32x4) Int32x4
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX
func (x Int64x2) InterleaveHi(y Int64x2) Int64x2
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX
func (x Uint16x8) InterleaveHi(y Uint16x8) Uint16x8
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX
func (x Uint32x4) InterleaveHi(y Uint32x4) Uint32x4
// InterleaveHi interleaves the elements of the high halves of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX
func (x Uint64x2) InterleaveHi(y Uint64x2) Uint64x2
/* InterleaveHiGrouped */
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX2
func (x Int16x16) InterleaveHiGrouped(y Int16x16) Int16x16
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX512
func (x Int16x32) InterleaveHiGrouped(y Int16x32) Int16x32
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX2
func (x Int32x8) InterleaveHiGrouped(y Int32x8) Int32x8
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX512
func (x Int32x16) InterleaveHiGrouped(y Int32x16) Int32x16
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
func (x Int64x4) InterleaveHiGrouped(y Int64x4) Int64x4
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
func (x Int64x8) InterleaveHiGrouped(y Int64x8) Int64x8
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX2
func (x Uint16x16) InterleaveHiGrouped(y Uint16x16) Uint16x16
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHWD, CPU Feature: AVX512
func (x Uint16x32) InterleaveHiGrouped(y Uint16x32) Uint16x32
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX2
func (x Uint32x8) InterleaveHiGrouped(y Uint32x8) Uint32x8
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHDQ, CPU Feature: AVX512
func (x Uint32x16) InterleaveHiGrouped(y Uint32x16) Uint32x16
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX2
func (x Uint64x4) InterleaveHiGrouped(y Uint64x4) Uint64x4
// InterleaveHiGrouped interleaves the elements of the high half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKHQDQ, CPU Feature: AVX512
func (x Uint64x8) InterleaveHiGrouped(y Uint64x8) Uint64x8
/* InterleaveLo */
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX
func (x Int16x8) InterleaveLo(y Int16x8) Int16x8
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX
func (x Int32x4) InterleaveLo(y Int32x4) Int32x4
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX
func (x Int64x2) InterleaveLo(y Int64x2) Int64x2
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX
func (x Uint16x8) InterleaveLo(y Uint16x8) Uint16x8
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX
func (x Uint32x4) InterleaveLo(y Uint32x4) Uint32x4
// InterleaveLo interleaves the elements of the low halves of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX
func (x Uint64x2) InterleaveLo(y Uint64x2) Uint64x2
/* InterleaveLoGrouped */
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX2
func (x Int16x16) InterleaveLoGrouped(y Int16x16) Int16x16
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX512
func (x Int16x32) InterleaveLoGrouped(y Int16x32) Int16x32
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX2
func (x Int32x8) InterleaveLoGrouped(y Int32x8) Int32x8
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX512
func (x Int32x16) InterleaveLoGrouped(y Int32x16) Int32x16
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
func (x Int64x4) InterleaveLoGrouped(y Int64x4) Int64x4
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
func (x Int64x8) InterleaveLoGrouped(y Int64x8) Int64x8
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX2
func (x Uint16x16) InterleaveLoGrouped(y Uint16x16) Uint16x16
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLWD, CPU Feature: AVX512
func (x Uint16x32) InterleaveLoGrouped(y Uint16x32) Uint16x32
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX2
func (x Uint32x8) InterleaveLoGrouped(y Uint32x8) Uint32x8
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLDQ, CPU Feature: AVX512
func (x Uint32x16) InterleaveLoGrouped(y Uint32x16) Uint32x16
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX2
func (x Uint64x4) InterleaveLoGrouped(y Uint64x4) Uint64x4
// InterleaveLoGrouped interleaves the elements of the low half of each 128-bit subvector of x and y.
//
// Asm: VPUNPCKLQDQ, CPU Feature: AVX512
func (x Uint64x8) InterleaveLoGrouped(y Uint64x8) Uint64x8
/* IsNan */
// IsNan checks if elements are NaN. Use as x.IsNan(x).

View file

@ -0,0 +1,15 @@
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build goexperiment.simd && amd64
package simd
// FlattenedTranspose tranposes x and y, regarded as a pair of 2x2
// matrices, but then flattens the rows in order, i.e
// x: ABCD ==> a: A1B2
// y: 1234 b: C3D4
func (x Int32x4) FlattenedTranspose(y Int32x4) (a, b Int32x4) {
return x.InterleaveLo(y), x.InterleaveHi(y)
}