From 25c36b95d1523f22d4c46ec237acc03e00540e0a Mon Sep 17 00:00:00 2001 From: David Chase Date: Fri, 19 Sep 2025 13:07:59 -0400 Subject: [PATCH] [dev.simd] simd, cmd/compile: add 128 bit select-from-pair Using this name until a better one appears: x.Select128FromPair(3, 2, y) Includes test for constant and variable case. Checks for unexpected immediates (using the zeroing flag, which is not supported for this intrinsic) and panics. Change-Id: I9249475d6572968c127b4ee9e00328d717c07578 Reviewed-on: https://go-review.googlesource.com/c/go/+/705496 Reviewed-by: Junyang Shao LUCI-TryBot-Result: Go LUCI --- src/cmd/compile/internal/amd64/simdssa.go | 2 + src/cmd/compile/internal/ir/symtab.go | 1 + .../compile/internal/ssa/_gen/simdAMD64.rules | 6 ++ .../compile/internal/ssa/_gen/simdAMD64ops.go | 2 + .../internal/ssa/_gen/simdgenericOps.go | 6 ++ src/cmd/compile/internal/ssa/opGen.go | 74 +++++++++++++++++++ src/cmd/compile/internal/ssa/rewriteAMD64.go | 18 +++++ src/cmd/compile/internal/ssagen/intrinsics.go | 26 ++++++- .../compile/internal/ssagen/simdintrinsics.go | 6 ++ src/cmd/compile/internal/ssagen/ssa.go | 1 + src/runtime/panic.go | 7 ++ src/simd/_gen/simdgen/gen_simdIntrinsics.go | 2 + src/simd/_gen/simdgen/gen_simdTypes.go | 9 +++ .../_gen/simdgen/ops/Moves/categories.yaml | 8 +- src/simd/_gen/simdgen/ops/Moves/go.yaml | 72 +++++++++++++++++- src/simd/_gen/unify/domain.go | 4 +- src/simd/internal/simd_test/simd_test.go | 74 +++++++++++++++++++ src/simd/ops_amd64.go | 56 ++++++++++++++ 18 files changed, 369 insertions(+), 5 deletions(-) diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index d69740cd96d..a4d24524357 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -1053,6 +1053,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VGF2P8AFFINEINVQB128, ssa.OpAMD64VGF2P8AFFINEINVQB256, ssa.OpAMD64VGF2P8AFFINEINVQB512, + ssa.OpAMD64VPERM2F128256, + ssa.OpAMD64VPERM2I128256, ssa.OpAMD64VINSERTF128256, ssa.OpAMD64VINSERTF64X4512, ssa.OpAMD64VINSERTI128256, diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go index 2222a5444aa..0cfa2a2262f 100644 --- a/src/cmd/compile/internal/ir/symtab.go +++ b/src/cmd/compile/internal/ir/symtab.go @@ -45,6 +45,7 @@ type symsStruct struct { PanicdottypeI *obj.LSym Panicnildottype *obj.LSym Panicoverflow *obj.LSym + PanicSimdImm *obj.LSym Racefuncenter *obj.LSym Racefuncexit *obj.LSym Raceread *obj.LSym diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 9db223c04f4..1eab8b5e6d6 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -938,6 +938,12 @@ (ScaleFloat64x2 ...) => (VSCALEFPD128 ...) (ScaleFloat64x4 ...) => (VSCALEFPD256 ...) (ScaleFloat64x8 ...) => (VSCALEFPD512 ...) +(Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...) +(Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...) +(Select128FromPairInt32x8 ...) => (VPERM2I128256 ...) +(Select128FromPairInt64x4 ...) => (VPERM2I128256 ...) +(Select128FromPairUint32x8 ...) => (VPERM2I128256 ...) +(Select128FromPairUint64x4 ...) => (VPERM2I128256 ...) (SetElemFloat32x4 ...) => (VPINSRD128 ...) (SetElemFloat64x2 ...) => (VPINSRQ128 ...) (SetElemInt8x16 ...) => (VPINSRB128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index ba91fb3fc95..5e1da3249fe 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -1212,6 +1212,8 @@ func simdAMD64Ops(v11, v21, v2k, vkv, v2kv, v2kk, v31, v3kv, vgpv, vgp, vfpv, vf {name: "VPRORQMasked128", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPRORQMasked256", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPRORQMasked512", argLength: 2, reg: wkw, asm: "VPRORQ", aux: "UInt8", commutative: false, typ: "Vec512", resultInArg0: false}, + {name: "VPERM2F128256", argLength: 2, reg: v21, asm: "VPERM2F128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, + {name: "VPERM2I128256", argLength: 2, reg: v21, asm: "VPERM2I128", aux: "UInt8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPINSRD128", argLength: 2, reg: vgpv, asm: "VPINSRD", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRQ128", argLength: 2, reg: vgpv, asm: "VPINSRQ", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPINSRB128", argLength: 2, reg: vgpv, asm: "VPINSRB", aux: "UInt8", commutative: false, typ: "Vec128", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 81a1dff1378..aa088dbf0bf 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1199,6 +1199,12 @@ func simdGenericOps() []opData { {name: "RoundToEvenScaledResidueFloat64x2", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueFloat64x4", argLength: 1, commutative: false, aux: "UInt8"}, {name: "RoundToEvenScaledResidueFloat64x8", argLength: 1, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemFloat64x2", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 792a1ca08f1..105d1a803c6 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -2444,6 +2444,8 @@ const ( OpAMD64VPRORQMasked128 OpAMD64VPRORQMasked256 OpAMD64VPRORQMasked512 + OpAMD64VPERM2F128256 + OpAMD64VPERM2I128256 OpAMD64VPINSRD128 OpAMD64VPINSRQ128 OpAMD64VPINSRB128 @@ -6594,6 +6596,12 @@ const ( OpRoundToEvenScaledResidueFloat64x2 OpRoundToEvenScaledResidueFloat64x4 OpRoundToEvenScaledResidueFloat64x8 + OpSelect128FromPairFloat32x8 + OpSelect128FromPairFloat64x4 + OpSelect128FromPairInt32x8 + OpSelect128FromPairInt64x4 + OpSelect128FromPairUint32x8 + OpSelect128FromPairUint64x4 OpSetElemFloat32x4 OpSetElemFloat64x2 OpSetElemInt8x16 @@ -37656,6 +37664,36 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VPERM2F128256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPERM2F128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, + { + name: "VPERM2I128256", + auxType: auxUInt8, + argLen: 2, + asm: x86.AVPERM2I128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + {1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPINSRD128", auxType: auxUInt8, @@ -82360,6 +82398,42 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Select128FromPairFloat32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairFloat64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairInt32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairInt64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairUint32x8", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairUint64x4", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "SetElemFloat32x4", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index ca9f9ae17be..bc611fc44c4 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -4991,6 +4991,24 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSelect0(v) case OpSelect1: return rewriteValueAMD64_OpSelect1(v) + case OpSelect128FromPairFloat32x8: + v.Op = OpAMD64VPERM2F128256 + return true + case OpSelect128FromPairFloat64x4: + v.Op = OpAMD64VPERM2F128256 + return true + case OpSelect128FromPairInt32x8: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairInt64x4: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairUint32x8: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairUint64x4: + v.Op = OpAMD64VPERM2I128256 + return true case OpSelectN: return rewriteValueAMD64_OpSelectN(v) case OpSetElemFloat32x4: diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 985d899a71e..4c5cd9ef2cf 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1842,7 +1842,9 @@ func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp fu for i, t := range targets { s.startBlock(t) genOp(s, i) - t.AddEdgeTo(bEnd) + if t.Kind != ssa.BlockExit { + t.AddEdgeTo(bEnd) + } s.endBlock() } @@ -1899,6 +1901,28 @@ func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.Ca } } +// Two immediates instead of just 1. Offset is ignored, so it is a _ parameter instead. +func opLen2Imm8_II(op ssa.Op, t *types.Type, _ int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + if args[1].Op == ssa.OpConst8 && args[2].Op == ssa.OpConst8 && args[1].AuxInt & ^3 == 0 && args[2].AuxInt & ^3 == 0 { + i1, i2 := args[1].AuxInt, args[2].AuxInt + return s.newValue2I(op, t, i1+i2<<4, args[0], args[3]) + } + four := s.constInt64(types.Types[types.TUINT8], 4) + shifted := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT8], args[2], four) + combined := s.newValue2(ssa.OpAdd8, types.Types[types.TUINT8], args[1], shifted) + return immJumpTable(s, combined, n, func(sNew *state, idx int) { + // Encode as int8 due to requirement of AuxInt, check its comment for details. + // TODO for "zeroing" values, panic instead. + if idx & ^(3+3<<4) == 0 { + s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx)), args[0], args[3]) + } else { + sNew.rtcall(ir.Syms.PanicSimdImm, false, nil) + } + }) + } +} + func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { if args[2].Op == ssa.OpConst8 { diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 41858a77454..a62b3882c38 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -950,6 +950,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x2.Scale", opLen2(ssa.OpScaleFloat64x2, types.TypeVec128), sys.AMD64) addF(simdPackage, "Float64x4.Scale", opLen2(ssa.OpScaleFloat64x4, types.TypeVec256), sys.AMD64) addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Float64x2.SetElem", opLen2Imm8(ssa.OpSetElemFloat64x2, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64) diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index 57129817f6c..37aad360f2a 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -141,6 +141,7 @@ func InitConfig() { ir.Syms.Panicnildottype = typecheck.LookupRuntimeFunc("panicnildottype") ir.Syms.Panicoverflow = typecheck.LookupRuntimeFunc("panicoverflow") ir.Syms.Panicshift = typecheck.LookupRuntimeFunc("panicshift") + ir.Syms.PanicSimdImm = typecheck.LookupRuntimeFunc("panicSimdImm") ir.Syms.Racefuncenter = typecheck.LookupRuntimeFunc("racefuncenter") ir.Syms.Racefuncexit = typecheck.LookupRuntimeFunc("racefuncexit") ir.Syms.Raceread = typecheck.LookupRuntimeFunc("raceread") diff --git a/src/runtime/panic.go b/src/runtime/panic.go index 8c91c9435ab..d7bce70fe5a 100644 --- a/src/runtime/panic.go +++ b/src/runtime/panic.go @@ -341,6 +341,13 @@ func panicmemAddr(addr uintptr) { panic(errorAddressString{msg: "invalid memory address or nil pointer dereference", addr: addr}) } +var simdImmError = error(errorString("out-of-range immediate for simd intrinsic")) + +func panicSimdImm() { + panicCheck2("simd immediate error") + panic(simdImmError) +} + // Create a new deferred function fn, which has no arguments and results. // The compiler turns a defer statement into a call to this. func deferproc(fn func()) { diff --git a/src/simd/_gen/simdgen/gen_simdIntrinsics.go b/src/simd/_gen/simdgen/gen_simdIntrinsics.go index 353bc46b317..4b27f7ce5f7 100644 --- a/src/simd/_gen/simdgen/gen_simdIntrinsics.go +++ b/src/simd/_gen/simdgen/gen_simdIntrinsics.go @@ -56,6 +56,8 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . {{end}} {{define "op2Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) {{end}} +{{define "op2Imm8_II"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen2Imm8_II(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) +{{end}} {{define "op3Imm8"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) {{end}} {{define "op3Imm8_2I"}} addF(simdPackage, "{{(index .In 1).Go}}.{{.Go}}", opLen3Imm8_2I(ssa.Op{{.GenericName}}, {{.SSAType}}, {{(index .In 0).ImmOffset}}), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go index 0d5d08b7edd..8944c35cad7 100644 --- a/src/simd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/_gen/simdgen/gen_simdTypes.go @@ -354,6 +354,15 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y" func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uint8) {{.GoType}} {{end}} +{{define "op2Imm8_II"}} +{{if .Documentation}}{{.Documentation}} +//{{end}} +// {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table. +// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}} +func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}} +{{end}} {{define "op3Imm8"}} {{if .Documentation}}{{.Documentation}} diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index e9a7fef2023..0c733e12ee1 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -174,4 +174,10 @@ // then 1, selecting element 1 from x's upper 128 bits (9), then 1, // selecting element 1 from y's upper 128 bits (11). // This differs from the same method applied to a 32x8 vector, where - // the 8-bit constant performs the same selection on both subvectors. \ No newline at end of file + // the 8-bit constant performs the same selection on both subvectors. + +- go: Select128FromPair + commutative: false + documentation: !string |- + // NAME selects the low and high 128-bit halves from the 128-bit halves + // of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 46599b7bd7e..495b9ed6fa1 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -721,7 +721,6 @@ out: - *v - - go: concatSelectedConstantGrouped asm: VSHUFPD in: @@ -771,3 +770,74 @@ inVariant: [] out: - *v + +- go: Select128FromPair + asm: VPERM2F128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: float + bits: 256 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2F128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: float + bits: 256 + OverwriteElementBits: 32 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 64 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 32 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v diff --git a/src/simd/_gen/unify/domain.go b/src/simd/_gen/unify/domain.go index 1e0f2be63d7..8eb5deab2ba 100644 --- a/src/simd/_gen/unify/domain.go +++ b/src/simd/_gen/unify/domain.go @@ -106,8 +106,8 @@ func (b *DefBuilder) Add(name string, v *Value) { if b.fields == nil { b.fields = make(map[string]*Value) } - if _, ok := b.fields[name]; ok { - panic(fmt.Sprintf("duplicate field %q", name)) + if old, ok := b.fields[name]; ok { + panic(fmt.Sprintf("duplicate field %q, added value is %v, old value is %v", name, v, old)) } b.fields[name] = v } diff --git a/src/simd/internal/simd_test/simd_test.go b/src/simd/internal/simd_test/simd_test.go index 6deadde45e6..e38f7eea01c 100644 --- a/src/simd/internal/simd_test/simd_test.go +++ b/src/simd/internal/simd_test/simd_test.go @@ -815,3 +815,77 @@ func TestSelectFromPairConstGroupedUint32x16(t *testing.T) { foo(lhhl, 0, 4, 5, 1) foo(hllh, 4, 0, 1, 5) } + +func TestSelect128FromPair(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + aa := x.Select128FromPair(0, 0, y) + ab := x.Select128FromPair(0, 1, y) + bc := x.Select128FromPair(1, 2, y) + cd := x.Select128FromPair(2, 3, y) + da := x.Select128FromPair(3, 0, y) + dc := x.Select128FromPair(3, 2, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + a, b = 2*a, 2*b + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1}) + } + + foo(aa, 0, 0) + foo(ab, 0, 1) + foo(bc, 1, 2) + foo(cd, 2, 3) + foo(da, 3, 0) + foo(dc, 3, 2) +} + +func TestSelect128FromPairError(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + defer func() { + if r := recover(); r != nil { + t.Logf("Saw expected panic %v", r) + } + }() + _ = x.Select128FromPair(0, 4, y) + + t.Errorf("Should have panicked") +} + +//go:noinline +func select128FromPair(x simd.Uint64x4, lo, hi uint8, y simd.Uint64x4) simd.Uint64x4 { + return x.Select128FromPair(lo, hi, y) +} + +func TestSelect128FromPairVar(t *testing.T) { + x := simd.LoadUint64x4Slice([]uint64{0, 1, 2, 3}) + y := simd.LoadUint64x4Slice([]uint64{4, 5, 6, 7}) + + aa := select128FromPair(x, 0, 0, y) + ab := select128FromPair(x, 0, 1, y) + bc := select128FromPair(x, 1, 2, y) + cd := select128FromPair(x, 2, 3, y) + da := select128FromPair(x, 3, 0, y) + dc := select128FromPair(x, 3, 2, y) + + r := make([]uint64, 4, 4) + + foo := func(v simd.Uint64x4, a, b uint64) { + a, b = 2*a, 2*b + v.StoreSlice(r) + checkSlices[uint64](t, r, []uint64{a, a + 1, b, b + 1}) + } + + foo(aa, 0, 0) + foo(ab, 0, 1) + foo(bc, 1, 2) + foo(cd, 2, 3) + foo(da, 3, 0) + foo(dc, 3, 2) + +} diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index a104601ed75..91e7d91842a 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -5576,6 +5576,62 @@ func (x Float64x4) Scale(y Float64x4) Float64x4 // Asm: VSCALEFPD, CPU Feature: AVX512 func (x Float64x8) Scale(y Float64x8) Float64x8 +/* Select128FromPair */ + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2F128, CPU Feature: AVX +func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2F128, CPU Feature: AVX +func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8 + +// Select128FromPair selects the low and high 128-bit halves from the 128-bit halves +// of its two 256-bit inputs, numbering those halves 0, 1, 2, 3. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4 + /* SetElem */ // SetElem sets a single constant-indexed element's value.