diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 283a2e53cd9..db426f6615f 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -941,8 +941,12 @@ (ScaleFloat64x8 ...) => (VSCALEFPD512 ...) (Select128FromPairFloat32x8 ...) => (VPERM2F128256 ...) (Select128FromPairFloat64x4 ...) => (VPERM2F128256 ...) +(Select128FromPairInt8x32 ...) => (VPERM2I128256 ...) +(Select128FromPairInt16x16 ...) => (VPERM2I128256 ...) (Select128FromPairInt32x8 ...) => (VPERM2I128256 ...) (Select128FromPairInt64x4 ...) => (VPERM2I128256 ...) +(Select128FromPairUint8x32 ...) => (VPERM2I128256 ...) +(Select128FromPairUint16x16 ...) => (VPERM2I128256 ...) (Select128FromPairUint32x8 ...) => (VPERM2I128256 ...) (Select128FromPairUint64x4 ...) => (VPERM2I128256 ...) (SetElemFloat32x4 ...) => (VPINSRD128 ...) diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index 3fae158c0ae..5683fcef0df 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1192,8 +1192,12 @@ func simdGenericOps() []opData { {name: "SHA1FourRoundsUint32x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairFloat32x8", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairFloat64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt8x32", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairInt16x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairInt32x8", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairInt64x4", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint8x32", argLength: 2, commutative: false, aux: "UInt8"}, + {name: "Select128FromPairUint16x16", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairUint32x8", argLength: 2, commutative: false, aux: "UInt8"}, {name: "Select128FromPairUint64x4", argLength: 2, commutative: false, aux: "UInt8"}, {name: "SetElemFloat32x4", argLength: 2, commutative: false, aux: "UInt8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index fa94dfbbd59..bb40ff41178 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -7151,8 +7151,12 @@ const ( OpSHA1FourRoundsUint32x4 OpSelect128FromPairFloat32x8 OpSelect128FromPairFloat64x4 + OpSelect128FromPairInt8x32 + OpSelect128FromPairInt16x16 OpSelect128FromPairInt32x8 OpSelect128FromPairInt64x4 + OpSelect128FromPairUint8x32 + OpSelect128FromPairUint16x16 OpSelect128FromPairUint32x8 OpSelect128FromPairUint64x4 OpSetElemFloat32x4 @@ -92250,6 +92254,18 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "Select128FromPairInt8x32", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairInt16x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "Select128FromPairInt32x8", auxType: auxUInt8, @@ -92262,6 +92278,18 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "Select128FromPairUint8x32", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, + { + name: "Select128FromPairUint16x16", + auxType: auxUInt8, + argLen: 2, + generic: true, + }, { name: "Select128FromPairUint32x8", auxType: auxUInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 5ad2ed3f96b..c7995c5c9e5 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -5017,18 +5017,30 @@ func rewriteValueAMD64(v *Value) bool { case OpSelect128FromPairFloat64x4: v.Op = OpAMD64VPERM2F128256 return true + case OpSelect128FromPairInt16x16: + v.Op = OpAMD64VPERM2I128256 + return true case OpSelect128FromPairInt32x8: v.Op = OpAMD64VPERM2I128256 return true case OpSelect128FromPairInt64x4: v.Op = OpAMD64VPERM2I128256 return true + case OpSelect128FromPairInt8x32: + v.Op = OpAMD64VPERM2I128256 + return true + case OpSelect128FromPairUint16x16: + v.Op = OpAMD64VPERM2I128256 + return true case OpSelect128FromPairUint32x8: v.Op = OpAMD64VPERM2I128256 return true case OpSelect128FromPairUint64x4: v.Op = OpAMD64VPERM2I128256 return true + case OpSelect128FromPairUint8x32: + v.Op = OpAMD64VPERM2I128256 + return true case OpSelectN: return rewriteValueAMD64_OpSelectN(v) case OpSetElemFloat32x4: diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 34e491371ea..413cf92c88c 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -953,8 +953,12 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Float64x8.Scale", opLen2(ssa.OpScaleFloat64x8, types.TypeVec512), sys.AMD64) addF(simdPackage, "Float32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairFloat64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt8x32, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt16x16, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairInt64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint8x32.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint8x32, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint16x16, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint32x8.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint32x8, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Uint64x4.Select128FromPair", opLen2Imm8_II(ssa.OpSelect128FromPairUint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Float32x4.SetElem", opLen2Imm8(ssa.OpSetElemFloat32x4, types.TypeVec128, 0), sys.AMD64) diff --git a/src/simd/_gen/simdgen/gen_simdTypes.go b/src/simd/_gen/simdgen/gen_simdTypes.go index dc5f77adaab..f98795e1b0b 100644 --- a/src/simd/_gen/simdgen/gen_simdTypes.go +++ b/src/simd/_gen/simdgen/gen_simdTypes.go @@ -351,7 +351,7 @@ func ({{.Op1NameAndType "x"}}) {{.Go}}({{.Op2NameAndType "y"}}, {{.ImmName}} uin {{if .Documentation}}{{.Documentation}} //{{end}} // {{.ImmName}} result in better performance when they are constants, non-constant values will be translated into a jump table. -// {{.ImmName}} should be between 0 and 3, inclusive; other values will result in a runtime panic. +// {{.ImmName}} should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: {{.Asm}}, CPU Feature: {{.CPUFeature}} func ({{.Op1NameAndType "x"}}) {{.Go}}({{.ImmName}} uint8, {{.Op2NameAndType "y"}}) {{.GoType}} diff --git a/src/simd/_gen/simdgen/godefs.go b/src/simd/_gen/simdgen/godefs.go index 0b8fbd7e3de..c127eb1b6de 100644 --- a/src/simd/_gen/simdgen/godefs.go +++ b/src/simd/_gen/simdgen/godefs.go @@ -98,6 +98,8 @@ func (o *Operation) SkipMaskedMethod() bool { return false } +var reForName = regexp.MustCompile(`\bNAME\b`) + func (o *Operation) DecodeUnified(v *unify.Value) error { if err := v.Decode(&o.rawOperation); err != nil { return err @@ -117,7 +119,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error { } else { o.Documentation = "// UNDOCUMENTED" } - o.Documentation = regexp.MustCompile(`\bNAME\b`).ReplaceAllString(o.Documentation, o.Go) + o.Documentation = reForName.ReplaceAllString(o.Documentation, o.Go) if isMasked { o.Documentation += "\n//\n// This operation is applied selectively under a write mask." // Suppress generic op and method declaration for exported methods, if a mask is present. @@ -128,7 +130,7 @@ func (o *Operation) DecodeUnified(v *unify.Value) error { } } if o.rawOperation.AddDoc != nil { - o.Documentation += "\n" + *o.rawOperation.AddDoc + o.Documentation += "\n" + reForName.ReplaceAllString(*o.rawOperation.AddDoc, o.Go) } o.In = append(o.rawOperation.In, o.rawOperation.InVariant...) diff --git a/src/simd/_gen/simdgen/ops/Moves/categories.yaml b/src/simd/_gen/simdgen/ops/Moves/categories.yaml index 44bd8efb7fd..3c86974e8a2 100644 --- a/src/simd/_gen/simdgen/ops/Moves/categories.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/categories.yaml @@ -135,7 +135,7 @@ // NAME concatenates selected elements from x and y into the lower and upper // halves of the output. The selection is chosen by the constant parameter h1h0l1l0 // where each {h,l}{1,0} is two bits specify which element from y or x to select. - // For example, {0,1,2,3}.concatSelectedConstant(0b_11_01_00_10, {4,5,6,7}) returns + // For example, {0,1,2,3}.NAME(0b_11_01_00_10, {4,5,6,7}) returns // {2, 0, 5, 7} (don't forget that the binary constant is written big-endian). - go: concatSelectedConstant @@ -196,9 +196,12 @@ // The selection is chosen by the constant parameter h1h0l1l0 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. // For example, - // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME( - // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) + // + // {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.NAME( + // 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) + // // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} + // // (don't forget that the binary constant is written big-endian). - go: concatSelectedConstantGrouped @@ -214,7 +217,7 @@ // subvectors of x and y. // // For example {4,5,8,9,12,13,16,17}.NAME(0b11_00_11_10, {6,7,10,11,14,15,18,19}) - // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's + // returns {4,7,9,11,12,14,17,19}; bit 0 is zero, selecting element 0 from x's // least 128-bits (4), then 1, selects the element 1 from y's least 128-bits (7), // then 1, selecting element 1 from x's next 128 bits (9), then 1, // selecting element 1 from y's upper 128 bits (11). The next two 0 bits select @@ -227,9 +230,8 @@ commutative: false documentation: !string |- // NAME treats the 256-bit vectors x and y as a single vector of four - // 128-bit elements, and returns a 256-bit result formed by + // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. - // For example, {4,5}.NAME(3,0,{6,7}) returns {7,4}. - go: ConcatShiftBytesRight commutative: false diff --git a/src/simd/_gen/simdgen/ops/Moves/go.yaml b/src/simd/_gen/simdgen/ops/Moves/go.yaml index 697d6a8bced..bbea29bcb0a 100644 --- a/src/simd/_gen/simdgen/ops/Moves/go.yaml +++ b/src/simd/_gen/simdgen/ops/Moves/go.yaml @@ -837,6 +837,12 @@ - go: Select128FromPair asm: VPERM2F128 operandOrder: II + addDoc: !string |- + // For example, + // + // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71}) + // + // returns {70, 71, 40, 41}. in: - &v go: $t @@ -854,6 +860,12 @@ - go: Select128FromPair asm: VPERM2F128 operandOrder: II + addDoc: !string |- + // For example, + // + // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73}) + // + // returns {70, 71, 72, 73, 40, 41, 42, 43}. in: - &v go: $t @@ -872,6 +884,12 @@ - go: Select128FromPair asm: VPERM2I128 operandOrder: II + addDoc: !string |- + // For example, + // + // {40, 41, 50, 51}.NAME(3, 0, {60, 61, 70, 71}) + // + // returns {70, 71, 40, 41}. in: - &v go: $t @@ -890,6 +908,12 @@ - go: Select128FromPair asm: VPERM2I128 operandOrder: II + addDoc: !string |- + // For example, + // + // {40, 41, 42, 43, 50, 51, 52, 53}.NAME(3, 0, {60, 61, 62, 63, 70, 71, 72, 73}) + // + // returns {70, 71, 72, 73, 40, 41, 42, 43}. in: - &v go: $t @@ -905,6 +929,56 @@ out: - *v +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + addDoc: !string |- + // For example, + // + // {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.NAME(3, 0, + // {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77}) + // + // returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}. + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 16 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + +- go: Select128FromPair + asm: VPERM2I128 + operandOrder: II + addDoc: !string |- + // For example, + // + // {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.NAME(3, 0, + // {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f}) + // + // returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}. + in: + - &v + go: $t + class: vreg + base: int|uint + bits: 256 + OverwriteElementBits: 8 + - *v + - class: immediate + immOffset: 0 + name: "lo, hi" + inVariant: [] + out: + - *v + - go: ConcatShiftBytesRight asm: VPALIGNR in: @@ -930,4 +1004,3 @@ immOffset: 0 out: - *uint256512 - \ No newline at end of file diff --git a/src/simd/ops_amd64.go b/src/simd/ops_amd64.go index e9ddb463be7..8acf3e897c2 100644 --- a/src/simd/ops_amd64.go +++ b/src/simd/ops_amd64.go @@ -5604,10 +5604,14 @@ func (x Float64x8) Scale(y Float64x8) Float64x8 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73}) +// +// returns {70, 71, 72, 73, 40, 41, 42, 43}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2F128, CPU Feature: AVX func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 @@ -5615,10 +5619,14 @@ func (x Float32x8) Select128FromPair(lo, hi uint8, y Float32x8) Float32x8 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71}) +// +// returns {70, 71, 40, 41}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2F128, CPU Feature: AVX func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 @@ -5626,10 +5634,46 @@ func (x Float64x4) Select128FromPair(lo, hi uint8, y Float64x4) Float64x4 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0, +// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f}) +// +// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int8x32) Select128FromPair(lo, hi uint8, y Int8x32) Int8x32 + +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, +// +// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0, +// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77}) +// +// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Int16x16) Select128FromPair(lo, hi uint8, y Int16x16) Int16x16 + +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, +// +// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73}) +// +// returns {70, 71, 72, 73, 40, 41, 42, 43}. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2I128, CPU Feature: AVX2 func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 @@ -5637,10 +5681,14 @@ func (x Int32x8) Select128FromPair(lo, hi uint8, y Int32x8) Int32x8 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71}) +// +// returns {70, 71, 40, 41}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2I128, CPU Feature: AVX2 func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 @@ -5648,10 +5696,46 @@ func (x Int64x4) Select128FromPair(lo, hi uint8, y Int64x4) Int64x4 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {0x40, 0x41, ..., 0x4f, 0x50, 0x51, ..., 0x5f}.Select128FromPair(3, 0, +// {0x60, 0x61, ..., 0x6f, 0x70, 0x71, ..., 0x7f}) +// +// returns {0x70, 0x71, ..., 0x7f, 0x40, 0x41, ..., 0x4f}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint8x32) Select128FromPair(lo, hi uint8, y Uint8x32) Uint8x32 + +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, +// +// {40, 41, 42, 43, 44, 45, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57}.Select128FromPair(3, 0, +// {60, 61, 62, 63, 64, 65, 66, 67, 70, 71, 72, 73, 74, 75, 76, 77}) +// +// returns {70, 71, 72, 73, 74, 75, 76, 77, 40, 41, 42, 43, 44, 45, 46, 47}. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. +// +// Asm: VPERM2I128, CPU Feature: AVX2 +func (x Uint16x16) Select128FromPair(lo, hi uint8, y Uint16x16) Uint16x16 + +// Select128FromPair treats the 256-bit vectors x and y as a single vector of four +// 128-bit elements, and returns a 256-bit result formed by +// concatenating the two elements specified by lo and hi. +// For example, +// +// {40, 41, 42, 43, 50, 51, 52, 53}.Select128FromPair(3, 0, {60, 61, 62, 63, 70, 71, 72, 73}) +// +// returns {70, 71, 72, 73, 40, 41, 42, 43}. +// +// lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2I128, CPU Feature: AVX2 func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8 @@ -5659,10 +5743,14 @@ func (x Uint32x8) Select128FromPair(lo, hi uint8, y Uint32x8) Uint32x8 // Select128FromPair treats the 256-bit vectors x and y as a single vector of four // 128-bit elements, and returns a 256-bit result formed by // concatenating the two elements specified by lo and hi. -// For example, {4,5}.Select128FromPair(3,0,{6,7}) returns {7,4}. +// For example, +// +// {40, 41, 50, 51}.Select128FromPair(3, 0, {60, 61, 70, 71}) +// +// returns {70, 71, 40, 41}. // // lo, hi result in better performance when they are constants, non-constant values will be translated into a jump table. -// lo, hi should be between 0 and 3, inclusive; other values will result in a runtime panic. +// lo, hi should be between 0 and 3, inclusive; other values may result in a runtime panic. // // Asm: VPERM2I128, CPU Feature: AVX2 func (x Uint64x4) Select128FromPair(lo, hi uint8, y Uint64x4) Uint64x4 diff --git a/src/simd/ops_internal_amd64.go b/src/simd/ops_internal_amd64.go index 63ee6416a66..e54c3b20063 100644 --- a/src/simd/ops_internal_amd64.go +++ b/src/simd/ops_internal_amd64.go @@ -144,11 +144,12 @@ func (x Float32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Float32x8) Fl // The selection is chosen by the constant parameter h1h0l1l0 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. // For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( // -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) // // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// // (don't forget that the binary constant is written big-endian). // // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. @@ -215,11 +216,12 @@ func (x Int32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Int32x8) Int32x // The selection is chosen by the constant parameter h1h0l1l0 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. // For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( // -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) // // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// // (don't forget that the binary constant is written big-endian). // // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table. @@ -286,11 +288,12 @@ func (x Uint32x8) concatSelectedConstantGrouped(h1h0l1l0 uint8, y Uint32x8) Uint // The selection is chosen by the constant parameter h1h0l1l0 // where each {h,l}{1,0} is two bits specifying which element from y or x to select. // For example, -// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( // -// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) +// {0,1,2,3,8,9,10,11, 20,21,22,23,28,29,210,211}.concatSelectedConstantGrouped( +// 0b_11_01_00_10, {4,5,6,7,12,13,14,15, 24,25,26,27,212,213,214,215}) // // returns {2,0,5,7,10,8,13,15, 22,20,25,27,210,28,213,215} +// // (don't forget that the binary constant is written big-endian). // // h1h0l1l0 results in better performance when it's a constant, a non-constant value will be translated into a jump table.