diff --git a/src/cmd/compile/internal/amd64/simdssa.go b/src/cmd/compile/internal/amd64/simdssa.go index 999f3c200ce..ac2848d1baf 100644 --- a/src/cmd/compile/internal/amd64/simdssa.go +++ b/src/cmd/compile/internal/amd64/simdssa.go @@ -706,6 +706,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool { ssa.OpAMD64VGF2P8AFFINEINVQB128, ssa.OpAMD64VGF2P8AFFINEINVQB256, ssa.OpAMD64VGF2P8AFFINEINVQB512, + ssa.OpAMD64VINSERTF128256, + ssa.OpAMD64VINSERTI128256, ssa.OpAMD64VPSHLDW128, ssa.OpAMD64VPSHLDW256, ssa.OpAMD64VPSHLDW512, diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules index 3768c5aaadc..6b1078e7412 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules @@ -1452,6 +1452,16 @@ (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x4 ...) => (VPDPBUSDS128 ...) (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x8 ...) => (VPDPBUSDS256 ...) (SaturatedUnsignedSignedQuadDotProdAccumulateUint32x16 ...) => (VPDPBUSDS512 ...) +(Set128Float32x8 [a] x y) => (VINSERTF128256 [a] x y) +(Set128Float64x4 [a] x y) => (VINSERTF128256 [a] x y) +(Set128Int8x32 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Int16x16 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Int32x8 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Int64x4 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Uint8x32 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Uint16x16 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Uint32x8 [a] x y) => (VINSERTI128256 [a] x y) +(Set128Uint64x4 [a] x y) => (VINSERTI128256 [a] x y) (SetElemInt8x16 [a] x y) => (VPINSRB128 [a] x y) (SetElemInt16x8 [a] x y) => (VPINSRW128 [a] x y) (SetElemInt32x4 [a] x y) => (VPINSRD128 [a] x y) diff --git a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go index 5e627e696e9..787d3c5fcbf 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go +++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go @@ -768,6 +768,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp, {name: "VRNDSCALEPSMasked256", argLength: 2, reg: fpkfp, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VREDUCEPSMasked256", argLength: 2, reg: fpkfp, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VCMPPSMasked256", argLength: 3, reg: fp2kk, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VINSERTF128256", argLength: 2, reg: fp21, asm: "VINSERTF128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VROUNDPD128", argLength: 1, reg: fp11, asm: "VROUNDPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VRNDSCALEPD128", argLength: 1, reg: fp11, asm: "VRNDSCALEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VREDUCEPD128", argLength: 1, reg: fp11, asm: "VREDUCEPD", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, @@ -879,6 +880,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp, {name: "VPINSRB128", argLength: 2, reg: fpgpfp, asm: "VPINSRB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false}, {name: "VPCMPB256", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked256", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, + {name: "VINSERTI128256", argLength: 2, reg: fp21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false}, {name: "VPCMPB512", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPBMasked512", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, {name: "VPCMPUW256", argLength: 2, reg: fp2k, asm: "VPCMPUW", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false}, diff --git a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go index b68b237c312..076a16ebda6 100644 --- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go +++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go @@ -1511,6 +1511,7 @@ func simdGenericOps() []opData { {name: "MaskedRoundWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedTruncWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "RoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Float32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "TruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "CeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, {name: "DiffWithCeilWithPrecisionFloat64x2", argLength: 1, commutative: false, aux: "Int8"}, @@ -1543,6 +1544,7 @@ func simdGenericOps() []opData { {name: "MaskedRoundWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedTruncWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "RoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Float64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "TruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "CeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "DiffWithCeilWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, @@ -1562,6 +1564,7 @@ func simdGenericOps() []opData { {name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllLeftAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllRightAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"}, + {name: "Set128Int16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromInt16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromInt16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllLeftAndFillUpperFromInt16x32", argLength: 3, commutative: false, aux: "Int8"}, @@ -1598,6 +1601,7 @@ func simdGenericOps() []opData { {name: "MaskedShiftAllRightAndFillUpperFromInt32x8", argLength: 3, commutative: false, aux: "Int8"}, {name: "RotateAllLeftInt32x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightInt32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Int32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromInt32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromInt32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "GetElemInt64x2", argLength: 1, commutative: false, aux: "Int8"}, @@ -1616,6 +1620,7 @@ func simdGenericOps() []opData { {name: "MaskedShiftAllRightAndFillUpperFromInt64x4", argLength: 3, commutative: false, aux: "Int8"}, {name: "RotateAllLeftInt64x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightInt64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Int64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromInt64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromInt64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedRotateAllLeftInt64x8", argLength: 2, commutative: false, aux: "Int8"}, @@ -1628,8 +1633,10 @@ func simdGenericOps() []opData { {name: "ShiftAllRightAndFillUpperFromInt64x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "Int8"}, {name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "Int8"}, + {name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllLeftAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllRightAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"}, + {name: "Set128Uint16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromUint16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromUint16x16", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedShiftAllLeftAndFillUpperFromUint16x32", argLength: 3, commutative: false, aux: "Int8"}, @@ -1666,6 +1673,7 @@ func simdGenericOps() []opData { {name: "MaskedShiftAllRightAndFillUpperFromUint32x8", argLength: 3, commutative: false, aux: "Int8"}, {name: "RotateAllLeftUint32x8", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint32x8", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Uint32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromUint32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromUint32x8", argLength: 2, commutative: false, aux: "Int8"}, {name: "GetElemUint64x2", argLength: 1, commutative: false, aux: "Int8"}, @@ -1684,6 +1692,7 @@ func simdGenericOps() []opData { {name: "MaskedShiftAllRightAndFillUpperFromUint64x4", argLength: 3, commutative: false, aux: "Int8"}, {name: "RotateAllLeftUint64x4", argLength: 1, commutative: false, aux: "Int8"}, {name: "RotateAllRightUint64x4", argLength: 1, commutative: false, aux: "Int8"}, + {name: "Set128Uint64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllLeftAndFillUpperFromUint64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "ShiftAllRightAndFillUpperFromUint64x4", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedRotateAllLeftUint64x8", argLength: 2, commutative: false, aux: "Int8"}, @@ -1704,6 +1713,7 @@ func simdGenericOps() []opData { {name: "GaloisFieldAffineTransformInversedUint8x32", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedGaloisFieldAffineTransformUint8x32", argLength: 3, commutative: false, aux: "Int8"}, {name: "MaskedGaloisFieldAffineTransformInversedUint8x32", argLength: 3, commutative: false, aux: "Int8"}, + {name: "Set128Uint8x32", argLength: 2, commutative: false, aux: "Int8"}, {name: "GaloisFieldAffineTransformUint8x64", argLength: 2, commutative: false, aux: "Int8"}, {name: "GaloisFieldAffineTransformInversedUint8x64", argLength: 2, commutative: false, aux: "Int8"}, {name: "MaskedGaloisFieldAffineTransformUint8x64", argLength: 3, commutative: false, aux: "Int8"}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index fec727ea12e..ece791ca6ce 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1961,6 +1961,7 @@ const ( OpAMD64VRNDSCALEPSMasked256 OpAMD64VREDUCEPSMasked256 OpAMD64VCMPPSMasked256 + OpAMD64VINSERTF128256 OpAMD64VROUNDPD128 OpAMD64VRNDSCALEPD128 OpAMD64VREDUCEPD128 @@ -2072,6 +2073,7 @@ const ( OpAMD64VPINSRB128 OpAMD64VPCMPB256 OpAMD64VPCMPBMasked256 + OpAMD64VINSERTI128256 OpAMD64VPCMPB512 OpAMD64VPCMPBMasked512 OpAMD64VPCMPUW256 @@ -5844,6 +5846,7 @@ const ( OpMaskedRoundWithPrecisionFloat32x8 OpMaskedTruncWithPrecisionFloat32x8 OpRoundWithPrecisionFloat32x8 + OpSet128Float32x8 OpTruncWithPrecisionFloat32x8 OpCeilWithPrecisionFloat64x2 OpDiffWithCeilWithPrecisionFloat64x2 @@ -5876,6 +5879,7 @@ const ( OpMaskedRoundWithPrecisionFloat64x4 OpMaskedTruncWithPrecisionFloat64x4 OpRoundWithPrecisionFloat64x4 + OpSet128Float64x4 OpTruncWithPrecisionFloat64x4 OpCeilWithPrecisionFloat64x8 OpDiffWithCeilWithPrecisionFloat64x8 @@ -5895,6 +5899,7 @@ const ( OpTruncWithPrecisionFloat64x8 OpMaskedShiftAllLeftAndFillUpperFromInt16x16 OpMaskedShiftAllRightAndFillUpperFromInt16x16 + OpSet128Int16x16 OpShiftAllLeftAndFillUpperFromInt16x16 OpShiftAllRightAndFillUpperFromInt16x16 OpMaskedShiftAllLeftAndFillUpperFromInt16x32 @@ -5931,6 +5936,7 @@ const ( OpMaskedShiftAllRightAndFillUpperFromInt32x8 OpRotateAllLeftInt32x8 OpRotateAllRightInt32x8 + OpSet128Int32x8 OpShiftAllLeftAndFillUpperFromInt32x8 OpShiftAllRightAndFillUpperFromInt32x8 OpGetElemInt64x2 @@ -5949,6 +5955,7 @@ const ( OpMaskedShiftAllRightAndFillUpperFromInt64x4 OpRotateAllLeftInt64x4 OpRotateAllRightInt64x4 + OpSet128Int64x4 OpShiftAllLeftAndFillUpperFromInt64x4 OpShiftAllRightAndFillUpperFromInt64x4 OpMaskedRotateAllLeftInt64x8 @@ -5961,8 +5968,10 @@ const ( OpShiftAllRightAndFillUpperFromInt64x8 OpGetElemInt8x16 OpSetElemInt8x16 + OpSet128Int8x32 OpMaskedShiftAllLeftAndFillUpperFromUint16x16 OpMaskedShiftAllRightAndFillUpperFromUint16x16 + OpSet128Uint16x16 OpShiftAllLeftAndFillUpperFromUint16x16 OpShiftAllRightAndFillUpperFromUint16x16 OpMaskedShiftAllLeftAndFillUpperFromUint16x32 @@ -5999,6 +6008,7 @@ const ( OpMaskedShiftAllRightAndFillUpperFromUint32x8 OpRotateAllLeftUint32x8 OpRotateAllRightUint32x8 + OpSet128Uint32x8 OpShiftAllLeftAndFillUpperFromUint32x8 OpShiftAllRightAndFillUpperFromUint32x8 OpGetElemUint64x2 @@ -6017,6 +6027,7 @@ const ( OpMaskedShiftAllRightAndFillUpperFromUint64x4 OpRotateAllLeftUint64x4 OpRotateAllRightUint64x4 + OpSet128Uint64x4 OpShiftAllLeftAndFillUpperFromUint64x4 OpShiftAllRightAndFillUpperFromUint64x4 OpMaskedRotateAllLeftUint64x8 @@ -6037,6 +6048,7 @@ const ( OpGaloisFieldAffineTransformInversedUint8x32 OpMaskedGaloisFieldAffineTransformUint8x32 OpMaskedGaloisFieldAffineTransformInversedUint8x32 + OpSet128Uint8x32 OpGaloisFieldAffineTransformUint8x64 OpGaloisFieldAffineTransformInversedUint8x64 OpMaskedGaloisFieldAffineTransformUint8x64 @@ -30131,6 +30143,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VINSERTF128256", + auxType: auxInt8, + argLen: 2, + asm: x86.AVINSERTF128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VROUNDPD128", auxType: auxInt8, @@ -31825,6 +31852,21 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VINSERTI128256", + auxType: auxInt8, + argLen: 2, + asm: x86.AVINSERTI128, + reg: regInfo{ + inputs: []inputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + {1, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + outputs: []outputInfo{ + {0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 + }, + }, + }, { name: "VPCMPB512", auxType: auxInt8, @@ -67718,6 +67760,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Float32x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "TruncWithPrecisionFloat32x8", auxType: auxInt8, @@ -67910,6 +67958,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Float64x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "TruncWithPrecisionFloat64x4", auxType: auxInt8, @@ -68024,6 +68078,12 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "Set128Int16x16", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromInt16x16", auxType: auxInt8, @@ -68240,6 +68300,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Int32x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromInt32x8", auxType: auxInt8, @@ -68348,6 +68414,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Int64x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromInt64x4", auxType: auxInt8, @@ -68420,6 +68492,12 @@ var opcodeTable = [...]opInfo{ argLen: 2, generic: true, }, + { + name: "Set128Int8x32", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "MaskedShiftAllLeftAndFillUpperFromUint16x16", auxType: auxInt8, @@ -68432,6 +68510,12 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "Set128Uint16x16", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromUint16x16", auxType: auxInt8, @@ -68648,6 +68732,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Uint32x8", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromUint32x8", auxType: auxInt8, @@ -68756,6 +68846,12 @@ var opcodeTable = [...]opInfo{ argLen: 1, generic: true, }, + { + name: "Set128Uint64x4", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "ShiftAllLeftAndFillUpperFromUint64x4", auxType: auxInt8, @@ -68876,6 +68972,12 @@ var opcodeTable = [...]opInfo{ argLen: 3, generic: true, }, + { + name: "Set128Uint8x32", + auxType: auxInt8, + argLen: 2, + generic: true, + }, { name: "GaloisFieldAffineTransformUint8x64", auxType: auxInt8, diff --git a/src/cmd/compile/internal/ssa/rewriteAMD64.go b/src/cmd/compile/internal/ssa/rewriteAMD64.go index 15ca2fcc5b4..5c1872dcdfd 100644 --- a/src/cmd/compile/internal/ssa/rewriteAMD64.go +++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go @@ -4411,6 +4411,26 @@ func rewriteValueAMD64(v *Value) bool { return rewriteValueAMD64_OpSelect1(v) case OpSelectN: return rewriteValueAMD64_OpSelectN(v) + case OpSet128Float32x8: + return rewriteValueAMD64_OpSet128Float32x8(v) + case OpSet128Float64x4: + return rewriteValueAMD64_OpSet128Float64x4(v) + case OpSet128Int16x16: + return rewriteValueAMD64_OpSet128Int16x16(v) + case OpSet128Int32x8: + return rewriteValueAMD64_OpSet128Int32x8(v) + case OpSet128Int64x4: + return rewriteValueAMD64_OpSet128Int64x4(v) + case OpSet128Int8x32: + return rewriteValueAMD64_OpSet128Int8x32(v) + case OpSet128Uint16x16: + return rewriteValueAMD64_OpSet128Uint16x16(v) + case OpSet128Uint32x8: + return rewriteValueAMD64_OpSet128Uint32x8(v) + case OpSet128Uint64x4: + return rewriteValueAMD64_OpSet128Uint64x4(v) + case OpSet128Uint8x32: + return rewriteValueAMD64_OpSet128Uint8x32(v) case OpSetElemInt16x8: return rewriteValueAMD64_OpSetElemInt16x8(v) case OpSetElemInt32x4: @@ -53102,6 +53122,156 @@ func rewriteValueAMD64_OpSelectN(v *Value) bool { } return false } +func rewriteValueAMD64_OpSet128Float32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Float32x8 [a] x y) + // result: (VINSERTF128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTF128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Float64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Float64x4 [a] x y) + // result: (VINSERTF128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTF128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Int16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Int16x16 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Int32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Int32x8 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Int64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Int64x4 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Int8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Int8x32 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Uint16x16(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Uint16x16 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Uint32x8(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Uint32x8 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Uint64x4(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Uint64x4 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} +func rewriteValueAMD64_OpSet128Uint8x32(v *Value) bool { + v_1 := v.Args[1] + v_0 := v.Args[0] + // match: (Set128Uint8x32 [a] x y) + // result: (VINSERTI128256 [a] x y) + for { + a := auxIntToInt8(v.AuxInt) + x := v_0 + y := v_1 + v.reset(OpAMD64VINSERTI128256) + v.AuxInt = int8ToAuxInt(a) + v.AddArg2(x, y) + return true + } +} func rewriteValueAMD64_OpSetElemInt16x8(v *Value) bool { v_1 := v.Args[1] v_0 := v.Args[0] diff --git a/src/cmd/compile/internal/ssagen/simdintrinsics.go b/src/cmd/compile/internal/ssagen/simdintrinsics.go index 9837f07fc47..3d0e6fbd4aa 100644 --- a/src/cmd/compile/internal/ssagen/simdintrinsics.go +++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go @@ -1463,6 +1463,16 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies . addF(simdPackage, "Uint32x4.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x4, types.TypeVec128), sys.AMD64) addF(simdPackage, "Uint32x8.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x8, types.TypeVec256), sys.AMD64) addF(simdPackage, "Uint32x16.SaturatedUnsignedSignedQuadDotProdAccumulate", opLen3(ssa.OpSaturatedUnsignedSignedQuadDotProdAccumulateUint32x16, types.TypeVec512), sys.AMD64) + addF(simdPackage, "Float32x8.Set128", opLen2Imm8(ssa.OpSet128Float32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Float64x4.Set128", opLen2Imm8(ssa.OpSet128Float64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int8x32.Set128", opLen2Imm8(ssa.OpSet128Int8x32, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int16x16.Set128", opLen2Imm8(ssa.OpSet128Int16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int32x8.Set128", opLen2Imm8(ssa.OpSet128Int32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Int64x4.Set128", opLen2Imm8(ssa.OpSet128Int64x4, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint8x32.Set128", opLen2Imm8(ssa.OpSet128Uint8x32, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint16x16.Set128", opLen2Imm8(ssa.OpSet128Uint16x16, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint32x8.Set128", opLen2Imm8(ssa.OpSet128Uint32x8, types.TypeVec256, 0), sys.AMD64) + addF(simdPackage, "Uint64x4.Set128", opLen2Imm8(ssa.OpSet128Uint64x4, types.TypeVec256, 0), sys.AMD64) addF(simdPackage, "Int8x16.SetElem", opLen2Imm8(ssa.OpSetElemInt8x16, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int16x8.SetElem", opLen2Imm8(ssa.OpSetElemInt16x8, types.TypeVec128, 0), sys.AMD64) addF(simdPackage, "Int32x4.SetElem", opLen2Imm8(ssa.OpSetElemInt32x4, types.TypeVec128, 0), sys.AMD64) diff --git a/src/simd/simd_test.go b/src/simd/simd_test.go index 59908d60c52..f99938bb9d2 100644 --- a/src/simd/simd_test.go +++ b/src/simd/simd_test.go @@ -193,6 +193,22 @@ func TestSlicesInt8GetElem(t *testing.T) { } } + +func TestSlicesInt8Set128(t *testing.T) { + a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32} + v := simd.LoadInt8x16Slice(a) // 1-16 + u := simd.LoadInt8x32Slice(a) // 1-32 + + w := u.Set128(1, v) // 1-16:1-16 + + b := make([]int8, 32, 32) + w.StoreSlice(b) + + checkInt8Slices(t, a, b[:16]) + checkInt8Slices(t, a, b[16:]) +} + func TestSlicesInt8TooShortLoad(t *testing.T) { defer func() { if r := recover(); r != nil { diff --git a/src/simd/simd_wrapped_test.go b/src/simd/simd_wrapped_test.go index 321d3bb80a4..4a8c0957e5b 100644 --- a/src/simd/simd_wrapped_test.go +++ b/src/simd/simd_wrapped_test.go @@ -7975,6 +7975,7 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6 // RotateAllLeft // RotateAllRight // RoundWithPrecision +// Set128 // SetElem // ShiftAllLeft // ShiftAllLeftAndFillUpperFrom diff --git a/src/simd/stubs_amd64.go b/src/simd/stubs_amd64.go index f53242cd738..de54a9ada48 100644 --- a/src/simd/stubs_amd64.go +++ b/src/simd/stubs_amd64.go @@ -7682,6 +7682,58 @@ func (x Uint32x8) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x32, z Int // Asm: VPDPBUSDS, CPU Feature: AVX512EVEX func (x Uint32x16) SaturatedUnsignedSignedQuadDotProdAccumulate(y Uint8x64, z Int8x64) Uint32x16 +/* Set128 */ + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTF128, CPU Feature: AVX +func (x Float32x8) Set128(imm uint8, y Float32x4) Float32x8 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTF128, CPU Feature: AVX +func (x Float64x4) Set128(imm uint8, y Float64x2) Float64x4 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Int8x32) Set128(imm uint8, y Int8x16) Int8x32 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Int16x16) Set128(imm uint8, y Int16x8) Int16x16 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Int32x8) Set128(imm uint8, y Int32x4) Int32x8 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Int64x4) Set128(imm uint8, y Int64x2) Int64x4 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Uint8x32) Set128(imm uint8, y Uint8x16) Uint8x32 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Uint16x16) Set128(imm uint8, y Uint16x8) Uint16x16 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Uint32x8) Set128(imm uint8, y Uint32x4) Uint32x8 + +// Set128 combines a 128-bit vector with a 256-bit vector, where the constant operand specifies whether the low (0) or high (1) half is receives the smaller vector. +// +// Asm: VINSERTI128, CPU Feature: AVX2 +func (x Uint64x4) Set128(imm uint8, y Uint64x2) Uint64x4 + /* SetElem */ // SetElem sets a single constant-indexed element's value.