[dev.simd] cmd/compile: add EXTRACT[IF]128 instructions

This is generated by simdgen CL 684080 and should be submitted after it. Also includes tests. Change-Id: I1d680911134d8fb92f4deccae4ec373f3ed9f752 Reviewed-on: https://go-review.googlesource.com/c/go/+/684115 Reviewed-by: Junyang Shao <shaojunyang@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-12-08 06:10:04 +00:00 · 2025-06-25 18:20:50 -04:00 · 2025-06-25 18:20:50 -04:00 · 43a61aef56
commit 43a61aef56
parent 292db9b676
10 changed files with 425 additions and 0 deletions
--- a/src/cmd/compile/internal/amd64/simdssa.go
+++ b/src/cmd/compile/internal/amd64/simdssa.go
@ -655,6 +655,8 @@ func ssaGenSIMDValue(s *ssagen.State, v *ssa.Value) bool {
 		ssa.OpAMD64VREDUCEPD128,
 		ssa.OpAMD64VREDUCEPD256,
 		ssa.OpAMD64VREDUCEPD512,
+		ssa.OpAMD64VEXTRACTF128128,
+		ssa.OpAMD64VEXTRACTI128128,
 		ssa.OpAMD64VPROLD128,
 		ssa.OpAMD64VPROLD256,
 		ssa.OpAMD64VPROLD512,
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64.rules
@ -224,6 +224,16 @@
 (GaloisFieldMulUint8x16 ...) => (VGF2P8MULB128 ...)
 (GaloisFieldMulUint8x32 ...) => (VGF2P8MULB256 ...)
 (GaloisFieldMulUint8x64 ...) => (VGF2P8MULB512 ...)
+(Get128Float32x8 [a] x) => (VEXTRACTF128128 [a] x)
+(Get128Float64x4 [a] x) => (VEXTRACTF128128 [a] x)
+(Get128Int8x32 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Int16x16 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Int32x8 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Int64x4 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Uint8x32 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Uint16x16 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Uint32x8 [a] x) => (VEXTRACTI128128 [a] x)
+(Get128Uint64x4 [a] x) => (VEXTRACTI128128 [a] x)
 (GetElemInt8x16 [a] x) => (VPEXTRB128 [a] x)
 (GetElemInt16x8 [a] x) => (VPEXTRW128 [a] x)
 (GetElemInt32x4 [a] x) => (VPEXTRD128 [a] x)
--- a/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdAMD64ops.go
@ -765,6 +765,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp,
 		{name: "VRNDSCALEPS256", argLength: 1, reg: fp11, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VREDUCEPS256", argLength: 1, reg: fp11, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPS256", argLength: 2, reg: fp21, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Vec256", resultInArg0: false},
+		{name: "VEXTRACTF128128", argLength: 1, reg: fp11, asm: "VEXTRACTF128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VRNDSCALEPSMasked256", argLength: 2, reg: fpkfp, asm: "VRNDSCALEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VREDUCEPSMasked256", argLength: 2, reg: fpkfp, asm: "VREDUCEPS", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
 		{name: "VCMPPSMasked256", argLength: 3, reg: fp2kk, asm: "VCMPPS", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
@ -878,6 +879,7 @@ func simdAMD64Ops(fp11, fp21, fp2k, fpkfp, fp2kfp, fp2kk, fp31, fp3kfp, fpgpfp,
 		{name: "VPCMPB128", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPBMasked128", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VPINSRB128", argLength: 2, reg: fpgpfp, asm: "VPINSRB", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
+		{name: "VEXTRACTI128128", argLength: 1, reg: fp11, asm: "VEXTRACTI128", aux: "Int8", commutative: false, typ: "Vec128", resultInArg0: false},
 		{name: "VPCMPB256", argLength: 2, reg: fp2k, asm: "VPCMPB", aux: "Int8", commutative: false, typ: "Mask", resultInArg0: false},
 		{name: "VPCMPBMasked256", argLength: 3, reg: fp2kk, asm: "VPCMPB", aux: "Int8", commutative: true, typ: "Mask", resultInArg0: false},
 		{name: "VINSERTI128256", argLength: 2, reg: fp21, asm: "VINSERTI128", aux: "Int8", commutative: false, typ: "Vec256", resultInArg0: false},
--- a/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/simdgenericOps.go
@ -1502,6 +1502,7 @@ func simdGenericOps() []opData {
 		{name: "DiffWithRoundWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "DiffWithTruncWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "FloorWithPrecisionFloat32x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "Get128Float32x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedCeilWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedDiffWithCeilWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedDiffWithFloorWithPrecisionFloat32x8", argLength: 2, commutative: false, aux: "Int8"},
@ -1535,6 +1536,7 @@ func simdGenericOps() []opData {
 		{name: "DiffWithRoundWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "DiffWithTruncWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "FloorWithPrecisionFloat64x4", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "Get128Float64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedCeilWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedDiffWithCeilWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedDiffWithFloorWithPrecisionFloat64x4", argLength: 2, commutative: false, aux: "Int8"},
@ -1562,6 +1564,7 @@ func simdGenericOps() []opData {
 		{name: "MaskedTruncWithPrecisionFloat64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "RoundWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "TruncWithPrecisionFloat64x8", argLength: 1, commutative: false, aux: "Int8"},
+		{name: "Get128Int16x16", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllRightAndFillUpperFromInt16x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "Set128Int16x16", argLength: 2, commutative: false, aux: "Int8"},
@ -1595,6 +1598,7 @@ func simdGenericOps() []opData {
 		{name: "SetElemInt32x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllLeftAndFillUpperFromInt32x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightAndFillUpperFromInt32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Int32x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllLeftInt32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllRightInt32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromInt32x8", argLength: 3, commutative: false, aux: "Int8"},
@ -1614,6 +1618,7 @@ func simdGenericOps() []opData {
 		{name: "SetElemInt64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllLeftAndFillUpperFromInt64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightAndFillUpperFromInt64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Int64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllLeftInt64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllRightInt64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromInt64x4", argLength: 3, commutative: false, aux: "Int8"},
@ -1633,7 +1638,9 @@ func simdGenericOps() []opData {
 		{name: "ShiftAllRightAndFillUpperFromInt64x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "GetElemInt8x16", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "SetElemInt8x16", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Int8x32", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "Set128Int8x32", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Uint16x16", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllRightAndFillUpperFromUint16x16", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "Set128Uint16x16", argLength: 2, commutative: false, aux: "Int8"},
@ -1667,6 +1674,7 @@ func simdGenericOps() []opData {
 		{name: "SetElemUint32x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllLeftAndFillUpperFromUint32x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightAndFillUpperFromUint32x4", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Uint32x8", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllLeftUint32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllRightUint32x8", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromUint32x8", argLength: 3, commutative: false, aux: "Int8"},
@ -1686,6 +1694,7 @@ func simdGenericOps() []opData {
 		{name: "SetElemUint64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllLeftAndFillUpperFromUint64x2", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "ShiftAllRightAndFillUpperFromUint64x2", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Uint64x4", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllLeftUint64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedRotateAllRightUint64x4", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "MaskedShiftAllLeftAndFillUpperFromUint64x4", argLength: 3, commutative: false, aux: "Int8"},
@ -1711,6 +1720,7 @@ func simdGenericOps() []opData {
 		{name: "SetElemUint8x16", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformUint8x32", argLength: 2, commutative: false, aux: "Int8"},
 		{name: "GaloisFieldAffineTransformInversedUint8x32", argLength: 2, commutative: false, aux: "Int8"},
+		{name: "Get128Uint8x32", argLength: 1, commutative: false, aux: "Int8"},
 		{name: "MaskedGaloisFieldAffineTransformUint8x32", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "MaskedGaloisFieldAffineTransformInversedUint8x32", argLength: 3, commutative: false, aux: "Int8"},
 		{name: "Set128Uint8x32", argLength: 2, commutative: false, aux: "Int8"},
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -1958,6 +1958,7 @@ const (
 	OpAMD64VRNDSCALEPS256
 	OpAMD64VREDUCEPS256
 	OpAMD64VCMPPS256
+	OpAMD64VEXTRACTF128128
 	OpAMD64VRNDSCALEPSMasked256
 	OpAMD64VREDUCEPSMasked256
 	OpAMD64VCMPPSMasked256
@ -2071,6 +2072,7 @@ const (
 	OpAMD64VPCMPB128
 	OpAMD64VPCMPBMasked128
 	OpAMD64VPINSRB128
+	OpAMD64VEXTRACTI128128
 	OpAMD64VPCMPB256
 	OpAMD64VPCMPBMasked256
 	OpAMD64VINSERTI128256
@ -5837,6 +5839,7 @@ const (
 	OpDiffWithRoundWithPrecisionFloat32x8
 	OpDiffWithTruncWithPrecisionFloat32x8
 	OpFloorWithPrecisionFloat32x8
+	OpGet128Float32x8
 	OpMaskedCeilWithPrecisionFloat32x8
 	OpMaskedDiffWithCeilWithPrecisionFloat32x8
 	OpMaskedDiffWithFloorWithPrecisionFloat32x8
@ -5870,6 +5873,7 @@ const (
 	OpDiffWithRoundWithPrecisionFloat64x4
 	OpDiffWithTruncWithPrecisionFloat64x4
 	OpFloorWithPrecisionFloat64x4
+	OpGet128Float64x4
 	OpMaskedCeilWithPrecisionFloat64x4
 	OpMaskedDiffWithCeilWithPrecisionFloat64x4
 	OpMaskedDiffWithFloorWithPrecisionFloat64x4
@ -5897,6 +5901,7 @@ const (
 	OpMaskedTruncWithPrecisionFloat64x8
 	OpRoundWithPrecisionFloat64x8
 	OpTruncWithPrecisionFloat64x8
+	OpGet128Int16x16
 	OpMaskedShiftAllLeftAndFillUpperFromInt16x16
 	OpMaskedShiftAllRightAndFillUpperFromInt16x16
 	OpSet128Int16x16
@ -5930,6 +5935,7 @@ const (
 	OpSetElemInt32x4
 	OpShiftAllLeftAndFillUpperFromInt32x4
 	OpShiftAllRightAndFillUpperFromInt32x4
+	OpGet128Int32x8
 	OpMaskedRotateAllLeftInt32x8
 	OpMaskedRotateAllRightInt32x8
 	OpMaskedShiftAllLeftAndFillUpperFromInt32x8
@ -5949,6 +5955,7 @@ const (
 	OpSetElemInt64x2
 	OpShiftAllLeftAndFillUpperFromInt64x2
 	OpShiftAllRightAndFillUpperFromInt64x2
+	OpGet128Int64x4
 	OpMaskedRotateAllLeftInt64x4
 	OpMaskedRotateAllRightInt64x4
 	OpMaskedShiftAllLeftAndFillUpperFromInt64x4
@ -5968,7 +5975,9 @@ const (
 	OpShiftAllRightAndFillUpperFromInt64x8
 	OpGetElemInt8x16
 	OpSetElemInt8x16
+	OpGet128Int8x32
 	OpSet128Int8x32
+	OpGet128Uint16x16
 	OpMaskedShiftAllLeftAndFillUpperFromUint16x16
 	OpMaskedShiftAllRightAndFillUpperFromUint16x16
 	OpSet128Uint16x16
@ -6002,6 +6011,7 @@ const (
 	OpSetElemUint32x4
 	OpShiftAllLeftAndFillUpperFromUint32x4
 	OpShiftAllRightAndFillUpperFromUint32x4
+	OpGet128Uint32x8
 	OpMaskedRotateAllLeftUint32x8
 	OpMaskedRotateAllRightUint32x8
 	OpMaskedShiftAllLeftAndFillUpperFromUint32x8
@ -6021,6 +6031,7 @@ const (
 	OpSetElemUint64x2
 	OpShiftAllLeftAndFillUpperFromUint64x2
 	OpShiftAllRightAndFillUpperFromUint64x2
+	OpGet128Uint64x4
 	OpMaskedRotateAllLeftUint64x4
 	OpMaskedRotateAllRightUint64x4
 	OpMaskedShiftAllLeftAndFillUpperFromUint64x4
@ -6046,6 +6057,7 @@ const (
 	OpSetElemUint8x16
 	OpGaloisFieldAffineTransformUint8x32
 	OpGaloisFieldAffineTransformInversedUint8x32
+	OpGet128Uint8x32
 	OpMaskedGaloisFieldAffineTransformUint8x32
 	OpMaskedGaloisFieldAffineTransformInversedUint8x32
 	OpSet128Uint8x32
@ -30096,6 +30108,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VEXTRACTF128128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVEXTRACTF128,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:    "VRNDSCALEPSMasked256",
 		auxType: auxInt8,
@ -31820,6 +31846,20 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:    "VEXTRACTI128128",
+		auxType: auxInt8,
+		argLen:  1,
+		asm:     x86.AVEXTRACTI128,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+			outputs: []outputInfo{
+				{0, 2147418112}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
+			},
+		},
+	},
 	{
 		name:    "VPCMPB256",
 		auxType: auxInt8,
@ -67706,6 +67746,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "Get128Float32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedCeilWithPrecisionFloat32x8",
 		auxType: auxInt8,
@ -67904,6 +67950,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "Get128Float64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedCeilWithPrecisionFloat64x4",
 		auxType: auxInt8,
@ -68066,6 +68118,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "Get128Int16x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedShiftAllLeftAndFillUpperFromInt16x16",
 		auxType: auxInt8,
@ -68264,6 +68322,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Int32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedRotateAllLeftInt32x8",
 		auxType: auxInt8,
@ -68378,6 +68442,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Int64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedRotateAllLeftInt64x4",
 		auxType: auxInt8,
@ -68492,12 +68562,24 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Int8x32",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "Set128Int8x32",
 		auxType: auxInt8,
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Uint16x16",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedShiftAllLeftAndFillUpperFromUint16x16",
 		auxType: auxInt8,
@ -68696,6 +68778,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Uint32x8",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedRotateAllLeftUint32x8",
 		auxType: auxInt8,
@ -68810,6 +68898,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Uint64x4",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedRotateAllLeftUint64x4",
 		auxType: auxInt8,
@ -68960,6 +69054,12 @@ var opcodeTable = [...]opInfo{
 		argLen:  2,
 		generic: true,
 	},
+	{
+		name:    "Get128Uint8x32",
+		auxType: auxInt8,
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "MaskedGaloisFieldAffineTransformUint8x32",
 		auxType: auxInt8,
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@ -1388,6 +1388,26 @@ func rewriteValueAMD64(v *Value) bool {
 	case OpGaloisFieldMulUint8x64:
 		v.Op = OpAMD64VGF2P8MULB512
 		return true
+	case OpGet128Float32x8:
+		return rewriteValueAMD64_OpGet128Float32x8(v)
+	case OpGet128Float64x4:
+		return rewriteValueAMD64_OpGet128Float64x4(v)
+	case OpGet128Int16x16:
+		return rewriteValueAMD64_OpGet128Int16x16(v)
+	case OpGet128Int32x8:
+		return rewriteValueAMD64_OpGet128Int32x8(v)
+	case OpGet128Int64x4:
+		return rewriteValueAMD64_OpGet128Int64x4(v)
+	case OpGet128Int8x32:
+		return rewriteValueAMD64_OpGet128Int8x32(v)
+	case OpGet128Uint16x16:
+		return rewriteValueAMD64_OpGet128Uint16x16(v)
+	case OpGet128Uint32x8:
+		return rewriteValueAMD64_OpGet128Uint32x8(v)
+	case OpGet128Uint64x4:
+		return rewriteValueAMD64_OpGet128Uint64x4(v)
+	case OpGet128Uint8x32:
+		return rewriteValueAMD64_OpGet128Uint8x32(v)
 	case OpGetCallerPC:
 		v.Op = OpAMD64LoweredGetCallerPC
 		return true
@ -30999,6 +31019,136 @@ func rewriteValueAMD64_OpGaloisFieldAffineTransformUint8x64(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpGet128Float32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Float32x8 [a] x)
+	// result: (VEXTRACTF128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTF128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Float64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Float64x4 [a] x)
+	// result: (VEXTRACTF128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTF128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Int16x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Int16x16 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Int32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Int32x8 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Int64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Int64x4 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Int8x32(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Int8x32 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Uint16x16(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Uint16x16 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Uint32x8(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Uint32x8 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Uint64x4(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Uint64x4 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
+func rewriteValueAMD64_OpGet128Uint8x32(v *Value) bool {
+	v_0 := v.Args[0]
+	// match: (Get128Uint8x32 [a] x)
+	// result: (VEXTRACTI128128 [a] x)
+	for {
+		a := auxIntToInt8(v.AuxInt)
+		x := v_0
+		v.reset(OpAMD64VEXTRACTI128128)
+		v.AuxInt = int8ToAuxInt(a)
+		v.AddArg(x)
+		return true
+	}
+}
 func rewriteValueAMD64_OpGetElemInt16x8(v *Value) bool {
 	v_0 := v.Args[0]
 	// match: (GetElemInt16x8 [a] x)
--- a/src/cmd/compile/internal/ssagen/simdintrinsics.go
+++ b/src/cmd/compile/internal/ssagen/simdintrinsics.go
@ -235,6 +235,16 @@ func simdIntrinsics(addF func(pkg, fn string, b intrinsicBuilder, archFamilies .
 	addF(simdPackage, "Uint8x16.GaloisFieldMul", opLen2(ssa.OpGaloisFieldMulUint8x16, types.TypeVec128), sys.AMD64)
 	addF(simdPackage, "Uint8x32.GaloisFieldMul", opLen2(ssa.OpGaloisFieldMulUint8x32, types.TypeVec256), sys.AMD64)
 	addF(simdPackage, "Uint8x64.GaloisFieldMul", opLen2(ssa.OpGaloisFieldMulUint8x64, types.TypeVec512), sys.AMD64)
+	addF(simdPackage, "Float32x8.Get128", opLen1Imm8(ssa.OpGet128Float32x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Float64x4.Get128", opLen1Imm8(ssa.OpGet128Float64x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int8x32.Get128", opLen1Imm8(ssa.OpGet128Int8x32, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int16x16.Get128", opLen1Imm8(ssa.OpGet128Int16x16, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int32x8.Get128", opLen1Imm8(ssa.OpGet128Int32x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Int64x4.Get128", opLen1Imm8(ssa.OpGet128Int64x4, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint8x32.Get128", opLen1Imm8(ssa.OpGet128Uint8x32, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint16x16.Get128", opLen1Imm8(ssa.OpGet128Uint16x16, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint32x8.Get128", opLen1Imm8(ssa.OpGet128Uint32x8, types.TypeVec128, 0), sys.AMD64)
+	addF(simdPackage, "Uint64x4.Get128", opLen1Imm8(ssa.OpGet128Uint64x4, types.TypeVec128, 0), sys.AMD64)
 	addF(simdPackage, "Int8x16.GetElem", opLen1Imm8(ssa.OpGetElemInt8x16, types.Types[types.TINT8], 0), sys.AMD64)
 	addF(simdPackage, "Int16x8.GetElem", opLen1Imm8(ssa.OpGetElemInt16x8, types.Types[types.TINT16], 0), sys.AMD64)
 	addF(simdPackage, "Int32x4.GetElem", opLen1Imm8(ssa.OpGetElemInt32x4, types.Types[types.TINT32], 0), sys.AMD64)
--- a/src/simd/simd_test.go
+++ b/src/simd/simd_test.go
@ -161,6 +161,22 @@ func checkInt8Slices(t *testing.T, a, b []int8) {
 	}
 }

+func checkFloat32Slices(t *testing.T, a, b []float32) {
+	for i := range b {
+		if a[i] != b[i] {
+			t.Errorf("a and b differ at index %d, a=%3.0f, b=%3.0f", i, a[i], b[i])
+		}
+	}
+}
+
+func checkFloat64Slices(t *testing.T, a, b []float64) {
+	for i := range b {
+		if a[i] != b[i] {
+			t.Errorf("a and b differ at index %d, a=%3.0f, b=%3.0f", i, a[i], b[i])
+		}
+	}
+}
+
 func TestSlicesInt8(t *testing.T) {
 	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
 		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
@ -209,6 +225,78 @@ func TestSlicesInt8Set128(t *testing.T) {
 	checkInt8Slices(t, a, b[16:])
 }

+func TestSlicesInt8Get128(t *testing.T) {
+	a := []int8{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	u := simd.LoadInt8x32Slice(a) // 1-32
+	v := u.Get128(0)              // 1-16
+	w := u.Get128(1)              // 17-32
+
+	b := make([]int8, 32, 32)
+	v.StoreSlice(b[:16])
+	w.StoreSlice(b[16:])
+
+	checkInt8Slices(t, a, b)
+}
+
+func TestSlicesFloat32Set128(t *testing.T) {
+	a := []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadFloat32x4Slice(a) // 1-4
+	u := simd.LoadFloat32x8Slice(a) // 1-4
+
+	w := u.Set128(1, v) // 1-4:1-4
+
+	b := make([]float32, 8, 8)
+	w.StoreSlice(b)
+
+	checkFloat32Slices(t, a, b[:4])
+	checkFloat32Slices(t, a, b[4:])
+}
+
+func TestSlicesFloat32Get128(t *testing.T) {
+	a := []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	u := simd.LoadFloat32x8Slice(a) // 1-8
+	v := u.Get128(0)                // 1-4
+	w := u.Get128(1)                // 5-8
+
+	b := make([]float32, 8, 8)
+	v.StoreSlice(b[:4])
+	w.StoreSlice(b[4:])
+
+	checkFloat32Slices(t, a, b)
+}
+
+func TestSlicesFloat64Set128(t *testing.T) {
+	a := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	v := simd.LoadFloat64x2Slice(a) // 1-2
+	u := simd.LoadFloat64x4Slice(a) // 1-2
+
+	w := u.Set128(1, v) // 1-2:1-2
+
+	b := make([]float64, 4, 4)
+	w.StoreSlice(b)
+
+	checkFloat64Slices(t, a, b[:2])
+	checkFloat64Slices(t, a, b[2:])
+}
+
+func TestSlicesFloat64Get128(t *testing.T) {
+	a := []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+		17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}
+	u := simd.LoadFloat64x4Slice(a) // 1-4
+	v := u.Get128(0)                // 1-2
+	w := u.Get128(1)                // 3-4
+
+	b := make([]float64, 4, 4)
+	v.StoreSlice(b[:2])
+	w.StoreSlice(b[2:])
+
+	checkFloat64Slices(t, a, b)
+}
+
 func TestSlicesInt8TooShortLoad(t *testing.T) {
 	defer func() {
 		if r := recover(); r != nil {
--- a/src/simd/simd_wrapped_test.go
+++ b/src/simd/simd_wrapped_test.go
@ -7954,6 +7954,7 @@ func testUint64x8UnaryMasked(t *testing.T, v0 []uint64, v1 []int64, want []uint6
 // FloorWithPrecision
 // GaloisFieldAffineTransform
 // GaloisFieldAffineTransformInversed
+// Get128
 // GetElem
 // MaskedCeilWithPrecision
 // MaskedDiffWithCeilWithPrecision
--- a/src/simd/stubs_amd64.go
+++ b/src/simd/stubs_amd64.go
@ -1198,6 +1198,58 @@ func (x Uint8x32) GaloisFieldMul(y Uint8x32) Uint8x32
 // Asm: VGF2P8MULB, CPU Feature: AVX512EVEX
 func (x Uint8x64) GaloisFieldMul(y Uint8x64) Uint8x64

+/* Get128 */
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float32x8) Get128(imm uint8) Float32x4
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTF128, CPU Feature: AVX
+func (x Float64x4) Get128(imm uint8) Float64x2
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int8x32) Get128(imm uint8) Int8x16
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int16x16) Get128(imm uint8) Int16x8
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int32x8) Get128(imm uint8) Int32x4
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Int64x4) Get128(imm uint8) Int64x2
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint8x32) Get128(imm uint8) Uint8x16
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint16x16) Get128(imm uint8) Uint16x8
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint32x8) Get128(imm uint8) Uint32x4
+
+// Get128 retrieves the upper (1) or lower (0) half of a 256-bit vector, depending on the constant operand.
+//
+// Asm: VEXTRACTI128, CPU Feature: AVX2
+func (x Uint64x4) Get128(imm uint8) Uint64x2
+
 /* GetElem */

 // GetElem retrieves a single constant-indexed element's value.