cmd/compile: add floating point min/max intrinsics on s390x

Add the VECTOR FP (MINIMUM|MAXIMUM) instructions to the assembler and use them in the compiler to implement min and max. Note: I've allowed floating point registers to be used with the single element instructions (those with the W instead of V prefix) to allow easier integration into the compiler. Change-Id: I5f80a510bd248cf483cce95f1979bf63fbae7de6 Reviewed-on: https://go-review.googlesource.com/c/go/+/684715 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Mark Freeman <mark@golang.org> Reviewed-by: Keith Randall <khr@google.com>
2025-12-08 06:10:04 +00:00 · 2025-06-27 21:05:38 +01:00 · 2025-06-27 21:05:38 +01:00 · cedf63616a
commit cedf63616a
parent 82a1921c3b
12 changed files with 160 additions and 2 deletions
--- a/src/cmd/asm/internal/asm/testdata/s390x.s
+++ b/src/cmd/asm/internal/asm/testdata/s390x.s
@ -540,6 +540,18 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
 	VSTRCZBS V18, V20, V22, V24	// e78240306f8a
 	VSTRCZHS V18, V20, V22, V24	// e78241306f8a
 	VSTRCZFS V18, V20, V22, V24	// e78242306f8a
+	VFMAXSB	$1, V2, V3, V4          // e742301020ef
+	WFMAXSB	$2, V5, V6, V7          // e775602820ef
+	WFMAXSB	$2, F5, F6, F7          // e775602820ef
+	VFMAXDB	$3, V8, V9, V10		// e7a8903030ef
+	WFMAXDB	$4, V11, V12, V13	// e7dbc04830ef
+	WFMAXDB	$4, F11, F12, F13	// e7dbc04830ef
+	VFMINSB	$7, V14, V15, V16	// e70ef07028ee
+	WFMINSB	$8, V17, V18, V19	// e73120882eee
+	WFMINSB	$8, F1, F2, F3		// e731208820ee
+	VFMINDB	$9, V20, V21, V22	// e76450903eee
+	WFMINDB	$10, V23, V24, V25	// e79780a83eee
+	WFMINDB	$10, F7, F8, F9		// e79780a830ee

 	RET
 	RET	foo(SB)
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@ -281,6 +281,10 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 	case ssa.OpS390XCPSDR:
 		p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
 		p.Reg = v.Args[0].Reg()
+	case ssa.OpS390XWFMAXDB, ssa.OpS390XWFMAXSB,
+		ssa.OpS390XWFMINDB, ssa.OpS390XWFMINSB:
+		p := opregregimm(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg(), 1 /* Java Math.Max() */)
+		p.AddRestSource(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()})
 	case ssa.OpS390XDIVD, ssa.OpS390XDIVW,
 		ssa.OpS390XDIVDU, ssa.OpS390XDIVWU,
 		ssa.OpS390XMODD, ssa.OpS390XMODW,
--- a/src/cmd/compile/internal/ssa/_gen/S390X.rules
+++ b/src/cmd/compile/internal/ssa/_gen/S390X.rules
@ -145,6 +145,9 @@

 (Sqrt32    ...) => (FSQRTS ...)

+(Max(64|32)F ...) => (WFMAX(D|S)B ...)
+(Min(64|32)F ...) => (WFMIN(D|S)B ...)
+
 // Atomic loads and stores.
 // The SYNC instruction (fast-BCR-serialization) prevents store-load
 // reordering. Other sequences of memory operations (load-load,
--- a/src/cmd/compile/internal/ssa/_gen/S390XOps.go
+++ b/src/cmd/compile/internal/ssa/_gen/S390XOps.go
@ -222,6 +222,12 @@ func init() {
 		{name: "LNDFR", argLength: 1, reg: fp11, asm: "LNDFR"},                                                                       // fp64/fp32 clear sign bit
 		{name: "CPSDR", argLength: 2, reg: fp21, asm: "CPSDR"},                                                                       // fp64/fp32 copy arg1 sign bit to arg0

+		// Single element vector floating point min / max instructions
+		{name: "WFMAXDB", argLength: 2, reg: fp21, asm: "WFMAXDB", typ: "Float64"}, // max[float64](arg0, arg1)
+		{name: "WFMAXSB", argLength: 2, reg: fp21, asm: "WFMAXSB", typ: "Float32"}, // max[float32](arg0, arg1)
+		{name: "WFMINDB", argLength: 2, reg: fp21, asm: "WFMINDB", typ: "Float64"}, // min[float64](arg0, arg1)
+		{name: "WFMINSB", argLength: 2, reg: fp21, asm: "WFMINSB", typ: "Float32"}, // min[float32](arg0, arg1)
+
 		// Round to integer, float64 only.
 		//
 		// aux | rounding mode
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -2655,6 +2655,10 @@ const (
 	OpS390XLPDFR
 	OpS390XLNDFR
 	OpS390XCPSDR
+	OpS390XWFMAXDB
+	OpS390XWFMAXSB
+	OpS390XWFMINDB
+	OpS390XWFMINSB
 	OpS390XFIDBR
 	OpS390XFMOVSload
 	OpS390XFMOVDload
@ -35775,6 +35779,62 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
+	{
+		name:   "WFMAXDB",
+		argLen: 2,
+		asm:    s390x.AWFMAXDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+				{1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+		},
+	},
+	{
+		name:   "WFMAXSB",
+		argLen: 2,
+		asm:    s390x.AWFMAXSB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+				{1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+		},
+	},
+	{
+		name:   "WFMINDB",
+		argLen: 2,
+		asm:    s390x.AWFMINDB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+				{1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+		},
+	},
+	{
+		name:   "WFMINSB",
+		argLen: 2,
+		asm:    s390x.AWFMINSB,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+				{1, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+			outputs: []outputInfo{
+				{0, 4294901760}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15
+			},
+		},
+	},
 	{
 		name:    "FIDBR",
 		auxType: auxInt8,
--- a/src/cmd/compile/internal/ssa/rewriteS390X.go
+++ b/src/cmd/compile/internal/ssa/rewriteS390X.go
@ -368,6 +368,18 @@ func rewriteValueS390X(v *Value) bool {
 		return rewriteValueS390X_OpLsh8x64(v)
 	case OpLsh8x8:
 		return rewriteValueS390X_OpLsh8x8(v)
+	case OpMax32F:
+		v.Op = OpS390XWFMAXSB
+		return true
+	case OpMax64F:
+		v.Op = OpS390XWFMAXDB
+		return true
+	case OpMin32F:
+		v.Op = OpS390XWFMINSB
+		return true
+	case OpMin64F:
+		v.Op = OpS390XWFMINDB
+		return true
 	case OpMod16:
 		return rewriteValueS390X_OpMod16(v)
 	case OpMod16u:
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@ -3986,7 +3986,7 @@ func (s *state) minMax(n *ir.CallExpr) *ssa.Value {
 		if typ.IsFloat() {
 			hasIntrinsic := false
 			switch Arch.LinkArch.Family {
-			case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64:
+			case sys.AMD64, sys.ARM64, sys.Loong64, sys.RISCV64, sys.S390X:
 				hasIntrinsic = true
 			case sys.PPC64:
 				hasIntrinsic = buildcfg.GOPPC64 >= 9
--- a/src/cmd/internal/obj/s390x/a.out.go
+++ b/src/cmd/internal/obj/s390x/a.out.go
@ -715,6 +715,14 @@ const (
 	AWFLNDB
 	AVFLPDB
 	AWFLPDB
+	AVFMAXDB
+	AWFMAXDB
+	AVFMAXSB
+	AWFMAXSB
+	AVFMINDB
+	AWFMINDB
+	AVFMINSB
+	AWFMINSB
 	AVFSQ
 	AVFSQDB
 	AWFSQDB
--- a/src/cmd/internal/obj/s390x/anames.go
+++ b/src/cmd/internal/obj/s390x/anames.go
@ -438,6 +438,14 @@ var Anames = []string{
 	"WFLNDB",
 	"VFLPDB",
 	"WFLPDB",
+	"VFMAXDB",
+	"WFMAXDB",
+	"VFMAXSB",
+	"WFMAXSB",
+	"VFMINDB",
+	"WFMINDB",
+	"VFMINSB",
+	"WFMINSB",
 	"VFSQ",
 	"VFSQDB",
 	"WFSQDB",
--- a/src/cmd/internal/obj/s390x/asmz.go
+++ b/src/cmd/internal/obj/s390x/asmz.go
@ -441,6 +441,11 @@ var optab = []Optab{
 	{i: 119, as: AVERLLVG, a1: C_VREG, a2: C_VREG, a6: C_VREG},
 	{i: 119, as: AVERLLVG, a1: C_VREG, a6: C_VREG},

+	// VRR-c floating point min/max
+	{i: 128, as: AVFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG},
+	{i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_VREG, a3: C_VREG, a6: C_VREG},
+	{i: 128, as: AWFMAXDB, a1: C_SCON, a2: C_FREG, a3: C_FREG, a6: C_FREG},
+
 	// VRR-d
 	{i: 120, as: AVACQ, a1: C_VREG, a2: C_VREG, a3: C_VREG, a6: C_VREG},

@ -1480,6 +1485,14 @@ func buildop(ctxt *obj.Link) {
 			opset(AVFMSDB, r)
 			opset(AWFMSDB, r)
 			opset(AVPERM, r)
+		case AVFMAXDB:
+			opset(AVFMAXSB, r)
+			opset(AVFMINDB, r)
+			opset(AVFMINSB, r)
+		case AWFMAXDB:
+			opset(AWFMAXSB, r)
+			opset(AWFMINDB, r)
+			opset(AWFMINSB, r)
 		case AKM:
 			opset(AKMC, r)
 			opset(AKLMD, r)
@ -2636,6 +2649,8 @@ const (
 	op_VUPLL  uint32 = 0xE7D4 // 	VRR-a	VECTOR UNPACK LOGICAL LOW
 	op_VUPL   uint32 = 0xE7D6 // 	VRR-a	VECTOR UNPACK LOW
 	op_VMSL   uint32 = 0xE7B8 // 	VRR-d	VECTOR MULTIPLY SUM LOGICAL
+	op_VFMAX  uint32 = 0xE7EF // 	VRR-c	VECTOR FP MAXIMUM
+	op_VFMIN  uint32 = 0xE7EE // 	VRR-c	VECTOR FP MINIMUM

 	// added in z15
 	op_KDSA uint32 = 0xB93A // FORMAT_RRE        COMPUTE DIGITAL SIGNATURE AUTHENTICATION (KDSA)
@ -4475,6 +4490,12 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
 			c.ctxt.Diag("padding byte register cannot be same as input or output register %v", p)
 		}
 		zRS(op_MVCLE, uint32(p.To.Reg), uint32(p.Reg), uint32(p.From.Reg), uint32(d2), asm)
+
+	case 128: // VRR-c floating point max/min
+		op, m4, _ := vop(p.As)
+		m5 := singleElementMask(p.As)
+		m6 := uint32(c.vregoff(&p.From))
+		zVRRc(op, uint32(p.To.Reg), uint32(p.Reg), uint32(p.GetFrom3().Reg), m6, m5, m4, asm)
 	}
 }

--- a/src/cmd/internal/obj/s390x/vector.go
+++ b/src/cmd/internal/obj/s390x/vector.go
@ -1027,6 +1027,22 @@ func vop(as obj.As) (opcode, es, cs uint32) {
 		return op_VUPL, 1, 0
 	case AVUPLF:
 		return op_VUPL, 2, 0
+	case AVFMAXDB:
+		return op_VFMAX, 3, 0
+	case AWFMAXDB:
+		return op_VFMAX, 3, 0
+	case AVFMAXSB:
+		return op_VFMAX, 2, 0
+	case AWFMAXSB:
+		return op_VFMAX, 2, 0
+	case AVFMINDB:
+		return op_VFMIN, 3, 0
+	case AWFMINDB:
+		return op_VFMIN, 3, 0
+	case AVFMINSB:
+		return op_VFMIN, 2, 0
+	case AWFMINSB:
+		return op_VFMIN, 2, 0
 	}
 }

@ -1062,7 +1078,11 @@ func singleElementMask(as obj.As) uint32 {
 		AWFSQDB,
 		AWFSDB,
 		AWFTCIDB,
-		AWFIDB:
+		AWFIDB,
+		AWFMAXDB,
+		AWFMAXSB,
+		AWFMINDB,
+		AWFMINSB:
 		return 8
 	}
 	return 0
--- a/test/codegen/floats.go
+++ b/test/codegen/floats.go
@ -172,6 +172,7 @@ func Float64Min(a, b float64) float64 {
 	// riscv64:"FMIN"
 	// ppc64/power9:"XSMINJDP"
 	// ppc64/power10:"XSMINJDP"
+	// s390x: "WFMINDB"
 	return min(a, b)
 }

@ -182,6 +183,7 @@ func Float64Max(a, b float64) float64 {
 	// riscv64:"FMAX"
 	// ppc64/power9:"XSMAXJDP"
 	// ppc64/power10:"XSMAXJDP"
+	// s390x: "WFMAXDB"
 	return max(a, b)
 }

@ -192,6 +194,7 @@ func Float32Min(a, b float32) float32 {
 	// riscv64:"FMINS"
 	// ppc64/power9:"XSMINJDP"
 	// ppc64/power10:"XSMINJDP"
+	// s390x: "WFMINSB"
 	return min(a, b)
 }

@ -202,6 +205,7 @@ func Float32Max(a, b float32) float32 {
 	// riscv64:"FMAXS"
 	// ppc64/power9:"XSMAXJDP"
 	// ppc64/power10:"XSMAXJDP"
+	// s390x: "WFMAXSB"
 	return max(a, b)
 }