[dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp <-> gp moves

also includes a handy debugging hook for the inliner. Change-Id: I23d0619506219d21db78c6c801612ff058562142 Reviewed-on: https://go-review.googlesource.com/c/go/+/694118 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2025-12-08 06:10:04 +00:00 · 2025-08-07 16:44:50 -04:00 · 2025-08-07 16:44:50 -04:00 · d5dea86993
commit d5dea86993
parent 08ab8e24a3
3 changed files with 97 additions and 30 deletions
--- a/src/cmd/compile/internal/amd64/ssa.go
+++ b/src/cmd/compile/internal/amd64/ssa.go
@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
 	}
 }
 func isFPReg(r int16) bool {
 	return x86.REG_X0 <= r && r <= x86.REG_Z31
 }
 // loadByType returns the load instruction of the given type.
 func loadByType(t *types.Type) obj.As {
 	// Avoid partial register write
@ -88,31 +92,33 @@ func storeByType(t *types.Type) obj.As {
 }
 // moveByType returns the reg->reg move instruction of the given type.
-func moveByType(t *types.Type) obj.As {
+func moveByType(from, to *ssa.Value) obj.As {
-	if t.IsFloat() {
+	toT := to.Type
 	fromR, toR := from.Reg(), to.Reg()
 	if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() {
 		// Moving the whole sse2 register is faster
 		// than moving just the correct low portion of it.
 		// There is no xmm->xmm move with 1 byte opcode,
 		// so use movups, which has 2 byte opcode.
 		return x86.AMOVUPS
-	} else if t.IsSIMD() {
+	}
-		return simdMov(t.Size())
+	if toT.IsSIMD() {
-	} else {
+		return simdMov(toT.Size())
-		switch t.Size() {
+	}
-		case 1:
+	switch toT.Size() {
-			// Avoids partial register write
+	case 1:
-			return x86.AMOVL
+		// Avoids partial register write
-		case 2:
+		return x86.AMOVL
-			return x86.AMOVL
+	case 2:
-		case 4:
+		return x86.AMOVL
-			return x86.AMOVL
+	case 4:
-		case 8:
+		return x86.AMOVL
-			return x86.AMOVQ
+	case 8:
-		case 16:
+		return x86.AMOVQ
-			return x86.AMOVUPS // int128s are in SSE registers
+	case 16:
-		default:
+		return x86.AMOVUPS // int128s are in SSE registers
-			panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t))
+	default:
-		}
+		panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT))
 	}
 }
@ -648,7 +654,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		// But this requires a way for regalloc to know that SRC might be
 		// clobbered by this instruction.
 		t := v.RegTmp()
-		opregreg(s, moveByType(v.Type), t, v.Args[1].Reg())
+		opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg())
 		p := s.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
@ -820,13 +826,37 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		p.From.Offset = v.AuxInt
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = x
 	case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
 		x := v.Reg()
-		p := s.Prog(v.Op.Asm())
+		a := v.Op.Asm()
-		p.From.Type = obj.TYPE_FCONST
+		if x < x86.REG_X0 { // not an FP register
-		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
+			if v.AuxInt == 0 && v.Aux == nil {
-		p.To.Type = obj.TYPE_REG
+				opregreg(s, x86.AXORL, x, x)
-		p.To.Reg = x
+				break
 			}
 			c := v.AuxInt
 			switch v.Type.Size() {
 			case 4:
 				a = x86.AMOVL
 				c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt)))))
 			case 8:
 				a = x86.AMOVQ
 			default:
 				panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v))
 			}
 			p := s.Prog(a)
 			p.From.Type = obj.TYPE_CONST
 			p.From.Offset = c
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = x
 		} else {
 			p := s.Prog(a)
 			p.From.Type = obj.TYPE_FCONST
 			p.From.Val = math.Float64frombits(uint64(v.AuxInt))
 			p.To.Type = obj.TYPE_REG
 			p.To.Reg = x
 		}
 	case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
 		ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
 		ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
@ -1134,7 +1164,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 			y = simdOrMaskReg(v)
 		}
 		if x != y {
-			opregreg(s, moveByType(v.Type), y, x)
+			opregreg(s, moveByType(v.Args[0], v), y, x)
 		}
 	case ssa.OpLoadReg:
 		if v.Type.IsFlags() {
--- a/src/cmd/compile/internal/inline/inl.go
+++ b/src/cmd/compile/internal/inline/inl.go
@ -202,6 +202,7 @@ func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose boo
 		// be very liberal here, if the closure is only called once, the budget is large
 		budget = max(budget, inlineClosureCalledOnceCost)
 	}
 	return budget
 }
@ -263,6 +264,7 @@ func CanInline(fn *ir.Func, profile *pgoir.Profile) {
 	visitor := hairyVisitor{
 		curFunc:       fn,
 		debug:         isDebugFn(fn),
 		isBigFunc:     IsBigFunc(fn),
 		budget:        budget,
 		maxBudget:     budget,
@ -407,6 +409,7 @@ type hairyVisitor struct {
 	// This is needed to access the current caller in the doNode function.
 	curFunc       *ir.Func
 	isBigFunc     bool
 	debug         bool
 	budget        int32
 	maxBudget     int32
 	reason        string
@ -416,6 +419,16 @@ type hairyVisitor struct {
 	profile       *pgoir.Profile
 }
 func isDebugFn(fn *ir.Func) bool {
 	// if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" {
 	// 	if n.Sym().Name == "BroadcastInt64x4" {
 	// 		fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
 	// 		return true
 	// 	}
 	// }
 	return false
 }
 func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
 	v.do = v.doNode // cache closure
 	if ir.DoChildren(fn, v.do) {
@ -434,6 +447,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
 	if n == nil {
 		return false
 	}
 	if v.debug {
 		fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget)
 	}
 opSwitch:
 	switch n.Op() {
 	// Call is okay if inlinable and we have the budget for the body.
@ -551,12 +567,19 @@ opSwitch:
 		}
 		if cheap {
 			if v.debug {
 				if ir.IsIntrinsicCall(n) {
 					fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n)
 				}
 			}
 			break // treat like any other node, that is, cost of 1
 		}
 		if ir.IsIntrinsicCall(n) {
-			// Treat like any other node.
+			if v.debug {
-			break
+				fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n)
 			}
 			break // Treat like any other node.
 		}
 		if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) {
@ -583,6 +606,10 @@ opSwitch:
 			}
 		}
 		if v.debug {
 			fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n)
 		}
 		// Call cost for non-leaf inlining.
 		v.budget -= extraCost
@ -592,6 +619,9 @@ opSwitch:
 	// Things that are too hairy, irrespective of the budget
 	case ir.OCALL, ir.OCALLINTER:
 		// Call cost for non-leaf inlining.
 		if v.debug {
 			fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n)
 		}
 		v.budget -= v.extraCallCost
 	case ir.OPANIC:
@ -743,7 +773,7 @@ opSwitch:
 	v.budget--
 	// When debugging, don't stop early, to get full cost of inlining this function
-	if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() {
+	if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug {
 		v.reason = "too expensive"
 		return true
 	}
--- a/src/cmd/compile/internal/ssagen/intrinsics.go
+++ b/src/cmd/compile/internal/ssagen/intrinsics.go
@ -1913,6 +1913,13 @@ func IsIntrinsicCall(n *ir.CallExpr) bool {
 	}
 	name, ok := n.Fun.(*ir.Name)
 	if !ok {
 		if n.Fun.Op() == ir.OMETHEXPR {
 			if meth := ir.MethodExprName(n.Fun); meth != nil {
 				if fn := meth.Func; fn != nil {
 					return IsIntrinsicSym(fn.Sym())
 				}
 			}
 		}
 		return false
 	}
 	return IsIntrinsicSym(name.Sym())