[dev.simd] cmd/compile: fix isIntrinsic for methods; fix fp <-> gp moves

also includes a handy debugging hook for the inliner.

Change-Id: I23d0619506219d21db78c6c801612ff058562142
Reviewed-on: https://go-review.googlesource.com/c/go/+/694118
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
This commit is contained in:
David Chase 2025-08-07 16:44:50 -04:00
parent 08ab8e24a3
commit d5dea86993
3 changed files with 97 additions and 30 deletions

View file

@ -43,6 +43,10 @@ func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
} }
} }
func isFPReg(r int16) bool {
return x86.REG_X0 <= r && r <= x86.REG_Z31
}
// loadByType returns the load instruction of the given type. // loadByType returns the load instruction of the given type.
func loadByType(t *types.Type) obj.As { func loadByType(t *types.Type) obj.As {
// Avoid partial register write // Avoid partial register write
@ -88,31 +92,33 @@ func storeByType(t *types.Type) obj.As {
} }
// moveByType returns the reg->reg move instruction of the given type. // moveByType returns the reg->reg move instruction of the given type.
func moveByType(t *types.Type) obj.As { func moveByType(from, to *ssa.Value) obj.As {
if t.IsFloat() { toT := to.Type
fromR, toR := from.Reg(), to.Reg()
if isFPReg(fromR) && isFPReg(toR) && toT.IsFloat() {
// Moving the whole sse2 register is faster // Moving the whole sse2 register is faster
// than moving just the correct low portion of it. // than moving just the correct low portion of it.
// There is no xmm->xmm move with 1 byte opcode, // There is no xmm->xmm move with 1 byte opcode,
// so use movups, which has 2 byte opcode. // so use movups, which has 2 byte opcode.
return x86.AMOVUPS return x86.AMOVUPS
} else if t.IsSIMD() { }
return simdMov(t.Size()) if toT.IsSIMD() {
} else { return simdMov(toT.Size())
switch t.Size() { }
case 1: switch toT.Size() {
// Avoids partial register write case 1:
return x86.AMOVL // Avoids partial register write
case 2: return x86.AMOVL
return x86.AMOVL case 2:
case 4: return x86.AMOVL
return x86.AMOVL case 4:
case 8: return x86.AMOVL
return x86.AMOVQ case 8:
case 16: return x86.AMOVQ
return x86.AMOVUPS // int128s are in SSE registers case 16:
default: return x86.AMOVUPS // int128s are in SSE registers
panic(fmt.Sprintf("bad int register width %d:%v", t.Size(), t)) default:
} panic(fmt.Sprintf("bad int register width %d:%v", toT.Size(), toT))
} }
} }
@ -648,7 +654,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
// But this requires a way for regalloc to know that SRC might be // But this requires a way for regalloc to know that SRC might be
// clobbered by this instruction. // clobbered by this instruction.
t := v.RegTmp() t := v.RegTmp()
opregreg(s, moveByType(v.Type), t, v.Args[1].Reg()) opregreg(s, moveByType(v.Args[1], v), t, v.Args[1].Reg())
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
@ -820,13 +826,37 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.From.Offset = v.AuxInt p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = x p.To.Reg = x
case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
x := v.Reg() x := v.Reg()
p := s.Prog(v.Op.Asm()) a := v.Op.Asm()
p.From.Type = obj.TYPE_FCONST if x < x86.REG_X0 { // not an FP register
p.From.Val = math.Float64frombits(uint64(v.AuxInt)) if v.AuxInt == 0 && v.Aux == nil {
p.To.Type = obj.TYPE_REG opregreg(s, x86.AXORL, x, x)
p.To.Reg = x break
}
c := v.AuxInt
switch v.Type.Size() {
case 4:
a = x86.AMOVL
c = int64(math.Float32bits(float32(math.Float64frombits(uint64(v.AuxInt)))))
case 8:
a = x86.AMOVQ
default:
panic(fmt.Sprintf("unexpected type width for float const into non-float register, %v", v))
}
p := s.Prog(a)
p.From.Type = obj.TYPE_CONST
p.From.Offset = c
p.To.Type = obj.TYPE_REG
p.To.Reg = x
} else {
p := s.Prog(a)
p.From.Type = obj.TYPE_FCONST
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = x
}
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload, case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload: ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
@ -1134,7 +1164,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
y = simdOrMaskReg(v) y = simdOrMaskReg(v)
} }
if x != y { if x != y {
opregreg(s, moveByType(v.Type), y, x) opregreg(s, moveByType(v.Args[0], v), y, x)
} }
case ssa.OpLoadReg: case ssa.OpLoadReg:
if v.Type.IsFlags() { if v.Type.IsFlags() {

View file

@ -202,6 +202,7 @@ func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose boo
// be very liberal here, if the closure is only called once, the budget is large // be very liberal here, if the closure is only called once, the budget is large
budget = max(budget, inlineClosureCalledOnceCost) budget = max(budget, inlineClosureCalledOnceCost)
} }
return budget return budget
} }
@ -263,6 +264,7 @@ func CanInline(fn *ir.Func, profile *pgoir.Profile) {
visitor := hairyVisitor{ visitor := hairyVisitor{
curFunc: fn, curFunc: fn,
debug: isDebugFn(fn),
isBigFunc: IsBigFunc(fn), isBigFunc: IsBigFunc(fn),
budget: budget, budget: budget,
maxBudget: budget, maxBudget: budget,
@ -407,6 +409,7 @@ type hairyVisitor struct {
// This is needed to access the current caller in the doNode function. // This is needed to access the current caller in the doNode function.
curFunc *ir.Func curFunc *ir.Func
isBigFunc bool isBigFunc bool
debug bool
budget int32 budget int32
maxBudget int32 maxBudget int32
reason string reason string
@ -416,6 +419,16 @@ type hairyVisitor struct {
profile *pgoir.Profile profile *pgoir.Profile
} }
func isDebugFn(fn *ir.Func) bool {
// if n := fn.Nname; n != nil && n.Sym().Pkg.Path == "0" {
// if n.Sym().Name == "BroadcastInt64x4" {
// fmt.Printf("isDebugFn '%s' DOT '%s'\n", n.Sym().Pkg.Path, n.Sym().Name)
// return true
// }
// }
return false
}
func (v *hairyVisitor) tooHairy(fn *ir.Func) bool { func (v *hairyVisitor) tooHairy(fn *ir.Func) bool {
v.do = v.doNode // cache closure v.do = v.doNode // cache closure
if ir.DoChildren(fn, v.do) { if ir.DoChildren(fn, v.do) {
@ -434,6 +447,9 @@ func (v *hairyVisitor) doNode(n ir.Node) bool {
if n == nil { if n == nil {
return false return false
} }
if v.debug {
fmt.Printf("%v: doNode %v budget is %d\n", ir.Line(n), n.Op(), v.budget)
}
opSwitch: opSwitch:
switch n.Op() { switch n.Op() {
// Call is okay if inlinable and we have the budget for the body. // Call is okay if inlinable and we have the budget for the body.
@ -551,12 +567,19 @@ opSwitch:
} }
if cheap { if cheap {
if v.debug {
if ir.IsIntrinsicCall(n) {
fmt.Printf("%v: cheap call is also intrinsic, %v\n", ir.Line(n), n)
}
}
break // treat like any other node, that is, cost of 1 break // treat like any other node, that is, cost of 1
} }
if ir.IsIntrinsicCall(n) { if ir.IsIntrinsicCall(n) {
// Treat like any other node. if v.debug {
break fmt.Printf("%v: intrinsic call, %v\n", ir.Line(n), n)
}
break // Treat like any other node.
} }
if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) { if callee := inlCallee(v.curFunc, n.Fun, v.profile, false); callee != nil && typecheck.HaveInlineBody(callee) {
@ -583,6 +606,10 @@ opSwitch:
} }
} }
if v.debug {
fmt.Printf("%v: costly OCALLFUNC %v\n", ir.Line(n), n)
}
// Call cost for non-leaf inlining. // Call cost for non-leaf inlining.
v.budget -= extraCost v.budget -= extraCost
@ -592,6 +619,9 @@ opSwitch:
// Things that are too hairy, irrespective of the budget // Things that are too hairy, irrespective of the budget
case ir.OCALL, ir.OCALLINTER: case ir.OCALL, ir.OCALLINTER:
// Call cost for non-leaf inlining. // Call cost for non-leaf inlining.
if v.debug {
fmt.Printf("%v: costly OCALL %v\n", ir.Line(n), n)
}
v.budget -= v.extraCallCost v.budget -= v.extraCallCost
case ir.OPANIC: case ir.OPANIC:
@ -743,7 +773,7 @@ opSwitch:
v.budget-- v.budget--
// When debugging, don't stop early, to get full cost of inlining this function // When debugging, don't stop early, to get full cost of inlining this function
if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() { if v.budget < 0 && base.Flag.LowerM < 2 && !logopt.Enabled() && !v.debug {
v.reason = "too expensive" v.reason = "too expensive"
return true return true
} }

View file

@ -1913,6 +1913,13 @@ func IsIntrinsicCall(n *ir.CallExpr) bool {
} }
name, ok := n.Fun.(*ir.Name) name, ok := n.Fun.(*ir.Name)
if !ok { if !ok {
if n.Fun.Op() == ir.OMETHEXPR {
if meth := ir.MethodExprName(n.Fun); meth != nil {
if fn := meth.Func; fn != nil {
return IsIntrinsicSym(fn.Sym())
}
}
}
return false return false
} }
return IsIntrinsicSym(name.Sym()) return IsIntrinsicSym(name.Sym())