cmd/compile, cmd/internal: fine-grained fiddling with loop alignment

This appears to be useful only on amd64, and was specifically
benchmarked on Apple Silicon and did not produce any benefit there.
This CL adds the assembly instruction `PCALIGNMAX align,amount`
which aligns to `align` if that can be achieved with `amount`
or fewer bytes of padding. (0 means never, but will align the
enclosing function.)

Specifically, if low-order-address-bits + amount are
greater than or equal to align; thus, `PCALIGNMAX 64,63` is
the same as `PCALIGN 64` and `PCALIGNMAX 64,0` will never
emit any alignment, but will still cause the function itself
to be aligned to (at least) 64 bytes.

Change-Id: Id51a056f1672f8095e8f755e01f72836c9686aa3
Reviewed-on: https://go-review.googlesource.com/c/go/+/577935
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
David Chase 2024-04-02 11:12:44 -04:00
parent 31c8150082
commit 18d0e6a14f
14 changed files with 221 additions and 79 deletions

View file

@ -16,6 +16,7 @@ var Debug DebugFlags
// The -d option takes a comma-separated list of settings. // The -d option takes a comma-separated list of settings.
// Each setting is name=value; for ints, name is short for name=1. // Each setting is name=value; for ints, name is short for name=1.
type DebugFlags struct { type DebugFlags struct {
AlignHot int `help:"enable hot block alignment (currently requires -pgo)" concurrent:"ok"`
Append int `help:"print information about append compilation"` Append int `help:"print information about append compilation"`
Checkptr int `help:"instrument unsafe pointer conversions\n0: instrumentation disabled\n1: conversions involving unsafe.Pointer are instrumented\n2: conversions to unsafe.Pointer force heap allocation" concurrent:"ok"` Checkptr int `help:"instrument unsafe pointer conversions\n0: instrumentation disabled\n1: conversions involving unsafe.Pointer are instrumented\n2: conversions to unsafe.Pointer force heap allocation" concurrent:"ok"`
Closure int `help:"print information about closure compilation"` Closure int `help:"print information about closure compilation"`

View file

@ -178,6 +178,7 @@ func ParseFlags() {
Debug.ConcurrentOk = true Debug.ConcurrentOk = true
Debug.MaxShapeLen = 500 Debug.MaxShapeLen = 500
Debug.AlignHot = 1
Debug.InlFuncsWithClosures = 1 Debug.InlFuncsWithClosures = 1
Debug.InlStaticInit = 1 Debug.InlStaticInit = 1
Debug.PGOInline = 1 Debug.PGOInline = 1

View file

@ -14,6 +14,7 @@ import (
"cmd/compile/internal/ir" "cmd/compile/internal/ir"
"cmd/compile/internal/liveness" "cmd/compile/internal/liveness"
"cmd/compile/internal/objw" "cmd/compile/internal/objw"
"cmd/compile/internal/pgoir"
"cmd/compile/internal/ssagen" "cmd/compile/internal/ssagen"
"cmd/compile/internal/staticinit" "cmd/compile/internal/staticinit"
"cmd/compile/internal/types" "cmd/compile/internal/types"
@ -112,7 +113,7 @@ func prepareFunc(fn *ir.Func) {
// compileFunctions compiles all functions in compilequeue. // compileFunctions compiles all functions in compilequeue.
// It fans out nBackendWorkers to do the work // It fans out nBackendWorkers to do the work
// and waits for them to complete. // and waits for them to complete.
func compileFunctions() { func compileFunctions(profile *pgoir.Profile) {
if race.Enabled { if race.Enabled {
// Randomize compilation order to try to shake out races. // Randomize compilation order to try to shake out races.
tmp := make([]*ir.Func, len(compilequeue)) tmp := make([]*ir.Func, len(compilequeue))
@ -179,7 +180,7 @@ func compileFunctions() {
for _, fn := range fns { for _, fn := range fns {
fn := fn fn := fn
queue(func(worker int) { queue(func(worker int) {
ssagen.Compile(fn, worker) ssagen.Compile(fn, worker, profile)
compile(fn.Closures) compile(fn.Closures)
wg.Done() wg.Done()
}) })

View file

@ -303,7 +303,7 @@ func Main(archInit func(*ssagen.ArchInfo)) {
// as late as possible to maximize how much work we can batch and // as late as possible to maximize how much work we can batch and
// process concurrently. // process concurrently.
if len(compilequeue) != 0 { if len(compilequeue) != 0 {
compileFunctions() compileFunctions(profile)
continue continue
} }

View file

@ -61,6 +61,9 @@ var (
// TODO(prattmic): Make this non-global. // TODO(prattmic): Make this non-global.
candHotCalleeMap = make(map[*pgoir.IRNode]struct{}) candHotCalleeMap = make(map[*pgoir.IRNode]struct{})
// Set of functions that contain hot call sites.
hasHotCall = make(map[*ir.Func]struct{})
// List of all hot call sites. CallSiteInfo.Callee is always nil. // List of all hot call sites. CallSiteInfo.Callee is always nil.
// TODO(prattmic): Make this non-global. // TODO(prattmic): Make this non-global.
candHotEdgeMap = make(map[pgoir.CallSiteInfo]struct{}) candHotEdgeMap = make(map[pgoir.CallSiteInfo]struct{})
@ -78,6 +81,22 @@ var (
inlineHotMaxBudget int32 = 2000 inlineHotMaxBudget int32 = 2000
) )
func IsPgoHotFunc(fn *ir.Func, profile *pgoir.Profile) bool {
if profile == nil {
return false
}
if n, ok := profile.WeightedCG.IRNodes[ir.LinkFuncName(fn)]; ok {
_, ok := candHotCalleeMap[n]
return ok
}
return false
}
func HasPgoHotInline(fn *ir.Func) bool {
_, has := hasHotCall[fn]
return has
}
// PGOInlinePrologue records the hot callsites from ir-graph. // PGOInlinePrologue records the hot callsites from ir-graph.
func PGOInlinePrologue(p *pgoir.Profile) { func PGOInlinePrologue(p *pgoir.Profile) {
if base.Debug.PGOInlineCDFThreshold != "" { if base.Debug.PGOInlineCDFThreshold != "" {
@ -228,16 +247,12 @@ func GarbageCollectUnreferencedHiddenClosures() {
func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose bool) int32 { func inlineBudget(fn *ir.Func, profile *pgoir.Profile, relaxed bool, verbose bool) int32 {
// Update the budget for profile-guided inlining. // Update the budget for profile-guided inlining.
budget := int32(inlineMaxBudget) budget := int32(inlineMaxBudget)
if profile != nil { if IsPgoHotFunc(fn, profile) {
if n, ok := profile.WeightedCG.IRNodes[ir.LinkFuncName(fn)]; ok {
if _, ok := candHotCalleeMap[n]; ok {
budget = inlineHotMaxBudget budget = inlineHotMaxBudget
if verbose { if verbose {
fmt.Printf("hot-node enabled increased budget=%v for func=%v\n", budget, ir.PkgFuncName(fn)) fmt.Printf("hot-node enabled increased budget=%v for func=%v\n", budget, ir.PkgFuncName(fn))
} }
} }
}
}
if relaxed { if relaxed {
budget += inlheur.BudgetExpansion(inlineMaxBudget) budget += inlheur.BudgetExpansion(inlineMaxBudget)
} }
@ -580,7 +595,7 @@ opSwitch:
// Check whether we'd actually inline this call. Set // Check whether we'd actually inline this call. Set
// log == false since we aren't actually doing inlining // log == false since we aren't actually doing inlining
// yet. // yet.
if ok, _ := canInlineCallExpr(v.curFunc, n, callee, v.isBigFunc, false); ok { if ok, _, _ := canInlineCallExpr(v.curFunc, n, callee, v.isBigFunc, false); ok {
// mkinlcall would inline this call [1], so use // mkinlcall would inline this call [1], so use
// the cost of the inline body as the cost of // the cost of the inline body as the cost of
// the call, as that is what will actually // the call, as that is what will actually
@ -873,10 +888,11 @@ var InlineCall = func(callerfn *ir.Func, call *ir.CallExpr, fn *ir.Func, inlInde
// inlineCostOK returns true if call n from caller to callee is cheap enough to // inlineCostOK returns true if call n from caller to callee is cheap enough to
// inline. bigCaller indicates that caller is a big function. // inline. bigCaller indicates that caller is a big function.
// //
// In addition to the "cost OK" boolean, it also returns the "max // In addition to the "cost OK" boolean, it also returns
// cost" limit used to make the decision (which may differ depending // - the "max cost" limit used to make the decision (which may differ depending on func size)
// on func size), and the score assigned to this specific callsite. // - the score assigned to this specific callsite
func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool, int32, int32) { // - whether the inlined function is "hot" according to PGO.
func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool, int32, int32, bool) {
maxCost := int32(inlineMaxBudget) maxCost := int32(inlineMaxBudget)
if bigCaller { if bigCaller {
// We use this to restrict inlining into very big functions. // We use this to restrict inlining into very big functions.
@ -892,19 +908,21 @@ func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool
} }
} }
lineOffset := pgoir.NodeLineOffset(n, caller)
csi := pgoir.CallSiteInfo{LineOffset: lineOffset, Caller: caller}
_, hot := candHotEdgeMap[csi]
if metric <= maxCost { if metric <= maxCost {
// Simple case. Function is already cheap enough. // Simple case. Function is already cheap enough.
return true, 0, metric return true, 0, metric, hot
} }
// We'll also allow inlining of hot functions below inlineHotMaxBudget, // We'll also allow inlining of hot functions below inlineHotMaxBudget,
// but only in small functions. // but only in small functions.
lineOffset := pgoir.NodeLineOffset(n, caller) if !hot {
csi := pgoir.CallSiteInfo{LineOffset: lineOffset, Caller: caller}
if _, ok := candHotEdgeMap[csi]; !ok {
// Cold // Cold
return false, maxCost, metric return false, maxCost, metric, false
} }
// Hot // Hot
@ -913,49 +931,50 @@ func inlineCostOK(n *ir.CallExpr, caller, callee *ir.Func, bigCaller bool) (bool
if base.Debug.PGODebug > 0 { if base.Debug.PGODebug > 0 {
fmt.Printf("hot-big check disallows inlining for call %s (cost %d) at %v in big function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller)) fmt.Printf("hot-big check disallows inlining for call %s (cost %d) at %v in big function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller))
} }
return false, maxCost, metric return false, maxCost, metric, false
} }
if metric > inlineHotMaxBudget { if metric > inlineHotMaxBudget {
return false, inlineHotMaxBudget, metric return false, inlineHotMaxBudget, metric, false
} }
if !base.PGOHash.MatchPosWithInfo(n.Pos(), "inline", nil) { if !base.PGOHash.MatchPosWithInfo(n.Pos(), "inline", nil) {
// De-selected by PGO Hash. // De-selected by PGO Hash.
return false, maxCost, metric return false, maxCost, metric, false
} }
if base.Debug.PGODebug > 0 { if base.Debug.PGODebug > 0 {
fmt.Printf("hot-budget check allows inlining for call %s (cost %d) at %v in function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller)) fmt.Printf("hot-budget check allows inlining for call %s (cost %d) at %v in function %s\n", ir.PkgFuncName(callee), callee.Inl.Cost, ir.Line(n), ir.PkgFuncName(caller))
} }
return true, 0, metric return true, 0, metric, hot
} }
// canInlineCallExpr returns true if the call n from caller to callee // canInlineCallExpr returns true if the call n from caller to callee
// can be inlined, plus the score computed for the call expr in // can be inlined, plus the score computed for the call expr in question,
// question. bigCaller indicates that caller is a big function. log // and whether the callee is hot according to PGO.
// bigCaller indicates that caller is a big function. log
// indicates that the 'cannot inline' reason should be logged. // indicates that the 'cannot inline' reason should be logged.
// //
// Preconditions: CanInline(callee) has already been called. // Preconditions: CanInline(callee) has already been called.
func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCaller bool, log bool) (bool, int32) { func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCaller bool, log bool) (bool, int32, bool) {
if callee.Inl == nil { if callee.Inl == nil {
// callee is never inlinable. // callee is never inlinable.
if log && logopt.Enabled() { if log && logopt.Enabled() {
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn), logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(callee))) fmt.Sprintf("%s cannot be inlined", ir.PkgFuncName(callee)))
} }
return false, 0 return false, 0, false
} }
ok, maxCost, callSiteScore := inlineCostOK(n, callerfn, callee, bigCaller) ok, maxCost, callSiteScore, hot := inlineCostOK(n, callerfn, callee, bigCaller)
if !ok { if !ok {
// callee cost too high for this call site. // callee cost too high for this call site.
if log && logopt.Enabled() { if log && logopt.Enabled() {
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn), logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
fmt.Sprintf("cost %d of %s exceeds max caller cost %d", callee.Inl.Cost, ir.PkgFuncName(callee), maxCost)) fmt.Sprintf("cost %d of %s exceeds max caller cost %d", callee.Inl.Cost, ir.PkgFuncName(callee), maxCost))
} }
return false, 0 return false, 0, false
} }
if callee == callerfn { if callee == callerfn {
@ -963,7 +982,7 @@ func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCa
if log && logopt.Enabled() { if log && logopt.Enabled() {
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", fmt.Sprintf("recursive call to %s", ir.FuncName(callerfn))) logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", fmt.Sprintf("recursive call to %s", ir.FuncName(callerfn)))
} }
return false, 0 return false, 0, false
} }
if base.Flag.Cfg.Instrumenting && types.IsNoInstrumentPkg(callee.Sym().Pkg) { if base.Flag.Cfg.Instrumenting && types.IsNoInstrumentPkg(callee.Sym().Pkg) {
@ -977,7 +996,7 @@ func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCa
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn), logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
fmt.Sprintf("call to runtime function %s in instrumented build", ir.PkgFuncName(callee))) fmt.Sprintf("call to runtime function %s in instrumented build", ir.PkgFuncName(callee)))
} }
return false, 0 return false, 0, false
} }
if base.Flag.Race && types.IsNoRacePkg(callee.Sym().Pkg) { if base.Flag.Race && types.IsNoRacePkg(callee.Sym().Pkg) {
@ -985,7 +1004,7 @@ func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCa
logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn), logopt.LogOpt(n.Pos(), "cannotInlineCall", "inline", ir.FuncName(callerfn),
fmt.Sprintf(`call to into "no-race" package function %s in race build`, ir.PkgFuncName(callee))) fmt.Sprintf(`call to into "no-race" package function %s in race build`, ir.PkgFuncName(callee)))
} }
return false, 0 return false, 0, false
} }
// Check if we've already inlined this function at this particular // Check if we've already inlined this function at this particular
@ -1008,11 +1027,11 @@ func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCa
fmt.Sprintf("repeated recursive cycle to %s", ir.PkgFuncName(callee))) fmt.Sprintf("repeated recursive cycle to %s", ir.PkgFuncName(callee)))
} }
} }
return false, 0 return false, 0, false
} }
} }
return true, callSiteScore return true, callSiteScore, hot
} }
// mkinlcall returns an OINLCALL node that can replace OCALLFUNC n, or // mkinlcall returns an OINLCALL node that can replace OCALLFUNC n, or
@ -1023,10 +1042,13 @@ func canInlineCallExpr(callerfn *ir.Func, n *ir.CallExpr, callee *ir.Func, bigCa
// //
// n.Left = mkinlcall(n.Left, fn, isddd) // n.Left = mkinlcall(n.Left, fn, isddd)
func mkinlcall(callerfn *ir.Func, n *ir.CallExpr, fn *ir.Func, bigCaller bool) *ir.InlinedCallExpr { func mkinlcall(callerfn *ir.Func, n *ir.CallExpr, fn *ir.Func, bigCaller bool) *ir.InlinedCallExpr {
ok, score := canInlineCallExpr(callerfn, n, fn, bigCaller, true) ok, score, hot := canInlineCallExpr(callerfn, n, fn, bigCaller, true)
if !ok { if !ok {
return nil return nil
} }
if hot {
hasHotCall[callerfn] = struct{}{}
}
typecheck.AssertFixedCall(n) typecheck.AssertFixedCall(n)
parent := base.Ctxt.PosTable.Pos(n.Pos()).Base().InliningIndex() parent := base.Ctxt.PosTable.Pos(n.Pos()).Base().InliningIndex()

View file

@ -31,6 +31,9 @@ type Block struct {
// After flagalloc, records whether flags are live at the end of the block. // After flagalloc, records whether flags are live at the end of the block.
FlagsLiveAtEnd bool FlagsLiveAtEnd bool
// A block that would be good to align (according to the optimizer's guesses)
Hotness Hotness
// Subsequent blocks, if any. The number and order depend on the block kind. // Subsequent blocks, if any. The number and order depend on the block kind.
Succs []Edge Succs []Edge
@ -112,7 +115,7 @@ func (e Edge) String() string {
} }
// BlockKind is the kind of SSA block. // BlockKind is the kind of SSA block.
type BlockKind int16 type BlockKind uint8
// short form print // short form print
func (b *Block) String() string { func (b *Block) String() string {
@ -426,3 +429,17 @@ const (
BranchUnknown = BranchPrediction(0) BranchUnknown = BranchPrediction(0)
BranchLikely = BranchPrediction(+1) BranchLikely = BranchPrediction(+1)
) )
type Hotness int8 // Could use negative numbers for specifically non-hot blocks, but don't, yet.
const (
// These values are arranged in what seems to be order of increasing alignment importance.
// Currently only a few are relevant. Implicitly, they are all in a loop.
HotNotFlowIn Hotness = 1 << iota // This block is only reached by branches
HotInitial // In the block order, the first one for a given loop. Not necessarily topological header.
HotPgo // By PGO-based heuristics, this block occurs in a hot loop
HotNot = 0
HotInitialNotFlowIn = HotInitial | HotNotFlowIn // typically first block of a rotated loop, loop is entered with a branch (not to this block). No PGO
HotPgoInitial = HotPgo | HotInitial // special case; single block loop, initial block is header block has a flow-in entry, but PGO says it is hot
HotPgoInitialNotFLowIn = HotPgo | HotInitial | HotNotFlowIn // PGO says it is hot, and the loop is rotated so flow enters loop with a branch
)

View file

@ -45,6 +45,7 @@ type Func struct {
laidout bool // Blocks are ordered laidout bool // Blocks are ordered
NoSplit bool // true if function is marked as nosplit. Used by schedule check pass. NoSplit bool // true if function is marked as nosplit. Used by schedule check pass.
dumpFileSeq uint8 // the sequence numbers of dump file. (%s_%02d__%s.dump", funcname, dumpFileSeq, phaseName) dumpFileSeq uint8 // the sequence numbers of dump file. (%s_%02d__%s.dump", funcname, dumpFileSeq, phaseName)
IsPgoHot bool
// when register allocation is done, maps value ids to locations // when register allocation is done, maps value ids to locations
RegAlloc []Location RegAlloc []Location

View file

@ -56,9 +56,20 @@ func loopRotate(f *Func) {
} }
p = e.b p = e.b
} }
if p == nil || p == b { if p == nil {
continue continue
} }
p.Hotness |= HotInitial
if f.IsPgoHot {
p.Hotness |= HotPgo
}
// blocks will be arranged so that p is ordered first, if it isn't already.
if p == b { // p is header, already first (and also, only block in the loop)
continue
}
p.Hotness |= HotNotFlowIn
// the loop header b follows p
after[p.ID] = []*Block{b} after[p.ID] = []*Block{b}
for { for {
nextIdx := idToIdx[b.ID] + 1 nextIdx := idToIdx[b.ID] + 1

View file

@ -12,9 +12,11 @@ import (
"sync" "sync"
"cmd/compile/internal/base" "cmd/compile/internal/base"
"cmd/compile/internal/inline"
"cmd/compile/internal/ir" "cmd/compile/internal/ir"
"cmd/compile/internal/liveness" "cmd/compile/internal/liveness"
"cmd/compile/internal/objw" "cmd/compile/internal/objw"
"cmd/compile/internal/pgoir"
"cmd/compile/internal/ssa" "cmd/compile/internal/ssa"
"cmd/compile/internal/types" "cmd/compile/internal/types"
"cmd/internal/obj" "cmd/internal/obj"
@ -296,8 +298,8 @@ const maxStackSize = 1 << 30
// uses it to generate a plist, // uses it to generate a plist,
// and flushes that plist to machine code. // and flushes that plist to machine code.
// worker indicates which of the backend workers is doing the processing. // worker indicates which of the backend workers is doing the processing.
func Compile(fn *ir.Func, worker int) { func Compile(fn *ir.Func, worker int, profile *pgoir.Profile) {
f := buildssa(fn, worker) f := buildssa(fn, worker, inline.IsPgoHotFunc(fn, profile) || inline.HasPgoHotInline(fn))
// Note: check arg size to fix issue 25507. // Note: check arg size to fix issue 25507.
if f.Frontend().(*ssafn).stksize >= maxStackSize || f.OwnAux.ArgWidth() >= maxStackSize { if f.Frontend().(*ssafn).stksize >= maxStackSize || f.OwnAux.ArgWidth() >= maxStackSize {
largeStackFramesMu.Lock() largeStackFramesMu.Lock()

View file

@ -291,7 +291,7 @@ func (s *state) emitOpenDeferInfo() {
// buildssa builds an SSA function for fn. // buildssa builds an SSA function for fn.
// worker indicates which of the backend workers is doing the processing. // worker indicates which of the backend workers is doing the processing.
func buildssa(fn *ir.Func, worker int) *ssa.Func { func buildssa(fn *ir.Func, worker int, isPgoHot bool) *ssa.Func {
name := ir.FuncName(fn) name := ir.FuncName(fn)
abiSelf := abiForFunc(fn, ssaConfig.ABI0, ssaConfig.ABI1) abiSelf := abiForFunc(fn, ssaConfig.ABI0, ssaConfig.ABI1)
@ -373,6 +373,7 @@ func buildssa(fn *ir.Func, worker int) *ssa.Func {
// Allocate starting block // Allocate starting block
s.f.Entry = s.f.NewBlock(ssa.BlockPlain) s.f.Entry = s.f.NewBlock(ssa.BlockPlain)
s.f.Entry.Pos = fn.Pos() s.f.Entry.Pos = fn.Pos()
s.f.IsPgoHot = isPgoHot
if printssa { if printssa {
ssaDF := ssaDumpFile ssaDF := ssaDumpFile
@ -7302,12 +7303,47 @@ func genssa(f *ssa.Func, pp *objw.Progs) {
var argLiveIdx int = -1 // argument liveness info index var argLiveIdx int = -1 // argument liveness info index
// These control cache line alignment; if the required portion of
// a cache line is not available, then pad to obtain cache line
// alignment. Not implemented on all architectures, may not be
// useful on all architectures.
var hotAlign, hotRequire int64
if base.Debug.AlignHot > 0 {
switch base.Ctxt.Arch.Name {
// enable this on a case-by-case basis, with benchmarking.
// currently shown:
// good for amd64
// not helpful for Apple Silicon
//
case "amd64", "386":
// Align to 64 if 31 or fewer bytes remain in a cache line
// benchmarks a little better than always aligning, and also
// adds slightly less to the (PGO-compiled) binary size.
hotAlign = 64
hotRequire = 31
}
}
// Emit basic blocks // Emit basic blocks
for i, b := range f.Blocks { for i, b := range f.Blocks {
s.bstart[b.ID] = s.pp.Next
s.lineRunStart = nil s.lineRunStart = nil
s.SetPos(s.pp.Pos.WithNotStmt()) // It needs a non-empty Pos, but cannot be a statement boundary (yet). s.SetPos(s.pp.Pos.WithNotStmt()) // It needs a non-empty Pos, but cannot be a statement boundary (yet).
if hotAlign > 0 && b.Hotness&ssa.HotPgoInitial == ssa.HotPgoInitial {
// So far this has only been shown profitable for PGO-hot loop headers.
// The Hotness values allows distinctions betwen initial blocks that are "hot" or not, and "flow-in" or not.
// Currently only the initial blocks of loops are tagged in this way;
// there are no blocks tagged "pgo-hot" that are not also tagged "initial".
// TODO more heuristics, more architectures.
p := s.pp.Prog(obj.APCALIGNMAX)
p.From.SetConst(hotAlign)
p.To.SetConst(hotRequire)
}
s.bstart[b.ID] = s.pp.Next
if idx, ok := argLiveBlockMap[b.ID]; ok && idx != argLiveIdx { if idx, ok := argLiveBlockMap[b.ID]; ok && idx != argLiveIdx {
argLiveIdx = idx argLiveIdx = idx
p := s.pp.Prog(obj.APCDATA) p := s.pp.Prog(obj.APCDATA)
@ -7466,7 +7502,8 @@ func genssa(f *ssa.Func, pp *objw.Progs) {
// going to emit anyway, and use those instructions instead of the // going to emit anyway, and use those instructions instead of the
// inline marks. // inline marks.
for p := s.pp.Text; p != nil; p = p.Link { for p := s.pp.Text; p != nil; p = p.Link {
if p.As == obj.ANOP || p.As == obj.AFUNCDATA || p.As == obj.APCDATA || p.As == obj.ATEXT || p.As == obj.APCALIGN || Arch.LinkArch.Family == sys.Wasm { if p.As == obj.ANOP || p.As == obj.AFUNCDATA || p.As == obj.APCDATA || p.As == obj.ATEXT ||
p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX || Arch.LinkArch.Family == sys.Wasm {
// Don't use 0-sized instructions as inline marks, because we need // Don't use 0-sized instructions as inline marks, because we need
// to identify inline mark instructions by pc offset. // to identify inline mark instructions by pc offset.
// (Some of these instructions are sometimes zero-sized, sometimes not. // (Some of these instructions are sometimes zero-sized, sometimes not.

View file

@ -892,6 +892,7 @@ var optab = []Optab{
{obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL {obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL
{obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL {obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL
{obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // align code {obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // align code
{obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional
} }
// Valid pstate field values, and value to use in instruction. // Valid pstate field values, and value to use in instruction.
@ -1109,13 +1110,8 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
m = o.size(c.ctxt, p) m = o.size(c.ctxt, p)
if m == 0 { if m == 0 {
switch p.As { switch p.As {
case obj.APCALIGN: case obj.APCALIGN, obj.APCALIGNMAX:
alignedValue := p.From.Offset m = obj.AlignmentPadding(int32(pc), p, ctxt, cursym)
m = pcAlignPadLength(ctxt, pc, alignedValue)
// Update the current text symbol alignment value.
if int32(alignedValue) > cursym.Func().Align {
cursym.Func().Align = int32(alignedValue)
}
break break
case obj.ANOP, obj.AFUNCDATA, obj.APCDATA: case obj.ANOP, obj.AFUNCDATA, obj.APCDATA:
continue continue
@ -1181,9 +1177,8 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
if m == 0 { if m == 0 {
switch p.As { switch p.As {
case obj.APCALIGN: case obj.APCALIGN, obj.APCALIGNMAX:
alignedValue := p.From.Offset m = obj.AlignmentPaddingLength(int32(pc), p, ctxt)
m = pcAlignPadLength(ctxt, pc, alignedValue)
break break
case obj.ANOP, obj.AFUNCDATA, obj.APCDATA: case obj.ANOP, obj.AFUNCDATA, obj.APCDATA:
continue continue
@ -1214,9 +1209,8 @@ func span7(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
if sz > 4*len(out) { if sz > 4*len(out) {
log.Fatalf("out array in span7 is too small, need at least %d for %v", sz/4, p) log.Fatalf("out array in span7 is too small, need at least %d for %v", sz/4, p)
} }
if p.As == obj.APCALIGN { if p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX {
alignedValue := p.From.Offset v := obj.AlignmentPaddingLength(int32(p.Pc), p, c.ctxt)
v := pcAlignPadLength(c.ctxt, p.Pc, alignedValue)
for i = 0; i < int(v/4); i++ { for i = 0; i < int(v/4); i++ {
// emit ANOOP instruction by the padding size // emit ANOOP instruction by the padding size
c.ctxt.Arch.ByteOrder.PutUint32(bp, OP_NOOP) c.ctxt.Arch.ByteOrder.PutUint32(bp, OP_NOOP)
@ -3316,6 +3310,7 @@ func buildop(ctxt *obj.Link) {
obj.AUNDEF, obj.AUNDEF,
obj.AFUNCDATA, obj.AFUNCDATA,
obj.APCALIGN, obj.APCALIGN,
obj.APCALIGNMAX,
obj.APCDATA, obj.APCDATA,
obj.ADUFFZERO, obj.ADUFFZERO,
obj.ADUFFCOPY: obj.ADUFFCOPY:

View file

@ -416,6 +416,7 @@ const (
AJMP AJMP
ANOP ANOP
APCALIGN APCALIGN
APCALIGNMAX // currently x86, amd64 and arm64
APCDATA APCDATA
ARET ARET
AGETCALLERPC AGETCALLERPC

View file

@ -6,6 +6,7 @@ package obj
import ( import (
"bytes" "bytes"
"cmd/internal/objabi"
"fmt" "fmt"
"internal/abi" "internal/abi"
"internal/buildcfg" "internal/buildcfg"
@ -642,6 +643,7 @@ var Anames = []string{
"JMP", "JMP",
"NOP", "NOP",
"PCALIGN", "PCALIGN",
"PCALIGNMAX",
"PCDATA", "PCDATA",
"RET", "RET",
"GETCALLERPC", "GETCALLERPC",
@ -667,3 +669,62 @@ func abiDecorate(a *Addr, abiDetail bool) string {
} }
return fmt.Sprintf("<%s>", a.Sym.ABI()) return fmt.Sprintf("<%s>", a.Sym.ABI())
} }
// AlignmentPadding bytes to add to align code as requested.
// Alignment is restricted to powers of 2 between 8 and 2048 inclusive.
//
// pc_: current offset in function, in bytes
// p: a PCALIGN or PCALIGNMAX prog
// ctxt: the context, for current function
// cursym: current function being assembled
// returns number of bytes of padding needed,
// updates minimum alignment for the function.
func AlignmentPadding(pc int32, p *Prog, ctxt *Link, cursym *LSym) int {
v := AlignmentPaddingLength(pc, p, ctxt)
requireAlignment(p.From.Offset, ctxt, cursym)
return v
}
// AlignmentPaddingLength is the number of bytes to add to align code as requested.
// Alignment is restricted to powers of 2 between 8 and 2048 inclusive.
// This only computes the length and does not update the (missing parameter)
// current function's own required alignment.
//
// pc: current offset in function, in bytes
// p: a PCALIGN or PCALIGNMAX prog
// ctxt: the context, for current function
// returns number of bytes of padding needed,
func AlignmentPaddingLength(pc int32, p *Prog, ctxt *Link) int {
a := p.From.Offset
if !((a&(a-1) == 0) && 8 <= a && a <= 2048) {
ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a)
return 0
}
pc64 := int64(pc)
lob := pc64 & (a - 1) // Low Order Bits -- if not zero, then not aligned
if p.As == APCALIGN {
if lob != 0 {
return int(a - lob)
}
return 0
}
// emit as many as s bytes of padding to obtain alignment
s := p.To.Offset
if s < 0 || s >= a {
ctxt.Diag("PCALIGNMAX 'amount' %d must be non-negative and smaller than the aligment %d\n", s, a)
return 0
}
if s >= a-lob {
return int(a - lob)
}
return 0
}
// requireAlignment ensures that the function is aligned enough to support
// the required code alignment
func requireAlignment(a int64, ctxt *Link, cursym *LSym) {
// TODO remove explicit knowledge about AIX.
if ctxt.Headtype != objabi.Haix && cursym.Func().Align < int32(a) {
cursym.Func().Align = int32(a)
}
}

View file

@ -2036,29 +2036,21 @@ type nopPad struct {
n int32 // Size of the pad n int32 // Size of the pad
} }
// Padding bytes to add to align code as requested. // requireAlignment ensures that the function alignment is at
// Alignment is restricted to powers of 2 between 8 and 2048 inclusive. // least as high as a, which should be a power of two
// and between 8 and 2048, inclusive.
// //
// pc: current offset in function, in bytes // the boolean result indicates whether the alignment meets those constraints
// a: requested alignment, in bytes func requireAlignment(a int64, ctxt *obj.Link, cursym *obj.LSym) bool {
// cursym: current function being assembled
// returns number of bytes of padding needed
func addpad(pc, a int64, ctxt *obj.Link, cursym *obj.LSym) int {
if !((a&(a-1) == 0) && 8 <= a && a <= 2048) { if !((a&(a-1) == 0) && 8 <= a && a <= 2048) {
ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a) ctxt.Diag("alignment value of an instruction must be a power of two and in the range [8, 2048], got %d\n", a)
return 0 return false
} }
// By default function alignment is 32 bytes for amd64 // By default function alignment is 32 bytes for amd64
if cursym.Func().Align < int32(a) { if cursym.Func().Align < int32(a) {
cursym.Func().Align = int32(a) cursym.Func().Align = int32(a)
} }
return true
if pc&(a-1) != 0 {
return int(a - (pc & (a - 1)))
}
return 0
} }
func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) { func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
@ -2144,17 +2136,17 @@ func span6(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
c0 := c c0 := c
c = pjc.padJump(ctxt, s, p, c) c = pjc.padJump(ctxt, s, p, c)
if p.As == obj.APCALIGN { if p.As == obj.APCALIGN || p.As == obj.APCALIGNMAX {
aln := p.From.Offset v := obj.AlignmentPadding(c, p, ctxt, s)
v := addpad(int64(c), aln, ctxt, s)
if v > 0 { if v > 0 {
s.Grow(int64(c) + int64(v)) s.Grow(int64(c) + int64(v))
fillnop(s.P[c:], int(v)) fillnop(s.P[c:], int(v))
} }
p.Pc = int64(c)
c += int32(v) c += int32(v)
pPrev = p pPrev = p
continue continue
} }
if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 { if maxLoopPad > 0 && p.Back&branchLoopHead != 0 && c&(loopAlign-1) != 0 {