cmd/compile: call generated size-specialized malloc functions directly

This change creates calls to size-specialized malloc functions instead of calls to newObject when we know the size of the allocation at compilation time. Most of it is a matter of calling the newObject function (which will create calls to the size-specialized functions) rather then the newObjectNonSpecialized function (which won't). In the newHeapaddr, small, non-pointer case, we'll create a non specialized newObject and transform that into the appropriate size-specialized function when we produce the mallocgc in flushPendingHeapAllocations. We have to update some of the rewrites in generic.rules to also apply to the size-specialized functions when they apply to newObject. The messiest thing is we have to adjust the offset we use to save the memory profiler stack, because the depth of the call to profilealloc is two frames fewer in the size-specialized malloc functions compared to when newObject calls mallocgc. A bunch of tests have been adjusted to account for that. Change-Id: I6a6a6964c9037fb6719e392c4a498ed700b617d7 Reviewed-on: https://go-review.googlesource.com/c/go/+/707856 Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Michael Matloob <matloob@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org>
2025-12-08 06:10:04 +00:00 · 2025-09-29 17:26:49 -04:00 · 2025-09-29 17:26:49 -04:00 · 19a30ea3f2
commit 19a30ea3f2
parent 80f3bb5516
16 changed files with 228 additions and 116 deletions
--- a/src/cmd/compile/internal/ir/symtab.go
+++ b/src/cmd/compile/internal/ir/symtab.go
@ -13,47 +13,50 @@ import (
 var Syms symsStruct

 type symsStruct struct {
-	AssertE2I         *obj.LSym
-	AssertE2I2        *obj.LSym
-	Asanread          *obj.LSym
-	Asanwrite         *obj.LSym
-	CgoCheckMemmove   *obj.LSym
-	CgoCheckPtrWrite  *obj.LSym
-	CheckPtrAlignment *obj.LSym
-	Deferproc         *obj.LSym
-	Deferprocat       *obj.LSym
-	DeferprocStack    *obj.LSym
-	Deferreturn       *obj.LSym
-	Duffcopy          *obj.LSym
-	Duffzero          *obj.LSym
-	GCWriteBarrier    [8]*obj.LSym
-	Goschedguarded    *obj.LSym
-	Growslice         *obj.LSym
-	InterfaceSwitch   *obj.LSym
-	MallocGC          *obj.LSym
-	Memmove           *obj.LSym
-	Msanread          *obj.LSym
-	Msanwrite         *obj.LSym
-	Msanmove          *obj.LSym
-	Newobject         *obj.LSym
-	Newproc           *obj.LSym
-	PanicBounds       *obj.LSym
-	PanicExtend       *obj.LSym
-	Panicdivide       *obj.LSym
-	Panicshift        *obj.LSym
-	PanicdottypeE     *obj.LSym
-	PanicdottypeI     *obj.LSym
-	Panicnildottype   *obj.LSym
-	Panicoverflow     *obj.LSym
-	Racefuncenter     *obj.LSym
-	Racefuncexit      *obj.LSym
-	Raceread          *obj.LSym
-	Racereadrange     *obj.LSym
-	Racewrite         *obj.LSym
-	Racewriterange    *obj.LSym
-	TypeAssert        *obj.LSym
-	WBZero            *obj.LSym
-	WBMove            *obj.LSym
+	AssertE2I                 *obj.LSym
+	AssertE2I2                *obj.LSym
+	Asanread                  *obj.LSym
+	Asanwrite                 *obj.LSym
+	CgoCheckMemmove           *obj.LSym
+	CgoCheckPtrWrite          *obj.LSym
+	CheckPtrAlignment         *obj.LSym
+	Deferproc                 *obj.LSym
+	Deferprocat               *obj.LSym
+	DeferprocStack            *obj.LSym
+	Deferreturn               *obj.LSym
+	Duffcopy                  *obj.LSym
+	Duffzero                  *obj.LSym
+	GCWriteBarrier            [8]*obj.LSym
+	Goschedguarded            *obj.LSym
+	Growslice                 *obj.LSym
+	InterfaceSwitch           *obj.LSym
+	MallocGC                  *obj.LSym
+	MallocGCSmallNoScan       [27]*obj.LSym
+	MallocGCSmallScanNoHeader [27]*obj.LSym
+	MallocGCTiny              [16]*obj.LSym
+	Memmove                   *obj.LSym
+	Msanread                  *obj.LSym
+	Msanwrite                 *obj.LSym
+	Msanmove                  *obj.LSym
+	Newobject                 *obj.LSym
+	Newproc                   *obj.LSym
+	PanicBounds               *obj.LSym
+	PanicExtend               *obj.LSym
+	Panicdivide               *obj.LSym
+	Panicshift                *obj.LSym
+	PanicdottypeE             *obj.LSym
+	PanicdottypeI             *obj.LSym
+	Panicnildottype           *obj.LSym
+	Panicoverflow             *obj.LSym
+	Racefuncenter             *obj.LSym
+	Racefuncexit              *obj.LSym
+	Raceread                  *obj.LSym
+	Racereadrange             *obj.LSym
+	Racewrite                 *obj.LSym
+	Racewriterange            *obj.LSym
+	TypeAssert                *obj.LSym
+	WBZero                    *obj.LSym
+	WBMove                    *obj.LSym
 	// Wasm
 	SigPanic         *obj.LSym
 	Staticuint64s    *obj.LSym
--- a/src/cmd/compile/internal/ssa/_gen/generic.rules
+++ b/src/cmd/compile/internal/ssa/_gen/generic.rules
@ -2065,28 +2065,32 @@
 // for rewriting results of some late-expanded rewrites (below)
 (SelectN [n] m:(MakeResult ___)) => m.Args[n]

+// TODO(matloob): Try out having non-zeroing mallocs for prointerless
+// memory, and leaving the zeroing here. Then the compiler can remove
+// the zeroing if the user has explicit writes to the whole object.
+
 // for late-expanded calls, recognize newobject and remove zeroing and nilchecks
-(Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call))
-	&& isSameCall(call.Aux, "runtime.newobject")
+(Zero (SelectN [0] call:(StaticLECall ___)) mem:(SelectN [1] call))
+	&& isMalloc(call.Aux)
 	=> mem

-(Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call))
+(Store (SelectN [0] call:(StaticLECall ___)) x mem:(SelectN [1] call))
 	&& isConstZero(x)
-	&& isSameCall(call.Aux, "runtime.newobject")
+	&& isMalloc(call.Aux)
 	=> mem

-(Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call))
+(Store (OffPtr (SelectN [0] call:(StaticLECall ___))) x mem:(SelectN [1] call))
 	&& isConstZero(x)
-	&& isSameCall(call.Aux, "runtime.newobject")
+	&& isMalloc(call.Aux)
 	=> mem

-(NilCheck ptr:(SelectN [0] call:(StaticLECall _ _)) _)
-	&& isSameCall(call.Aux, "runtime.newobject")
+(NilCheck ptr:(SelectN [0] call:(StaticLECall ___)) _)
+	&& isMalloc(call.Aux)
 	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
 	=> ptr

-(NilCheck ptr:(OffPtr (SelectN [0] call:(StaticLECall _ _))) _)
-	&& isSameCall(call.Aux, "runtime.newobject")
+(NilCheck ptr:(OffPtr (SelectN [0] call:(StaticLECall ___))) _)
+	&& isMalloc(call.Aux)
 	&& warnRule(fe.Debug_checknil(), v, "removed nil check")
 	=> ptr

--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@ -456,6 +456,26 @@ func isSameCall(aux Aux, name string) bool {
 	return fn != nil && fn.String() == name
 }

+func isMalloc(aux Aux) bool {
+	return isNewObject(aux) || isSpecializedMalloc(aux)
+}
+
+func isNewObject(aux Aux) bool {
+	fn := aux.(*AuxCall).Fn
+	return fn != nil && fn.String() == "runtime.newobject"
+}
+
+func isSpecializedMalloc(aux Aux) bool {
+	fn := aux.(*AuxCall).Fn
+	if fn == nil {
+		return false
+	}
+	name := fn.String()
+	return strings.HasPrefix(name, "runtime.mallocgcSmallNoScanSC") ||
+		strings.HasPrefix(name, "runtime.mallocgcSmallScanNoHeaderSC") ||
+		strings.HasPrefix(name, "runtime.mallocTiny")
+}
+
 // canLoadUnaligned reports if the architecture supports unaligned load operations.
 func canLoadUnaligned(c *Config) bool {
 	return c.ctxt.Arch.Alignment == 1
--- a/src/cmd/compile/internal/ssa/rewritegeneric.go
+++ b/src/cmd/compile/internal/ssa/rewritegeneric.go
@ -21318,8 +21318,8 @@ func rewriteValuegeneric_OpNilCheck(v *Value) bool {
 		v.copyOf(ptr)
 		return true
 	}
-	// match: (NilCheck ptr:(SelectN [0] call:(StaticLECall _ _)) _)
-	// cond: isSameCall(call.Aux, "runtime.newobject") && warnRule(fe.Debug_checknil(), v, "removed nil check")
+	// match: (NilCheck ptr:(SelectN [0] call:(StaticLECall ___)) _)
+	// cond: isMalloc(call.Aux) && warnRule(fe.Debug_checknil(), v, "removed nil check")
 	// result: ptr
 	for {
 		ptr := v_0
@ -21327,14 +21327,17 @@ func rewriteValuegeneric_OpNilCheck(v *Value) bool {
 			break
 		}
 		call := ptr.Args[0]
-		if call.Op != OpStaticLECall || len(call.Args) != 2 || !(isSameCall(call.Aux, "runtime.newobject") && warnRule(fe.Debug_checknil(), v, "removed nil check")) {
+		if call.Op != OpStaticLECall {
+			break
+		}
+		if !(isMalloc(call.Aux) && warnRule(fe.Debug_checknil(), v, "removed nil check")) {
 			break
 		}
 		v.copyOf(ptr)
 		return true
 	}
-	// match: (NilCheck ptr:(OffPtr (SelectN [0] call:(StaticLECall _ _))) _)
-	// cond: isSameCall(call.Aux, "runtime.newobject") && warnRule(fe.Debug_checknil(), v, "removed nil check")
+	// match: (NilCheck ptr:(OffPtr (SelectN [0] call:(StaticLECall ___))) _)
+	// cond: isMalloc(call.Aux) && warnRule(fe.Debug_checknil(), v, "removed nil check")
 	// result: ptr
 	for {
 		ptr := v_0
@ -21346,7 +21349,10 @@ func rewriteValuegeneric_OpNilCheck(v *Value) bool {
 			break
 		}
 		call := ptr_0.Args[0]
-		if call.Op != OpStaticLECall || len(call.Args) != 2 || !(isSameCall(call.Aux, "runtime.newobject") && warnRule(fe.Debug_checknil(), v, "removed nil check")) {
+		if call.Op != OpStaticLECall {
+			break
+		}
+		if !(isMalloc(call.Aux) && warnRule(fe.Debug_checknil(), v, "removed nil check")) {
 			break
 		}
 		v.copyOf(ptr)
@ -32463,27 +32469,27 @@ func rewriteValuegeneric_OpStore(v *Value) bool {
 		v.AddArg3(dst, e, mem)
 		return true
 	}
-	// match: (Store (SelectN [0] call:(StaticLECall _ _)) x mem:(SelectN [1] call))
-	// cond: isConstZero(x) && isSameCall(call.Aux, "runtime.newobject")
+	// match: (Store (SelectN [0] call:(StaticLECall ___)) x mem:(SelectN [1] call))
+	// cond: isConstZero(x) && isMalloc(call.Aux)
 	// result: mem
 	for {
 		if v_0.Op != OpSelectN || auxIntToInt64(v_0.AuxInt) != 0 {
 			break
 		}
 		call := v_0.Args[0]
-		if call.Op != OpStaticLECall || len(call.Args) != 2 {
+		if call.Op != OpStaticLECall {
 			break
 		}
 		x := v_1
 		mem := v_2
-		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isConstZero(x) && isSameCall(call.Aux, "runtime.newobject")) {
+		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isConstZero(x) && isMalloc(call.Aux)) {
 			break
 		}
 		v.copyOf(mem)
 		return true
 	}
-	// match: (Store (OffPtr (SelectN [0] call:(StaticLECall _ _))) x mem:(SelectN [1] call))
-	// cond: isConstZero(x) && isSameCall(call.Aux, "runtime.newobject")
+	// match: (Store (OffPtr (SelectN [0] call:(StaticLECall ___))) x mem:(SelectN [1] call))
+	// cond: isConstZero(x) && isMalloc(call.Aux)
 	// result: mem
 	for {
 		if v_0.Op != OpOffPtr {
@ -32494,12 +32500,12 @@ func rewriteValuegeneric_OpStore(v *Value) bool {
 			break
 		}
 		call := v_0_0.Args[0]
-		if call.Op != OpStaticLECall || len(call.Args) != 2 {
+		if call.Op != OpStaticLECall {
 			break
 		}
 		x := v_1
 		mem := v_2
-		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isConstZero(x) && isSameCall(call.Aux, "runtime.newobject")) {
+		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isConstZero(x) && isMalloc(call.Aux)) {
 			break
 		}
 		v.copyOf(mem)
@ -36842,19 +36848,19 @@ func rewriteValuegeneric_OpZero(v *Value) bool {
 	v_1 := v.Args[1]
 	v_0 := v.Args[0]
 	b := v.Block
-	// match: (Zero (SelectN [0] call:(StaticLECall _ _)) mem:(SelectN [1] call))
-	// cond: isSameCall(call.Aux, "runtime.newobject")
+	// match: (Zero (SelectN [0] call:(StaticLECall ___)) mem:(SelectN [1] call))
+	// cond: isMalloc(call.Aux)
 	// result: mem
 	for {
 		if v_0.Op != OpSelectN || auxIntToInt64(v_0.AuxInt) != 0 {
 			break
 		}
 		call := v_0.Args[0]
-		if call.Op != OpStaticLECall || len(call.Args) != 2 {
+		if call.Op != OpStaticLECall {
 			break
 		}
 		mem := v_1
-		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isSameCall(call.Aux, "runtime.newobject")) {
+		if mem.Op != OpSelectN || auxIntToInt64(mem.AuxInt) != 1 || call != mem.Args[0] || !(isMalloc(call.Aux)) {
 			break
 		}
 		v.copyOf(mem)
--- a/src/cmd/compile/internal/ssa/writebarrier.go
+++ b/src/cmd/compile/internal/ssa/writebarrier.go
@ -798,7 +798,16 @@ func IsNewObject(v *Value, select1 []*Value) (mem *Value, ok bool) {
 	if call.Op != OpStaticCall {
 		return nil, false
 	}
-	if !isSameCall(call.Aux, "runtime.newobject") {
+	// Check for new object, or for new object calls that have been transformed into size-specialized malloc calls.
+	// Calls that have return type unsafe pointer may have originally been produced by flushPendingHeapAllocations
+	// in the ssa generator, so may have not originally been newObject calls.
+	var numParameters int64
+	switch {
+	case isNewObject(call.Aux):
+		numParameters = 1
+	case isSpecializedMalloc(call.Aux) && !v.Type.IsUnsafePtr():
+		numParameters = 3
+	default:
 		return nil, false
 	}
 	if f.ABIDefault == f.ABI1 && len(c.intParamRegs) >= 1 {
@ -813,7 +822,7 @@ func IsNewObject(v *Value, select1 []*Value) (mem *Value, ok bool) {
 	if v.Args[0].Args[0].Op != OpSP {
 		return nil, false
 	}
-	if v.Args[0].AuxInt != c.ctxt.Arch.FixedFrameSize+c.RegSize { // offset of return value
+	if v.Args[0].AuxInt != c.ctxt.Arch.FixedFrameSize+numParameters*c.RegSize { // offset of return value
 		return nil, false
 	}
 	return mem, true
--- a/src/cmd/compile/internal/ssagen/ssa.go
+++ b/src/cmd/compile/internal/ssagen/ssa.go
@ -12,6 +12,7 @@ import (
 	"go/constant"
 	"html"
 	"internal/buildcfg"
+	"internal/runtime/gc"
 	"os"
 	"path/filepath"
 	"slices"
@ -124,6 +125,15 @@ func InitConfig() {
 	ir.Syms.Goschedguarded = typecheck.LookupRuntimeFunc("goschedguarded")
 	ir.Syms.Growslice = typecheck.LookupRuntimeFunc("growslice")
 	ir.Syms.InterfaceSwitch = typecheck.LookupRuntimeFunc("interfaceSwitch")
+	for i := 1; i < len(ir.Syms.MallocGCSmallNoScan); i++ {
+		ir.Syms.MallocGCSmallNoScan[i] = typecheck.LookupRuntimeFunc(fmt.Sprintf("mallocgcSmallNoScanSC%d", i))
+	}
+	for i := 1; i < len(ir.Syms.MallocGCSmallScanNoHeader); i++ {
+		ir.Syms.MallocGCSmallScanNoHeader[i] = typecheck.LookupRuntimeFunc(fmt.Sprintf("mallocgcSmallScanNoHeaderSC%d", i))
+	}
+	for i := 1; i < len(ir.Syms.MallocGCTiny); i++ {
+		ir.Syms.MallocGCTiny[i] = typecheck.LookupRuntimeFunc(fmt.Sprintf("mallocTiny%d", i))
+	}
 	ir.Syms.MallocGC = typecheck.LookupRuntimeFunc("mallocgc")
 	ir.Syms.Memmove = typecheck.LookupRuntimeFunc("memmove")
 	ir.Syms.Msanread = typecheck.LookupRuntimeFunc("msanread")
@ -690,7 +700,7 @@ func allocAlign(t *types.Type) int64 {
 func (s *state) newHeapaddr(n *ir.Name) {
 	size := allocSize(n.Type())
 	if n.Type().HasPointers() || size >= maxAggregatedHeapAllocation || size == 0 {
-		s.setHeapaddr(n.Pos(), n, s.newObject(n.Type(), nil))
+		s.setHeapaddr(n.Pos(), n, s.newObject(n.Type()))
 		return
 	}

@ -709,7 +719,7 @@ func (s *state) newHeapaddr(n *ir.Name) {
 		// Make an allocation, but the type being allocated is just
 		// the first pending object. We will come back and update it
 		// later if needed.
-		allocCall = s.newObject(n.Type(), nil)
+		allocCall = s.newObjectNonSpecialized(n.Type(), nil)
 	} else {
 		allocCall = s.pendingHeapAllocations[0].Args[0]
 	}
@ -762,7 +772,11 @@ func (s *state) flushPendingHeapAllocations() {
 		s.constBool(true),             // needZero TODO: false is ok?
 		call.Args[1],                  // memory
 	}
-	call.Aux = ssa.StaticAuxCall(ir.Syms.MallocGC, s.f.ABIDefault.ABIAnalyzeTypes(
+	mallocSym := ir.Syms.MallocGC
+	if specialMallocSym := s.specializedMallocSym(size, false); specialMallocSym != nil {
+		mallocSym = specialMallocSym
+	}
+	call.Aux = ssa.StaticAuxCall(mallocSym, s.f.ABIDefault.ABIAnalyzeTypes(
 		[]*types.Type{args[0].Type, args[1].Type, args[2].Type},
 		[]*types.Type{types.Types[types.TUNSAFEPTR]},
 	))
@ -774,6 +788,43 @@ func (s *state) flushPendingHeapAllocations() {
 	ptr.Type = types.Types[types.TUNSAFEPTR]
 }

+func (s *state) specializedMallocSym(size int64, hasPointers bool) *obj.LSym {
+	if !s.sizeSpecializedMallocEnabled() {
+		return nil
+	}
+	ptrSize := s.config.PtrSize
+	ptrBits := ptrSize * 8
+	minSizeForMallocHeader := ptrSize * ptrBits
+	heapBitsInSpan := size <= minSizeForMallocHeader
+	if !heapBitsInSpan {
+		return nil
+	}
+	divRoundUp := func(n, a uintptr) uintptr { return (n + a - 1) / a }
+	sizeClass := gc.SizeToSizeClass8[divRoundUp(uintptr(size), gc.SmallSizeDiv)]
+	if hasPointers {
+		return ir.Syms.MallocGCSmallScanNoHeader[sizeClass]
+	}
+	if size < gc.TinySize {
+		return ir.Syms.MallocGCTiny[size]
+	}
+	return ir.Syms.MallocGCSmallNoScan[sizeClass]
+}
+
+func (s *state) sizeSpecializedMallocEnabled() bool {
+	if base.Flag.CompilingRuntime {
+		// The compiler forces the values of the asan, msan, and race flags to false if
+		// we're compiling the runtime, so we lose the information about whether we're
+		// building in asan, msan, or race mode. Because the specialized functions don't
+		// work in that mode, just turn if off in that case.
+		// TODO(matloob): Save the information about whether the flags were passed in
+		// originally so we can turn off size specialized malloc in that case instead
+		// using Instrumenting below. Then we can remove this condition.
+		return false
+	}
+
+	return buildcfg.Experiment.SizeSpecializedMalloc && !base.Flag.Cfg.Instrumenting
+}
+
 // setHeapaddr allocates a new PAUTO variable to store ptr (which must be non-nil)
 // and then sets it as n's heap address.
 func (s *state) setHeapaddr(pos src.XPos, n *ir.Name, ptr *ssa.Value) {
@ -796,7 +847,24 @@ func (s *state) setHeapaddr(pos src.XPos, n *ir.Name, ptr *ssa.Value) {
 }

 // newObject returns an SSA value denoting new(typ).
-func (s *state) newObject(typ *types.Type, rtype *ssa.Value) *ssa.Value {
+func (s *state) newObject(typ *types.Type) *ssa.Value {
+	if typ.Size() == 0 {
+		return s.newValue1A(ssa.OpAddr, types.NewPtr(typ), ir.Syms.Zerobase, s.sb)
+	}
+	rtype := s.reflectType(typ)
+	if specialMallocSym := s.specializedMallocSym(typ.Size(), typ.HasPointers()); specialMallocSym != nil {
+		return s.rtcall(specialMallocSym, true, []*types.Type{types.NewPtr(typ)},
+			s.constInt(types.Types[types.TUINTPTR], typ.Size()),
+			rtype,
+			s.constBool(true),
+		)[0]
+	}
+	return s.rtcall(ir.Syms.Newobject, true, []*types.Type{types.NewPtr(typ)}, rtype)[0]
+}
+
+// newObjectNonSpecialized returns an SSA value denoting new(typ). It does
+// not produce size-specialized malloc functions.
+func (s *state) newObjectNonSpecialized(typ *types.Type, rtype *ssa.Value) *ssa.Value {
 	if typ.Size() == 0 {
 		return s.newValue1A(ssa.OpAddr, types.NewPtr(typ), ir.Syms.Zerobase, s.sb)
 	}
@ -3594,11 +3662,10 @@ func (s *state) exprCheckPtr(n ir.Node, checkPtrOK bool) *ssa.Value {

 	case ir.ONEW:
 		n := n.(*ir.UnaryExpr)
-		var rtype *ssa.Value
 		if x, ok := n.X.(*ir.DynamicType); ok && x.Op() == ir.ODYNAMICTYPE {
-			rtype = s.expr(x.RType)
+			return s.newObjectNonSpecialized(n.Type().Elem(), s.expr(x.RType))
 		}
-		return s.newObject(n.Type().Elem(), rtype)
+		return s.newObject(n.Type().Elem())

 	case ir.OUNSAFEADD:
 		n := n.(*ir.BinaryExpr)
--- a/src/cmd/dist/buildtool.go
+++ b/src/cmd/dist/buildtool.go
@ -90,6 +90,7 @@ var bootstrapDirs = []string{
 	"internal/platform",
 	"internal/profile",
 	"internal/race",
+	"internal/runtime/gc",
 	"internal/saferio",
 	"internal/syscall/unix",
 	"internal/types/errors",
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -49,7 +49,7 @@ const (
 	// desired maximum number of frames after expansion.
 	// This should be at least as large as the largest skip value
 	// used for profiling; otherwise stacks may be truncated inconsistently
-	maxSkip = 6
+	maxSkip = 8

 	// maxProfStackDepth is the highest valid value for debug.profstackdepth.
 	// It's used for the bucket.stk func.
@ -444,7 +444,7 @@ func mProf_Malloc(mp *m, p unsafe.Pointer, size uintptr) {
 	}
 	// Only use the part of mp.profStack we need and ignore the extra space
 	// reserved for delayed inline expansion with frame pointer unwinding.
-	nstk := callers(5, mp.profStack[:debug.profstackdepth])
+	nstk := callers(3, mp.profStack[:debug.profstackdepth+2])
 	index := (mProfCycle.read() + 2) % uint32(len(memRecord{}.future))

 	b := stkbucket(memProfile, size, mp.profStack[:nstk], true)
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@ -97,25 +97,25 @@ func TestMemoryProfiler(t *testing.T) {
 		legacy string
 	}{{
 		stk: []string{"runtime/pprof.allocatePersistent1K", "runtime/pprof.TestMemoryProfiler"},
-		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+( 0x[0-9,a-f]+ 0x[0-9,a-f]+)?
 #	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test\.go:48
 #	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test\.go:87
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient1M", "runtime/pprof.TestMemoryProfiler"},
-		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:25
 #	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:84
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2M", "runtime/pprof.TestMemoryProfiler"},
-		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:31
 #	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:85
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}, {
 		stk: []string{"runtime/pprof.allocateTransient2MInline", "runtime/pprof.TestMemoryProfiler"},
-		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:35
 #	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*runtime/pprof/mprof_test.go:86
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@ -2585,7 +2585,7 @@ func TestProfilerStackDepth(t *testing.T) {
 					t.Errorf("want stack depth = %d, got %d", depth, len(stk))
 				}

-				if rootFn, wantFn := stk[depth-1], "runtime/pprof.produceProfileEvents"; rootFn != wantFn {
+				if rootFn, wantFn := stk[depth-1], "runtime/pprof.allocDeep"; rootFn != wantFn {
 					t.Errorf("want stack stack root %s, got %v", wantFn, rootFn)
 				}
 			}
@ -2660,7 +2660,7 @@ func goroutineDeep(t *testing.T, n int) {
 // guaranteed to have exactly the desired depth with produceProfileEvents as
 // their root frame which is expected by TestProfilerStackDepth.
 func produceProfileEvents(t *testing.T, depth int) {
-	allocDeep(depth - 1)       // -1 for produceProfileEvents, **
+	allocDeep(depth + 1)       // +1 for produceProfileEvents, **
 	blockChanDeep(t, depth-2)  // -2 for produceProfileEvents, **, chanrecv1
 	blockMutexDeep(t, depth-2) // -2 for produceProfileEvents, **, Unlock
 	memSink = nil
--- a/test/codegen/strings.go
+++ b/test/codegen/strings.go
@ -23,7 +23,7 @@ func CountBytes(s []byte) int {

 func ToByteSlice() []byte { // Issue #24698
 	// amd64:`LEAQ\ttype:\[3\]uint8`
-	// amd64:`CALL\truntime\.newobject`
+	// amd64:`CALL\truntime\.mallocTiny3`
 	// amd64:-`.*runtime.stringtoslicebyte`
 	return []byte("foo")
 }
--- a/test/fixedbugs/issue15747.go
+++ b/test/fixedbugs/issue15747.go
@ -19,7 +19,7 @@ type T struct{ M string }

 var b bool

-func f1(q *Q, xx []byte) interface{} { // ERROR "live at call to newobject: xx$" "live at entry to f1: xx$"
+func f1(q *Q, xx []byte) interface{} { // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: xx$" "live at entry to f1: xx$"
 	// xx was copied from the stack to the heap on the previous line:
 	// xx was live for the first two prints but then it switched to &xx
 	// being live. We should not see plain xx again.
@ -36,7 +36,7 @@ func f1(q *Q, xx []byte) interface{} { // ERROR "live at call to newobject: xx$"
 //go:noinline
 func f2(d []byte, n int) (odata, res []byte, e interface{}) { // ERROR "live at entry to f2: d$"
 	if n > len(d) {
-		return d, nil, &T{M: "hello"} // ERROR "live at call to newobject: d"
+		return d, nil, &T{M: "hello"} // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: d"
 	}
 	res = d[:n]
 	odata = d[n:]
--- a/test/heapsampling.go
+++ b/test/heapsampling.go
@ -48,22 +48,23 @@ func testInterleavedAllocations() error {
 	const iters = 50000
 	// Sizes of the allocations performed by each experiment.
 	frames := []string{"main.allocInterleaved1", "main.allocInterleaved2", "main.allocInterleaved3"}
+	leafFrame := "main.allocInterleaved"

 	// Pass if at least one of three experiments has no errors. Use a separate
 	// function for each experiment to identify each experiment in the profile.
 	allocInterleaved1(iters)
-	if checkAllocations(getMemProfileRecords(), frames[0:1], iters, allocInterleavedSizes) == nil {
+	if checkAllocations(getMemProfileRecords(), leafFrame, frames[0:1], iters, allocInterleavedSizes) == nil {
 		// Passed on first try, report no error.
 		return nil
 	}
 	allocInterleaved2(iters)
-	if checkAllocations(getMemProfileRecords(), frames[0:2], iters, allocInterleavedSizes) == nil {
+	if checkAllocations(getMemProfileRecords(), leafFrame, frames[0:2], iters, allocInterleavedSizes) == nil {
 		// Passed on second try, report no error.
 		return nil
 	}
 	allocInterleaved3(iters)
 	// If it fails a third time, we may be onto something.
-	return checkAllocations(getMemProfileRecords(), frames[0:3], iters, allocInterleavedSizes)
+	return checkAllocations(getMemProfileRecords(), leafFrame, frames[0:3], iters, allocInterleavedSizes)
 }

 var allocInterleavedSizes = []int64{17 * 1024, 1024, 18 * 1024, 512, 16 * 1024, 256}
@ -108,22 +109,23 @@ func testSmallAllocations() error {
 	// Sizes of the allocations performed by each experiment.
 	sizes := []int64{1024, 512, 256}
 	frames := []string{"main.allocSmall1", "main.allocSmall2", "main.allocSmall3"}
+	leafFrame := "main.allocSmall"

 	// Pass if at least one of three experiments has no errors. Use a separate
 	// function for each experiment to identify each experiment in the profile.
 	allocSmall1(iters)
-	if checkAllocations(getMemProfileRecords(), frames[0:1], iters, sizes) == nil {
+	if checkAllocations(getMemProfileRecords(), leafFrame, frames[0:1], iters, sizes) == nil {
 		// Passed on first try, report no error.
 		return nil
 	}
 	allocSmall2(iters)
-	if checkAllocations(getMemProfileRecords(), frames[0:2], iters, sizes) == nil {
+	if checkAllocations(getMemProfileRecords(), leafFrame, frames[0:2], iters, sizes) == nil {
 		// Passed on second try, report no error.
 		return nil
 	}
 	allocSmall3(iters)
 	// If it fails a third time, we may be onto something.
-	return checkAllocations(getMemProfileRecords(), frames[0:3], iters, sizes)
+	return checkAllocations(getMemProfileRecords(), leafFrame, frames[0:3], iters, sizes)
 }

 // allocSmall performs only small allocations for sanity testing.
@ -161,21 +163,21 @@ func allocSmall3(n int) {
 // Look only at samples that include the named frames, and group the
 // allocations by their line number. All these allocations are done from
 // the same leaf function, so their line numbers are the same.
-func checkAllocations(records []runtime.MemProfileRecord, frames []string, count int64, size []int64) error {
+func checkAllocations(records []runtime.MemProfileRecord, leafFrame string, frames []string, count int64, size []int64) error {
 	objectsPerLine := map[int][]int64{}
 	bytesPerLine := map[int][]int64{}
 	totalCount := []int64{}
 	// Compute the line number of the first allocation. All the
 	// allocations are from the same leaf, so pick the first one.
 	var firstLine int
-	for ln := range allocObjects(records, frames[0]) {
+	for ln := range allocObjects(records, leafFrame, frames[0]) {
 		if firstLine == 0 || firstLine > ln {
 			firstLine = ln
 		}
 	}
 	for _, frame := range frames {
 		var objectCount int64
-		a := allocObjects(records, frame)
+		a := allocObjects(records, leafFrame, frame)
 		for s := range size {
 			// Allocations of size size[s] should be on line firstLine + s.
 			ln := firstLine + s
@ -258,7 +260,7 @@ type allocStat struct {
 // allocObjects examines the profile records for samples including the
 // named function and returns the allocation stats aggregated by
 // source line number of the allocation (at the leaf frame).
-func allocObjects(records []runtime.MemProfileRecord, function string) map[int]allocStat {
+func allocObjects(records []runtime.MemProfileRecord, leafFrame, function string) map[int]allocStat {
 	a := make(map[int]allocStat)
 	for _, r := range records {
 		var pcs []uintptr
@ -273,7 +275,7 @@ func allocObjects(records []runtime.MemProfileRecord, function string) map[int]a
 		for {
 			frame, more := frames.Next()
 			name := frame.Function
-			if line == 0 {
+			if name == leafFrame && line == 0 {
 				line = frame.Line
 			}
 			if name == function {
--- a/test/live.go
+++ b/test/live.go
@ -467,9 +467,9 @@ func f27defer(b bool) {
 func f27go(b bool) {
 	x := 0
 	if b {
-		go call27(func() { x++ }) // ERROR "live at call to newobject: &x$" "live at call to newobject: &x .autotmp_[0-9]+$" "live at call to newproc: &x$" // allocate two closures, the func literal, and the wrapper for go
+		go call27(func() { x++ }) // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x .autotmp_[0-9]+$" "live at call to newproc: &x$" // allocate two closures, the func literal, and the wrapper for go
 	}
-	go call27(func() { x++ }) // ERROR "live at call to newobject: &x$" "live at call to newobject: .autotmp_[0-9]+$" // allocate two closures, the func literal, and the wrapper for go
+	go call27(func() { x++ }) // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: .autotmp_[0-9]+$" // allocate two closures, the func literal, and the wrapper for go
 	printnl()
 }

@ -538,7 +538,7 @@ func f31(b1, b2, b3 bool) {
 		g31(g18()) // ERROR "stack object .autotmp_[0-9]+ \[2\]string$"
 	}
 	if b2 {
-		h31(g18()) // ERROR "live at call to convT: .autotmp_[0-9]+$" "live at call to newobject: .autotmp_[0-9]+$"
+		h31(g18()) // ERROR "live at call to convT: .autotmp_[0-9]+$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: .autotmp_[0-9]+$"
 	}
 	if b3 {
 		panic(g18())
@ -665,14 +665,14 @@ func f39a() (x []int) {

 func f39b() (x [10]*int) {
 	x = [10]*int{}
-	x[0] = new(int) // ERROR "live at call to newobject: x$"
+	x[0] = new(int) // ERROR "live at call to mallocTiny[48]: x$"
 	printnl()       // ERROR "live at call to printnl: x$"
 	return x
 }

 func f39c() (x [10]*int) {
 	x = [10]*int{}
-	x[0] = new(int) // ERROR "live at call to newobject: x$"
+	x[0] = new(int) // ERROR "live at call to mallocTiny[48]: x$"
 	printnl()       // ERROR "live at call to printnl: x$"
 	return
 }
--- a/test/live_regabi.go
+++ b/test/live_regabi.go
@ -465,9 +465,9 @@ func f27defer(b bool) {
 func f27go(b bool) {
 	x := 0
 	if b {
-		go call27(func() { x++ }) // ERROR "live at call to newobject: &x$" "live at call to newobject: &x .autotmp_[0-9]+$" "live at call to newproc: &x$" // allocate two closures, the func literal, and the wrapper for go
+		go call27(func() { x++ }) // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x .autotmp_[0-9]+$" "live at call to newproc: &x$" // allocate two closures, the func literal, and the wrapper for go
 	}
-	go call27(func() { x++ }) // ERROR "live at call to newobject: &x$" "live at call to newobject: .autotmp_[0-9]+$" // allocate two closures, the func literal, and the wrapper for go
+	go call27(func() { x++ }) // ERROR "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: &x$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: .autotmp_[0-9]+$" // allocate two closures, the func literal, and the wrapper for go
 	printnl()
 }

@ -536,7 +536,7 @@ func f31(b1, b2, b3 bool) {
 		g31(g18()) // ERROR "stack object .autotmp_[0-9]+ \[2\]string$"
 	}
 	if b2 {
-		h31(g18()) // ERROR "live at call to convT: .autotmp_[0-9]+$" "live at call to newobject: .autotmp_[0-9]+$"
+		h31(g18()) // ERROR "live at call to convT: .autotmp_[0-9]+$" "live at call to mallocgcSmallScanNoHeaderSC[0-9]+: .autotmp_[0-9]+$"
 	}
 	if b3 {
 		panic(g18())
@ -663,14 +663,14 @@ func f39a() (x []int) {

 func f39b() (x [10]*int) {
 	x = [10]*int{}
-	x[0] = new(int) // ERROR "live at call to newobject: x$"
+	x[0] = new(int) // ERROR "live at call to mallocTiny[48]: x$"
 	printnl()       // ERROR "live at call to printnl: x$"
 	return x
 }

 func f39c() (x [10]*int) {
 	x = [10]*int{}
-	x[0] = new(int) // ERROR "live at call to newobject: x$"
+	x[0] = new(int) // ERROR "live at call to mallocTiny[48]: x$"
 	printnl()       // ERROR "live at call to printnl: x$"
 	return
 }
--- a/test/uintptrescapes2.go
+++ b/test/uintptrescapes2.go
@ -33,8 +33,8 @@ func (T) M1(a uintptr) {} // ERROR "escaping uintptr"
 func (T) M2(a ...uintptr) {} // ERROR "escaping ...uintptr"

 func TestF1() {
-	var t int                        // ERROR "moved to heap"
-	F1(uintptr(unsafe.Pointer(&t)))  // ERROR "live at call to F1: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+	var t int                       // ERROR "moved to heap"
+	F1(uintptr(unsafe.Pointer(&t))) // ERROR "live at call to F1: .?autotmp" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
 }

 func TestF3() {
@ -49,17 +49,17 @@ func TestM1() {
 }

 func TestF2() {
-	var v int                                 // ERROR "moved to heap"
-	F2(0, 1, uintptr(unsafe.Pointer(&v)), 2)  // ERROR "live at call to newobject: .?autotmp" "live at call to F2: .?autotmp" "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+	var v int                                // ERROR "moved to heap"
+	F2(0, 1, uintptr(unsafe.Pointer(&v)), 2) // ERROR "live at call to mallocgcSmallNoScanSC[0-9]+: .?autotmp" "live at call to F2: .?autotmp" "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
 }

 func TestF4() {
 	var v2 int                                // ERROR "moved to heap"
-	F4(0, 1, uintptr(unsafe.Pointer(&v2)), 2) // ERROR "live at call to newobject: .?autotmp" "live at call to F4: .?autotmp" "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+	F4(0, 1, uintptr(unsafe.Pointer(&v2)), 2) // ERROR "live at call to mallocgcSmallNoScanSC[0-9]+: .?autotmp" "live at call to F4: .?autotmp" "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
 }

 func TestM2() {
 	var t T
 	var v int                                  // ERROR "moved to heap"
-	t.M2(0, 1, uintptr(unsafe.Pointer(&v)), 2) // ERROR "live at call to newobject: .?autotmp" "live at call to T.M2: .?autotmp"  "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
+	t.M2(0, 1, uintptr(unsafe.Pointer(&v)), 2) // ERROR "live at call to mallocgcSmallNoScanSC[0-9]+: .?autotmp" "live at call to T.M2: .?autotmp"  "escapes to heap" "stack object .autotmp_[0-9]+ unsafe.Pointer$"
 }