[dev.simd] all: merge master (9de69f6) into dev.simd

Merge List: + 2025-08-20 9de69f6913 errors: mention Is/As in Join docs + 2025-08-20 4afd482812 cmd/go/internal/doc: pass URL fragments separately with -http + 2025-08-20 509d5f647f internal/poll: don't call Seek for overlapped Windows handles + 2025-08-20 853fc12739 internal/poll: set the correct file offset in FD.Seek for Windows overlapped handles + 2025-08-19 bd885401d5 runtime: save and restore all fcc registers in async preempt on loong64 + 2025-08-19 119546ea4f cmd/go: document install outputs to $GOOS_$GOARCH when cross compiling + 2025-08-19 ffa882059c unique: deflake TestCanonMap/LoadOrStore/ConcurrentUnsharedKeys + 2025-08-19 1f2e8e03e4 os: fix path in MkdirTemp error message + 2025-08-19 5024d0d884 cmd/compile: tweak example command in README + 2025-08-19 b80ffb64d8 internal/trace: remove redundant info from Event.String + 2025-08-19 c7d8bda459 cmd/compile/internal: make function comments match function names + 2025-08-19 de2d741667 internal/trace: use RFC3339Nano for wall clock snapshots in Event.String + 2025-08-19 c61db5ebd5 syscall: forkAndExecInChild1: don't reuse pid variable + 2025-08-19 07ee3bfc63 cmd/go: use modern pprof flags in documentation + 2025-08-18 5a56d8848b cmd/compile: ensure we use allowed registers for input-clobbering instructions + 2025-08-18 c3927a47f0 runtime: fix comments in tracetype.go + 2025-08-15 77f911e31c internal/trace: emit final sync event for generation in Go 1.26+ + 2025-08-15 786be1d2bf runtime: don't overwrite global stop channel in tests + 2025-08-15 4a7fde922f internal/trace: add end-of-generation signal to trace + 2025-08-15 cb814bd5bc net: skip TestIPv4WriteMsgUDPAddrPort on plan9 + 2025-08-15 78a3968c2c runtime/metrics: add metric for current Go-owned thread count + 2025-08-15 ab8121a407 runtime/metrics: add metric for total goroutines created + 2025-08-15 13df972f68 runtime/metrics: add metrics for goroutine sched states + 2025-08-15 bd07fafb0a runtime: disable stack shrinking for all waiting-for-suspendG cases + 2025-08-15 a651e2ea47 runtime: remove duff support for amd64 + 2025-08-15 e4291e484c runtime: remove duff support for arm64 + 2025-08-15 15d6dbc05c cmd/compile: use generated loops instead of DUFFCOPY on arm64 + 2025-08-15 bca3e98b8a cmd/go: test barrier actions + 2025-08-15 052fcde9fd internal/runtime: cleaner overflow checker + 2025-08-15 3871c0d84d syscall: permit nil destination address in sendmsgN{Inet4,Inet6} + 2025-08-14 a8564bd412 runtime: make all synctest bubble violations fatal panics Change-Id: Ibc94566bc69bcb59b1d79b6fa868610ca2d1d223
2025-12-08 06:10:04 +00:00 · 2025-08-20 16:06:42 -04:00 · 2025-08-20 16:06:42 -04:00 · 103b6e39ca
commit 103b6e39ca
parent 728ac3e050 9de69f6913
68 changed files with 1449 additions and 1363 deletions
--- a/src/cmd/compile/README.md
+++ b/src/cmd/compile/README.md
@ -281,11 +281,11 @@ dependencies, so is not suitable for distributed build systems.)
  ```
  $ go install golang.org/x/tools/cmd/toolstash@latest
  $ git clone https://go.googlesource.com/go
-  $ cd go
+  $ export PATH=$PWD/go/bin:$PATH
  $ cd go/src
  $ git checkout -b mybranch
-  $ ./src/all.bash               # build and confirm good starting point
+  $ ./all.bash                      # build and confirm good starting point
-  $ export PATH=$PWD/bin:$PATH
+  $ toolstash save                  # save current tools
  $ toolstash save               # save current tools
  ```
  After that, your edit/compile/test cycle can be similar to:
  ```
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@ -1162,41 +1162,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		//   BNE loop
 		// There's a past-the-end pointer here, any problem with that?
 	case ssa.OpARM64DUFFCOPY:
 		p := s.Prog(obj.ADUFFCOPY)
 		p.To.Type = obj.TYPE_MEM
 		p.To.Name = obj.NAME_EXTERN
 		p.To.Sym = ir.Syms.Duffcopy
 		p.To.Offset = v.AuxInt
 	case ssa.OpARM64LoweredMove:
-		// LDP.P	16(R16), (R25, Rtmp)
+		dstReg := v.Args[0].Reg()
-		// STP.P	(R25, Rtmp), 16(R17)
+		srcReg := v.Args[1].Reg()
-		// CMP	Rarg2, R16
+		if dstReg == srcReg {
-		// BLE	-3(PC)
+			break
-		// arg2 is the address of the last element of src
+		}
-		p := s.Prog(arm64.ALDP)
+		tmpReg1 := int16(arm64.REG_R24)
-		p.Scond = arm64.C_XPOST
+		tmpReg2 := int16(arm64.REG_R25)
-		p.From.Type = obj.TYPE_MEM
+		n := v.AuxInt
-		p.From.Reg = arm64.REG_R16
+		if n < 16 {
-		p.From.Offset = 16
+			v.Fatalf("Move too small %d", n)
-		p.To.Type = obj.TYPE_REGREG
+		}
-		p.To.Reg = arm64.REG_R25
+
-		p.To.Offset = int64(arm64.REGTMP)
+		// Generate copying instructions.
-		p2 := s.Prog(arm64.ASTP)
+		var off int64
-		p2.Scond = arm64.C_XPOST
+		for n >= 16 {
-		p2.From.Type = obj.TYPE_REGREG
+			// LDP     off(srcReg), (tmpReg1, tmpReg2)
-		p2.From.Reg = arm64.REG_R25
+			// STP     (tmpReg1, tmpReg2), off(dstReg)
-		p2.From.Offset = int64(arm64.REGTMP)
+			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
-		p2.To.Type = obj.TYPE_MEM
+			off += 16
-		p2.To.Reg = arm64.REG_R17
+			n -= 16
-		p2.To.Offset = 16
+		}
-		p3 := s.Prog(arm64.ACMP)
+		if n > 8 {
-		p3.From.Type = obj.TYPE_REG
+			//  MOVD    off(srcReg), tmpReg1
-		p3.From.Reg = v.Args[2].Reg()
+			//  MOVD    tmpReg1, off(dstReg)
-		p3.Reg = arm64.REG_R16
+			move8(s, srcReg, dstReg, tmpReg1, off)
-		p4 := s.Prog(arm64.ABLE)
+			off += 8
-		p4.To.Type = obj.TYPE_BRANCH
+			n -= 8
-		p4.To.SetTarget(p)
+		}
 		if n != 0 {
 			//  MOVD    off+n-8(srcReg), tmpReg1
 			//  MOVD    tmpReg1, off+n-8(dstReg)
 			move8(s, srcReg, dstReg, tmpReg1, off+n-8)
 		}
 	case ssa.OpARM64LoweredMoveLoop:
 		dstReg := v.Args[0].Reg()
 		srcReg := v.Args[1].Reg()
 		if dstReg == srcReg {
 			break
 		}
 		countReg := int16(arm64.REG_R23)
 		tmpReg1 := int16(arm64.REG_R24)
 		tmpReg2 := int16(arm64.REG_R25)
 		n := v.AuxInt
 		loopSize := int64(64)
 		if n < 3*loopSize {
 			// - a loop count of 0 won't work.
 			// - a loop count of 1 is useless.
 			// - a loop count of 2 is a code size ~tie
 			//     3 instructions to implement the loop
 			//     4 instructions in the loop body
 			//   vs
 			//     8 instructions in the straightline code
 			//   Might as well use straightline code.
 			v.Fatalf("ZeroLoop size too small %d", n)
 		}
 		// Put iteration count in a register.
 		//   MOVD    $n, countReg
 		p := s.Prog(arm64.AMOVD)
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = n / loopSize
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = countReg
 		cntInit := p
 		// Move loopSize bytes starting at srcReg to dstReg.
 		// Increment srcReg and destReg by loopSize as a side effect.
 		for range loopSize / 16 {
 			// LDP.P  16(srcReg), (tmpReg1, tmpReg2)
 			// STP.P  (tmpReg1, tmpReg2), 16(dstReg)
 			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
 		}
 		// Decrement loop count.
 		//   SUB     $1, countReg
 		p = s.Prog(arm64.ASUB)
 		p.From.Type = obj.TYPE_CONST
 		p.From.Offset = 1
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = countReg
 		// Jump to loop header if we're not done yet.
 		//   CBNZ    head
 		p = s.Prog(arm64.ACBNZ)
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = countReg
 		p.To.Type = obj.TYPE_BRANCH
 		p.To.SetTarget(cntInit.Link)
 		// Multiples of the loop size are now done.
 		n %= loopSize
 		// Copy any fractional portion.
 		var off int64
 		for n >= 16 {
 			//  LDP     off(srcReg), (tmpReg1, tmpReg2)
 			//  STP     (tmpReg1, tmpReg2), off(dstReg)
 			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
 			off += 16
 			n -= 16
 		}
 		if n > 8 {
 			//  MOVD    off(srcReg), tmpReg1
 			//  MOVD    tmpReg1, off(dstReg)
 			move8(s, srcReg, dstReg, tmpReg1, off)
 			off += 8
 			n -= 8
 		}
 		if n != 0 {
 			//  MOVD    off+n-8(srcReg), tmpReg1
 			//  MOVD    tmpReg1, off+n-8(dstReg)
 			move8(s, srcReg, dstReg, tmpReg1, off+n-8)
 		}
 	case ssa.OpARM64CALLstatic, ssa.OpARM64CALLclosure, ssa.OpARM64CALLinter:
 		s.Call(v)
 	case ssa.OpARM64CALLtail:
@ -1599,3 +1677,53 @@ func zero8(s *ssagen.State, reg int16, off int64) {
 	p.To.Reg = reg
 	p.To.Offset = off
 }
 // move16 copies 16 bytes at src+off to dst+off.
 // Uses registers tmp1 and tmp2.
 // If postInc is true, increment src and dst by 16.
 func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
 	// LDP     off(src), (tmp1, tmp2)
 	ld := s.Prog(arm64.ALDP)
 	ld.From.Type = obj.TYPE_MEM
 	ld.From.Reg = src
 	ld.From.Offset = off
 	ld.To.Type = obj.TYPE_REGREG
 	ld.To.Reg = tmp1
 	ld.To.Offset = int64(tmp2)
 	// STP     (tmp1, tmp2), off(dst)
 	st := s.Prog(arm64.ASTP)
 	st.From.Type = obj.TYPE_REGREG
 	st.From.Reg = tmp1
 	st.From.Offset = int64(tmp2)
 	st.To.Type = obj.TYPE_MEM
 	st.To.Reg = dst
 	st.To.Offset = off
 	if postInc {
 		if off != 0 {
 			panic("can't postinc with non-zero offset")
 		}
 		ld.Scond = arm64.C_XPOST
 		st.Scond = arm64.C_XPOST
 		ld.From.Offset = 16
 		st.To.Offset = 16
 	}
 }
 // move8 copies 8 bytes at src+off to dst+off.
 // Uses register tmp.
 func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
 	// MOVD    off(src), tmp
 	ld := s.Prog(arm64.AMOVD)
 	ld.From.Type = obj.TYPE_MEM
 	ld.From.Reg = src
 	ld.From.Offset = off
 	ld.To.Type = obj.TYPE_REG
 	ld.To.Reg = tmp
 	// MOVD    tmp, off(dst)
 	st := s.Prog(arm64.AMOVD)
 	st.From.Type = obj.TYPE_REG
 	st.From.Reg = tmp
 	st.To.Type = obj.TYPE_MEM
 	st.To.Reg = dst
 	st.To.Offset = off
 }
--- a/src/cmd/compile/internal/loopvar/loopvar_test.go
+++ b/src/cmd/compile/internal/loopvar/loopvar_test.go
@ -50,7 +50,7 @@ var cases = []testcase{
 	{"1", "", 0, []string{"for_nested.go"}},
 }
-// TestLoopVar checks that the GOEXPERIMENT and debug flags behave as expected.
+// TestLoopVarGo1_21 checks that the GOEXPERIMENT and debug flags behave as expected.
 func TestLoopVarGo1_21(t *testing.T) {
 	switch runtime.GOOS {
 	case "linux", "darwin":
--- a/src/cmd/compile/internal/pgoir/irgraph.go
+++ b/src/cmd/compile/internal/pgoir/irgraph.go
@ -158,7 +158,7 @@ func New(profileFile string) (*Profile, error) {
 	}, nil
 }
-// initializeIRGraph builds the IRGraph by visiting all the ir.Func in decl list
+// createIRGraph builds the IRGraph by visiting all the ir.Func in decl list
 // of a package.
 func createIRGraph(namedEdgeMap pgo.NamedEdgeMap) *IRGraph {
 	g := &IRGraph{
--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@ -462,39 +462,8 @@
 			(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
 				(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))
-// strip off fractional word move
+(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
-(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 64 =>
+(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)
 	(Move [8]
 		(OffPtr <dst.Type> dst [s-8])
 		(OffPtr <src.Type> src [s-8])
 		(Move [s-s%16] dst src mem))
 (Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 64 =>
 	(Move [16]
 		(OffPtr <dst.Type> dst [s-16])
 		(OffPtr <src.Type> src [s-16])
 		(Move [s-s%16] dst src mem))
 // medium move uses a duff device
 (Move [s] dst src mem)
 	&& s > 64 && s <= 16*64 && s%16 == 0
 	&& logLargeCopy(v, s) =>
 	(DUFFCOPY [8 * (64 - s/16)] dst src mem)
 // 8 is the number of bytes to encode:
 //
 // LDP.P   16(R16), (R26, R27)
 // STP.P   (R26, R27), 16(R17)
 //
 // 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
 // large move uses a loop
 (Move [s] dst src mem)
 	&& s%16 == 0 && s > 16*64
 	&& logLargeCopy(v, s) =>
 	(LoweredMove
 		dst
 		src
 		(ADDconst <src.Type> src [s-16])
 		mem)
 // calls
 (StaticCall  ...) => (CALLstatic  ...)
--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@ -144,6 +144,8 @@ func init() {
 		gpspsbg    = gpspg | buildReg("SB")
 		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
 		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
 		r24to25    = buildReg("R24 R25")
 		r23to25    = buildReg("R23 R24 R25")
 		rz         = buildReg("ZERO")
 		first16    = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
 	)
@ -568,47 +570,40 @@ func init() {
 			needIntTemp:    true,
 		},
-		// duffcopy
+		// medium copying
-		// arg0 = address of dst memory (in R21, changed as side effect)
+		// arg0 = address of dst memory
-		// arg1 = address of src memory (in R20, changed as side effect)
+		// arg1 = address of src memory
 		// arg2 = mem
-		// auxint = offset into duffcopy code to start executing
+		// auxint = # of bytes to copy
 		// returns mem
 		// R20, R21 changed as side effect
 		// R16 and R17 may be clobbered by linker trampoline.
 		{
-			name:      "DUFFCOPY",
+			name:      "LoweredMove",
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R21"), buildReg("R20")},
+				inputs:   []regMask{gp &^ r24to25, gp &^ r24to25},
-				clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
+				clobbers: r24to25, // TODO: figure out needIntTemp x2
 			},
-			//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
+			faultOnNilArg0: true,
-			//faultOnNilArg1: true,
+			faultOnNilArg1: true,
 			unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
 		},
-		// large move
+		// large copying
-		// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
+		// arg0 = address of dst memory
-		// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
+		// arg1 = address of src memory
-		// arg2 = address of the last element of src
+		// arg2 = mem
-		// arg3 = mem
+		// auxint = # of bytes to copy
 		// returns mem
 		//	LDP.P	16(R16), (R25, Rtmp)
 		//	STP.P	(R25, Rtmp), 16(R17)
 		//	CMP	Rarg2, R16
 		//	BLE	-3(PC)
 		// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
 		// the-end-of-src - 16 is within the area to copy, ok to spill.
 		{
-			name:      "LoweredMove",
+			name:      "LoweredMoveLoop",
-			argLength: 4,
+			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")},
+				inputs:       []regMask{gp &^ r23to25, gp &^ r23to25},
-				clobbers: buildReg("R16 R17 R25"),
+				clobbers:     r23to25, // TODO: figure out needIntTemp x3
 				clobbersArg0: true,
 				clobbersArg1: true,
 			},
 			clobberFlags:   true,
 			faultOnNilArg0: true,
 			faultOnNilArg1: true,
 		},
--- a/src/cmd/compile/internal/ssa/func_test.go
+++ b/src/cmd/compile/internal/ssa/func_test.go
@ -475,7 +475,7 @@ func opcodeMap(f *Func) map[Op]int {
 	return m
 }
-// opcodeCounts checks that the number of opcodes listed in m agree with the
+// checkOpcodeCounts checks that the number of opcodes listed in m agree with the
 // number of opcodes that appear in the function.
 func checkOpcodeCounts(t *testing.T, f *Func, m map[Op]int) {
 	n := opcodeMap(f)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -2906,8 +2906,8 @@ const (
 	OpARM64GreaterEqualNoov
 	OpARM64LoweredZero
 	OpARM64LoweredZeroLoop
 	OpARM64DUFFCOPY
 	OpARM64LoweredMove
 	OpARM64LoweredMoveLoop
 	OpARM64LoweredGetClosurePtr
 	OpARM64LoweredGetCallerSP
 	OpARM64LoweredGetCallerPC
@ -43040,32 +43040,34 @@ var opcodeTable = [...]opInfo{
 			clobbersArg0: true,
 		},
 	},
 	{
 		name:        "DUFFCOPY",
 		auxType:     auxInt64,
 		argLen:      3,
 		unsafePoint: true,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 1048576}, // R21
 				{1, 524288},  // R20
 			},
 			clobbers: 303759360, // R16 R17 R20 R21 R26 R30
 		},
 	},
 	{
 		name:           "LoweredMove",
-		argLen:         4,
+		auxType:        auxInt64,
-		clobberFlags:   true,
+		argLen:         3,
 		faultOnNilArg0: true,
 		faultOnNilArg1: true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 131072},    // R17
+				{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
-				{1, 65536},     // R16
+				{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
 				{2, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
 			},
-			clobbers: 16973824, // R16 R17 R25
+			clobbers: 25165824, // R24 R25
 		},
 	},
 	{
 		name:           "LoweredMoveLoop",
 		auxType:        auxInt64,
 		argLen:         3,
 		faultOnNilArg0: true,
 		faultOnNilArg1: true,
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
 				{1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
 			},
 			clobbers:     29360128, // R23 R24 R25
 			clobbersArg0: true,
 			clobbersArg1: true,
 		},
 	},
 	{
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@ -1756,10 +1756,9 @@ func (s *regAllocState) regalloc(f *Func) {
 					// spilling the value with the most distant next use.
 					continue
 				}
-				// Copy input to a new clobberable register.
+				// Copy input to a different register that won't be clobbered.
 				c := s.allocValToReg(v.Args[i], m, true, v.Pos)
 				s.copies[c] = false
 				args[i] = c
 			}
 			// Pick a temporary register if needed.
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@ -19688,87 +19688,35 @@ func rewriteValueARM64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%16 != 0 && s%16 <= 8 && s > 64
+	// cond: s > 64 && s < 192 && logLargeCopy(v, s)
-	// result: (Move [8] (OffPtr <dst.Type> dst [s-8]) (OffPtr <src.Type> src [s-8]) (Move [s-s%16] dst src mem))
+	// result: (LoweredMove [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%16 != 0 && s%16 <= 8 && s > 64) {
+		if !(s > 64 && s < 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpMove)
+		v.reset(OpARM64LoweredMove)
-		v.AuxInt = int64ToAuxInt(8)
+		v.AuxInt = int64ToAuxInt(s)
 		v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
 		v0.AuxInt = int64ToAuxInt(s - 8)
 		v0.AddArg(dst)
 		v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
 		v1.AuxInt = int64ToAuxInt(s - 8)
 		v1.AddArg(src)
 		v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
 		v2.AuxInt = int64ToAuxInt(s - s%16)
 		v2.AddArg3(dst, src, mem)
 		v.AddArg3(v0, v1, v2)
 		return true
 	}
 	// match: (Move [s] dst src mem)
 	// cond: s%16 != 0 && s%16 > 8 && s > 64
 	// result: (Move [16] (OffPtr <dst.Type> dst [s-16]) (OffPtr <src.Type> src [s-16]) (Move [s-s%16] dst src mem))
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
 		if !(s%16 != 0 && s%16 > 8 && s > 64) {
 			break
 		}
 		v.reset(OpMove)
 		v.AuxInt = int64ToAuxInt(16)
 		v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
 		v0.AuxInt = int64ToAuxInt(s - 16)
 		v0.AddArg(dst)
 		v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
 		v1.AuxInt = int64ToAuxInt(s - 16)
 		v1.AddArg(src)
 		v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
 		v2.AuxInt = int64ToAuxInt(s - s%16)
 		v2.AddArg3(dst, src, mem)
 		v.AddArg3(v0, v1, v2)
 		return true
 	}
 	// match: (Move [s] dst src mem)
 	// cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
 	// result: (DUFFCOPY [8 * (64 - s/16)] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
 		if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
 			break
 		}
 		v.reset(OpARM64DUFFCOPY)
 		v.AuxInt = int64ToAuxInt(8 * (64 - s/16))
 		v.AddArg3(dst, src, mem)
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%16 == 0 && s > 16*64 && logLargeCopy(v, s)
+	// cond: s >= 192 && logLargeCopy(v, s)
-	// result: (LoweredMove dst src (ADDconst <src.Type> src [s-16]) mem)
+	// result: (LoweredMoveLoop [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%16 == 0 && s > 16*64 && logLargeCopy(v, s)) {
+		if !(s >= 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpARM64LoweredMove)
+		v.reset(OpARM64LoweredMoveLoop)
-		v0 := b.NewValue0(v.Pos, OpARM64ADDconst, src.Type)
+		v.AuxInt = int64ToAuxInt(s)
-		v0.AuxInt = int64ToAuxInt(s - 16)
+		v.AddArg3(dst, src, mem)
 		v0.AddArg(src)
 		v.AddArg4(dst, src, v0, mem)
 		return true
 	}
 	return false
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@ -758,6 +758,8 @@
 // variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH
 // environment variable is not set. Executables in $GOROOT
 // are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN.
 // Cross compiled binaries are installed in $GOOS_$GOARCH subdirectories
 // of the above.
 //
 // If the arguments have version suffixes (like @latest or @v1.0.0), "go install"
 // builds packages in module-aware mode, ignoring the go.mod file in the current
@ -3231,8 +3233,8 @@
 //
 // Several of the flags control profiling and write an execution profile
 // suitable for "go tool pprof"; run "go tool pprof -h" for more
-// information. The --alloc_space, --alloc_objects, and --show_bytes
+// information. The -sample_index=alloc_space, -sample_index=alloc_objects,
-// options of pprof control how the information is presented.
+// and -show_bytes options of pprof control how the information is presented.
 //
 // The following flags are recognized by the 'go test' command and
 // control the execution of any test:
--- a/src/cmd/go/internal/doc/doc.go
+++ b/src/cmd/go/internal/doc/doc.go
@ -212,16 +212,16 @@ func do(writer io.Writer, flagSet *flag.FlagSet, args []string) (err error) {
 			mod, err := runCmd(append(os.Environ(), "GOWORK=off"), "go", "list", "-m")
 			if err == nil && mod != "" && mod != "command-line-arguments" {
 				// If there's a module, go to the module's doc page.
-				return doPkgsite(mod)
+				return doPkgsite(mod, "")
 			}
 			gowork, err := runCmd(nil, "go", "env", "GOWORK")
 			if err == nil && gowork != "" {
 				// Outside a module, but in a workspace, go to the home page
 				// with links to each of the modules' pages.
-				return doPkgsite("")
+				return doPkgsite("", "")
 			}
 			// Outside a module or workspace, go to the documentation for the standard library.
-			return doPkgsite("std")
+			return doPkgsite("std", "")
 		}
 		// If args are provided, we need to figure out which page to open on the pkgsite
@ -282,11 +282,11 @@ func do(writer io.Writer, flagSet *flag.FlagSet, args []string) (err error) {
 		}
 		if found {
 			if serveHTTP {
-				path, err := objectPath(userPath, pkg, symbol, method)
+				path, fragment, err := objectPath(userPath, pkg, symbol, method)
 				if err != nil {
 					return err
 				}
-				return doPkgsite(path)
+				return doPkgsite(path, fragment)
 			}
 			return nil
 		}
@ -305,7 +305,8 @@ func runCmd(env []string, cmdline ...string) (string, error) {
 	return strings.TrimSpace(stdout.String()), nil
 }
-func objectPath(userPath string, pkg *Package, symbol, method string) (string, error) {
+// returns a path followed by a fragment (or an error)
 func objectPath(userPath string, pkg *Package, symbol, method string) (string, string, error) {
 	var err error
 	path := pkg.build.ImportPath
 	if path == "." {
@ -314,7 +315,7 @@ func objectPath(userPath string, pkg *Package, symbol, method string) (string, e
 		// go list to get the import path.
 		path, err = runCmd(nil, "go", "list", userPath)
 		if err != nil {
-			return "", err
+			return "", "", err
 		}
 	}
@ -322,10 +323,7 @@ func objectPath(userPath string, pkg *Package, symbol, method string) (string, e
 	if symbol != "" && method != "" {
 		object = symbol + "." + method
 	}
-	if object != "" {
+	return path, object, nil
 		path = path + "#" + object
 	}
 	return path, nil
 }
 // failMessage creates a nicely formatted error message when there is no result to show.
--- a/src/cmd/go/internal/doc/pkgsite.go
+++ b/src/cmd/go/internal/doc/pkgsite.go
@ -34,7 +34,7 @@ func pickUnusedPort() (int, error) {
 	return port, nil
 }
-func doPkgsite(urlPath string) error {
+func doPkgsite(urlPath, fragment string) error {
 	port, err := pickUnusedPort()
 	if err != nil {
 		return fmt.Errorf("failed to find port for documentation server: %v", err)
@ -44,6 +44,9 @@ func doPkgsite(urlPath string) error {
 	if err != nil {
 		return fmt.Errorf("internal error: failed to construct url: %v", err)
 	}
 	if fragment != "" {
 		path += "#" + fragment
 	}
 	// Turn off the default signal handler for SIGINT (and SIGQUIT on Unix)
 	// and instead wait for the child process to handle the signal and
--- a/src/cmd/go/internal/doc/pkgsite_bootstrap.go
+++ b/src/cmd/go/internal/doc/pkgsite_bootstrap.go
@ -8,4 +8,4 @@
 package doc
-func doPkgsite(string) error { return nil }
+func doPkgsite(string, string) error { return nil }
--- a/src/cmd/go/internal/test/test.go
+++ b/src/cmd/go/internal/test/test.go
@ -186,8 +186,8 @@ and flags that apply to the resulting test binary.
 Several of the flags control profiling and write an execution profile
 suitable for "go tool pprof"; run "go tool pprof -h" for more
-information. The --alloc_space, --alloc_objects, and --show_bytes
+information. The -sample_index=alloc_space, -sample_index=alloc_objects,
-options of pprof control how the information is presented.
+and -show_bytes options of pprof control how the information is presented.
 The following flags are recognized by the 'go test' command and
 control the execution of any test:
@ -1044,11 +1044,36 @@ func runTest(ctx context.Context, cmd *base.Command, args []string) {
 		prints = append(prints, printTest)
 	}
-	// Order runs for coordinating start JSON prints.
+	// Order runs for coordinating start JSON prints via two mechanisms:
 	// 1. Channel locking forces runTest actions to start in-order.
 	// 2. Barrier tasks force runTest actions to be scheduled in-order.
 	// We need both for performant behavior, as channel locking without the barrier tasks starves the worker pool,
 	// and barrier tasks without channel locking doesn't guarantee start in-order behavior alone.
 	var prevBarrier *work.Action
 	ch := make(chan struct{})
 	close(ch)
 	for _, a := range runs {
 		if r, ok := a.Actor.(*runTestActor); ok {
 			// Inject a barrier task between the run action and its dependencies.
 			// This barrier task wil also depend on the previous barrier task.
 			// This prevents the run task from being scheduled until all previous run dependencies have finished.
 			// The build graph will be augmented to look roughly like this:
 			//	build("a")           build("b")           build("c")
 			//	    |                   |                     |
 			//	barrier("a.test") -> barrier("b.test") -> barrier("c.test")
 			//	    |                   |                     |
 			//	run("a.test")        run("b.test")        run("c.test")
 			barrier := &work.Action{
 				Mode: "test barrier",
 				Deps: slices.Clip(a.Deps),
 			}
 			if prevBarrier != nil {
 				barrier.Deps = append(barrier.Deps, prevBarrier)
 			}
 			a.Deps = []*work.Action{barrier}
 			prevBarrier = barrier
 			r.prev = ch
 			ch = make(chan struct{})
 			r.next = ch
@ -1400,6 +1425,8 @@ func (lockedStdout) Write(b []byte) (int, error) {
 func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) error {
 	sh := b.Shell(a)
 	barrierAction := a.Deps[0]
 	buildAction := barrierAction.Deps[0]
 	// Wait for previous test to get started and print its first json line.
 	select {
@ -1530,7 +1557,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// we have different link inputs but the same final binary,
 		// we still reuse the cached test result.
 		// c.saveOutput will store the result under both IDs.
-		r.c.tryCacheWithID(b, a, a.Deps[0].BuildContentID())
+		r.c.tryCacheWithID(b, a, buildAction.BuildContentID())
 	}
 	if r.c.buf != nil {
 		if stdout != &buf {
@ -1581,7 +1608,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// fresh copies of tools to test as part of the testing.
 		addToEnv = "GOCOVERDIR=" + gcd
 	}
-	args := str.StringList(execCmd, a.Deps[0].BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)
+	args := str.StringList(execCmd, buildAction.BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)
 	if testCoverProfile != "" {
 		// Write coverage to temporary profile, for merging later.
@ -1741,8 +1768,8 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 // tryCache is called just before the link attempt,
 // to see if the test result is cached and therefore the link is unneeded.
 // It reports whether the result can be satisfied from cache.
-func (c *runCache) tryCache(b *work.Builder, a *work.Action) bool {
+func (c *runCache) tryCache(b *work.Builder, a *work.Action, linkAction *work.Action) bool {
-	return c.tryCacheWithID(b, a, a.Deps[0].BuildActionID())
+	return c.tryCacheWithID(b, a, linkAction.BuildActionID())
 }
 func (c *runCache) tryCacheWithID(b *work.Builder, a *work.Action, id string) bool {
--- a/src/cmd/go/internal/work/action.go
+++ b/src/cmd/go/internal/work/action.go
@ -92,7 +92,7 @@ type Action struct {
 	buggyInstall bool // is this a buggy install (see -linkshared)?
-	TryCache func(*Builder, *Action) bool // callback for cache bypass
+	TryCache func(*Builder, *Action, *Action) bool // callback for cache bypass
 	CacheExecutable bool // Whether to cache executables produced by link steps
--- a/src/cmd/go/internal/work/build.go
+++ b/src/cmd/go/internal/work/build.go
@ -568,6 +568,8 @@ Executables are installed in the directory named by the GOBIN environment
 variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH
 environment variable is not set. Executables in $GOROOT
 are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN.
 Cross compiled binaries are installed in $GOOS_$GOARCH subdirectories
 of the above.
 If the arguments have version suffixes (like @latest or @v1.0.0), "go install"
 builds packages in module-aware mode, ignoring the go.mod file in the current
--- a/src/cmd/go/internal/work/buildid.go
+++ b/src/cmd/go/internal/work/buildid.go
@ -401,6 +401,25 @@ var (
 	stdlibRecompiledIncOnce = sync.OnceFunc(stdlibRecompiled.Inc)
 )
 // testRunAction returns the run action for a test given the link action
 // for the test binary, if the only (non-test-barrier) action that depend
 // on the link action is the run action.
 func testRunAction(a *Action) *Action {
 	if len(a.triggers) != 1 || a.triggers[0].Mode != "test barrier" {
 		return nil
 	}
 	var runAction *Action
 	for _, t := range a.triggers[0].triggers {
 		if t.Mode == "test run" {
 			if runAction != nil {
 				return nil
 			}
 			runAction = t
 		}
 	}
 	return runAction
 }
 // useCache tries to satisfy the action a, which has action ID actionHash,
 // by using a cached result from an earlier build.
 // If useCache decides that the cache can be used, it sets a.buildID
@ -526,7 +545,7 @@ func (b *Builder) useCache(a *Action, actionHash cache.ActionID, target string,
 	// then to avoid the link step, report the link as up-to-date.
 	// We avoid the nested build ID problem in the previous special case
 	// by recording the test results in the cache under the action ID half.
-	if len(a.triggers) == 1 && a.triggers[0].TryCache != nil && a.triggers[0].TryCache(b, a.triggers[0]) {
+	if ra := testRunAction(a); ra != nil && ra.TryCache != nil && ra.TryCache(b, ra, a) {
 		// Best effort attempt to display output from the compile and link steps.
 		// If it doesn't work, it doesn't work: reusing the test result is more
 		// important than reprinting diagnostic information.
--- a/src/cmd/go/internal/work/cover.go
+++ b/src/cmd/go/internal/work/cover.go
@ -36,8 +36,9 @@ func (b *Builder) CovData(a *Action, cmdargs ...any) ([]byte, error) {
 // but will be empty; in this case the return is an empty string.
 func BuildActionCoverMetaFile(runAct *Action) (string, error) {
 	p := runAct.Package
-	for i := range runAct.Deps {
+	barrierAct := runAct.Deps[0]
-		pred := runAct.Deps[i]
+	for i := range barrierAct.Deps {
 		pred := barrierAct.Deps[i]
 		if pred.Mode != "build" || pred.Package == nil {
 			continue
 		}
--- a/src/cmd/go/internal/work/exec.go
+++ b/src/cmd/go/internal/work/exec.go
@ -183,7 +183,21 @@ func (b *Builder) Do(ctx context.Context, root *Action) {
 		for _, a0 := range a.triggers {
 			if a.Failed != nil {
-				a0.Failed = a.Failed
+				if a0.Mode == "test barrier" {
 					// If this action was triggered by a test, there
 					// will be a test barrier action in between the test
 					// and the true trigger. But there will be other
 					// triggers that are other barriers that are waiting
 					// for this one. Propagate the failure to the true
 					// trigger, but not to the other barriers.
 					for _, bt := range a0.triggers {
 						if bt.Mode != "test barrier" {
 							bt.Failed = a.Failed
 						}
 					}
 				} else {
 					a0.Failed = a.Failed
 				}
 			}
 			if a0.pending--; a0.pending == 0 {
 				b.ready.push(a0)
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@ -892,8 +892,6 @@ var optab = []Optab{
 	{obj.ANOP, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // nop variants, see #40689
 	{obj.ANOP, C_ZREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
 	{obj.ANOP, C_VREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
 	{obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
 	{obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
 	{obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},    // align code
 	{obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional
 }
@ -3297,9 +3295,7 @@ func buildop(ctxt *obj.Link) {
 			obj.AFUNCDATA,
 			obj.APCALIGN,
 			obj.APCALIGNMAX,
-			obj.APCDATA,
+			obj.APCDATA:
 			obj.ADUFFZERO,
 			obj.ADUFFCOPY:
 			break
 		}
 	}
@ -6971,7 +6967,7 @@ func (c *ctxt7) opbra(p *obj.Prog, a obj.As) uint32 {
 	case AB:
 		return 0<<31 | 5<<26 /* imm26 */
-	case obj.ADUFFZERO, obj.ADUFFCOPY, ABL:
+	case ABL:
 		return 1<<31 | 5<<26
 	}
--- a/src/cmd/internal/obj/arm64/obj7.go
+++ b/src/cmd/internal/obj/arm64/obj7.go
@ -319,11 +319,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 	// Rewrite BR/BL to symbol as TYPE_BRANCH.
 	switch p.As {
-	case AB,
+	case AB, ABL, obj.ARET:
 		ABL,
 		obj.ARET,
 		obj.ADUFFZERO,
 		obj.ADUFFCOPY:
 		if p.To.Sym != nil {
 			p.To.Type = obj.TYPE_BRANCH
 		}
@ -400,39 +396,6 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {
 // Rewrite p, if necessary, to access global data via the global offset table.
 func (c *ctxt7) rewriteToUseGot(p *obj.Prog) {
 	if p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO {
 		//     ADUFFxxx $offset
 		// becomes
 		//     MOVD runtime.duffxxx@GOT, REGTMP
 		//     ADD $offset, REGTMP
 		//     CALL REGTMP
 		var sym *obj.LSym
 		if p.As == obj.ADUFFZERO {
 			sym = c.ctxt.LookupABI("runtime.duffzero", obj.ABIInternal)
 		} else {
 			sym = c.ctxt.LookupABI("runtime.duffcopy", obj.ABIInternal)
 		}
 		offset := p.To.Offset
 		p.As = AMOVD
 		p.From.Type = obj.TYPE_MEM
 		p.From.Name = obj.NAME_GOTREF
 		p.From.Sym = sym
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = REGTMP
 		p.To.Name = obj.NAME_NONE
 		p.To.Offset = 0
 		p.To.Sym = nil
 		p1 := obj.Appendp(p, c.newprog)
 		p1.As = AADD
 		p1.From.Type = obj.TYPE_CONST
 		p1.From.Offset = offset
 		p1.To.Type = obj.TYPE_REG
 		p1.To.Reg = REGTMP
 		p2 := obj.Appendp(p1, c.newprog)
 		p2.As = obj.ACALL
 		p2.To.Type = obj.TYPE_REG
 		p2.To.Reg = REGTMP
 	}
 	// We only care about global data: NAME_EXTERN means a global
 	// symbol in the Go sense, and p.Sym.Local is true for a few
@ -543,9 +506,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 		case obj.ATEXT:
 			p.Mark |= LEAF
-		case ABL,
+		case ABL:
 			obj.ADUFFZERO,
 			obj.ADUFFCOPY:
 			c.cursym.Func().Text.Mark &^= LEAF
 		}
 	}
@ -912,110 +873,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				p.From.Type = obj.TYPE_MEM
 				p.From.Reg = REGSP
 			}
 		case obj.ADUFFCOPY:
 			//  ADR	ret_addr, R27
 			//  STP	(FP, R27), -24(SP)
 			//  SUB	24, SP, FP
 			//  DUFFCOPY
 			// ret_addr:
 			//  SUB	8, SP, FP
 			q1 := p
 			// copy DUFFCOPY from q1 to q4
 			q4 := obj.Appendp(p, c.newprog)
 			q4.Pos = p.Pos
 			q4.As = obj.ADUFFCOPY
 			q4.To = p.To
 			q1.As = AADR
 			q1.From.Type = obj.TYPE_BRANCH
 			q1.To.Type = obj.TYPE_REG
 			q1.To.Reg = REG_R27
 			q2 := obj.Appendp(q1, c.newprog)
 			q2.Pos = p.Pos
 			q2.As = ASTP
 			q2.From.Type = obj.TYPE_REGREG
 			q2.From.Reg = REGFP
 			q2.From.Offset = int64(REG_R27)
 			q2.To.Type = obj.TYPE_MEM
 			q2.To.Reg = REGSP
 			q2.To.Offset = -24
 			// maintain FP for DUFFCOPY
 			q3 := obj.Appendp(q2, c.newprog)
 			q3.Pos = p.Pos
 			q3.As = ASUB
 			q3.From.Type = obj.TYPE_CONST
 			q3.From.Offset = 24
 			q3.Reg = REGSP
 			q3.To.Type = obj.TYPE_REG
 			q3.To.Reg = REGFP
 			q5 := obj.Appendp(q4, c.newprog)
 			q5.Pos = p.Pos
 			q5.As = ASUB
 			q5.From.Type = obj.TYPE_CONST
 			q5.From.Offset = 8
 			q5.Reg = REGSP
 			q5.To.Type = obj.TYPE_REG
 			q5.To.Reg = REGFP
 			q1.From.SetTarget(q5)
 			p = q5
 		case obj.ADUFFZERO:
 			//  ADR	ret_addr, R27
 			//  STP	(FP, R27), -24(SP)
 			//  SUB	24, SP, FP
 			//  DUFFZERO
 			// ret_addr:
 			//  SUB	8, SP, FP
 			q1 := p
 			// copy DUFFZERO from q1 to q4
 			q4 := obj.Appendp(p, c.newprog)
 			q4.Pos = p.Pos
 			q4.As = obj.ADUFFZERO
 			q4.To = p.To
 			q1.As = AADR
 			q1.From.Type = obj.TYPE_BRANCH
 			q1.To.Type = obj.TYPE_REG
 			q1.To.Reg = REG_R27
 			q2 := obj.Appendp(q1, c.newprog)
 			q2.Pos = p.Pos
 			q2.As = ASTP
 			q2.From.Type = obj.TYPE_REGREG
 			q2.From.Reg = REGFP
 			q2.From.Offset = int64(REG_R27)
 			q2.To.Type = obj.TYPE_MEM
 			q2.To.Reg = REGSP
 			q2.To.Offset = -24
 			// maintain FP for DUFFZERO
 			q3 := obj.Appendp(q2, c.newprog)
 			q3.Pos = p.Pos
 			q3.As = ASUB
 			q3.From.Type = obj.TYPE_CONST
 			q3.From.Offset = 24
 			q3.Reg = REGSP
 			q3.To.Type = obj.TYPE_REG
 			q3.To.Reg = REGFP
 			q5 := obj.Appendp(q4, c.newprog)
 			q5.Pos = p.Pos
 			q5.As = ASUB
 			q5.From.Type = obj.TYPE_CONST
 			q5.From.Offset = 8
 			q5.Reg = REGSP
 			q5.To.Type = obj.TYPE_REG
 			q5.To.Reg = REGFP
 			q1.From.SetTarget(q5)
 			p = q5
 		}
 		if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 {
 			f := c.cursym.Func()
 			if f.FuncFlag&abi.FuncFlagSPWrite == 0 {
--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@ -4013,15 +4013,6 @@ func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) in
 	return z
 }
 var bpduff1 = []byte{
 	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
 	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
 }
 var bpduff2 = []byte{
 	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
 }
 // asmevex emits EVEX pregis and opcode byte.
 // In addition to asmvex r/m, vvvv and reg fields also requires optional
 // K-masking register.
@ -4859,16 +4850,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 					ctxt.Diag("directly calling duff when dynamically linking Go")
 				}
 				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
 					// Maintain BP around call, since duffcopy/duffzero can't do it
 					// (the call jumps into the middle of the function).
 					// This makes it possible to see call sites for duffcopy/duffzero in
 					// BP-based profiling tools like Linux perf (which is the
 					// whole point of maintaining frame pointers in Go).
 					// MOVQ BP, -16(SP)
 					// LEAQ -16(SP), BP
 					ab.Put(bpduff1)
 				}
 				ab.Put1(byte(op))
 				cursym.AddRel(ctxt, obj.Reloc{
 					Type: objabi.R_CALL,
@ -4879,12 +4860,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 				})
 				ab.PutInt32(0)
 				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
 					// Pop BP pushed above.
 					// MOVQ 0(BP), BP
 					ab.Put(bpduff2)
 				}
 			// TODO: jump across functions needs reloc
 			case Zbr, Zjmp, Zloop:
 				if p.As == AXBEGIN {
--- a/src/errors/join.go
+++ b/src/errors/join.go
@ -16,6 +16,7 @@ import (
 // between each string.
 //
 // A non-nil error returned by Join implements the Unwrap() []error method.
 // The errors may be inspected with [Is] and [As].
 func Join(errs ...error) error {
 	n := 0
 	for _, err := range errs {
--- a/src/internal/poll/fd_windows.go
+++ b/src/internal/poll/fd_windows.go
@ -622,12 +622,22 @@ func (fd *FD) Pread(b []byte, off int64) (int, error) {
 	fd.l.Lock()
 	defer fd.l.Unlock()
-	curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
+	if fd.isBlocking {
-	if err != nil {
+		curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
-		return 0, err
+		if err != nil {
 			return 0, err
 		}
 		defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 		defer fd.setOffset(curoffset)
 	} else {
 		// Overlapped handles don't have the file pointer updated
 		// when performing I/O operations, so there is no need to
 		// call Seek to reset the file pointer.
 		// Also, some overlapped file handles don't support seeking.
 		// See https://go.dev/issues/74951.
 		curoffset := fd.offset
 		defer fd.setOffset(curoffset)
 	}
 	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 	defer fd.setOffset(curoffset)
 	o := &fd.rop
 	o.InitBuf(b)
 	fd.setOffset(off)
@ -847,12 +857,22 @@ func (fd *FD) Pwrite(buf []byte, off int64) (int, error) {
 	fd.l.Lock()
 	defer fd.l.Unlock()
-	curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
+	if fd.isBlocking {
-	if err != nil {
+		curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
-		return 0, err
+		if err != nil {
 			return 0, err
 		}
 		defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 		defer fd.setOffset(curoffset)
 	} else {
 		// Overlapped handles don't have the file pointer updated
 		// when performing I/O operations, so there is no need to
 		// call Seek to reset the file pointer.
 		// Also, some overlapped file handles don't support seeking.
 		// See https://go.dev/issues/74951.
 		curoffset := fd.offset
 		defer fd.setOffset(curoffset)
 	}
 	defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 	defer fd.setOffset(curoffset)
 	var ntotal int
 	for {
@ -1107,6 +1127,12 @@ func (fd *FD) Seek(offset int64, whence int) (int64, error) {
 	fd.l.Lock()
 	defer fd.l.Unlock()
 	if !fd.isBlocking && whence == io.SeekCurrent {
 		// Windows doesn't keep the file pointer for overlapped file handles.
 		// We do it ourselves in case to account for any read or write
 		// operations that may have occurred.
 		offset += fd.offset
 	}
 	n, err := syscall.Seek(fd.Sysfd, offset, whence)
 	fd.setOffset(n)
 	return n, err
--- a/src/internal/runtime/maps/table.go
+++ b/src/internal/runtime/maps/table.go
@ -8,6 +8,7 @@ package maps
 import (
 	"internal/abi"
 	"internal/goarch"
 	"internal/runtime/math"
 	"unsafe"
 )
@ -127,8 +128,7 @@ func (t *table) maxGrowthLeft() uint16 {
 		// single-group tables, we could fill all slots.
 		return t.capacity - 1
 	} else {
-		if t.capacity*maxAvgGroupLoad < t.capacity {
+		if t.capacity > math.MaxUint16/maxAvgGroupLoad {
 			// TODO(prattmic): Do something cleaner.
 			panic("overflow")
 		}
 		return (t.capacity * maxAvgGroupLoad) / abi.MapGroupSlots
--- a/src/internal/runtime/math/math.go
+++ b/src/internal/runtime/math/math.go
@ -7,6 +7,7 @@ package math
 import "internal/goarch"
 const (
 	MaxUint16  = ^uint16(0)
 	MaxUint32  = ^uint32(0)
 	MaxUint64  = ^uint64(0)
 	MaxUintptr = ^uintptr(0)
--- a/src/internal/synctest/synctest_test.go
+++ b/src/internal/synctest/synctest_test.go
@ -383,57 +383,59 @@ func TestChannelMovedOutOfBubble(t *testing.T) {
 	for _, test := range []struct {
 		desc      string
 		f         func(chan struct{})
-		wantPanic string
+		wantFatal string
 	}{{
 		desc: "receive",
 		f: func(ch chan struct{}) {
 			<-ch
 		},
-		wantPanic: "receive on synctest channel from outside bubble",
+		wantFatal: "receive on synctest channel from outside bubble",
 	}, {
 		desc: "send",
 		f: func(ch chan struct{}) {
 			ch <- struct{}{}
 		},
-		wantPanic: "send on synctest channel from outside bubble",
+		wantFatal: "send on synctest channel from outside bubble",
 	}, {
 		desc: "close",
 		f: func(ch chan struct{}) {
 			close(ch)
 		},
-		wantPanic: "close of synctest channel from outside bubble",
+		wantFatal: "close of synctest channel from outside bubble",
 	}} {
 		t.Run(test.desc, func(t *testing.T) {
 			// Bubbled channel accessed from outside any bubble.
 			t.Run("outside_bubble", func(t *testing.T) {
-				donec := make(chan struct{})
+				wantFatal(t, test.wantFatal, func() {
-				ch := make(chan chan struct{})
+					donec := make(chan struct{})
-				go func() {
+					ch := make(chan chan struct{})
-					defer close(donec)
+					go func() {
-					defer wantPanic(t, test.wantPanic)
+						defer close(donec)
-					test.f(<-ch)
+						test.f(<-ch)
-				}()
+					}()
-				synctest.Run(func() {
+					synctest.Run(func() {
-					ch <- make(chan struct{})
+						ch <- make(chan struct{})
 					})
 					<-donec
 				})
 				<-donec
 			})
 			// Bubbled channel accessed from a different bubble.
 			t.Run("different_bubble", func(t *testing.T) {
-				donec := make(chan struct{})
+				wantFatal(t, test.wantFatal, func() {
-				ch := make(chan chan struct{})
+					donec := make(chan struct{})
-				go func() {
+					ch := make(chan chan struct{})
-					defer close(donec)
+					go func() {
-					c := <-ch
+						defer close(donec)
 						c := <-ch
 						synctest.Run(func() {
 							test.f(c)
 						})
 					}()
 					synctest.Run(func() {
-						defer wantPanic(t, test.wantPanic)
+						ch <- make(chan struct{})
 						test.f(c)
 					})
-				}()
+					<-donec
 				synctest.Run(func() {
 					ch <- make(chan struct{})
 				})
 				<-donec
 			})
 		})
 	}
@ -443,39 +445,40 @@ func TestTimerFromInsideBubble(t *testing.T) {
 	for _, test := range []struct {
 		desc      string
 		f         func(tm *time.Timer)
-		wantPanic string
+		wantFatal string
 	}{{
 		desc: "read channel",
 		f: func(tm *time.Timer) {
 			<-tm.C
 		},
-		wantPanic: "receive on synctest channel from outside bubble",
+		wantFatal: "receive on synctest channel from outside bubble",
 	}, {
 		desc: "Reset",
 		f: func(tm *time.Timer) {
 			tm.Reset(1 * time.Second)
 		},
-		wantPanic: "reset of synctest timer from outside bubble",
+		wantFatal: "reset of synctest timer from outside bubble",
 	}, {
 		desc: "Stop",
 		f: func(tm *time.Timer) {
 			tm.Stop()
 		},
-		wantPanic: "stop of synctest timer from outside bubble",
+		wantFatal: "stop of synctest timer from outside bubble",
 	}} {
 		t.Run(test.desc, func(t *testing.T) {
-			donec := make(chan struct{})
+			wantFatal(t, test.wantFatal, func() {
-			ch := make(chan *time.Timer)
+				donec := make(chan struct{})
-			go func() {
+				ch := make(chan *time.Timer)
-				defer close(donec)
+				go func() {
-				defer wantPanic(t, test.wantPanic)
+					defer close(donec)
-				test.f(<-ch)
+					test.f(<-ch)
-			}()
+				}()
-			synctest.Run(func() {
+				synctest.Run(func() {
-				tm := time.NewTimer(1 * time.Second)
+					tm := time.NewTimer(1 * time.Second)
-				ch <- tm
+					ch <- tm
 				})
 				<-donec
 			})
 			<-donec
 		})
 	}
 }
--- a/src/internal/trace/batch.go
+++ b/src/internal/trace/batch.go
@ -44,6 +44,10 @@ func (b *batch) isSyncBatch(ver version.Version) bool {
 			(tracev2.EventType(b.data[0]) == tracev2.EvSync && ver >= version.Go125))
 }
 func (b *batch) isEndOfGeneration() bool {
 	return b.exp == tracev2.NoExperiment && len(b.data) > 0 && tracev2.EventType(b.data[0]) == tracev2.EvEndOfGeneration
 }
 // readBatch reads the next full batch from r.
 func readBatch(r interface {
 	io.Reader
@ -54,6 +58,9 @@ func readBatch(r interface {
 	if err != nil {
 		return batch{}, 0, err
 	}
 	if typ := tracev2.EventType(b); typ == tracev2.EvEndOfGeneration {
 		return batch{m: NoThread, exp: tracev2.NoExperiment, data: []byte{b}}, 0, nil
 	}
 	if typ := tracev2.EventType(b); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
 		return batch{}, 0, fmt.Errorf("expected batch event, got event %d", typ)
 	}
--- a/src/internal/trace/event.go
+++ b/src/internal/trace/event.go
@ -843,7 +843,6 @@ func (e Event) String() string {
 		fmt.Fprintf(&sb, " Task=%d Category=%q Message=%q", l.Task, l.Category, l.Message)
 	case EventStateTransition:
 		s := e.StateTransition()
 		fmt.Fprintf(&sb, " Resource=%s Reason=%q", s.Resource, s.Reason)
 		switch s.Resource.Kind {
 		case ResourceGoroutine:
 			id := s.Resource.Goroutine()
@ -854,6 +853,7 @@ func (e Event) String() string {
 			old, new := s.Proc()
 			fmt.Fprintf(&sb, " ProcID=%d %s->%s", id, old, new)
 		}
 		fmt.Fprintf(&sb, " Reason=%q", s.Reason)
 		if s.Stack != NoStack {
 			fmt.Fprintln(&sb)
 			fmt.Fprintln(&sb, "TransitionStack=")
@ -879,7 +879,7 @@ func (e Event) String() string {
 			fmt.Fprintf(&sb, " Trace=%d Mono=%d Wall=%s",
 				s.ClockSnapshot.Trace,
 				s.ClockSnapshot.Mono,
-				s.ClockSnapshot.Wall.Format(time.RFC3339),
+				s.ClockSnapshot.Wall.Format(time.RFC3339Nano),
 			)
 		}
 	}
--- a/src/internal/trace/generation.go
+++ b/src/internal/trace/generation.go
@ -9,6 +9,7 @@ import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"slices"
@ -32,22 +33,102 @@ type generation struct {
 	*evTable
 }
 // readGeneration buffers and decodes the structural elements of a trace generation
 // out of r.
 func readGeneration(r *bufio.Reader, ver version.Version) (*generation, error) {
 	if ver < version.Go126 {
 		return nil, errors.New("internal error: readGeneration called for <1.26 trace")
 	}
 	g := &generation{
 		evTable: &evTable{
 			pcs: make(map[uint64]frame),
 		},
 		batches: make(map[ThreadID][]batch),
 	}
 	// Read batches one at a time until we either hit the next generation.
 	for {
 		b, gen, err := readBatch(r)
 		if err == io.EOF {
 			if len(g.batches) != 0 {
 				return nil, errors.New("incomplete generation found; trace likely truncated")
 			}
 			return nil, nil // All done.
 		}
 		if err != nil {
 			return nil, err
 		}
 		if g.gen == 0 {
 			// Initialize gen.
 			g.gen = gen
 		}
 		if b.isEndOfGeneration() {
 			break
 		}
 		if gen == 0 {
 			// 0 is a sentinel used by the runtime, so we'll never see it.
 			return nil, fmt.Errorf("invalid generation number %d", gen)
 		}
 		if gen != g.gen {
 			return nil, fmt.Errorf("broken trace: missing end-of-generation event, or generations are interleaved")
 		}
 		if g.minTs == 0 || b.time < g.minTs {
 			g.minTs = b.time
 		}
 		if err := processBatch(g, b, ver); err != nil {
 			return nil, err
 		}
 	}
 	// Check some invariants.
 	if g.freq == 0 {
 		return nil, fmt.Errorf("no frequency event found")
 	}
 	if !g.hasClockSnapshot {
 		return nil, fmt.Errorf("no clock snapshot event found")
 	}
 	// N.B. Trust that the batch order is correct. We can't validate the batch order
 	// by timestamp because the timestamps could just be plain wrong. The source of
 	// truth is the order things appear in the trace and the partial order sequence
 	// numbers on certain events. If it turns out the batch order is actually incorrect
 	// we'll very likely fail to advance a partial order from the frontier.
 	// Compactify stacks and strings for better lookup performance later.
 	g.stacks.compactify()
 	g.strings.compactify()
 	// Validate stacks.
 	if err := validateStackStrings(&g.stacks, &g.strings, g.pcs); err != nil {
 		return nil, err
 	}
 	// Now that we have the frequency, fix up CPU samples.
 	fixUpCPUSamples(g.cpuSamples, g.freq)
 	return g, nil
 }
 // spilledBatch represents a batch that was read out for the next generation,
 // while reading the previous one. It's passed on when parsing the next
 // generation.
 //
 // Used only for trace versions < Go126.
 type spilledBatch struct {
 	gen uint64
 	*batch
 }
-// readGeneration buffers and decodes the structural elements of a trace generation
+// readGenerationWithSpill buffers and decodes the structural elements of a trace generation
 // out of r. spill is the first batch of the new generation (already buffered and
 // parsed from reading the last generation). Returns the generation and the first
 // batch read of the next generation, if any.
 //
 // If gen is non-nil, it is valid and must be processed before handling the returned
 // error.
-func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (*generation, *spilledBatch, error) {
+func readGenerationWithSpill(r *bufio.Reader, spill *spilledBatch, ver version.Version) (*generation, *spilledBatch, error) {
 	if ver >= version.Go126 {
 		return nil, nil, errors.New("internal error: readGenerationWithSpill called for Go 1.26+ trace")
 	}
 	g := &generation{
 		evTable: &evTable{
 			pcs: make(map[uint64]frame),
@ -56,6 +137,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 	}
 	// Process the spilled batch.
 	if spill != nil {
 		// Process the spilled batch, which contains real data.
 		g.gen = spill.gen
 		g.minTs = spill.batch.time
 		if err := processBatch(g, *spill.batch, ver); err != nil {
@ -63,8 +145,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		}
 		spill = nil
 	}
-	// Read batches one at a time until we either hit EOF or
+	// Read batches one at a time until we either hit the next generation.
 	// the next generation.
 	var spillErr error
 	for {
 		b, gen, err := readBatch(r)
@ -73,7 +154,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		}
 		if err != nil {
 			if g.gen != 0 {
-				// This is an error reading the first batch of the next generation.
+				// This may be an error reading the first batch of the next generation.
 				// This is fine. Let's forge ahead assuming that what we've got so
 				// far is fine.
 				spillErr = err
@ -89,7 +170,8 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 			// Initialize gen.
 			g.gen = gen
 		}
-		if gen == g.gen+1 { // TODO: advance this the same way the runtime does.
+		if gen == g.gen+1 {
 			// TODO: Increment the generation with wraparound the same way the runtime does.
 			spill = &spilledBatch{gen: gen, batch: &b}
 			break
 		}
@ -134,15 +216,8 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		return nil, nil, err
 	}
-	// Fix up the CPU sample timestamps, now that we have freq.
+	// Now that we have the frequency, fix up CPU samples.
-	for i := range g.cpuSamples {
+	fixUpCPUSamples(g.cpuSamples, g.freq)
 		s := &g.cpuSamples[i]
 		s.time = g.freq.mul(timestamp(s.time))
 	}
 	// Sort the CPU samples.
 	slices.SortFunc(g.cpuSamples, func(a, b cpuSample) int {
 		return cmp.Compare(a.time, b.time)
 	})
 	return g, spill, spillErr
 }
@ -174,6 +249,8 @@ func processBatch(g *generation, b batch, ver version.Version) error {
 		if err := addExperimentalBatch(g.expBatches, b); err != nil {
 			return err
 		}
 	case b.isEndOfGeneration():
 		return errors.New("internal error: unexpectedly processing EndOfGeneration; broken trace?")
 	default:
 		if _, ok := g.batches[b.m]; !ok {
 			g.batchMs = append(g.batchMs, b.m)
@ -512,3 +589,15 @@ func addExperimentalBatch(expBatches map[tracev2.Experiment][]ExperimentalBatch,
 	})
 	return nil
 }
 func fixUpCPUSamples(samples []cpuSample, freq frequency) {
 	// Fix up the CPU sample timestamps.
 	for i := range samples {
 		s := &samples[i]
 		s.time = freq.mul(timestamp(s.time))
 	}
 	// Sort the CPU samples.
 	slices.SortFunc(samples, func(a, b cpuSample) int {
 		return cmp.Compare(a.time, b.time)
 	})
 }
--- a/src/internal/trace/internal/testgen/trace.go
+++ b/src/internal/trace/internal/testgen/trace.go
@ -322,6 +322,14 @@ func (g *Generation) writeEventsTo(tw *raw.TextWriter) {
 		}
 	}
 	b.writeEventsTo(tw)
 	// Write end-of-generation event if necessary.
 	if g.trace.ver >= version.Go126 {
 		tw.WriteEvent(raw.Event{
 			Version: g.trace.ver,
 			Ev:      tracev2.EvEndOfGeneration,
 		})
 	}
 }
 func (g *Generation) newStructuralBatch() *Batch {
--- a/src/internal/trace/reader.go
+++ b/src/internal/trace/reader.go
@ -6,6 +6,7 @@ package trace
 import (
 	"bufio"
 	"errors"
 	"fmt"
 	"io"
 	"slices"
@ -22,18 +23,29 @@ import (
 // event as the first event, and a Sync event as the last event.
 // (There may also be any number of Sync events in the middle, too.)
 type Reader struct {
-	version      version.Version
+	version    version.Version
-	r            *bufio.Reader
+	r          *bufio.Reader
-	lastTs       Time
+	lastTs     Time
-	gen          *generation
+	gen        *generation
 	frontier   []*batchCursor
 	cpuSamples []cpuSample
 	order      ordering
 	syncs      int
 	readGenErr error
 	done       bool
 	// Spill state.
 	//
 	// Traces before Go 1.26 had no explicit end-of-generation signal, and
 	// so the first batch of the next generation needed to be parsed to identify
 	// a new generation. This batch is the "spilled" so we don't lose track
 	// of it when parsing the next generation.
 	//
 	// This is unnecessary after Go 1.26 because of an explicit end-of-generation
 	// signal.
 	spill        *spilledBatch
 	spillErr     error // error from reading spill
 	spillErrSync bool  // whether we emitted a Sync before reporting spillErr
 	frontier     []*batchCursor
 	cpuSamples   []cpuSample
 	order        ordering
 	syncs        int
 	done         bool
 	v1Events *traceV1Converter
 }
@ -54,7 +66,7 @@ func NewReader(r io.Reader) (*Reader, error) {
 		return &Reader{
 			v1Events: convertV1Trace(tr),
 		}, nil
-	case version.Go122, version.Go123, version.Go125:
+	case version.Go122, version.Go123, version.Go125, version.Go126:
 		return &Reader{
 			version: v,
 			r:       br,
@ -139,52 +151,23 @@ func (r *Reader) ReadEvent() (e Event, err error) {
 	// Check if we need to refresh the generation.
 	if len(r.frontier) == 0 && len(r.cpuSamples) == 0 {
-		if r.spillErr != nil {
+		if r.version < version.Go126 {
-			if r.spillErrSync {
+			return r.nextGenWithSpill()
-				return Event{}, r.spillErr
+		}
-			}
+		if r.readGenErr != nil {
-			r.spillErrSync = true
+			return Event{}, r.readGenErr
 		}
 		gen, err := readGeneration(r.r, r.version)
 		if err != nil {
 			// Before returning an error, emit the sync event
 			// for the current generation and queue up the error
 			// for the next call.
 			r.readGenErr = err
 			r.gen = nil
 			r.syncs++
 			return syncEvent(nil, r.lastTs, r.syncs), nil
 		}
-		if r.gen != nil && r.spill == nil {
+		return r.installGen(gen)
 			// If we have a generation from the last read,
 			// and there's nothing left in the frontier, and
 			// there's no spilled batch, indicating that there's
 			// no further generation, it means we're done.
 			// Emit the final sync event.
 			r.done = true
 			r.syncs++
 			return syncEvent(nil, r.lastTs, r.syncs), nil
 		}
 		// Read the next generation.
 		r.gen, r.spill, r.spillErr = readGeneration(r.r, r.spill, r.version)
 		if r.gen == nil {
 			r.spillErrSync = true
 			r.syncs++
 			return syncEvent(nil, r.lastTs, r.syncs), nil
 		}
 		// Reset CPU samples cursor.
 		r.cpuSamples = r.gen.cpuSamples
 		// Reset frontier.
 		for _, m := range r.gen.batchMs {
 			batches := r.gen.batches[m]
 			bc := &batchCursor{m: m}
 			ok, err := bc.nextEvent(batches, r.gen.freq)
 			if err != nil {
 				return Event{}, err
 			}
 			if !ok {
 				// Turns out there aren't actually any events in these batches.
 				continue
 			}
 			r.frontier = heapInsert(r.frontier, bc)
 		}
 		r.syncs++
 		// Always emit a sync event at the beginning of the generation.
 		return syncEvent(r.gen.evTable, r.gen.freq.mul(r.gen.minTs), r.syncs), nil
 	}
 	tryAdvance := func(i int) (bool, error) {
 		bc := r.frontier[i]
@ -251,6 +234,78 @@ func (r *Reader) ReadEvent() (e Event, err error) {
 	return ev, nil
 }
 // nextGenWithSpill reads the generation and calls nextGen while
 // also handling any spilled batches.
 func (r *Reader) nextGenWithSpill() (Event, error) {
 	if r.version >= version.Go126 {
 		return Event{}, errors.New("internal error: nextGenWithSpill called for Go 1.26+ trace")
 	}
 	if r.spillErr != nil {
 		if r.spillErrSync {
 			return Event{}, r.spillErr
 		}
 		r.spillErrSync = true
 		r.syncs++
 		return syncEvent(nil, r.lastTs, r.syncs), nil
 	}
 	if r.gen != nil && r.spill == nil {
 		// If we have a generation from the last read,
 		// and there's nothing left in the frontier, and
 		// there's no spilled batch, indicating that there's
 		// no further generation, it means we're done.
 		// Emit the final sync event.
 		r.done = true
 		r.syncs++
 		return syncEvent(nil, r.lastTs, r.syncs), nil
 	}
 	// Read the next generation.
 	var gen *generation
 	gen, r.spill, r.spillErr = readGenerationWithSpill(r.r, r.spill, r.version)
 	if gen == nil {
 		r.gen = nil
 		r.spillErrSync = true
 		r.syncs++
 		return syncEvent(nil, r.lastTs, r.syncs), nil
 	}
 	return r.installGen(gen)
 }
 // installGen installs the new generation into the Reader and returns
 // a Sync event for the new generation.
 func (r *Reader) installGen(gen *generation) (Event, error) {
 	if gen == nil {
 		// Emit the final sync event.
 		r.gen = nil
 		r.done = true
 		r.syncs++
 		return syncEvent(nil, r.lastTs, r.syncs), nil
 	}
 	r.gen = gen
 	// Reset CPU samples cursor.
 	r.cpuSamples = r.gen.cpuSamples
 	// Reset frontier.
 	for _, m := range r.gen.batchMs {
 		batches := r.gen.batches[m]
 		bc := &batchCursor{m: m}
 		ok, err := bc.nextEvent(batches, r.gen.freq)
 		if err != nil {
 			return Event{}, err
 		}
 		if !ok {
 			// Turns out there aren't actually any events in these batches.
 			continue
 		}
 		r.frontier = heapInsert(r.frontier, bc)
 	}
 	r.syncs++
 	// Always emit a sync event at the beginning of the generation.
 	return syncEvent(r.gen.evTable, r.gen.freq.mul(r.gen.minTs), r.syncs), nil
 }
 func dumpFrontier(frontier []*batchCursor) string {
 	var sb strings.Builder
 	for _, bc := range frontier {
--- a/src/internal/trace/tracev2/events.go
+++ b/src/internal/trace/tracev2/events.go
@ -87,8 +87,8 @@ const (
 	EvSync          // start of a sync batch [...EvFrequency|EvClockSnapshot]
 	EvClockSnapshot // snapshot of trace, mono and wall clocks [timestamp, mono, sec, nsec]
-	// Reserved internal in-band end-of-generation signal. Must never appear in the trace. Added in Go 1.25.
+	// In-band end-of-generation signal. Added in Go 1.26.
-	// This could be used as an explicit in-band end-of-generation signal in the future.
+	// Used in Go 1.25 only internally.
 	EvEndOfGeneration
 	NumEvents
--- a/src/internal/trace/version/version.go
+++ b/src/internal/trace/version/version.go
@ -21,7 +21,8 @@ const (
 	Go122   Version = 22 // v2
 	Go123   Version = 23 // v2
 	Go125   Version = 25 // v2
-	Current         = Go125
+	Go126   Version = 26 // v2
 	Current         = Go126
 )
 var versions = map[Version][]tracev2.EventSpec{
@ -33,7 +34,8 @@ var versions = map[Version][]tracev2.EventSpec{
 	Go122: tracev2.Specs()[:tracev2.EvUserLog+1],           // All events after are Go 1.23+.
 	Go123: tracev2.Specs()[:tracev2.EvExperimentalBatch+1], // All events after are Go 1.25+.
-	Go125: tracev2.Specs(),
+	Go125: tracev2.Specs()[:tracev2.EvClockSnapshot+1],     // All events after are Go 1.26+.
 	Go126: tracev2.Specs(),
 }
 // Specs returns the set of event.Specs for this version.
--- a/src/net/udpsock_test.go
+++ b/src/net/udpsock_test.go
@ -710,6 +710,11 @@ func TestIPv6WriteMsgUDPAddrPortTargetAddrIPVersion(t *testing.T) {
 // WriteMsgUDPAddrPort accepts IPv4 and IPv4-mapped IPv6 destination addresses,
 // and rejects IPv6 destination addresses on a "udp4" connection.
 func TestIPv4WriteMsgUDPAddrPortTargetAddrIPVersion(t *testing.T) {
 	switch runtime.GOOS {
 	case "plan9":
 		t.Skipf("not supported on %s", runtime.GOOS)
 	}
 	if !testableNetwork("udp4") {
 		t.Skipf("skipping: udp4 not available")
 	}
--- a/src/os/os_windows_test.go
+++ b/src/os/os_windows_test.go
@ -1845,6 +1845,72 @@ func TestFile(t *testing.T) {
 	}
 }
 func TestFileOverlappedSeek(t *testing.T) {
 	t.Parallel()
 	name := filepath.Join(t.TempDir(), "foo")
 	f := newFileOverlapped(t, name, true)
 	content := []byte("foo")
 	if _, err := f.Write(content); err != nil {
 		t.Fatal(err)
 	}
 	// Check that the file pointer is at the expected offset.
 	n, err := f.Seek(0, io.SeekCurrent)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if n != int64(len(content)) {
 		t.Errorf("expected file pointer to be at offset %d, got %d", len(content), n)
 	}
 	// Set the file pointer to the start of the file.
 	if _, err := f.Seek(0, io.SeekStart); err != nil {
 		t.Fatal(err)
 	}
 	// Read the first byte.
 	var buf [1]byte
 	if _, err := f.Read(buf[:]); err != nil {
 		t.Fatal(err)
 	}
 	if !bytes.Equal(buf[:], content[:len(buf)]) {
 		t.Errorf("expected %q, got %q", content[:len(buf)], buf[:])
 	}
 	// Check that the file pointer is at the expected offset.
 	n, err = f.Seek(0, io.SeekCurrent)
 	if err != nil {
 		t.Fatal(err)
 	}
 	if n != int64(len(buf)) {
 		t.Errorf("expected file pointer to be at offset %d, got %d", len(buf), n)
 	}
 }
 func TestFileOverlappedReadAtVolume(t *testing.T) {
 	// Test that we can use File.ReadAt with an overlapped volume handle.
 	// See https://go.dev/issues/74951.
 	t.Parallel()
 	name := `\\.\` + filepath.VolumeName(t.TempDir())
 	namep, err := syscall.UTF16PtrFromString(name)
 	if err != nil {
 		t.Fatal(err)
 	}
 	h, err := syscall.CreateFile(namep,
 		syscall.GENERIC_READ|syscall.GENERIC_WRITE,
 		syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_READ,
 		nil, syscall.OPEN_ALWAYS, syscall.FILE_FLAG_OVERLAPPED, 0)
 	if err != nil {
 		if errors.Is(err, syscall.ERROR_ACCESS_DENIED) {
 			t.Skip("skipping test: access denied")
 		}
 		t.Fatal(err)
 	}
 	f := os.NewFile(uintptr(h), name)
 	defer f.Close()
 	var buf [0]byte
 	if _, err := f.ReadAt(buf[:], 0); err != nil {
 		t.Fatal(err)
 	}
 }
 func TestPipe(t *testing.T) {
 	t.Parallel()
 	r, w, err := os.Pipe()
--- a/src/os/tempfile.go
+++ b/src/os/tempfile.go
@ -105,7 +105,7 @@ func MkdirTemp(dir, pattern string) (string, error) {
 			if try++; try < 10000 {
 				continue
 			}
-			return "", &PathError{Op: "mkdirtemp", Path: dir + string(PathSeparator) + prefix + "*" + suffix, Err: ErrExist}
+			return "", &PathError{Op: "mkdirtemp", Path: prefix + "*" + suffix, Err: ErrExist}
 		}
 		if IsNotExist(err) {
 			if _, err := Stat(dir); IsNotExist(err) {
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@ -191,7 +191,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	}
 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("send on synctest channel from outside bubble"))
+		fatal("send on synctest channel from outside bubble")
 	}
 	// Fast path: check for failed non-blocking operation without acquiring the lock.
@ -318,7 +318,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.bubble != nil && getg().bubble != c.bubble {
 		unlockf()
-		panic(plainError("send on synctest channel from outside bubble"))
+		fatal("send on synctest channel from outside bubble")
 	}
 	if raceenabled {
 		if c.dataqsiz == 0 {
@ -416,7 +416,7 @@ func closechan(c *hchan) {
 		panic(plainError("close of nil channel"))
 	}
 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("close of synctest channel from outside bubble"))
+		fatal("close of synctest channel from outside bubble")
 	}
 	lock(&c.lock)
@ -538,7 +538,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	}
 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("receive on synctest channel from outside bubble"))
+		fatal("receive on synctest channel from outside bubble")
 	}
 	if c.timer != nil {
@ -702,7 +702,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.bubble != nil && getg().bubble != c.bubble {
 		unlockf()
-		panic(plainError("receive on synctest channel from outside bubble"))
+		fatal("receive on synctest channel from outside bubble")
 	}
 	if c.dataqsiz == 0 {
 		if raceenabled {
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@ -177,7 +177,7 @@ func totalMutexWaitTimeNanos() int64 {
 // NumGoroutine returns the number of goroutines that currently exist.
 func NumGoroutine() int {
-	return int(gcount())
+	return int(gcount(false))
 }
 //go:linkname debug_modinfo runtime/debug.modinfo
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s
@ -1,427 +0,0 @@
 // Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 #include "textflag.h"
 TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	MOVUPS	X15,(DI)
 	MOVUPS	X15,16(DI)
 	MOVUPS	X15,32(DI)
 	MOVUPS	X15,48(DI)
 	LEAQ	64(DI),DI
 	RET
 TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	MOVUPS	(SI), X0
 	ADDQ	$16, SI
 	MOVUPS	X0, (DI)
 	ADDQ	$16, DI
 	RET
--- a/src/runtime/duff_arm64.s
+++ b/src/runtime/duff_arm64.s
@ -1,267 +0,0 @@
 // Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 #include "textflag.h"
 TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP.P	(ZR, ZR), 16(R20)
 	STP	(ZR, ZR), (R20)
 	RET
 TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	LDP.P	16(R20), (R26, R27)
 	STP.P	(R26, R27), 16(R21)
 	RET
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@ -8,6 +8,7 @@ package runtime
 import (
 	"internal/godebugs"
 	"internal/runtime/atomic"
 	"internal/runtime/gc"
 	"unsafe"
 )
@ -465,9 +466,45 @@ func initMetrics() {
 			},
 		},
 		"/sched/goroutines:goroutines": {
-			compute: func(_ *statAggregate, out *metricValue) {
+			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
-				out.scalar = uint64(gcount())
+				out.scalar = uint64(in.schedStats.gTotal)
 			},
 		},
 		"/sched/goroutines/not-in-go:goroutines": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.gNonGo)
 			},
 		},
 		"/sched/goroutines/running:goroutines": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.gRunning)
 			},
 		},
 		"/sched/goroutines/runnable:goroutines": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.gRunnable)
 			},
 		},
 		"/sched/goroutines/waiting:goroutines": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.gWaiting)
 			},
 		},
 		"/sched/goroutines-created:goroutines": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.gCreated)
 			},
 		},
 		"/sched/latencies:seconds": {
@ -495,6 +532,13 @@ func initMetrics() {
 				sched.stwTotalTimeOther.write(out)
 			},
 		},
 		"/sched/threads/total:threads": {
 			deps: makeStatDepSet(schedStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
 				out.scalar = uint64(in.schedStats.threads)
 			},
 		},
 		"/sync/mutex/wait/total:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				out.kind = metricKindFloat64
@ -547,6 +591,7 @@ const (
 	cpuStatsDep                  // corresponds to cpuStatsAggregate
 	gcStatsDep                   // corresponds to gcStatsAggregate
 	finalStatsDep                // corresponds to finalStatsAggregate
 	schedStatsDep                // corresponds to schedStatsAggregate
 	numStatsDeps
 )
@ -740,6 +785,88 @@ func (a *finalStatsAggregate) compute() {
 	a.cleanupsQueued, a.cleanupsExecuted = gcCleanups.readQueueStats()
 }
 // schedStatsAggregate contains stats about the scheduler, including
 // an approximate count of goroutines in each state.
 type schedStatsAggregate struct {
 	gTotal    uint64
 	gRunning  uint64
 	gRunnable uint64
 	gNonGo    uint64
 	gWaiting  uint64
 	gCreated  uint64
 	threads   uint64
 }
 // compute populates the schedStatsAggregate with values from the runtime.
 func (a *schedStatsAggregate) compute() {
 	// Lock the scheduler so the global run queue can't change and
 	// the number of Ps can't change. This doesn't prevent the
 	// local run queues from changing, so the results are still
 	// approximate.
 	lock(&sched.lock)
 	// The total count of threads owned by Go is the number of Ms
 	// minus extra Ms on the list or in use.
 	a.threads = uint64(mcount()) - uint64(extraMInUse.Load()) - uint64(extraMLength.Load())
 	// Collect running/runnable from per-P run queues.
 	a.gCreated += sched.goroutinesCreated.Load()
 	for _, p := range allp {
 		if p == nil || p.status == _Pdead {
 			break
 		}
 		a.gCreated += p.goroutinesCreated
 		switch p.status {
 		case _Prunning:
 			a.gRunning++
 		case _Psyscall:
 			a.gNonGo++
 		case _Pgcstop:
 			// The world is stopping or stopped.
 			// This is fine. The results will be
 			// slightly odd since nothing else
 			// is running, but it will be accurate.
 		}
 		for {
 			h := atomic.Load(&p.runqhead)
 			t := atomic.Load(&p.runqtail)
 			next := atomic.Loaduintptr((*uintptr)(&p.runnext))
 			runnable := int32(t - h)
 			if atomic.Load(&p.runqhead) != h || runnable < 0 {
 				continue
 			}
 			if next != 0 {
 				runnable++
 			}
 			a.gRunnable += uint64(runnable)
 			break
 		}
 	}
 	// Global run queue.
 	a.gRunnable += uint64(sched.runq.size)
 	// Account for Gs that are in _Gsyscall without a P in _Psyscall.
 	nGsyscallNoP := sched.nGsyscallNoP.Load()
 	// nGsyscallNoP can go negative during temporary races.
 	if nGsyscallNoP >= 0 {
 		a.gNonGo += uint64(nGsyscallNoP)
 	}
 	// Compute the number of blocked goroutines. We have to
 	// include system goroutines in this count because we included
 	// them above.
 	a.gTotal = uint64(gcount(true))
 	a.gWaiting = a.gTotal - (a.gRunning + a.gRunnable + a.gNonGo)
 	if a.gWaiting < 0 {
 		a.gWaiting = 0
 	}
 	unlock(&sched.lock)
 }
 // nsToSec takes a duration in nanoseconds and converts it to seconds as
 // a float64.
 func nsToSec(ns int64) float64 {
@ -758,6 +885,7 @@ type statAggregate struct {
 	cpuStats   cpuStatsAggregate
 	gcStats    gcStatsAggregate
 	finalStats finalStatsAggregate
 	schedStats schedStatsAggregate
 }
 // ensure populates statistics aggregates determined by deps if they
@ -782,6 +910,8 @@ func (a *statAggregate) ensure(deps *statDepSet) {
 			a.gcStats.compute()
 		case finalStatsDep:
 			a.finalStats.compute()
 		case schedStatsDep:
 			a.schedStats.compute()
 		}
 	}
 	a.ensured = a.ensured.union(missing)
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@ -437,6 +437,32 @@ var allDesc = []Description{
 		Description: "The current runtime.GOMAXPROCS setting, or the number of operating system threads that can execute user-level Go code simultaneously.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines-created:goroutines",
 		Description: "Count of goroutines created since program start.",
 		Cumulative:  true,
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines/not-in-go:goroutines",
 		Description: "Approximate count of goroutines running or blocked in a system call or cgo call. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines/runnable:goroutines",
 		Description: "Approximate count of goroutines ready to execute, but not executing. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines/running:goroutines",
 		Description: "Approximate count of goroutines executing. Always less than or equal to /sched/gomaxprocs:threads. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines/waiting:goroutines",
 		Description: "Approximate count of goroutines waiting on a resource (I/O or sync primitives). Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sched/goroutines:goroutines",
 		Description: "Count of live goroutines.",
@ -472,6 +498,11 @@ var allDesc = []Description{
 		Kind:        KindFloat64Histogram,
 		Cumulative:  true,
 	},
 	{
 		Name:        "/sched/threads/total:threads",
 		Description: "The current count of live threads that are owned by the Go runtime.",
 		Kind:        KindUint64,
 	},
 	{
 		Name:        "/sync/mutex/wait/total:seconds",
 		Description: "Approximate cumulative time goroutines have spent blocked on a sync.Mutex, sync.RWMutex, or runtime-internal lock. This metric is useful for identifying global changes in lock contention. Collect a mutex or block profile using the runtime/pprof package for more detailed contention data.",
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@ -509,6 +509,29 @@ Below is the full list of supported metrics, ordered lexicographically.
 		operating system threads that can execute user-level Go code
 		simultaneously.
 	/sched/goroutines-created:goroutines
 		Count of goroutines created since program start.
 	/sched/goroutines/not-in-go:goroutines
 		Approximate count of goroutines running or blocked in
 		a system call or cgo call. Not guaranteed to add up to
 		/sched/goroutines:goroutines with other goroutine metrics.
 	/sched/goroutines/runnable:goroutines
 		Approximate count of goroutines ready to execute,
 		but not executing. Not guaranteed to add up to
 		/sched/goroutines:goroutines with other goroutine metrics.
 	/sched/goroutines/running:goroutines
 		Approximate count of goroutines executing. Always less than or
 		equal to /sched/gomaxprocs:threads. Not guaranteed to add up to
 		/sched/goroutines:goroutines with other goroutine metrics.
 	/sched/goroutines/waiting:goroutines
 		Approximate count of goroutines waiting on a resource
 		(I/O or sync primitives). Not guaranteed to add up to
 		/sched/goroutines:goroutines with other goroutine metrics.
 	/sched/goroutines:goroutines
 		Count of live goroutines.
@ -549,6 +572,10 @@ Below is the full list of supported metrics, ordered lexicographically.
 		/sched/pauses/stopping/other:seconds). Bucket counts increase
 		monotonically.
 	/sched/threads/total:threads
 		The current count of live threads that are owned by the Go
 		runtime.
 	/sync/mutex/wait/total:seconds
 		Approximate cumulative time goroutines have spent blocked on a
 		sync.Mutex, sync.RWMutex, or runtime-internal lock. This metric
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@ -22,6 +22,7 @@ import (
 	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
 	"testing"
 	"time"
 	"unsafe"
@ -1575,3 +1576,219 @@ func TestReadMetricsFinalizers(t *testing.T) {
 		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[1].Name, N, v0, v1)
 	}
 }
 func TestReadMetricsSched(t *testing.T) {
 	const (
 		notInGo = iota
 		runnable
 		running
 		waiting
 		created
 		threads
 		numSamples
 	)
 	var s [numSamples]metrics.Sample
 	s[notInGo].Name = "/sched/goroutines/not-in-go:goroutines"
 	s[runnable].Name = "/sched/goroutines/runnable:goroutines"
 	s[running].Name = "/sched/goroutines/running:goroutines"
 	s[waiting].Name = "/sched/goroutines/waiting:goroutines"
 	s[created].Name = "/sched/goroutines-created:goroutines"
 	s[threads].Name = "/sched/threads/total:threads"
 	logMetrics := func(t *testing.T, s []metrics.Sample) {
 		for i := range s {
 			t.Logf("%s: %d", s[i].Name, s[i].Value.Uint64())
 		}
 	}
 	// generalSlack is the amount of goroutines we allow ourselves to be
 	// off by in any given category, either due to background system
 	// goroutines or testing package goroutines.
 	const generalSlack = 4
 	// waitingSlack is the max number of blocked goroutines left
 	// from other tests, the testing package, or system
 	// goroutines.
 	const waitingSlack = 100
 	// threadsSlack is the maximum number of threads left over
 	// from other tests and the runtime (sysmon, the template thread, etc.)
 	const threadsSlack = 20
 	// Make sure GC isn't running, since GC workers interfere with
 	// expected counts.
 	defer debug.SetGCPercent(debug.SetGCPercent(-1))
 	runtime.GC()
 	check := func(t *testing.T, s *metrics.Sample, min, max uint64) {
 		val := s.Value.Uint64()
 		if val < min {
 			t.Errorf("%s too low; %d < %d", s.Name, val, min)
 		}
 		if val > max {
 			t.Errorf("%s too high; %d > %d", s.Name, val, max)
 		}
 	}
 	checkEq := func(t *testing.T, s *metrics.Sample, value uint64) {
 		check(t, s, value, value)
 	}
 	spinUntil := func(f func() bool, timeout time.Duration) bool {
 		start := time.Now()
 		for time.Since(start) < timeout {
 			if f() {
 				return true
 			}
 			time.Sleep(time.Millisecond)
 		}
 		return false
 	}
 	// Check base values.
 	t.Run("base", func(t *testing.T) {
 		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 		metrics.Read(s[:])
 		logMetrics(t, s[:])
 		check(t, &s[notInGo], 0, generalSlack)
 		check(t, &s[runnable], 0, generalSlack)
 		checkEq(t, &s[running], 1)
 		check(t, &s[waiting], 0, waitingSlack)
 	})
 	metrics.Read(s[:])
 	createdAfterBase := s[created].Value.Uint64()
 	// Force Running count to be high. We'll use these goroutines
 	// for Runnable, too.
 	const count = 10
 	var ready, exit atomic.Uint32
 	for i := 0; i < count-1; i++ {
 		go func() {
 			ready.Add(1)
 			for exit.Load() == 0 {
 				// Spin to get us and keep us running, but check
 				// the exit condition so we exit out early if we're
 				// done.
 				start := time.Now()
 				for time.Since(start) < 10*time.Millisecond && exit.Load() == 0 {
 				}
 				runtime.Gosched()
 			}
 		}()
 	}
 	for ready.Load() < count-1 {
 		runtime.Gosched()
 	}
 	// Be careful. We've entered a dangerous state for platforms
 	// that do not return back to the underlying system unless all
 	// goroutines are blocked, like js/wasm, since we have a bunch
 	// of runnable goroutines all spinning. We cannot write anything
 	// out.
 	if testenv.HasParallelism() {
 		t.Run("created", func(t *testing.T) {
 			metrics.Read(s[:])
 			logMetrics(t, s[:])
 			checkEq(t, &s[created], createdAfterBase+count)
 		})
 		t.Run("running", func(t *testing.T) {
 			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(count + 4))
 			// It can take a little bit for the scheduler to
 			// distribute the goroutines to Ps, so retry for a
 			// while.
 			spinUntil(func() bool {
 				metrics.Read(s[:])
 				return s[running].Value.Uint64() >= count
 			}, time.Second)
 			logMetrics(t, s[:])
 			check(t, &s[running], count, count+4)
 			check(t, &s[threads], count, count+4+threadsSlack)
 		})
 		// Force runnable count to be high.
 		t.Run("runnable", func(t *testing.T) {
 			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 			metrics.Read(s[:])
 			logMetrics(t, s[:])
 			checkEq(t, &s[running], 1)
 			check(t, &s[runnable], count-1, count+generalSlack)
 		})
 		// Done with the running/runnable goroutines.
 		exit.Store(1)
 	} else {
 		// Read metrics and then exit all the other goroutines,
 		// so that system calls may proceed.
 		metrics.Read(s[:])
 		// Done with the running/runnable goroutines.
 		exit.Store(1)
 		// Now we can check our invariants.
 		t.Run("created", func(t *testing.T) {
 			// Look for count-1 goroutines because we read metrics
 			// *before* t.Run goroutine was created for this sub-test.
 			checkEq(t, &s[created], createdAfterBase+count-1)
 		})
 		t.Run("running", func(t *testing.T) {
 			logMetrics(t, s[:])
 			checkEq(t, &s[running], 1)
 			checkEq(t, &s[threads], 1)
 		})
 		t.Run("runnable", func(t *testing.T) {
 			logMetrics(t, s[:])
 			check(t, &s[runnable], count-1, count+generalSlack)
 		})
 	}
 	// Force not-in-go count to be high. This is a little tricky since
 	// we try really hard not to let things block in system calls.
 	// We have to drop to the syscall package to do this reliably.
 	t.Run("not-in-go", func(t *testing.T) {
 		// Block a bunch of goroutines on an OS pipe.
 		pr, pw, err := pipe()
 		if err != nil {
 			switch runtime.GOOS {
 			case "js", "wasip1":
 				t.Skip("creating pipe:", err)
 			}
 			t.Fatal("creating pipe:", err)
 		}
 		for i := 0; i < count; i++ {
 			go syscall.Read(pr, make([]byte, 1))
 		}
 		// Let the goroutines block.
 		spinUntil(func() bool {
 			metrics.Read(s[:])
 			return s[notInGo].Value.Uint64() >= count
 		}, time.Second)
 		metrics.Read(s[:])
 		logMetrics(t, s[:])
 		check(t, &s[notInGo], count, count+generalSlack)
 		syscall.Close(pw)
 		syscall.Close(pr)
 	})
 	t.Run("waiting", func(t *testing.T) {
 		// Force waiting count to be high.
 		const waitingCount = 1000
 		stop := make(chan bool)
 		for i := 0; i < waitingCount; i++ {
 			go func() { <-stop }()
 		}
 		// Let the goroutines block.
 		spinUntil(func() bool {
 			metrics.Read(s[:])
 			return s[waiting].Value.Uint64() >= waitingCount
 		}, time.Second)
 		metrics.Read(s[:])
 		logMetrics(t, s[:])
 		check(t, &s[waiting], waitingCount, waitingCount+waitingSlack)
 		close(stop)
 	})
 }
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@ -1521,18 +1521,15 @@ func gcBgMarkWorker(ready chan struct{}) {
 		}
 		systemstack(func() {
-			// Mark our goroutine preemptible so its stack
+			// Mark our goroutine preemptible so its stack can be scanned or observed
-			// can be scanned or observed by the execution
+			// by the execution tracer. This, for example, lets two mark workers scan
-			// tracer. This, for example, lets two mark workers
+			// each other (otherwise, they would deadlock).
 			// scan each other (otherwise, they would
 			// deadlock). We must not modify anything on
 			// the G stack. However, stack shrinking is
 			// disabled for mark workers, so it is safe to
 			// read from the G stack.
 			//
-			// N.B. The execution tracer is not aware of this status
+			// casGToWaitingForSuspendG marks the goroutine as ineligible for a
-			// transition and handles it specially based on the
+			// stack shrink, effectively pinning the stack in memory for the duration.
-			// wait reason.
+			//
 			// N.B. The execution tracer is not aware of this status transition and
 			// handles it specially based on the wait reason.
 			casGToWaitingForSuspendG(gp, _Grunning, waitReasonGCWorkerActive)
 			switch pp.gcMarkWorkerMode {
 			default:
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@ -32,10 +32,8 @@ import (
 )
 func main() {
 	gen("amd64", notags, zeroAMD64, copyAMD64)
 	gen("386", notags, zero386, copy386)
 	gen("arm", notags, zeroARM, copyARM)
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("loong64", notags, zeroLOONG64, copyLOONG64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@ -610,10 +610,30 @@ func genLoong64(g *gen) {
 		l.add(movf, reg, regsize)
 	}
-	// save/restore FCC0
+	// Add condition flag register fcc0-fcc7
 	sv := ""
 	rs := ""
 	last := 7
 	for i := 0; i <= last; i++ {
 		msb := 7 + (i * 8)
 		lsb := 0 + (i * 8)
 		// MOVV FCCx, R4,
 		// BSTRINSV $msb, R4, $lsb, R5
 		sv += fmt.Sprintf("%s FCC%d, R4\n", mov, i)
 		sv += fmt.Sprintf("BSTRINSV $%d, R4, $%d, R5\n", msb, lsb)
 		// BSTRPICKV $msb, R5, $lsb, R4
 		// MOVV R4, FCCx
 		rs += fmt.Sprintf("BSTRPICKV $%d, R5, $%d, R4\n", msb, lsb)
 		rs += fmt.Sprintf("%s R4, FCC%d", mov, i)
 		if i != last {
 			rs += fmt.Sprintf("\n")
 		}
 	}
 	l.addSpecial(
-		mov+" FCC0, R4\n"+mov+" R4, %d(R3)",
+		sv+mov+" R5, %d(R3)",
-		mov+" %d(R3), R4\n"+mov+" R4, FCC0",
+		mov+" %d(R3), R5\n"+rs,
 		regsize)
 	// allocate frame, save PC of interrupted instruction (in LR)
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -1308,7 +1308,7 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 		// allocation estimate without bothering to STW. As long as
 		// this is close, then we'll only need to STW once (on the next
 		// call).
-		return int(gcount()), false
+		return int(gcount(false)), false
 	}
 	semacquire(&goroutineProfile.sema)
@ -1324,7 +1324,7 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 	// goroutines that can vary between user and system to ensure that the count
 	// doesn't change during the collection. So, check the finalizer goroutine
 	// and cleanup goroutines in particular.
-	n = int(gcount())
+	n = int(gcount(false))
 	if fingStatus.Load()&fingRunningFinalizer != 0 {
 		n++
 	}
--- a/src/runtime/pipe_unix_test.go
+++ b/src/runtime/pipe_unix_test.go
@ -0,0 +1,15 @@
 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build !windows
 package runtime_test
 import "syscall"
 func pipe() (r, w int, err error) {
 	var p [2]int
 	err = syscall.Pipe(p[:])
 	return p[0], p[1], err
 }
--- a/src/runtime/pipe_windows_test.go
+++ b/src/runtime/pipe_windows_test.go
@ -0,0 +1,13 @@
 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package runtime_test
 import "syscall"
 func pipe() (r, w syscall.Handle, err error) {
 	var p [2]syscall.Handle
 	err = syscall.Pipe(p[:])
 	return p[0], p[1], err
 }
--- a/src/runtime/preempt_loong64.s
+++ b/src/runtime/preempt_loong64.s
@ -65,10 +65,40 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD F30, 456(R3)
 	MOVD F31, 464(R3)
 	MOVV FCC0, R4
-	MOVV R4, 472(R3)
+	BSTRINSV $7, R4, $0, R5
 	MOVV FCC1, R4
 	BSTRINSV $15, R4, $8, R5
 	MOVV FCC2, R4
 	BSTRINSV $23, R4, $16, R5
 	MOVV FCC3, R4
 	BSTRINSV $31, R4, $24, R5
 	MOVV FCC4, R4
 	BSTRINSV $39, R4, $32, R5
 	MOVV FCC5, R4
 	BSTRINSV $47, R4, $40, R5
 	MOVV FCC6, R4
 	BSTRINSV $55, R4, $48, R5
 	MOVV FCC7, R4
 	BSTRINSV $63, R4, $56, R5
 	MOVV R5, 472(R3)
 	CALL ·asyncPreempt2(SB)
-	MOVV 472(R3), R4
+	MOVV 472(R3), R5
 	BSTRPICKV $7, R5, $0, R4
 	MOVV R4, FCC0
 	BSTRPICKV $15, R5, $8, R4
 	MOVV R4, FCC1
 	BSTRPICKV $23, R5, $16, R4
 	MOVV R4, FCC2
 	BSTRPICKV $31, R5, $24, R4
 	MOVV R4, FCC3
 	BSTRPICKV $39, R5, $32, R4
 	MOVV R4, FCC4
 	BSTRPICKV $47, R5, $40, R4
 	MOVV R4, FCC5
 	BSTRPICKV $55, R5, $48, R4
 	MOVV R4, FCC6
 	BSTRPICKV $63, R5, $56, R4
 	MOVV R4, FCC7
 	MOVD 464(R3), F31
 	MOVD 456(R3), F30
 	MOVD 448(R3), F29
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -1007,7 +1007,7 @@ func mcommoninit(mp *m, id int64) {
 	// when it is just in a register or thread-local storage.
 	mp.alllink = allm
-	// NumCgoCall() and others iterate over allm w/o schedlock,
+	// NumCgoCall and others iterate over allm w/o schedlock,
 	// so we need to publish it safely.
 	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
 	unlock(&sched.lock)
@ -1372,6 +1372,9 @@ func casGToWaiting(gp *g, old uint32, reason waitReason) {
 // casGToWaitingForSuspendG transitions gp from old to _Gwaiting, and sets the wait reason.
 // The wait reason must be a valid isWaitingForSuspendG wait reason.
 //
 // While a goroutine is in this state, it's stack is effectively pinned.
 // The garbage collector must not shrink or otherwise mutate the goroutine's stack.
 //
 // Use this over casgstatus when possible to ensure that a waitreason is set.
 func casGToWaitingForSuspendG(gp *g, old uint32, reason waitReason) {
 	if !reason.isWaitingForSuspendG() {
@ -1608,18 +1611,11 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	// stack while we try to stop the world since otherwise we could get
 	// in a mutual preemption deadlock.
 	//
-	// We must not modify anything on the G stack because a stack shrink
+	// casGToWaitingForSuspendG marks the goroutine as ineligible for a
-	// may occur, now that we switched to _Gwaiting, specifically if we're
+	// stack shrink, effectively pinning the stack in memory for the duration.
 	// doing this during the mark phase (mark termination excepted, since
 	// we know that stack scanning is done by that point). A stack shrink
 	// is otherwise OK though because in order to return from this function
 	// (and to leave the system stack) we must have preempted all
 	// goroutines, including any attempting to scan our stack, in which
 	// case, any stack shrinking will have already completed by the time we
 	// exit.
 	//
 	// N.B. The execution tracer is not aware of this status transition and
-	// andles it specially based on the wait reason.
+	// handles it specially based on the wait reason.
 	casGToWaitingForSuspendG(getg().m.curg, _Grunning, waitReasonStoppingTheWorld)
 	trace := traceAcquire()
@ -1652,6 +1648,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 			if trace.ok() {
 				trace.ProcSteal(pp, false)
 			}
 			sched.nGsyscallNoP.Add(1)
 			pp.syscalltick++
 			pp.gcStopTime = nanotime()
 			sched.stopwait--
@ -2106,16 +2103,11 @@ func forEachP(reason waitReason, fn func(*p)) {
 		// deadlock as we attempt to preempt a goroutine that's trying
 		// to preempt us (e.g. for a stack scan).
 		//
-		// We must not modify anything on the G stack because a stack shrink
+		// casGToWaitingForSuspendG marks the goroutine as ineligible for a
-		// may occur. A stack shrink is otherwise OK though because in order
+		// stack shrink, effectively pinning the stack in memory for the duration.
 		// to return from this function (and to leave the system stack) we
 		// must have preempted all goroutines, including any attempting
 		// to scan our stack, in which case, any stack shrinking will
 		// have already completed by the time we exit.
 		//
-		// N.B. The execution tracer is not aware of this status
+		// N.B. The execution tracer is not aware of this status transition and
-		// transition and handles it specially based on the
+		// handles it specially based on the wait reason.
 		// wait reason.
 		casGToWaitingForSuspendG(gp, _Grunning, reason)
 		forEachPInternal(fn)
 		casgstatus(gp, _Gwaiting, _Grunning)
@ -2183,6 +2175,7 @@ func forEachPInternal(fn func(*p)) {
 				trace.ProcSteal(p2, false)
 				traceRelease(trace)
 			}
 			sched.nGsyscallNoP.Add(1)
 			p2.syscalltick++
 			handoffp(p2)
 		} else if trace.ok() {
@ -2456,6 +2449,7 @@ func needm(signal bool) {
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdead, _Gsyscall)
 	sched.ngsys.Add(-1)
 	sched.nGsyscallNoP.Add(1)
 	if !signal {
 		if trace.ok() {
@ -2591,6 +2585,7 @@ func dropm() {
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
 	mp.curg.preemptStop = false
 	sched.ngsys.Add(1)
 	sched.nGsyscallNoP.Add(-1)
 	if !mp.isExtraInSig {
 		if trace.ok() {
@ -4684,6 +4679,7 @@ func entersyscall_gcwait() {
 			trace.ProcSteal(pp, true)
 			traceRelease(trace)
 		}
 		sched.nGsyscallNoP.Add(1)
 		pp.gcStopTime = nanotime()
 		pp.syscalltick++
 		if sched.stopwait--; sched.stopwait == 0 {
@ -4716,6 +4712,8 @@ func entersyscallblock() {
 	gp.m.syscalltick = gp.m.p.ptr().syscalltick
 	gp.m.p.ptr().syscalltick++
 	sched.nGsyscallNoP.Add(1)
 	// Leave SP around for GC and traceback.
 	pc := sys.GetCallerPC()
 	sp := sys.GetCallerSP()
@ -4936,6 +4934,7 @@ func exitsyscallfast_pidle() bool {
 	}
 	unlock(&sched.lock)
 	if pp != nil {
 		sched.nGsyscallNoP.Add(-1)
 		acquirep(pp)
 		return true
 	}
@ -4962,6 +4961,7 @@ func exitsyscall0(gp *g) {
 		trace.GoSysExit(true)
 		traceRelease(trace)
 	}
 	sched.nGsyscallNoP.Add(-1)
 	dropg()
 	lock(&sched.lock)
 	var pp *p
@ -5262,6 +5262,7 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreaso
 			racereleasemergeg(newg, unsafe.Pointer(&labelSync))
 		}
 	}
 	pp.goroutinesCreated++
 	releasem(mp)
 	return newg
@ -5537,8 +5538,11 @@ func badunlockosthread() {
 	throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
 }
-func gcount() int32 {
+func gcount(includeSys bool) int32 {
-	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.stack.size - sched.gFree.noStack.size - sched.ngsys.Load()
+	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.stack.size - sched.gFree.noStack.size
 	if !includeSys {
 		n -= sched.ngsys.Load()
 	}
 	for _, pp := range allp {
 		n -= pp.gFree.size
 	}
@ -5838,6 +5842,8 @@ func (pp *p) destroy() {
 	pp.gcAssistTime = 0
 	gcCleanups.queued += pp.cleanupsQueued
 	pp.cleanupsQueued = 0
 	sched.goroutinesCreated.Add(int64(pp.goroutinesCreated))
 	pp.goroutinesCreated = 0
 	pp.xRegs.free()
 	pp.status = _Pdead
 }
@ -6413,6 +6419,7 @@ func retake(now int64) uint32 {
 					trace.ProcSteal(pp, false)
 					traceRelease(trace)
 				}
 				sched.nGsyscallNoP.Add(1)
 				n++
 				pp.syscalltick++
 				handoffp(pp)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -764,6 +764,9 @@ type p struct {
 	// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
 	gcStopTime int64
 	// goroutinesCreated is the total count of goroutines created by this P.
 	goroutinesCreated uint64
 	// xRegs is the per-P extended register state used by asynchronous
 	// preemption. This is an empty struct on platforms that don't use extended
 	// register state.
@ -792,7 +795,8 @@ type schedt struct {
 	nmsys        int32    // number of system m's not counted for deadlock
 	nmfreed      int64    // cumulative number of freed m's
-	ngsys atomic.Int32 // number of system goroutines
+	ngsys        atomic.Int32 // number of system goroutines
 	nGsyscallNoP atomic.Int32 // number of goroutines in syscalls without a P
 	pidle        puintptr // idle p's
 	npidle       atomic.Int32
@ -891,6 +895,10 @@ type schedt struct {
 	// M, but waiting for locks within the runtime. This field stores the value
 	// for Ms that have exited.
 	totalRuntimeLockWaitTime atomic.Int64
 	// goroutinesCreated (plus the value of goroutinesCreated on each P in allp)
 	// is the sum of all goroutines created by the program.
 	goroutinesCreated atomic.Uint64
 }
 // Values for the flags field of a sigTabT.
@ -1217,7 +1225,9 @@ var isIdleInSynctest = [len(waitReasonStrings)]bool{
 }
 var (
-	allm          *m
+	// Linked-list of all Ms. Written under sched.lock, read atomically.
 	allm *m
 	gomaxprocs    int32
 	numCPUStartup int32
 	forcegc       forcegcstate
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@ -178,7 +178,7 @@ func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, blo
 		if cas.c.bubble != nil {
 			if getg().bubble != cas.c.bubble {
-				panic(plainError("select on synctest channel from outside bubble"))
+				fatal("select on synctest channel from outside bubble")
 			}
 		} else {
 			allSynctest = false
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@ -1214,15 +1214,18 @@ func isShrinkStackSafe(gp *g) bool {
 	if gp.parkingOnChan.Load() {
 		return false
 	}
-	// We also can't copy the stack while tracing is enabled, and
+	// We also can't copy the stack while a gp is in _Gwaiting solely
-	// gp is in _Gwaiting solely to make itself available to suspendG.
+	// to make itself available to suspendG.
 	//
 	// In these cases, the G is actually executing on the system
-	// stack, and the execution tracer may want to take a stack trace
+	// stack, and the execution tracer, mutex profiler, etc. may want
-	// of the G's stack. Note: it's safe to access gp.waitreason here.
+	// to take a stack trace of the G's stack.
-	// We're only checking if this is true if we took ownership of the
+	//
 	// Note: it's safe to access gp.waitreason here.
 	// We're only calling isShrinkStackSafe if we took ownership of the
 	// G with the _Gscan bit. This prevents the goroutine from transitioning,
 	// which prevents gp.waitreason from changing.
-	if traceEnabled() && readgstatus(gp)&^_Gscan == _Gwaiting && gp.waitreason.isWaitingForSuspendG() {
+	if readgstatus(gp)&^_Gscan == _Gwaiting && gp.waitreason.isWaitingForSuspendG() {
 		return false
 	}
 	return true
@ -1258,12 +1261,6 @@ func shrinkstack(gp *g) {
 	if debug.gcshrinkstackoff > 0 {
 		return
 	}
 	f := findfunc(gp.startpc)
 	if f.valid() && f.funcID == abi.FuncID_gcBgMarkWorker {
 		// We're not allowed to shrink the gcBgMarkWorker
 		// stack (see gcBgMarkWorker for explanation).
 		return
 	}
 	oldsize := gp.stack.hi - gp.stack.lo
 	newsize := oldsize / 2
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@ -415,7 +415,7 @@ func newTimer(when, period int64, f func(arg any, seq uintptr, delay int64), arg
 //go:linkname stopTimer time.stopTimer
 func stopTimer(t *timeTimer) bool {
 	if t.isFake && getg().bubble == nil {
-		panic("stop of synctest timer from outside bubble")
+		fatal("stop of synctest timer from outside bubble")
 	}
 	return t.stop()
 }
@ -430,7 +430,7 @@ func resetTimer(t *timeTimer, when, period int64) bool {
 		racerelease(unsafe.Pointer(&t.timer))
 	}
 	if t.isFake && getg().bubble == nil {
-		panic("reset of synctest timer from outside bubble")
+		fatal("reset of synctest timer from outside bubble")
 	}
 	return t.reset(when, period)
 }
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@ -754,24 +754,7 @@ func traceRegisterLabelsAndReasons(gen uintptr) {
 // was on has been returned, ReadTrace returns nil. The caller must copy the
 // returned data before calling ReadTrace again.
 // ReadTrace must be called from one goroutine at a time.
-func ReadTrace() []byte {
+func ReadTrace() (buf []byte) {
 	for {
 		buf := readTrace()
 		// Skip over the end-of-generation signal which must not appear
 		// in the final trace.
 		if len(buf) == 1 && tracev2.EventType(buf[0]) == tracev2.EvEndOfGeneration {
 			continue
 		}
 		return buf
 	}
 }
 // readTrace is the implementation of ReadTrace, except with an additional
 // in-band signal as to when the buffer is for a new generation.
 //
 //go:linkname readTrace runtime/trace.runtime_readTrace
 func readTrace() (buf []byte) {
 top:
 	var park bool
 	systemstack(func() {
@ -842,7 +825,7 @@ func readTrace0() (buf []byte, park bool) {
 	if !trace.headerWritten {
 		trace.headerWritten = true
 		unlock(&trace.lock)
-		return []byte("go 1.25 trace\x00\x00\x00"), false
+		return []byte("go 1.26 trace\x00\x00\x00"), false
 	}
 	// Read the next buffer.
--- a/src/runtime/trace/batch.go
+++ b/src/runtime/trace/batch.go
@ -12,72 +12,77 @@ import (
 // timestamp is an unprocessed timestamp.
 type timestamp uint64
 // batch represents a batch of trace events.
 // It is unparsed except for its header.
 type batch struct {
 	m    threadID
 	time timestamp
 	gen  uint64
 	data []byte
 }
 // threadID is the runtime-internal M structure's ID. This is unique
 // for each OS thread.
 type threadID int64
 // readBatch copies b and parses the trace batch header inside.
-// Returns the batch, the generation, bytes read, and an error.
+// Returns the batch, bytes read, and an error.
-func readBatch(b []byte) (batch, uint64, uint64, error) {
+func readBatch(b []byte) (batch, uint64, error) {
 	if len(b) == 0 {
-		return batch{}, 0, 0, fmt.Errorf("batch is empty")
+		return batch{}, 0, fmt.Errorf("batch is empty")
 	}
 	data := make([]byte, len(b))
-	if nw := copy(data, b); nw != len(b) {
+	copy(data, b)
 		return batch{}, 0, 0, fmt.Errorf("unexpected error copying batch")
 	}
 	// Read batch header byte.
 	if typ := tracev2.EventType(b[0]); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
 		return batch{}, 0, 1, fmt.Errorf("expected batch event, got event %d", typ)
 	}
-	// Read the batch header: gen (generation), thread (M) ID, base timestamp
+	// Read batch header byte.
-	// for the batch.
+	if typ := tracev2.EventType(b[0]); typ == tracev2.EvEndOfGeneration {
 		if len(b) != 1 {
 			return batch{}, 1, fmt.Errorf("unexpected end of generation in batch of size >1")
 		}
 		return batch{data: data}, 1, nil
 	}
 	if typ := tracev2.EventType(b[0]); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
 		return batch{}, 1, fmt.Errorf("expected batch event, got event %d", typ)
 	}
 	total := 1
 	b = b[1:]
 	// Read the generation
 	gen, n, err := readUvarint(b)
 	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch gen: %w", err)
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch gen: %w", err)
 	}
 	total += n
 	b = b[n:]
 	m, n, err := readUvarint(b)
 	if err != nil {
 		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch M ID: %w", err)
 	}
 	total += n
 	b = b[n:]
 	ts, n, err := readUvarint(b)
 	if err != nil {
 		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch timestamp: %w", err)
 	}
 	total += n
 	b = b[n:]
-	// Read in the size of the batch to follow.
+	// Read the M (discard it).
 	_, n, err = readUvarint(b)
 	if err != nil {
 		return batch{}, uint64(total + n), fmt.Errorf("error reading batch M ID: %w", err)
 	}
 	total += n
 	b = b[n:]
 	// Read the timestamp.
 	ts, n, err := readUvarint(b)
 	if err != nil {
 		return batch{}, uint64(total + n), fmt.Errorf("error reading batch timestamp: %w", err)
 	}
 	total += n
 	b = b[n:]
 	// Read the size of the batch to follow.
 	size, n, err := readUvarint(b)
 	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch size: %w", err)
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch size: %w", err)
 	}
 	if size > tracev2.MaxBatchSize {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("invalid batch size %d, maximum is %d", size, tracev2.MaxBatchSize)
+		return batch{}, uint64(total + n), fmt.Errorf("invalid batch size %d, maximum is %d", size, tracev2.MaxBatchSize)
 	}
 	total += n
 	total += int(size)
 	if total != len(data) {
 		return batch{}, uint64(total), fmt.Errorf("expected complete batch")
 	}
 	data = data[:total]
 	// Return the batch.
 	return batch{
-		m:    threadID(m),
+		gen:  gen,
 		time: timestamp(ts),
 		data: data,
-	}, gen, uint64(total), nil
+	}, uint64(total), nil
 }
--- a/src/runtime/trace/flightrecorder.go
+++ b/src/runtime/trace/flightrecorder.go
@ -141,9 +141,9 @@ func (fr *FlightRecorder) WriteTo(w io.Writer) (n int64, err error) {
 	// Write all the data.
 	for _, gen := range gens {
-		for _, batch := range gen.batches {
+		for _, data := range gen.batches {
 			// Write batch data.
-			nw, err = w.Write(batch.data)
+			nw, err = w.Write(data)
 			n += int64(nw)
 			if err != nil {
 				return n, err
--- a/src/runtime/trace/recorder.go
+++ b/src/runtime/trace/recorder.go
@ -41,21 +41,21 @@ func (w *recorder) Write(b []byte) (n int, err error) {
 	if len(b) == n {
 		return 0, nil
 	}
-	ba, gen, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch.
+	ba, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch.
 	if err != nil {
 		return len(b) - int(nb) - n, err
 	}
 	n += int(nb)
 	// Append the batch to the current generation.
-	if r.active.gen == 0 {
+	if ba.gen != 0 && r.active.gen == 0 {
-		r.active.gen = gen
+		r.active.gen = ba.gen
 	}
-	if r.active.minTime == 0 || r.active.minTime > r.freq.mul(ba.time) {
+	if ba.time != 0 && (r.active.minTime == 0 || r.active.minTime > r.freq.mul(ba.time)) {
 		r.active.minTime = r.freq.mul(ba.time)
 	}
 	r.active.size += len(ba.data)
-	r.active.batches = append(r.active.batches, ba)
+	r.active.batches = append(r.active.batches, ba.data)
 	return len(b), nil
 }
@ -99,7 +99,7 @@ type rawGeneration struct {
 	gen     uint64
 	size    int
 	minTime eventTime
-	batches []batch
+	batches [][]byte
 }
 func traceTimeNow(freq frequency) eventTime {
--- a/src/runtime/trace/subscribe.go
+++ b/src/runtime/trace/subscribe.go
@ -155,7 +155,7 @@ func (t *traceMultiplexer) startLocked() error {
 	t.subscribersMu.Unlock()
 	go func() {
-		header := runtime_readTrace()
+		header := runtime.ReadTrace()
 		if traceStartWriter != nil {
 			traceStartWriter.Write(header)
 		}
@ -164,10 +164,16 @@ func (t *traceMultiplexer) startLocked() error {
 		}
 		for {
-			data := runtime_readTrace()
+			data := runtime.ReadTrace()
 			if data == nil {
 				break
 			}
 			if traceStartWriter != nil {
 				traceStartWriter.Write(data)
 			}
 			if flightRecorder != nil {
 				flightRecorder.Write(data)
 			}
 			if len(data) == 1 && tracev2.EventType(data[0]) == tracev2.EvEndOfGeneration {
 				if flightRecorder != nil {
 					flightRecorder.endGeneration()
@ -187,13 +193,6 @@ func (t *traceMultiplexer) startLocked() error {
 				if frIsNew {
 					flightRecorder.Write(header)
 				}
 			} else {
 				if traceStartWriter != nil {
 					traceStartWriter.Write(data)
 				}
 				if flightRecorder != nil {
 					flightRecorder.Write(data)
 				}
 			}
 		}
 	}()
--- a/src/runtime/tracetype.go
+++ b/src/runtime/tracetype.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-// Trace stack table and acquisition.
+// Trace type table.
 package runtime
@ -13,7 +13,7 @@ import (
 	"unsafe"
 )
-// traceTypeTable maps stack traces (arrays of PC's) to unique uint32 ids.
+// traceTypeTable maps types to unique uint32 ids.
 // It is lock-free for reading.
 type traceTypeTable struct {
 	tab traceMap
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@ -365,11 +365,11 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
 			goto childerror
 		}
-		pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
+		c, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
 		if err1 != 0 {
 			goto childerror
 		}
-		if pid != unsafe.Sizeof(err2) {
+		if c != unsafe.Sizeof(err2) {
 			err1 = EINVAL
 			goto childerror
 		}
@ -427,7 +427,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
 			if err1 != 0 {
 				goto childerror
 			}
@ -438,7 +438,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
 			if err1 != 0 {
 				goto childerror
 			}
@ -452,7 +452,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
 			if err1 != 0 {
 				goto childerror
 			}
--- a/src/syscall/syscall_unix.go
+++ b/src/syscall/syscall_unix.go
@ -410,17 +410,25 @@ func SendmsgN(fd int, p, oob []byte, to Sockaddr, flags int) (n int, err error)
 }
 func sendmsgNInet4(fd int, p, oob []byte, to *SockaddrInet4, flags int) (n int, err error) {
-	ptr, salen, err := to.sockaddr()
+	var ptr unsafe.Pointer
-	if err != nil {
+	var salen _Socklen
-		return 0, err
+	if to != nil {
 		ptr, salen, err = to.sockaddr()
 		if err != nil {
 			return 0, err
 		}
 	}
 	return sendmsgN(fd, p, oob, ptr, salen, flags)
 }
 func sendmsgNInet6(fd int, p, oob []byte, to *SockaddrInet6, flags int) (n int, err error) {
-	ptr, salen, err := to.sockaddr()
+	var ptr unsafe.Pointer
-	if err != nil {
+	var salen _Socklen
-		return 0, err
+	if to != nil {
 		ptr, salen, err = to.sockaddr()
 		if err != nil {
 			return 0, err
 		}
 	}
 	return sendmsgN(fd, p, oob, ptr, salen, flags)
 }
--- a/src/unique/canonmap_test.go
+++ b/src/unique/canonmap_test.go
@ -108,6 +108,25 @@ func testCanonMap(t *testing.T, newMap func() *canonMap[string]) {
 				wg.Wait()
 			}
 			// Run an extra GC cycle to de-flake. Sometimes the cleanups
 			// fail to run in time, despite drainCleanupQueue.
 			//
 			// TODO(mknyszek): Figure out why the extra GC is necessary,
 			// and what is transiently keeping the cleanups live.
 			// * I have confirmed that they are not completely stuck, and
 			//   they always eventually run.
 			// * I have also confirmed it's not asynchronous preemption
 			//   keeping them around (though that is a possibility).
 			// * I have confirmed that they are not simply sitting on
 			//   the queue, and that drainCleanupQueue is just failing
 			//   to actually empty the queue.
 			// * I have confirmed that it's not a write barrier that's
 			//   keeping it alive, nor is it a weak pointer dereference
 			//   (which shades the object during the GC).
 			// The corresponding objects do seem to be transiently truly
 			// reachable, but I have no idea by what path.
 			runtime.GC()
 			// Drain cleanups so everything is deleted.
 			drainCleanupQueue(t)
--- a/test/fixedbugs/issue75063.go
+++ b/test/fixedbugs/issue75063.go
@ -0,0 +1,75 @@
 // compile
 // Copyright 2025 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 package reorder
 type Element struct {
 	A     string
 	B     string
 	C     string
 	D     string
 	E     string
 	Text  []string
 	List  []string
 	Child Elements
 	F     string
 	G     bool
 	H     bool
 	I     string
 }
 type Elements []Element
 func DoesNotCompile(ve Elements) Elements {
 	aa := Elements{}
 	bb := Elements{}
 	cc := Elements{}
 	dd := Elements{}
 	ee := Elements{}
 	ff := Elements{}
 	gg := Elements{}
 	hh := Elements{}
 	ii := Elements{}
 	if len(ve) != 1 {
 		return ve
 	}
 	for _, e := range ve[0].Child {
 		if len(e.Text) == 1 && (e.Text[0] == "xx") {
 			ee = append(ee, e)
 		} else if len(e.Text) == 1 && e.Text[0] == "yy" {
 			for _, c := range e.Child {
 				if len(c.Text) == 1 && c.Text[0] == "zz" {
 					ii = append(ii, c)
 				} else {
 					hh = append(hh, c)
 				}
 			}
 			ii = append(ii, hh...)
 			e.Child = ii
 			gg = append(gg, e)
 		} else if len(e.Text) == 1 && e.Text[0] == "tt" {
 			for _, entry := range e.Child {
 				for _, c := range entry.Child {
 					if len(c.Text) == 1 && c.Text[0] == "ee" {
 						cc = append(cc, c)
 					} else {
 						dd = append(dd, c)
 					}
 				}
 				cc = append(cc, dd...)
 				entry.Child = cc
 				bb = append(bb, entry)
 				cc, dd = Elements{}, Elements{}
 			}
 			e.Child = bb
 			aa = append(aa, e)
 		} else {
 			ff = append(ff, e)
 		}
 	}
 	return ve
 }
`@ -8,4 +8,4 @@`

	`package doc`	`package doc`

	`func doPkgsite(string) error { return nil }`	`func doPkgsite(string, string) error { return nil }`