[dev.simd] all: merge master (9de69f6) into dev.simd

Merge List: + 2025-08-20 9de69f6913 errors: mention Is/As in Join docs + 2025-08-20 4afd482812 cmd/go/internal/doc: pass URL fragments separately with -http + 2025-08-20 509d5f647f internal/poll: don't call Seek for overlapped Windows handles + 2025-08-20 853fc12739 internal/poll: set the correct file offset in FD.Seek for Windows overlapped handles + 2025-08-19 bd885401d5 runtime: save and restore all fcc registers in async preempt on loong64 + 2025-08-19 119546ea4f cmd/go: document install outputs to $GOOS_$GOARCH when cross compiling + 2025-08-19 ffa882059c unique: deflake TestCanonMap/LoadOrStore/ConcurrentUnsharedKeys + 2025-08-19 1f2e8e03e4 os: fix path in MkdirTemp error message + 2025-08-19 5024d0d884 cmd/compile: tweak example command in README + 2025-08-19 b80ffb64d8 internal/trace: remove redundant info from Event.String + 2025-08-19 c7d8bda459 cmd/compile/internal: make function comments match function names + 2025-08-19 de2d741667 internal/trace: use RFC3339Nano for wall clock snapshots in Event.String + 2025-08-19 c61db5ebd5 syscall: forkAndExecInChild1: don't reuse pid variable + 2025-08-19 07ee3bfc63 cmd/go: use modern pprof flags in documentation + 2025-08-18 5a56d8848b cmd/compile: ensure we use allowed registers for input-clobbering instructions + 2025-08-18 c3927a47f0 runtime: fix comments in tracetype.go + 2025-08-15 77f911e31c internal/trace: emit final sync event for generation in Go 1.26+ + 2025-08-15 786be1d2bf runtime: don't overwrite global stop channel in tests + 2025-08-15 4a7fde922f internal/trace: add end-of-generation signal to trace + 2025-08-15 cb814bd5bc net: skip TestIPv4WriteMsgUDPAddrPort on plan9 + 2025-08-15 78a3968c2c runtime/metrics: add metric for current Go-owned thread count + 2025-08-15 ab8121a407 runtime/metrics: add metric for total goroutines created + 2025-08-15 13df972f68 runtime/metrics: add metrics for goroutine sched states + 2025-08-15 bd07fafb0a runtime: disable stack shrinking for all waiting-for-suspendG cases + 2025-08-15 a651e2ea47 runtime: remove duff support for amd64 + 2025-08-15 e4291e484c runtime: remove duff support for arm64 + 2025-08-15 15d6dbc05c cmd/compile: use generated loops instead of DUFFCOPY on arm64 + 2025-08-15 bca3e98b8a cmd/go: test barrier actions + 2025-08-15 052fcde9fd internal/runtime: cleaner overflow checker + 2025-08-15 3871c0d84d syscall: permit nil destination address in sendmsgN{Inet4,Inet6} + 2025-08-14 a8564bd412 runtime: make all synctest bubble violations fatal panics Change-Id: Ibc94566bc69bcb59b1d79b6fa868610ca2d1d223
2025-12-08 06:10:04 +00:00 · 2025-08-20 16:06:42 -04:00 · 2025-08-20 16:06:42 -04:00 · 103b6e39ca
commit 103b6e39ca
parent 728ac3e050 9de69f6913
68 changed files with 1449 additions and 1363 deletions
--- a/src/cmd/compile/README.md
+++ b/src/cmd/compile/README.md
@ -281,10 +281,10 @@ dependencies, so is not suitable for distributed build systems.)
  ```
  $ go install golang.org/x/tools/cmd/toolstash@latest
  $ git clone https://go.googlesource.com/go
-  $ cd go
+  $ export PATH=$PWD/go/bin:$PATH
+  $ cd go/src
  $ git checkout -b mybranch
-  $ ./src/all.bash               # build and confirm good starting point
-  $ export PATH=$PWD/bin:$PATH
+  $ ./all.bash                      # build and confirm good starting point
  $ toolstash save                  # save current tools
  ```
  After that, your edit/compile/test cycle can be similar to:
--- a/src/cmd/compile/internal/arm64/ssa.go
+++ b/src/cmd/compile/internal/arm64/ssa.go
@ -1162,41 +1162,119 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
 		//   BNE loop
 		// There's a past-the-end pointer here, any problem with that?

-	case ssa.OpARM64DUFFCOPY:
-		p := s.Prog(obj.ADUFFCOPY)
-		p.To.Type = obj.TYPE_MEM
-		p.To.Name = obj.NAME_EXTERN
-		p.To.Sym = ir.Syms.Duffcopy
-		p.To.Offset = v.AuxInt
 	case ssa.OpARM64LoweredMove:
-		// LDP.P	16(R16), (R25, Rtmp)
-		// STP.P	(R25, Rtmp), 16(R17)
-		// CMP	Rarg2, R16
-		// BLE	-3(PC)
-		// arg2 is the address of the last element of src
-		p := s.Prog(arm64.ALDP)
-		p.Scond = arm64.C_XPOST
-		p.From.Type = obj.TYPE_MEM
-		p.From.Reg = arm64.REG_R16
-		p.From.Offset = 16
-		p.To.Type = obj.TYPE_REGREG
-		p.To.Reg = arm64.REG_R25
-		p.To.Offset = int64(arm64.REGTMP)
-		p2 := s.Prog(arm64.ASTP)
-		p2.Scond = arm64.C_XPOST
-		p2.From.Type = obj.TYPE_REGREG
-		p2.From.Reg = arm64.REG_R25
-		p2.From.Offset = int64(arm64.REGTMP)
-		p2.To.Type = obj.TYPE_MEM
-		p2.To.Reg = arm64.REG_R17
-		p2.To.Offset = 16
-		p3 := s.Prog(arm64.ACMP)
-		p3.From.Type = obj.TYPE_REG
-		p3.From.Reg = v.Args[2].Reg()
-		p3.Reg = arm64.REG_R16
-		p4 := s.Prog(arm64.ABLE)
-		p4.To.Type = obj.TYPE_BRANCH
-		p4.To.SetTarget(p)
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+		if dstReg == srcReg {
+			break
+		}
+		tmpReg1 := int16(arm64.REG_R24)
+		tmpReg2 := int16(arm64.REG_R25)
+		n := v.AuxInt
+		if n < 16 {
+			v.Fatalf("Move too small %d", n)
+		}
+
+		// Generate copying instructions.
+		var off int64
+		for n >= 16 {
+			// LDP     off(srcReg), (tmpReg1, tmpReg2)
+			// STP     (tmpReg1, tmpReg2), off(dstReg)
+			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+			off += 16
+			n -= 16
+		}
+		if n > 8 {
+			//  MOVD    off(srcReg), tmpReg1
+			//  MOVD    tmpReg1, off(dstReg)
+			move8(s, srcReg, dstReg, tmpReg1, off)
+			off += 8
+			n -= 8
+		}
+		if n != 0 {
+			//  MOVD    off+n-8(srcReg), tmpReg1
+			//  MOVD    tmpReg1, off+n-8(dstReg)
+			move8(s, srcReg, dstReg, tmpReg1, off+n-8)
+		}
+	case ssa.OpARM64LoweredMoveLoop:
+		dstReg := v.Args[0].Reg()
+		srcReg := v.Args[1].Reg()
+		if dstReg == srcReg {
+			break
+		}
+		countReg := int16(arm64.REG_R23)
+		tmpReg1 := int16(arm64.REG_R24)
+		tmpReg2 := int16(arm64.REG_R25)
+		n := v.AuxInt
+		loopSize := int64(64)
+		if n < 3*loopSize {
+			// - a loop count of 0 won't work.
+			// - a loop count of 1 is useless.
+			// - a loop count of 2 is a code size ~tie
+			//     3 instructions to implement the loop
+			//     4 instructions in the loop body
+			//   vs
+			//     8 instructions in the straightline code
+			//   Might as well use straightline code.
+			v.Fatalf("ZeroLoop size too small %d", n)
+		}
+
+		// Put iteration count in a register.
+		//   MOVD    $n, countReg
+		p := s.Prog(arm64.AMOVD)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = n / loopSize
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = countReg
+		cntInit := p
+
+		// Move loopSize bytes starting at srcReg to dstReg.
+		// Increment srcReg and destReg by loopSize as a side effect.
+		for range loopSize / 16 {
+			// LDP.P  16(srcReg), (tmpReg1, tmpReg2)
+			// STP.P  (tmpReg1, tmpReg2), 16(dstReg)
+			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, 0, true)
+		}
+		// Decrement loop count.
+		//   SUB     $1, countReg
+		p = s.Prog(arm64.ASUB)
+		p.From.Type = obj.TYPE_CONST
+		p.From.Offset = 1
+		p.To.Type = obj.TYPE_REG
+		p.To.Reg = countReg
+		// Jump to loop header if we're not done yet.
+		//   CBNZ    head
+		p = s.Prog(arm64.ACBNZ)
+		p.From.Type = obj.TYPE_REG
+		p.From.Reg = countReg
+		p.To.Type = obj.TYPE_BRANCH
+		p.To.SetTarget(cntInit.Link)
+
+		// Multiples of the loop size are now done.
+		n %= loopSize
+
+		// Copy any fractional portion.
+		var off int64
+		for n >= 16 {
+			//  LDP     off(srcReg), (tmpReg1, tmpReg2)
+			//  STP     (tmpReg1, tmpReg2), off(dstReg)
+			move16(s, srcReg, dstReg, tmpReg1, tmpReg2, off, false)
+			off += 16
+			n -= 16
+		}
+		if n > 8 {
+			//  MOVD    off(srcReg), tmpReg1
+			//  MOVD    tmpReg1, off(dstReg)
+			move8(s, srcReg, dstReg, tmpReg1, off)
+			off += 8
+			n -= 8
+		}
+		if n != 0 {
+			//  MOVD    off+n-8(srcReg), tmpReg1
+			//  MOVD    tmpReg1, off+n-8(dstReg)
+			move8(s, srcReg, dstReg, tmpReg1, off+n-8)
+		}
+
 	case ssa.OpARM64CALLstatic, ssa.OpARM64CALLclosure, ssa.OpARM64CALLinter:
 		s.Call(v)
 	case ssa.OpARM64CALLtail:
@ -1599,3 +1677,53 @@ func zero8(s *ssagen.State, reg int16, off int64) {
 	p.To.Reg = reg
 	p.To.Offset = off
 }
+
+// move16 copies 16 bytes at src+off to dst+off.
+// Uses registers tmp1 and tmp2.
+// If postInc is true, increment src and dst by 16.
+func move16(s *ssagen.State, src, dst, tmp1, tmp2 int16, off int64, postInc bool) {
+	// LDP     off(src), (tmp1, tmp2)
+	ld := s.Prog(arm64.ALDP)
+	ld.From.Type = obj.TYPE_MEM
+	ld.From.Reg = src
+	ld.From.Offset = off
+	ld.To.Type = obj.TYPE_REGREG
+	ld.To.Reg = tmp1
+	ld.To.Offset = int64(tmp2)
+	// STP     (tmp1, tmp2), off(dst)
+	st := s.Prog(arm64.ASTP)
+	st.From.Type = obj.TYPE_REGREG
+	st.From.Reg = tmp1
+	st.From.Offset = int64(tmp2)
+	st.To.Type = obj.TYPE_MEM
+	st.To.Reg = dst
+	st.To.Offset = off
+	if postInc {
+		if off != 0 {
+			panic("can't postinc with non-zero offset")
+		}
+		ld.Scond = arm64.C_XPOST
+		st.Scond = arm64.C_XPOST
+		ld.From.Offset = 16
+		st.To.Offset = 16
+	}
+}
+
+// move8 copies 8 bytes at src+off to dst+off.
+// Uses register tmp.
+func move8(s *ssagen.State, src, dst, tmp int16, off int64) {
+	// MOVD    off(src), tmp
+	ld := s.Prog(arm64.AMOVD)
+	ld.From.Type = obj.TYPE_MEM
+	ld.From.Reg = src
+	ld.From.Offset = off
+	ld.To.Type = obj.TYPE_REG
+	ld.To.Reg = tmp
+	// MOVD    tmp, off(dst)
+	st := s.Prog(arm64.AMOVD)
+	st.From.Type = obj.TYPE_REG
+	st.From.Reg = tmp
+	st.To.Type = obj.TYPE_MEM
+	st.To.Reg = dst
+	st.To.Offset = off
+}
--- a/src/cmd/compile/internal/loopvar/loopvar_test.go
+++ b/src/cmd/compile/internal/loopvar/loopvar_test.go
@ -50,7 +50,7 @@ var cases = []testcase{
 	{"1", "", 0, []string{"for_nested.go"}},
 }

-// TestLoopVar checks that the GOEXPERIMENT and debug flags behave as expected.
+// TestLoopVarGo1_21 checks that the GOEXPERIMENT and debug flags behave as expected.
 func TestLoopVarGo1_21(t *testing.T) {
 	switch runtime.GOOS {
 	case "linux", "darwin":
--- a/src/cmd/compile/internal/pgoir/irgraph.go
+++ b/src/cmd/compile/internal/pgoir/irgraph.go
@ -158,7 +158,7 @@ func New(profileFile string) (*Profile, error) {
 	}, nil
 }

-// initializeIRGraph builds the IRGraph by visiting all the ir.Func in decl list
+// createIRGraph builds the IRGraph by visiting all the ir.Func in decl list
 // of a package.
 func createIRGraph(namedEdgeMap pgo.NamedEdgeMap) *IRGraph {
 	g := &IRGraph{
--- a/src/cmd/compile/internal/ssa/_gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64.rules
@ -462,39 +462,8 @@
 			(STP [16] dst (Select0 <typ.UInt64> (LDP [16] src mem)) (Select1 <typ.UInt64> (LDP [16] src mem))
 				(STP dst (Select0 <typ.UInt64> (LDP src mem)) (Select1 <typ.UInt64> (LDP src mem)) mem))))

-// strip off fractional word move
-(Move [s] dst src mem) && s%16 != 0 && s%16 <= 8 && s > 64 =>
-	(Move [8]
-		(OffPtr <dst.Type> dst [s-8])
-		(OffPtr <src.Type> src [s-8])
-		(Move [s-s%16] dst src mem))
-(Move [s] dst src mem) && s%16 != 0 && s%16 > 8 && s > 64 =>
-	(Move [16]
-		(OffPtr <dst.Type> dst [s-16])
-		(OffPtr <src.Type> src [s-16])
-		(Move [s-s%16] dst src mem))
-
-// medium move uses a duff device
-(Move [s] dst src mem)
-	&& s > 64 && s <= 16*64 && s%16 == 0
-	&& logLargeCopy(v, s) =>
-	(DUFFCOPY [8 * (64 - s/16)] dst src mem)
-// 8 is the number of bytes to encode:
-//
-// LDP.P   16(R16), (R26, R27)
-// STP.P   (R26, R27), 16(R17)
-//
-// 64 is number of these blocks. See runtime/duff_arm64.s:duffcopy
-
-// large move uses a loop
-(Move [s] dst src mem)
-	&& s%16 == 0 && s > 16*64
-	&& logLargeCopy(v, s) =>
-	(LoweredMove
-		dst
-		src
-		(ADDconst <src.Type> src [s-16])
-		mem)
+(Move [s] dst src mem) && s > 64 && s < 192 && logLargeCopy(v, s) => (LoweredMove [s] dst src mem)
+(Move [s] dst src mem) && s >= 192 && logLargeCopy(v, s) => (LoweredMoveLoop [s] dst src mem)

 // calls
 (StaticCall  ...) => (CALLstatic  ...)
--- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
+++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go
@ -144,6 +144,8 @@ func init() {
 		gpspsbg    = gpspg | buildReg("SB")
 		fp         = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
 		callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
+		r24to25    = buildReg("R24 R25")
+		r23to25    = buildReg("R23 R24 R25")
 		rz         = buildReg("ZERO")
 		first16    = buildReg("R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15")
 	)
@ -568,47 +570,40 @@ func init() {
 			needIntTemp:    true,
 		},

-		// duffcopy
-		// arg0 = address of dst memory (in R21, changed as side effect)
-		// arg1 = address of src memory (in R20, changed as side effect)
+		// medium copying
+		// arg0 = address of dst memory
+		// arg1 = address of src memory
 		// arg2 = mem
-		// auxint = offset into duffcopy code to start executing
+		// auxint = # of bytes to copy
 		// returns mem
-		// R20, R21 changed as side effect
-		// R16 and R17 may be clobbered by linker trampoline.
 		{
-			name:      "DUFFCOPY",
+			name:      "LoweredMove",
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R21"), buildReg("R20")},
-				clobbers: buildReg("R16 R17 R20 R21 R26 R30"),
+				inputs:   []regMask{gp &^ r24to25, gp &^ r24to25},
+				clobbers: r24to25, // TODO: figure out needIntTemp x2
 			},
-			//faultOnNilArg0: true, // Note: removed for 73748. TODO: reenable at some point
-			//faultOnNilArg1: true,
-			unsafePoint: true, // FP maintenance around DUFFCOPY can be clobbered by interrupts
+			faultOnNilArg0: true,
+			faultOnNilArg1: true,
 		},

-		// large move
-		// arg0 = address of dst memory (in R17 aka arm64.REGRT2, changed as side effect)
-		// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
-		// arg2 = address of the last element of src
-		// arg3 = mem
+		// large copying
+		// arg0 = address of dst memory
+		// arg1 = address of src memory
+		// arg2 = mem
+		// auxint = # of bytes to copy
 		// returns mem
-		//	LDP.P	16(R16), (R25, Rtmp)
-		//	STP.P	(R25, Rtmp), 16(R17)
-		//	CMP	Rarg2, R16
-		//	BLE	-3(PC)
-		// Note: the-end-of-src may be not a valid pointer. it's a problem if it is spilled.
-		// the-end-of-src - 16 is within the area to copy, ok to spill.
 		{
-			name:      "LoweredMove",
-			argLength: 4,
+			name:      "LoweredMoveLoop",
+			aux:       "Int64",
+			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("R17"), buildReg("R16"), gp &^ buildReg("R25")},
-				clobbers: buildReg("R16 R17 R25"),
+				inputs:       []regMask{gp &^ r23to25, gp &^ r23to25},
+				clobbers:     r23to25, // TODO: figure out needIntTemp x3
+				clobbersArg0: true,
+				clobbersArg1: true,
 			},
-			clobberFlags:   true,
 			faultOnNilArg0: true,
 			faultOnNilArg1: true,
 		},
--- a/src/cmd/compile/internal/ssa/func_test.go
+++ b/src/cmd/compile/internal/ssa/func_test.go
@ -475,7 +475,7 @@ func opcodeMap(f *Func) map[Op]int {
 	return m
 }

-// opcodeCounts checks that the number of opcodes listed in m agree with the
+// checkOpcodeCounts checks that the number of opcodes listed in m agree with the
 // number of opcodes that appear in the function.
 func checkOpcodeCounts(t *testing.T, f *Func, m map[Op]int) {
 	n := opcodeMap(f)
--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@ -2906,8 +2906,8 @@ const (
 	OpARM64GreaterEqualNoov
 	OpARM64LoweredZero
 	OpARM64LoweredZeroLoop
-	OpARM64DUFFCOPY
 	OpARM64LoweredMove
+	OpARM64LoweredMoveLoop
 	OpARM64LoweredGetClosurePtr
 	OpARM64LoweredGetCallerSP
 	OpARM64LoweredGetCallerPC
@ -43041,31 +43041,33 @@ var opcodeTable = [...]opInfo{
 		},
 	},
 	{
-		name:        "DUFFCOPY",
+		name:           "LoweredMove",
 		auxType:        auxInt64,
 		argLen:         3,
-		unsafePoint: true,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 1048576}, // R21
-				{1, 524288},  // R20
-			},
-			clobbers: 303759360, // R16 R17 R20 R21 R26 R30
-		},
-	},
-	{
-		name:           "LoweredMove",
-		argLen:         4,
-		clobberFlags:   true,
 		faultOnNilArg0: true,
 		faultOnNilArg1: true,
 		reg: regInfo{
 			inputs: []inputInfo{
-				{0, 131072},    // R17
-				{1, 65536},     // R16
-				{2, 318767103}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R26 R30
+				{0, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
+				{1, 310378495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R26 R30
 			},
-			clobbers: 16973824, // R16 R17 R25
+			clobbers: 25165824, // R24 R25
+		},
+	},
+	{
+		name:           "LoweredMoveLoop",
+		auxType:        auxInt64,
+		argLen:         3,
+		faultOnNilArg0: true,
+		faultOnNilArg1: true,
+		reg: regInfo{
+			inputs: []inputInfo{
+				{0, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
+				{1, 306184191}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R26 R30
+			},
+			clobbers:     29360128, // R23 R24 R25
+			clobbersArg0: true,
+			clobbersArg1: true,
 		},
 	},
 	{
--- a/src/cmd/compile/internal/ssa/regalloc.go
+++ b/src/cmd/compile/internal/ssa/regalloc.go
@ -1756,10 +1756,9 @@ func (s *regAllocState) regalloc(f *Func) {
 					// spilling the value with the most distant next use.
 					continue
 				}
-				// Copy input to a new clobberable register.
+				// Copy input to a different register that won't be clobbered.
 				c := s.allocValToReg(v.Args[i], m, true, v.Pos)
 				s.copies[c] = false
-				args[i] = c
 			}

 			// Pick a temporary register if needed.
--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@ -19688,87 +19688,35 @@ func rewriteValueARM64_OpMove(v *Value) bool {
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%16 != 0 && s%16 <= 8 && s > 64
-	// result: (Move [8] (OffPtr <dst.Type> dst [s-8]) (OffPtr <src.Type> src [s-8]) (Move [s-s%16] dst src mem))
+	// cond: s > 64 && s < 192 && logLargeCopy(v, s)
+	// result: (LoweredMove [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%16 != 0 && s%16 <= 8 && s > 64) {
+		if !(s > 64 && s < 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpMove)
-		v.AuxInt = int64ToAuxInt(8)
-		v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
-		v0.AuxInt = int64ToAuxInt(s - 8)
-		v0.AddArg(dst)
-		v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
-		v1.AuxInt = int64ToAuxInt(s - 8)
-		v1.AddArg(src)
-		v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
-		v2.AuxInt = int64ToAuxInt(s - s%16)
-		v2.AddArg3(dst, src, mem)
-		v.AddArg3(v0, v1, v2)
-		return true
-	}
-	// match: (Move [s] dst src mem)
-	// cond: s%16 != 0 && s%16 > 8 && s > 64
-	// result: (Move [16] (OffPtr <dst.Type> dst [s-16]) (OffPtr <src.Type> src [s-16]) (Move [s-s%16] dst src mem))
-	for {
-		s := auxIntToInt64(v.AuxInt)
-		dst := v_0
-		src := v_1
-		mem := v_2
-		if !(s%16 != 0 && s%16 > 8 && s > 64) {
-			break
-		}
-		v.reset(OpMove)
-		v.AuxInt = int64ToAuxInt(16)
-		v0 := b.NewValue0(v.Pos, OpOffPtr, dst.Type)
-		v0.AuxInt = int64ToAuxInt(s - 16)
-		v0.AddArg(dst)
-		v1 := b.NewValue0(v.Pos, OpOffPtr, src.Type)
-		v1.AuxInt = int64ToAuxInt(s - 16)
-		v1.AddArg(src)
-		v2 := b.NewValue0(v.Pos, OpMove, types.TypeMem)
-		v2.AuxInt = int64ToAuxInt(s - s%16)
-		v2.AddArg3(dst, src, mem)
-		v.AddArg3(v0, v1, v2)
-		return true
-	}
-	// match: (Move [s] dst src mem)
-	// cond: s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)
-	// result: (DUFFCOPY [8 * (64 - s/16)] dst src mem)
-	for {
-		s := auxIntToInt64(v.AuxInt)
-		dst := v_0
-		src := v_1
-		mem := v_2
-		if !(s > 64 && s <= 16*64 && s%16 == 0 && logLargeCopy(v, s)) {
-			break
-		}
-		v.reset(OpARM64DUFFCOPY)
-		v.AuxInt = int64ToAuxInt(8 * (64 - s/16))
+		v.reset(OpARM64LoweredMove)
+		v.AuxInt = int64ToAuxInt(s)
 		v.AddArg3(dst, src, mem)
 		return true
 	}
 	// match: (Move [s] dst src mem)
-	// cond: s%16 == 0 && s > 16*64 && logLargeCopy(v, s)
-	// result: (LoweredMove dst src (ADDconst <src.Type> src [s-16]) mem)
+	// cond: s >= 192 && logLargeCopy(v, s)
+	// result: (LoweredMoveLoop [s] dst src mem)
 	for {
 		s := auxIntToInt64(v.AuxInt)
 		dst := v_0
 		src := v_1
 		mem := v_2
-		if !(s%16 == 0 && s > 16*64 && logLargeCopy(v, s)) {
+		if !(s >= 192 && logLargeCopy(v, s)) {
 			break
 		}
-		v.reset(OpARM64LoweredMove)
-		v0 := b.NewValue0(v.Pos, OpARM64ADDconst, src.Type)
-		v0.AuxInt = int64ToAuxInt(s - 16)
-		v0.AddArg(src)
-		v.AddArg4(dst, src, v0, mem)
+		v.reset(OpARM64LoweredMoveLoop)
+		v.AuxInt = int64ToAuxInt(s)
+		v.AddArg3(dst, src, mem)
 		return true
 	}
 	return false
--- a/src/cmd/go/alldocs.go
+++ b/src/cmd/go/alldocs.go
@ -758,6 +758,8 @@
 // variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH
 // environment variable is not set. Executables in $GOROOT
 // are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN.
+// Cross compiled binaries are installed in $GOOS_$GOARCH subdirectories
+// of the above.
 //
 // If the arguments have version suffixes (like @latest or @v1.0.0), "go install"
 // builds packages in module-aware mode, ignoring the go.mod file in the current
@ -3231,8 +3233,8 @@
 //
 // Several of the flags control profiling and write an execution profile
 // suitable for "go tool pprof"; run "go tool pprof -h" for more
-// information. The --alloc_space, --alloc_objects, and --show_bytes
-// options of pprof control how the information is presented.
+// information. The -sample_index=alloc_space, -sample_index=alloc_objects,
+// and -show_bytes options of pprof control how the information is presented.
 //
 // The following flags are recognized by the 'go test' command and
 // control the execution of any test:
--- a/src/cmd/go/internal/doc/doc.go
+++ b/src/cmd/go/internal/doc/doc.go
@ -212,16 +212,16 @@ func do(writer io.Writer, flagSet *flag.FlagSet, args []string) (err error) {
 			mod, err := runCmd(append(os.Environ(), "GOWORK=off"), "go", "list", "-m")
 			if err == nil && mod != "" && mod != "command-line-arguments" {
 				// If there's a module, go to the module's doc page.
-				return doPkgsite(mod)
+				return doPkgsite(mod, "")
 			}
 			gowork, err := runCmd(nil, "go", "env", "GOWORK")
 			if err == nil && gowork != "" {
 				// Outside a module, but in a workspace, go to the home page
 				// with links to each of the modules' pages.
-				return doPkgsite("")
+				return doPkgsite("", "")
 			}
 			// Outside a module or workspace, go to the documentation for the standard library.
-			return doPkgsite("std")
+			return doPkgsite("std", "")
 		}

 		// If args are provided, we need to figure out which page to open on the pkgsite
@ -282,11 +282,11 @@ func do(writer io.Writer, flagSet *flag.FlagSet, args []string) (err error) {
 		}
 		if found {
 			if serveHTTP {
-				path, err := objectPath(userPath, pkg, symbol, method)
+				path, fragment, err := objectPath(userPath, pkg, symbol, method)
 				if err != nil {
 					return err
 				}
-				return doPkgsite(path)
+				return doPkgsite(path, fragment)
 			}
 			return nil
 		}
@ -305,7 +305,8 @@ func runCmd(env []string, cmdline ...string) (string, error) {
 	return strings.TrimSpace(stdout.String()), nil
 }

-func objectPath(userPath string, pkg *Package, symbol, method string) (string, error) {
+// returns a path followed by a fragment (or an error)
+func objectPath(userPath string, pkg *Package, symbol, method string) (string, string, error) {
 	var err error
 	path := pkg.build.ImportPath
 	if path == "." {
@ -314,7 +315,7 @@ func objectPath(userPath string, pkg *Package, symbol, method string) (string, e
 		// go list to get the import path.
 		path, err = runCmd(nil, "go", "list", userPath)
 		if err != nil {
-			return "", err
+			return "", "", err
 		}
 	}

@ -322,10 +323,7 @@ func objectPath(userPath string, pkg *Package, symbol, method string) (string, e
 	if symbol != "" && method != "" {
 		object = symbol + "." + method
 	}
-	if object != "" {
-		path = path + "#" + object
-	}
-	return path, nil
+	return path, object, nil
 }

 // failMessage creates a nicely formatted error message when there is no result to show.
--- a/src/cmd/go/internal/doc/pkgsite.go
+++ b/src/cmd/go/internal/doc/pkgsite.go
@ -34,7 +34,7 @@ func pickUnusedPort() (int, error) {
 	return port, nil
 }

-func doPkgsite(urlPath string) error {
+func doPkgsite(urlPath, fragment string) error {
 	port, err := pickUnusedPort()
 	if err != nil {
 		return fmt.Errorf("failed to find port for documentation server: %v", err)
@ -44,6 +44,9 @@ func doPkgsite(urlPath string) error {
 	if err != nil {
 		return fmt.Errorf("internal error: failed to construct url: %v", err)
 	}
+	if fragment != "" {
+		path += "#" + fragment
+	}

 	// Turn off the default signal handler for SIGINT (and SIGQUIT on Unix)
 	// and instead wait for the child process to handle the signal and
--- a/src/cmd/go/internal/doc/pkgsite_bootstrap.go
+++ b/src/cmd/go/internal/doc/pkgsite_bootstrap.go
@ -8,4 +8,4 @@

 package doc

-func doPkgsite(string) error { return nil }
+func doPkgsite(string, string) error { return nil }
--- a/src/cmd/go/internal/test/test.go
+++ b/src/cmd/go/internal/test/test.go
@ -186,8 +186,8 @@ and flags that apply to the resulting test binary.

 Several of the flags control profiling and write an execution profile
 suitable for "go tool pprof"; run "go tool pprof -h" for more
-information. The --alloc_space, --alloc_objects, and --show_bytes
-options of pprof control how the information is presented.
+information. The -sample_index=alloc_space, -sample_index=alloc_objects,
+and -show_bytes options of pprof control how the information is presented.

 The following flags are recognized by the 'go test' command and
 control the execution of any test:
@ -1044,11 +1044,36 @@ func runTest(ctx context.Context, cmd *base.Command, args []string) {
 		prints = append(prints, printTest)
 	}

-	// Order runs for coordinating start JSON prints.
+	// Order runs for coordinating start JSON prints via two mechanisms:
+	// 1. Channel locking forces runTest actions to start in-order.
+	// 2. Barrier tasks force runTest actions to be scheduled in-order.
+	// We need both for performant behavior, as channel locking without the barrier tasks starves the worker pool,
+	// and barrier tasks without channel locking doesn't guarantee start in-order behavior alone.
+	var prevBarrier *work.Action
 	ch := make(chan struct{})
 	close(ch)
 	for _, a := range runs {
 		if r, ok := a.Actor.(*runTestActor); ok {
+			// Inject a barrier task between the run action and its dependencies.
+			// This barrier task wil also depend on the previous barrier task.
+			// This prevents the run task from being scheduled until all previous run dependencies have finished.
+			// The build graph will be augmented to look roughly like this:
+			//	build("a")           build("b")           build("c")
+			//	    |                   |                     |
+			//	barrier("a.test") -> barrier("b.test") -> barrier("c.test")
+			//	    |                   |                     |
+			//	run("a.test")        run("b.test")        run("c.test")
+
+			barrier := &work.Action{
+				Mode: "test barrier",
+				Deps: slices.Clip(a.Deps),
+			}
+			if prevBarrier != nil {
+				barrier.Deps = append(barrier.Deps, prevBarrier)
+			}
+			a.Deps = []*work.Action{barrier}
+			prevBarrier = barrier
+
 			r.prev = ch
 			ch = make(chan struct{})
 			r.next = ch
@ -1400,6 +1425,8 @@ func (lockedStdout) Write(b []byte) (int, error) {

 func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) error {
 	sh := b.Shell(a)
+	barrierAction := a.Deps[0]
+	buildAction := barrierAction.Deps[0]

 	// Wait for previous test to get started and print its first json line.
 	select {
@ -1530,7 +1557,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// we have different link inputs but the same final binary,
 		// we still reuse the cached test result.
 		// c.saveOutput will store the result under both IDs.
-		r.c.tryCacheWithID(b, a, a.Deps[0].BuildContentID())
+		r.c.tryCacheWithID(b, a, buildAction.BuildContentID())
 	}
 	if r.c.buf != nil {
 		if stdout != &buf {
@ -1581,7 +1608,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 		// fresh copies of tools to test as part of the testing.
 		addToEnv = "GOCOVERDIR=" + gcd
 	}
-	args := str.StringList(execCmd, a.Deps[0].BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)
+	args := str.StringList(execCmd, buildAction.BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs)

 	if testCoverProfile != "" {
 		// Write coverage to temporary profile, for merging later.
@ -1741,8 +1768,8 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action)
 // tryCache is called just before the link attempt,
 // to see if the test result is cached and therefore the link is unneeded.
 // It reports whether the result can be satisfied from cache.
-func (c *runCache) tryCache(b *work.Builder, a *work.Action) bool {
-	return c.tryCacheWithID(b, a, a.Deps[0].BuildActionID())
+func (c *runCache) tryCache(b *work.Builder, a *work.Action, linkAction *work.Action) bool {
+	return c.tryCacheWithID(b, a, linkAction.BuildActionID())
 }

 func (c *runCache) tryCacheWithID(b *work.Builder, a *work.Action, id string) bool {
--- a/src/cmd/go/internal/work/action.go
+++ b/src/cmd/go/internal/work/action.go
@ -92,7 +92,7 @@ type Action struct {

 	buggyInstall bool // is this a buggy install (see -linkshared)?

-	TryCache func(*Builder, *Action) bool // callback for cache bypass
+	TryCache func(*Builder, *Action, *Action) bool // callback for cache bypass

 	CacheExecutable bool // Whether to cache executables produced by link steps

--- a/src/cmd/go/internal/work/build.go
+++ b/src/cmd/go/internal/work/build.go
@ -568,6 +568,8 @@ Executables are installed in the directory named by the GOBIN environment
 variable, which defaults to $GOPATH/bin or $HOME/go/bin if the GOPATH
 environment variable is not set. Executables in $GOROOT
 are installed in $GOROOT/bin or $GOTOOLDIR instead of $GOBIN.
+Cross compiled binaries are installed in $GOOS_$GOARCH subdirectories
+of the above.

 If the arguments have version suffixes (like @latest or @v1.0.0), "go install"
 builds packages in module-aware mode, ignoring the go.mod file in the current
--- a/src/cmd/go/internal/work/buildid.go
+++ b/src/cmd/go/internal/work/buildid.go
@ -401,6 +401,25 @@ var (
 	stdlibRecompiledIncOnce = sync.OnceFunc(stdlibRecompiled.Inc)
 )

+// testRunAction returns the run action for a test given the link action
+// for the test binary, if the only (non-test-barrier) action that depend
+// on the link action is the run action.
+func testRunAction(a *Action) *Action {
+	if len(a.triggers) != 1 || a.triggers[0].Mode != "test barrier" {
+		return nil
+	}
+	var runAction *Action
+	for _, t := range a.triggers[0].triggers {
+		if t.Mode == "test run" {
+			if runAction != nil {
+				return nil
+			}
+			runAction = t
+		}
+	}
+	return runAction
+}
+
 // useCache tries to satisfy the action a, which has action ID actionHash,
 // by using a cached result from an earlier build.
 // If useCache decides that the cache can be used, it sets a.buildID
@ -526,7 +545,7 @@ func (b *Builder) useCache(a *Action, actionHash cache.ActionID, target string,
 	// then to avoid the link step, report the link as up-to-date.
 	// We avoid the nested build ID problem in the previous special case
 	// by recording the test results in the cache under the action ID half.
-	if len(a.triggers) == 1 && a.triggers[0].TryCache != nil && a.triggers[0].TryCache(b, a.triggers[0]) {
+	if ra := testRunAction(a); ra != nil && ra.TryCache != nil && ra.TryCache(b, ra, a) {
 		// Best effort attempt to display output from the compile and link steps.
 		// If it doesn't work, it doesn't work: reusing the test result is more
 		// important than reprinting diagnostic information.
--- a/src/cmd/go/internal/work/cover.go
+++ b/src/cmd/go/internal/work/cover.go
@ -36,8 +36,9 @@ func (b *Builder) CovData(a *Action, cmdargs ...any) ([]byte, error) {
 // but will be empty; in this case the return is an empty string.
 func BuildActionCoverMetaFile(runAct *Action) (string, error) {
 	p := runAct.Package
-	for i := range runAct.Deps {
-		pred := runAct.Deps[i]
+	barrierAct := runAct.Deps[0]
+	for i := range barrierAct.Deps {
+		pred := barrierAct.Deps[i]
 		if pred.Mode != "build" || pred.Package == nil {
 			continue
 		}
--- a/src/cmd/go/internal/work/exec.go
+++ b/src/cmd/go/internal/work/exec.go
@ -183,8 +183,22 @@ func (b *Builder) Do(ctx context.Context, root *Action) {

 		for _, a0 := range a.triggers {
 			if a.Failed != nil {
+				if a0.Mode == "test barrier" {
+					// If this action was triggered by a test, there
+					// will be a test barrier action in between the test
+					// and the true trigger. But there will be other
+					// triggers that are other barriers that are waiting
+					// for this one. Propagate the failure to the true
+					// trigger, but not to the other barriers.
+					for _, bt := range a0.triggers {
+						if bt.Mode != "test barrier" {
+							bt.Failed = a.Failed
+						}
+					}
+				} else {
 					a0.Failed = a.Failed
 				}
+			}
 			if a0.pending--; a0.pending == 0 {
 				b.ready.push(a0)
 				b.readySema <- true
--- a/src/cmd/internal/obj/arm64/asm7.go
+++ b/src/cmd/internal/obj/arm64/asm7.go
@ -892,8 +892,6 @@ var optab = []Optab{
 	{obj.ANOP, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // nop variants, see #40689
 	{obj.ANOP, C_ZREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
 	{obj.ANOP, C_VREG, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},
-	{obj.ADUFFZERO, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
-	{obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0},   // same as AB/ABL
 	{obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0},    // align code
 	{obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional
 }
@ -3297,9 +3295,7 @@ func buildop(ctxt *obj.Link) {
 			obj.AFUNCDATA,
 			obj.APCALIGN,
 			obj.APCALIGNMAX,
-			obj.APCDATA,
-			obj.ADUFFZERO,
-			obj.ADUFFCOPY:
+			obj.APCDATA:
 			break
 		}
 	}
@ -6971,7 +6967,7 @@ func (c *ctxt7) opbra(p *obj.Prog, a obj.As) uint32 {
 	case AB:
 		return 0<<31 | 5<<26 /* imm26 */

-	case obj.ADUFFZERO, obj.ADUFFCOPY, ABL:
+	case ABL:
 		return 1<<31 | 5<<26
 	}

--- a/src/cmd/internal/obj/arm64/obj7.go
+++ b/src/cmd/internal/obj/arm64/obj7.go
@ -319,11 +319,7 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {

 	// Rewrite BR/BL to symbol as TYPE_BRANCH.
 	switch p.As {
-	case AB,
-		ABL,
-		obj.ARET,
-		obj.ADUFFZERO,
-		obj.ADUFFCOPY:
+	case AB, ABL, obj.ARET:
 		if p.To.Sym != nil {
 			p.To.Type = obj.TYPE_BRANCH
 		}
@ -400,39 +396,6 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) {

 // Rewrite p, if necessary, to access global data via the global offset table.
 func (c *ctxt7) rewriteToUseGot(p *obj.Prog) {
-	if p.As == obj.ADUFFCOPY || p.As == obj.ADUFFZERO {
-		//     ADUFFxxx $offset
-		// becomes
-		//     MOVD runtime.duffxxx@GOT, REGTMP
-		//     ADD $offset, REGTMP
-		//     CALL REGTMP
-		var sym *obj.LSym
-		if p.As == obj.ADUFFZERO {
-			sym = c.ctxt.LookupABI("runtime.duffzero", obj.ABIInternal)
-		} else {
-			sym = c.ctxt.LookupABI("runtime.duffcopy", obj.ABIInternal)
-		}
-		offset := p.To.Offset
-		p.As = AMOVD
-		p.From.Type = obj.TYPE_MEM
-		p.From.Name = obj.NAME_GOTREF
-		p.From.Sym = sym
-		p.To.Type = obj.TYPE_REG
-		p.To.Reg = REGTMP
-		p.To.Name = obj.NAME_NONE
-		p.To.Offset = 0
-		p.To.Sym = nil
-		p1 := obj.Appendp(p, c.newprog)
-		p1.As = AADD
-		p1.From.Type = obj.TYPE_CONST
-		p1.From.Offset = offset
-		p1.To.Type = obj.TYPE_REG
-		p1.To.Reg = REGTMP
-		p2 := obj.Appendp(p1, c.newprog)
-		p2.As = obj.ACALL
-		p2.To.Type = obj.TYPE_REG
-		p2.To.Reg = REGTMP
-	}

 	// We only care about global data: NAME_EXTERN means a global
 	// symbol in the Go sense, and p.Sym.Local is true for a few
@ -543,9 +506,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 		case obj.ATEXT:
 			p.Mark |= LEAF

-		case ABL,
-			obj.ADUFFZERO,
-			obj.ADUFFCOPY:
+		case ABL:
 			c.cursym.Func().Text.Mark &^= LEAF
 		}
 	}
@ -912,110 +873,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym, newprog obj.ProgAlloc) {
 				p.From.Type = obj.TYPE_MEM
 				p.From.Reg = REGSP
 			}
-
-		case obj.ADUFFCOPY:
-			//  ADR	ret_addr, R27
-			//  STP	(FP, R27), -24(SP)
-			//  SUB	24, SP, FP
-			//  DUFFCOPY
-			// ret_addr:
-			//  SUB	8, SP, FP
-
-			q1 := p
-			// copy DUFFCOPY from q1 to q4
-			q4 := obj.Appendp(p, c.newprog)
-			q4.Pos = p.Pos
-			q4.As = obj.ADUFFCOPY
-			q4.To = p.To
-
-			q1.As = AADR
-			q1.From.Type = obj.TYPE_BRANCH
-			q1.To.Type = obj.TYPE_REG
-			q1.To.Reg = REG_R27
-
-			q2 := obj.Appendp(q1, c.newprog)
-			q2.Pos = p.Pos
-			q2.As = ASTP
-			q2.From.Type = obj.TYPE_REGREG
-			q2.From.Reg = REGFP
-			q2.From.Offset = int64(REG_R27)
-			q2.To.Type = obj.TYPE_MEM
-			q2.To.Reg = REGSP
-			q2.To.Offset = -24
-
-			// maintain FP for DUFFCOPY
-			q3 := obj.Appendp(q2, c.newprog)
-			q3.Pos = p.Pos
-			q3.As = ASUB
-			q3.From.Type = obj.TYPE_CONST
-			q3.From.Offset = 24
-			q3.Reg = REGSP
-			q3.To.Type = obj.TYPE_REG
-			q3.To.Reg = REGFP
-
-			q5 := obj.Appendp(q4, c.newprog)
-			q5.Pos = p.Pos
-			q5.As = ASUB
-			q5.From.Type = obj.TYPE_CONST
-			q5.From.Offset = 8
-			q5.Reg = REGSP
-			q5.To.Type = obj.TYPE_REG
-			q5.To.Reg = REGFP
-			q1.From.SetTarget(q5)
-			p = q5
-
-		case obj.ADUFFZERO:
-			//  ADR	ret_addr, R27
-			//  STP	(FP, R27), -24(SP)
-			//  SUB	24, SP, FP
-			//  DUFFZERO
-			// ret_addr:
-			//  SUB	8, SP, FP
-
-			q1 := p
-			// copy DUFFZERO from q1 to q4
-			q4 := obj.Appendp(p, c.newprog)
-			q4.Pos = p.Pos
-			q4.As = obj.ADUFFZERO
-			q4.To = p.To
-
-			q1.As = AADR
-			q1.From.Type = obj.TYPE_BRANCH
-			q1.To.Type = obj.TYPE_REG
-			q1.To.Reg = REG_R27
-
-			q2 := obj.Appendp(q1, c.newprog)
-			q2.Pos = p.Pos
-			q2.As = ASTP
-			q2.From.Type = obj.TYPE_REGREG
-			q2.From.Reg = REGFP
-			q2.From.Offset = int64(REG_R27)
-			q2.To.Type = obj.TYPE_MEM
-			q2.To.Reg = REGSP
-			q2.To.Offset = -24
-
-			// maintain FP for DUFFZERO
-			q3 := obj.Appendp(q2, c.newprog)
-			q3.Pos = p.Pos
-			q3.As = ASUB
-			q3.From.Type = obj.TYPE_CONST
-			q3.From.Offset = 24
-			q3.Reg = REGSP
-			q3.To.Type = obj.TYPE_REG
-			q3.To.Reg = REGFP
-
-			q5 := obj.Appendp(q4, c.newprog)
-			q5.Pos = p.Pos
-			q5.As = ASUB
-			q5.From.Type = obj.TYPE_CONST
-			q5.From.Offset = 8
-			q5.Reg = REGSP
-			q5.To.Type = obj.TYPE_REG
-			q5.To.Reg = REGFP
-			q1.From.SetTarget(q5)
-			p = q5
 		}
-
 		if p.To.Type == obj.TYPE_REG && p.To.Reg == REGSP && p.Spadj == 0 {
 			f := c.cursym.Func()
 			if f.FuncFlag&abi.FuncFlagSPWrite == 0 {
--- a/src/cmd/internal/obj/x86/asm6.go
+++ b/src/cmd/internal/obj/x86/asm6.go
@ -4013,15 +4013,6 @@ func (ab *AsmBuf) mediaop(ctxt *obj.Link, o *Optab, op int, osize int, z int) in
 	return z
 }

-var bpduff1 = []byte{
-	0x48, 0x89, 0x6c, 0x24, 0xf0, // MOVQ BP, -16(SP)
-	0x48, 0x8d, 0x6c, 0x24, 0xf0, // LEAQ -16(SP), BP
-}
-
-var bpduff2 = []byte{
-	0x48, 0x8b, 0x6d, 0x00, // MOVQ 0(BP), BP
-}
-
 // asmevex emits EVEX pregis and opcode byte.
 // In addition to asmvex r/m, vvvv and reg fields also requires optional
 // K-masking register.
@ -4859,16 +4850,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 					ctxt.Diag("directly calling duff when dynamically linking Go")
 				}

-				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
-					// Maintain BP around call, since duffcopy/duffzero can't do it
-					// (the call jumps into the middle of the function).
-					// This makes it possible to see call sites for duffcopy/duffzero in
-					// BP-based profiling tools like Linux perf (which is the
-					// whole point of maintaining frame pointers in Go).
-					// MOVQ BP, -16(SP)
-					// LEAQ -16(SP), BP
-					ab.Put(bpduff1)
-				}
 				ab.Put1(byte(op))
 				cursym.AddRel(ctxt, obj.Reloc{
 					Type: objabi.R_CALL,
@ -4879,12 +4860,6 @@ func (ab *AsmBuf) doasm(ctxt *obj.Link, cursym *obj.LSym, p *obj.Prog) {
 				})
 				ab.PutInt32(0)

-				if yt.zcase == Zcallduff && ctxt.Arch.Family == sys.AMD64 {
-					// Pop BP pushed above.
-					// MOVQ 0(BP), BP
-					ab.Put(bpduff2)
-				}
-
 			// TODO: jump across functions needs reloc
 			case Zbr, Zjmp, Zloop:
 				if p.As == AXBEGIN {
--- a/src/errors/join.go
+++ b/src/errors/join.go
@ -16,6 +16,7 @@ import (
 // between each string.
 //
 // A non-nil error returned by Join implements the Unwrap() []error method.
+// The errors may be inspected with [Is] and [As].
 func Join(errs ...error) error {
 	n := 0
 	for _, err := range errs {
--- a/src/internal/poll/fd_windows.go
+++ b/src/internal/poll/fd_windows.go
@ -622,12 +622,22 @@ func (fd *FD) Pread(b []byte, off int64) (int, error) {

 	fd.l.Lock()
 	defer fd.l.Unlock()
+	if fd.isBlocking {
 		curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
 		if err != nil {
 			return 0, err
 		}
 		defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 		defer fd.setOffset(curoffset)
+	} else {
+		// Overlapped handles don't have the file pointer updated
+		// when performing I/O operations, so there is no need to
+		// call Seek to reset the file pointer.
+		// Also, some overlapped file handles don't support seeking.
+		// See https://go.dev/issues/74951.
+		curoffset := fd.offset
+		defer fd.setOffset(curoffset)
+	}
 	o := &fd.rop
 	o.InitBuf(b)
 	fd.setOffset(off)
@ -847,12 +857,22 @@ func (fd *FD) Pwrite(buf []byte, off int64) (int, error) {

 	fd.l.Lock()
 	defer fd.l.Unlock()
+	if fd.isBlocking {
 		curoffset, err := syscall.Seek(fd.Sysfd, 0, io.SeekCurrent)
 		if err != nil {
 			return 0, err
 		}
 		defer syscall.Seek(fd.Sysfd, curoffset, io.SeekStart)
 		defer fd.setOffset(curoffset)
+	} else {
+		// Overlapped handles don't have the file pointer updated
+		// when performing I/O operations, so there is no need to
+		// call Seek to reset the file pointer.
+		// Also, some overlapped file handles don't support seeking.
+		// See https://go.dev/issues/74951.
+		curoffset := fd.offset
+		defer fd.setOffset(curoffset)
+	}

 	var ntotal int
 	for {
@ -1107,6 +1127,12 @@ func (fd *FD) Seek(offset int64, whence int) (int64, error) {
 	fd.l.Lock()
 	defer fd.l.Unlock()

+	if !fd.isBlocking && whence == io.SeekCurrent {
+		// Windows doesn't keep the file pointer for overlapped file handles.
+		// We do it ourselves in case to account for any read or write
+		// operations that may have occurred.
+		offset += fd.offset
+	}
 	n, err := syscall.Seek(fd.Sysfd, offset, whence)
 	fd.setOffset(n)
 	return n, err
--- a/src/internal/runtime/maps/table.go
+++ b/src/internal/runtime/maps/table.go
@ -8,6 +8,7 @@ package maps
 import (
 	"internal/abi"
 	"internal/goarch"
+	"internal/runtime/math"
 	"unsafe"
 )

@ -127,8 +128,7 @@ func (t *table) maxGrowthLeft() uint16 {
 		// single-group tables, we could fill all slots.
 		return t.capacity - 1
 	} else {
-		if t.capacity*maxAvgGroupLoad < t.capacity {
-			// TODO(prattmic): Do something cleaner.
+		if t.capacity > math.MaxUint16/maxAvgGroupLoad {
 			panic("overflow")
 		}
 		return (t.capacity * maxAvgGroupLoad) / abi.MapGroupSlots
--- a/src/internal/runtime/math/math.go
+++ b/src/internal/runtime/math/math.go
@ -7,6 +7,7 @@ package math
 import "internal/goarch"

 const (
+	MaxUint16  = ^uint16(0)
 	MaxUint32  = ^uint32(0)
 	MaxUint64  = ^uint64(0)
 	MaxUintptr = ^uintptr(0)
--- a/src/internal/synctest/synctest_test.go
+++ b/src/internal/synctest/synctest_test.go
@ -383,34 +383,34 @@ func TestChannelMovedOutOfBubble(t *testing.T) {
 	for _, test := range []struct {
 		desc      string
 		f         func(chan struct{})
-		wantPanic string
+		wantFatal string
 	}{{
 		desc: "receive",
 		f: func(ch chan struct{}) {
 			<-ch
 		},
-		wantPanic: "receive on synctest channel from outside bubble",
+		wantFatal: "receive on synctest channel from outside bubble",
 	}, {
 		desc: "send",
 		f: func(ch chan struct{}) {
 			ch <- struct{}{}
 		},
-		wantPanic: "send on synctest channel from outside bubble",
+		wantFatal: "send on synctest channel from outside bubble",
 	}, {
 		desc: "close",
 		f: func(ch chan struct{}) {
 			close(ch)
 		},
-		wantPanic: "close of synctest channel from outside bubble",
+		wantFatal: "close of synctest channel from outside bubble",
 	}} {
 		t.Run(test.desc, func(t *testing.T) {
 			// Bubbled channel accessed from outside any bubble.
 			t.Run("outside_bubble", func(t *testing.T) {
+				wantFatal(t, test.wantFatal, func() {
 					donec := make(chan struct{})
 					ch := make(chan chan struct{})
 					go func() {
 						defer close(donec)
-					defer wantPanic(t, test.wantPanic)
 						test.f(<-ch)
 					}()
 					synctest.Run(func() {
@ -418,15 +418,16 @@ func TestChannelMovedOutOfBubble(t *testing.T) {
 					})
 					<-donec
 				})
+			})
 			// Bubbled channel accessed from a different bubble.
 			t.Run("different_bubble", func(t *testing.T) {
+				wantFatal(t, test.wantFatal, func() {
 					donec := make(chan struct{})
 					ch := make(chan chan struct{})
 					go func() {
 						defer close(donec)
 						c := <-ch
 						synctest.Run(func() {
-						defer wantPanic(t, test.wantPanic)
 							test.f(c)
 						})
 					}()
@ -436,6 +437,7 @@ func TestChannelMovedOutOfBubble(t *testing.T) {
 					<-donec
 				})
 			})
+		})
 	}
 }

@ -443,32 +445,32 @@ func TestTimerFromInsideBubble(t *testing.T) {
 	for _, test := range []struct {
 		desc      string
 		f         func(tm *time.Timer)
-		wantPanic string
+		wantFatal string
 	}{{
 		desc: "read channel",
 		f: func(tm *time.Timer) {
 			<-tm.C
 		},
-		wantPanic: "receive on synctest channel from outside bubble",
+		wantFatal: "receive on synctest channel from outside bubble",
 	}, {
 		desc: "Reset",
 		f: func(tm *time.Timer) {
 			tm.Reset(1 * time.Second)
 		},
-		wantPanic: "reset of synctest timer from outside bubble",
+		wantFatal: "reset of synctest timer from outside bubble",
 	}, {
 		desc: "Stop",
 		f: func(tm *time.Timer) {
 			tm.Stop()
 		},
-		wantPanic: "stop of synctest timer from outside bubble",
+		wantFatal: "stop of synctest timer from outside bubble",
 	}} {
 		t.Run(test.desc, func(t *testing.T) {
+			wantFatal(t, test.wantFatal, func() {
 				donec := make(chan struct{})
 				ch := make(chan *time.Timer)
 				go func() {
 					defer close(donec)
-				defer wantPanic(t, test.wantPanic)
 					test.f(<-ch)
 				}()
 				synctest.Run(func() {
@ -477,6 +479,7 @@ func TestTimerFromInsideBubble(t *testing.T) {
 				})
 				<-donec
 			})
+		})
 	}
 }

--- a/src/internal/trace/batch.go
+++ b/src/internal/trace/batch.go
@ -44,6 +44,10 @@ func (b *batch) isSyncBatch(ver version.Version) bool {
 			(tracev2.EventType(b.data[0]) == tracev2.EvSync && ver >= version.Go125))
 }

+func (b *batch) isEndOfGeneration() bool {
+	return b.exp == tracev2.NoExperiment && len(b.data) > 0 && tracev2.EventType(b.data[0]) == tracev2.EvEndOfGeneration
+}
+
 // readBatch reads the next full batch from r.
 func readBatch(r interface {
 	io.Reader
@ -54,6 +58,9 @@ func readBatch(r interface {
 	if err != nil {
 		return batch{}, 0, err
 	}
+	if typ := tracev2.EventType(b); typ == tracev2.EvEndOfGeneration {
+		return batch{m: NoThread, exp: tracev2.NoExperiment, data: []byte{b}}, 0, nil
+	}
 	if typ := tracev2.EventType(b); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
 		return batch{}, 0, fmt.Errorf("expected batch event, got event %d", typ)
 	}
--- a/src/internal/trace/event.go
+++ b/src/internal/trace/event.go
@ -843,7 +843,6 @@ func (e Event) String() string {
 		fmt.Fprintf(&sb, " Task=%d Category=%q Message=%q", l.Task, l.Category, l.Message)
 	case EventStateTransition:
 		s := e.StateTransition()
-		fmt.Fprintf(&sb, " Resource=%s Reason=%q", s.Resource, s.Reason)
 		switch s.Resource.Kind {
 		case ResourceGoroutine:
 			id := s.Resource.Goroutine()
@ -854,6 +853,7 @@ func (e Event) String() string {
 			old, new := s.Proc()
 			fmt.Fprintf(&sb, " ProcID=%d %s->%s", id, old, new)
 		}
+		fmt.Fprintf(&sb, " Reason=%q", s.Reason)
 		if s.Stack != NoStack {
 			fmt.Fprintln(&sb)
 			fmt.Fprintln(&sb, "TransitionStack=")
@ -879,7 +879,7 @@ func (e Event) String() string {
 			fmt.Fprintf(&sb, " Trace=%d Mono=%d Wall=%s",
 				s.ClockSnapshot.Trace,
 				s.ClockSnapshot.Mono,
-				s.ClockSnapshot.Wall.Format(time.RFC3339),
+				s.ClockSnapshot.Wall.Format(time.RFC3339Nano),
 			)
 		}
 	}
--- a/src/internal/trace/generation.go
+++ b/src/internal/trace/generation.go
@ -9,6 +9,7 @@ import (
 	"bytes"
 	"cmp"
 	"encoding/binary"
+	"errors"
 	"fmt"
 	"io"
 	"slices"
@ -32,22 +33,102 @@ type generation struct {
 	*evTable
 }

+// readGeneration buffers and decodes the structural elements of a trace generation
+// out of r.
+func readGeneration(r *bufio.Reader, ver version.Version) (*generation, error) {
+	if ver < version.Go126 {
+		return nil, errors.New("internal error: readGeneration called for <1.26 trace")
+	}
+	g := &generation{
+		evTable: &evTable{
+			pcs: make(map[uint64]frame),
+		},
+		batches: make(map[ThreadID][]batch),
+	}
+
+	// Read batches one at a time until we either hit the next generation.
+	for {
+		b, gen, err := readBatch(r)
+		if err == io.EOF {
+			if len(g.batches) != 0 {
+				return nil, errors.New("incomplete generation found; trace likely truncated")
+			}
+			return nil, nil // All done.
+		}
+		if err != nil {
+			return nil, err
+		}
+		if g.gen == 0 {
+			// Initialize gen.
+			g.gen = gen
+		}
+		if b.isEndOfGeneration() {
+			break
+		}
+		if gen == 0 {
+			// 0 is a sentinel used by the runtime, so we'll never see it.
+			return nil, fmt.Errorf("invalid generation number %d", gen)
+		}
+		if gen != g.gen {
+			return nil, fmt.Errorf("broken trace: missing end-of-generation event, or generations are interleaved")
+		}
+		if g.minTs == 0 || b.time < g.minTs {
+			g.minTs = b.time
+		}
+		if err := processBatch(g, b, ver); err != nil {
+			return nil, err
+		}
+	}
+
+	// Check some invariants.
+	if g.freq == 0 {
+		return nil, fmt.Errorf("no frequency event found")
+	}
+	if !g.hasClockSnapshot {
+		return nil, fmt.Errorf("no clock snapshot event found")
+	}
+
+	// N.B. Trust that the batch order is correct. We can't validate the batch order
+	// by timestamp because the timestamps could just be plain wrong. The source of
+	// truth is the order things appear in the trace and the partial order sequence
+	// numbers on certain events. If it turns out the batch order is actually incorrect
+	// we'll very likely fail to advance a partial order from the frontier.
+
+	// Compactify stacks and strings for better lookup performance later.
+	g.stacks.compactify()
+	g.strings.compactify()
+
+	// Validate stacks.
+	if err := validateStackStrings(&g.stacks, &g.strings, g.pcs); err != nil {
+		return nil, err
+	}
+
+	// Now that we have the frequency, fix up CPU samples.
+	fixUpCPUSamples(g.cpuSamples, g.freq)
+	return g, nil
+}
+
 // spilledBatch represents a batch that was read out for the next generation,
 // while reading the previous one. It's passed on when parsing the next
 // generation.
+//
+// Used only for trace versions < Go126.
 type spilledBatch struct {
 	gen uint64
 	*batch
 }

-// readGeneration buffers and decodes the structural elements of a trace generation
+// readGenerationWithSpill buffers and decodes the structural elements of a trace generation
 // out of r. spill is the first batch of the new generation (already buffered and
 // parsed from reading the last generation). Returns the generation and the first
 // batch read of the next generation, if any.
 //
 // If gen is non-nil, it is valid and must be processed before handling the returned
 // error.
-func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (*generation, *spilledBatch, error) {
+func readGenerationWithSpill(r *bufio.Reader, spill *spilledBatch, ver version.Version) (*generation, *spilledBatch, error) {
+	if ver >= version.Go126 {
+		return nil, nil, errors.New("internal error: readGenerationWithSpill called for Go 1.26+ trace")
+	}
 	g := &generation{
 		evTable: &evTable{
 			pcs: make(map[uint64]frame),
@ -56,6 +137,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 	}
 	// Process the spilled batch.
 	if spill != nil {
+		// Process the spilled batch, which contains real data.
 		g.gen = spill.gen
 		g.minTs = spill.batch.time
 		if err := processBatch(g, *spill.batch, ver); err != nil {
@ -63,8 +145,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		}
 		spill = nil
 	}
-	// Read batches one at a time until we either hit EOF or
-	// the next generation.
+	// Read batches one at a time until we either hit the next generation.
 	var spillErr error
 	for {
 		b, gen, err := readBatch(r)
@ -73,7 +154,7 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		}
 		if err != nil {
 			if g.gen != 0 {
-				// This is an error reading the first batch of the next generation.
+				// This may be an error reading the first batch of the next generation.
 				// This is fine. Let's forge ahead assuming that what we've got so
 				// far is fine.
 				spillErr = err
@ -89,7 +170,8 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 			// Initialize gen.
 			g.gen = gen
 		}
-		if gen == g.gen+1 { // TODO: advance this the same way the runtime does.
+		if gen == g.gen+1 {
+			// TODO: Increment the generation with wraparound the same way the runtime does.
 			spill = &spilledBatch{gen: gen, batch: &b}
 			break
 		}
@ -134,15 +216,8 @@ func readGeneration(r *bufio.Reader, spill *spilledBatch, ver version.Version) (
 		return nil, nil, err
 	}

-	// Fix up the CPU sample timestamps, now that we have freq.
-	for i := range g.cpuSamples {
-		s := &g.cpuSamples[i]
-		s.time = g.freq.mul(timestamp(s.time))
-	}
-	// Sort the CPU samples.
-	slices.SortFunc(g.cpuSamples, func(a, b cpuSample) int {
-		return cmp.Compare(a.time, b.time)
-	})
+	// Now that we have the frequency, fix up CPU samples.
+	fixUpCPUSamples(g.cpuSamples, g.freq)
 	return g, spill, spillErr
 }

@ -174,6 +249,8 @@ func processBatch(g *generation, b batch, ver version.Version) error {
 		if err := addExperimentalBatch(g.expBatches, b); err != nil {
 			return err
 		}
+	case b.isEndOfGeneration():
+		return errors.New("internal error: unexpectedly processing EndOfGeneration; broken trace?")
 	default:
 		if _, ok := g.batches[b.m]; !ok {
 			g.batchMs = append(g.batchMs, b.m)
@ -512,3 +589,15 @@ func addExperimentalBatch(expBatches map[tracev2.Experiment][]ExperimentalBatch,
 	})
 	return nil
 }
+
+func fixUpCPUSamples(samples []cpuSample, freq frequency) {
+	// Fix up the CPU sample timestamps.
+	for i := range samples {
+		s := &samples[i]
+		s.time = freq.mul(timestamp(s.time))
+	}
+	// Sort the CPU samples.
+	slices.SortFunc(samples, func(a, b cpuSample) int {
+		return cmp.Compare(a.time, b.time)
+	})
+}
--- a/src/internal/trace/internal/testgen/trace.go
+++ b/src/internal/trace/internal/testgen/trace.go
@ -322,6 +322,14 @@ func (g *Generation) writeEventsTo(tw *raw.TextWriter) {
 		}
 	}
 	b.writeEventsTo(tw)
+
+	// Write end-of-generation event if necessary.
+	if g.trace.ver >= version.Go126 {
+		tw.WriteEvent(raw.Event{
+			Version: g.trace.ver,
+			Ev:      tracev2.EvEndOfGeneration,
+		})
+	}
 }

 func (g *Generation) newStructuralBatch() *Batch {
--- a/src/internal/trace/reader.go
+++ b/src/internal/trace/reader.go
@ -6,6 +6,7 @@ package trace

 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"slices"
@ -26,15 +27,26 @@ type Reader struct {
 	r          *bufio.Reader
 	lastTs     Time
 	gen        *generation
-	spill        *spilledBatch
-	spillErr     error // error from reading spill
-	spillErrSync bool  // whether we emitted a Sync before reporting spillErr
 	frontier   []*batchCursor
 	cpuSamples []cpuSample
 	order      ordering
 	syncs      int
+	readGenErr error
 	done       bool

+	// Spill state.
+	//
+	// Traces before Go 1.26 had no explicit end-of-generation signal, and
+	// so the first batch of the next generation needed to be parsed to identify
+	// a new generation. This batch is the "spilled" so we don't lose track
+	// of it when parsing the next generation.
+	//
+	// This is unnecessary after Go 1.26 because of an explicit end-of-generation
+	// signal.
+	spill        *spilledBatch
+	spillErr     error // error from reading spill
+	spillErrSync bool  // whether we emitted a Sync before reporting spillErr
+
 	v1Events *traceV1Converter
 }

@ -54,7 +66,7 @@ func NewReader(r io.Reader) (*Reader, error) {
 		return &Reader{
 			v1Events: convertV1Trace(tr),
 		}, nil
-	case version.Go122, version.Go123, version.Go125:
+	case version.Go122, version.Go123, version.Go125, version.Go126:
 		return &Reader{
 			version: v,
 			r:       br,
@ -139,52 +151,23 @@ func (r *Reader) ReadEvent() (e Event, err error) {

 	// Check if we need to refresh the generation.
 	if len(r.frontier) == 0 && len(r.cpuSamples) == 0 {
-		if r.spillErr != nil {
-			if r.spillErrSync {
-				return Event{}, r.spillErr
+		if r.version < version.Go126 {
+			return r.nextGenWithSpill()
 		}
-			r.spillErrSync = true
-			r.syncs++
-			return syncEvent(nil, r.lastTs, r.syncs), nil
+		if r.readGenErr != nil {
+			return Event{}, r.readGenErr
 		}
-		if r.gen != nil && r.spill == nil {
-			// If we have a generation from the last read,
-			// and there's nothing left in the frontier, and
-			// there's no spilled batch, indicating that there's
-			// no further generation, it means we're done.
-			// Emit the final sync event.
-			r.done = true
-			r.syncs++
-			return syncEvent(nil, r.lastTs, r.syncs), nil
-		}
-		// Read the next generation.
-		r.gen, r.spill, r.spillErr = readGeneration(r.r, r.spill, r.version)
-		if r.gen == nil {
-			r.spillErrSync = true
-			r.syncs++
-			return syncEvent(nil, r.lastTs, r.syncs), nil
-		}
-
-		// Reset CPU samples cursor.
-		r.cpuSamples = r.gen.cpuSamples
-
-		// Reset frontier.
-		for _, m := range r.gen.batchMs {
-			batches := r.gen.batches[m]
-			bc := &batchCursor{m: m}
-			ok, err := bc.nextEvent(batches, r.gen.freq)
+		gen, err := readGeneration(r.r, r.version)
 		if err != nil {
-				return Event{}, err
-			}
-			if !ok {
-				// Turns out there aren't actually any events in these batches.
-				continue
-			}
-			r.frontier = heapInsert(r.frontier, bc)
-		}
+			// Before returning an error, emit the sync event
+			// for the current generation and queue up the error
+			// for the next call.
+			r.readGenErr = err
+			r.gen = nil
 			r.syncs++
-		// Always emit a sync event at the beginning of the generation.
-		return syncEvent(r.gen.evTable, r.gen.freq.mul(r.gen.minTs), r.syncs), nil
+			return syncEvent(nil, r.lastTs, r.syncs), nil
+		}
+		return r.installGen(gen)
 	}
 	tryAdvance := func(i int) (bool, error) {
 		bc := r.frontier[i]
@ -251,6 +234,78 @@ func (r *Reader) ReadEvent() (e Event, err error) {
 	return ev, nil
 }

+// nextGenWithSpill reads the generation and calls nextGen while
+// also handling any spilled batches.
+func (r *Reader) nextGenWithSpill() (Event, error) {
+	if r.version >= version.Go126 {
+		return Event{}, errors.New("internal error: nextGenWithSpill called for Go 1.26+ trace")
+	}
+	if r.spillErr != nil {
+		if r.spillErrSync {
+			return Event{}, r.spillErr
+		}
+		r.spillErrSync = true
+		r.syncs++
+		return syncEvent(nil, r.lastTs, r.syncs), nil
+	}
+	if r.gen != nil && r.spill == nil {
+		// If we have a generation from the last read,
+		// and there's nothing left in the frontier, and
+		// there's no spilled batch, indicating that there's
+		// no further generation, it means we're done.
+		// Emit the final sync event.
+		r.done = true
+		r.syncs++
+		return syncEvent(nil, r.lastTs, r.syncs), nil
+	}
+
+	// Read the next generation.
+	var gen *generation
+	gen, r.spill, r.spillErr = readGenerationWithSpill(r.r, r.spill, r.version)
+	if gen == nil {
+		r.gen = nil
+		r.spillErrSync = true
+		r.syncs++
+		return syncEvent(nil, r.lastTs, r.syncs), nil
+	}
+	return r.installGen(gen)
+}
+
+// installGen installs the new generation into the Reader and returns
+// a Sync event for the new generation.
+func (r *Reader) installGen(gen *generation) (Event, error) {
+	if gen == nil {
+		// Emit the final sync event.
+		r.gen = nil
+		r.done = true
+		r.syncs++
+		return syncEvent(nil, r.lastTs, r.syncs), nil
+	}
+	r.gen = gen
+
+	// Reset CPU samples cursor.
+	r.cpuSamples = r.gen.cpuSamples
+
+	// Reset frontier.
+	for _, m := range r.gen.batchMs {
+		batches := r.gen.batches[m]
+		bc := &batchCursor{m: m}
+		ok, err := bc.nextEvent(batches, r.gen.freq)
+		if err != nil {
+			return Event{}, err
+		}
+		if !ok {
+			// Turns out there aren't actually any events in these batches.
+			continue
+		}
+		r.frontier = heapInsert(r.frontier, bc)
+	}
+	r.syncs++
+
+	// Always emit a sync event at the beginning of the generation.
+	return syncEvent(r.gen.evTable, r.gen.freq.mul(r.gen.minTs), r.syncs), nil
+}
+
 func dumpFrontier(frontier []*batchCursor) string {
 	var sb strings.Builder
 	for _, bc := range frontier {
--- a/src/internal/trace/tracev2/events.go
+++ b/src/internal/trace/tracev2/events.go
@ -87,8 +87,8 @@ const (
 	EvSync          // start of a sync batch [...EvFrequency|EvClockSnapshot]
 	EvClockSnapshot // snapshot of trace, mono and wall clocks [timestamp, mono, sec, nsec]

-	// Reserved internal in-band end-of-generation signal. Must never appear in the trace. Added in Go 1.25.
-	// This could be used as an explicit in-band end-of-generation signal in the future.
+	// In-band end-of-generation signal. Added in Go 1.26.
+	// Used in Go 1.25 only internally.
 	EvEndOfGeneration

 	NumEvents
--- a/src/internal/trace/version/version.go
+++ b/src/internal/trace/version/version.go
@ -21,7 +21,8 @@ const (
 	Go122   Version = 22 // v2
 	Go123   Version = 23 // v2
 	Go125   Version = 25 // v2
-	Current         = Go125
+	Go126   Version = 26 // v2
+	Current         = Go126
 )

 var versions = map[Version][]tracev2.EventSpec{
@ -33,7 +34,8 @@ var versions = map[Version][]tracev2.EventSpec{

 	Go122: tracev2.Specs()[:tracev2.EvUserLog+1],           // All events after are Go 1.23+.
 	Go123: tracev2.Specs()[:tracev2.EvExperimentalBatch+1], // All events after are Go 1.25+.
-	Go125: tracev2.Specs(),
+	Go125: tracev2.Specs()[:tracev2.EvClockSnapshot+1],     // All events after are Go 1.26+.
+	Go126: tracev2.Specs(),
 }

 // Specs returns the set of event.Specs for this version.
--- a/src/net/udpsock_test.go
+++ b/src/net/udpsock_test.go
@ -710,6 +710,11 @@ func TestIPv6WriteMsgUDPAddrPortTargetAddrIPVersion(t *testing.T) {
 // WriteMsgUDPAddrPort accepts IPv4 and IPv4-mapped IPv6 destination addresses,
 // and rejects IPv6 destination addresses on a "udp4" connection.
 func TestIPv4WriteMsgUDPAddrPortTargetAddrIPVersion(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9":
+		t.Skipf("not supported on %s", runtime.GOOS)
+	}
+
 	if !testableNetwork("udp4") {
 		t.Skipf("skipping: udp4 not available")
 	}
--- a/src/os/os_windows_test.go
+++ b/src/os/os_windows_test.go
@ -1845,6 +1845,72 @@ func TestFile(t *testing.T) {
 	}
 }

+func TestFileOverlappedSeek(t *testing.T) {
+	t.Parallel()
+	name := filepath.Join(t.TempDir(), "foo")
+	f := newFileOverlapped(t, name, true)
+	content := []byte("foo")
+	if _, err := f.Write(content); err != nil {
+		t.Fatal(err)
+	}
+	// Check that the file pointer is at the expected offset.
+	n, err := f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if n != int64(len(content)) {
+		t.Errorf("expected file pointer to be at offset %d, got %d", len(content), n)
+	}
+	// Set the file pointer to the start of the file.
+	if _, err := f.Seek(0, io.SeekStart); err != nil {
+		t.Fatal(err)
+	}
+	// Read the first byte.
+	var buf [1]byte
+	if _, err := f.Read(buf[:]); err != nil {
+		t.Fatal(err)
+	}
+	if !bytes.Equal(buf[:], content[:len(buf)]) {
+		t.Errorf("expected %q, got %q", content[:len(buf)], buf[:])
+	}
+	// Check that the file pointer is at the expected offset.
+	n, err = f.Seek(0, io.SeekCurrent)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if n != int64(len(buf)) {
+		t.Errorf("expected file pointer to be at offset %d, got %d", len(buf), n)
+	}
+}
+
+func TestFileOverlappedReadAtVolume(t *testing.T) {
+	// Test that we can use File.ReadAt with an overlapped volume handle.
+	// See https://go.dev/issues/74951.
+	t.Parallel()
+	name := `\\.\` + filepath.VolumeName(t.TempDir())
+	namep, err := syscall.UTF16PtrFromString(name)
+	if err != nil {
+		t.Fatal(err)
+	}
+	h, err := syscall.CreateFile(namep,
+		syscall.GENERIC_READ|syscall.GENERIC_WRITE,
+		syscall.FILE_SHARE_WRITE|syscall.FILE_SHARE_READ,
+		nil, syscall.OPEN_ALWAYS, syscall.FILE_FLAG_OVERLAPPED, 0)
+	if err != nil {
+		if errors.Is(err, syscall.ERROR_ACCESS_DENIED) {
+			t.Skip("skipping test: access denied")
+		}
+		t.Fatal(err)
+	}
+	f := os.NewFile(uintptr(h), name)
+	defer f.Close()
+
+	var buf [0]byte
+	if _, err := f.ReadAt(buf[:], 0); err != nil {
+		t.Fatal(err)
+	}
+}
+
 func TestPipe(t *testing.T) {
 	t.Parallel()
 	r, w, err := os.Pipe()
--- a/src/os/tempfile.go
+++ b/src/os/tempfile.go
@ -105,7 +105,7 @@ func MkdirTemp(dir, pattern string) (string, error) {
 			if try++; try < 10000 {
 				continue
 			}
-			return "", &PathError{Op: "mkdirtemp", Path: dir + string(PathSeparator) + prefix + "*" + suffix, Err: ErrExist}
+			return "", &PathError{Op: "mkdirtemp", Path: prefix + "*" + suffix, Err: ErrExist}
 		}
 		if IsNotExist(err) {
 			if _, err := Stat(dir); IsNotExist(err) {
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@ -191,7 +191,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	}

 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("send on synctest channel from outside bubble"))
+		fatal("send on synctest channel from outside bubble")
 	}

 	// Fast path: check for failed non-blocking operation without acquiring the lock.
@ -318,7 +318,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.bubble != nil && getg().bubble != c.bubble {
 		unlockf()
-		panic(plainError("send on synctest channel from outside bubble"))
+		fatal("send on synctest channel from outside bubble")
 	}
 	if raceenabled {
 		if c.dataqsiz == 0 {
@ -416,7 +416,7 @@ func closechan(c *hchan) {
 		panic(plainError("close of nil channel"))
 	}
 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("close of synctest channel from outside bubble"))
+		fatal("close of synctest channel from outside bubble")
 	}

 	lock(&c.lock)
@ -538,7 +538,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 	}

 	if c.bubble != nil && getg().bubble != c.bubble {
-		panic(plainError("receive on synctest channel from outside bubble"))
+		fatal("receive on synctest channel from outside bubble")
 	}

 	if c.timer != nil {
@ -702,7 +702,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
 func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.bubble != nil && getg().bubble != c.bubble {
 		unlockf()
-		panic(plainError("receive on synctest channel from outside bubble"))
+		fatal("receive on synctest channel from outside bubble")
 	}
 	if c.dataqsiz == 0 {
 		if raceenabled {
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@ -177,7 +177,7 @@ func totalMutexWaitTimeNanos() int64 {

 // NumGoroutine returns the number of goroutines that currently exist.
 func NumGoroutine() int {
-	return int(gcount())
+	return int(gcount(false))
 }

 //go:linkname debug_modinfo runtime/debug.modinfo
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s
@ -1,427 +0,0 @@
-// Code generated by mkduff.go; DO NOT EDIT.
-// Run go generate from src/runtime to update.
-// See mkduff.go for comments.
-
-#include "textflag.h"
-
-TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	MOVUPS	X15,(DI)
-	MOVUPS	X15,16(DI)
-	MOVUPS	X15,32(DI)
-	MOVUPS	X15,48(DI)
-	LEAQ	64(DI),DI
-
-	RET
-
-TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	MOVUPS	(SI), X0
-	ADDQ	$16, SI
-	MOVUPS	X0, (DI)
-	ADDQ	$16, DI
-
-	RET
--- a/src/runtime/duff_arm64.s
+++ b/src/runtime/duff_arm64.s
@ -1,267 +0,0 @@
-// Code generated by mkduff.go; DO NOT EDIT.
-// Run go generate from src/runtime to update.
-// See mkduff.go for comments.
-
-#include "textflag.h"
-
-TEXT runtime·duffzero<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP.P	(ZR, ZR), 16(R20)
-	STP	(ZR, ZR), (R20)
-	RET
-
-TEXT runtime·duffcopy<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-0
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	LDP.P	16(R20), (R26, R27)
-	STP.P	(R26, R27), 16(R21)
-
-	RET
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@ -8,6 +8,7 @@ package runtime

 import (
 	"internal/godebugs"
+	"internal/runtime/atomic"
 	"internal/runtime/gc"
 	"unsafe"
 )
@ -465,9 +466,45 @@ func initMetrics() {
 			},
 		},
 		"/sched/goroutines:goroutines": {
-			compute: func(_ *statAggregate, out *metricValue) {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
 				out.kind = metricKindUint64
-				out.scalar = uint64(gcount())
+				out.scalar = uint64(in.schedStats.gTotal)
+			},
+		},
+		"/sched/goroutines/not-in-go:goroutines": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.gNonGo)
+			},
+		},
+		"/sched/goroutines/running:goroutines": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.gRunning)
+			},
+		},
+		"/sched/goroutines/runnable:goroutines": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.gRunnable)
+			},
+		},
+		"/sched/goroutines/waiting:goroutines": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.gWaiting)
+			},
+		},
+		"/sched/goroutines-created:goroutines": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.gCreated)
 			},
 		},
 		"/sched/latencies:seconds": {
@ -495,6 +532,13 @@ func initMetrics() {
 				sched.stwTotalTimeOther.write(out)
 			},
 		},
+		"/sched/threads/total:threads": {
+			deps: makeStatDepSet(schedStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.schedStats.threads)
+			},
+		},
 		"/sync/mutex/wait/total:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				out.kind = metricKindFloat64
@ -547,6 +591,7 @@ const (
 	cpuStatsDep                  // corresponds to cpuStatsAggregate
 	gcStatsDep                   // corresponds to gcStatsAggregate
 	finalStatsDep                // corresponds to finalStatsAggregate
+	schedStatsDep                // corresponds to schedStatsAggregate
 	numStatsDeps
 )

@ -740,6 +785,88 @@ func (a *finalStatsAggregate) compute() {
 	a.cleanupsQueued, a.cleanupsExecuted = gcCleanups.readQueueStats()
 }

+// schedStatsAggregate contains stats about the scheduler, including
+// an approximate count of goroutines in each state.
+type schedStatsAggregate struct {
+	gTotal    uint64
+	gRunning  uint64
+	gRunnable uint64
+	gNonGo    uint64
+	gWaiting  uint64
+	gCreated  uint64
+	threads   uint64
+}
+
+// compute populates the schedStatsAggregate with values from the runtime.
+func (a *schedStatsAggregate) compute() {
+	// Lock the scheduler so the global run queue can't change and
+	// the number of Ps can't change. This doesn't prevent the
+	// local run queues from changing, so the results are still
+	// approximate.
+	lock(&sched.lock)
+
+	// The total count of threads owned by Go is the number of Ms
+	// minus extra Ms on the list or in use.
+	a.threads = uint64(mcount()) - uint64(extraMInUse.Load()) - uint64(extraMLength.Load())
+
+	// Collect running/runnable from per-P run queues.
+	a.gCreated += sched.goroutinesCreated.Load()
+	for _, p := range allp {
+		if p == nil || p.status == _Pdead {
+			break
+		}
+		a.gCreated += p.goroutinesCreated
+		switch p.status {
+		case _Prunning:
+			a.gRunning++
+		case _Psyscall:
+			a.gNonGo++
+		case _Pgcstop:
+			// The world is stopping or stopped.
+			// This is fine. The results will be
+			// slightly odd since nothing else
+			// is running, but it will be accurate.
+		}
+
+		for {
+			h := atomic.Load(&p.runqhead)
+			t := atomic.Load(&p.runqtail)
+			next := atomic.Loaduintptr((*uintptr)(&p.runnext))
+			runnable := int32(t - h)
+			if atomic.Load(&p.runqhead) != h || runnable < 0 {
+				continue
+			}
+			if next != 0 {
+				runnable++
+			}
+			a.gRunnable += uint64(runnable)
+			break
+		}
+	}
+
+	// Global run queue.
+	a.gRunnable += uint64(sched.runq.size)
+
+	// Account for Gs that are in _Gsyscall without a P in _Psyscall.
+	nGsyscallNoP := sched.nGsyscallNoP.Load()
+
+	// nGsyscallNoP can go negative during temporary races.
+	if nGsyscallNoP >= 0 {
+		a.gNonGo += uint64(nGsyscallNoP)
+	}
+
+	// Compute the number of blocked goroutines. We have to
+	// include system goroutines in this count because we included
+	// them above.
+	a.gTotal = uint64(gcount(true))
+	a.gWaiting = a.gTotal - (a.gRunning + a.gRunnable + a.gNonGo)
+	if a.gWaiting < 0 {
+		a.gWaiting = 0
+	}
+
+	unlock(&sched.lock)
+}
+
 // nsToSec takes a duration in nanoseconds and converts it to seconds as
 // a float64.
 func nsToSec(ns int64) float64 {
@ -758,6 +885,7 @@ type statAggregate struct {
 	cpuStats   cpuStatsAggregate
 	gcStats    gcStatsAggregate
 	finalStats finalStatsAggregate
+	schedStats schedStatsAggregate
 }

 // ensure populates statistics aggregates determined by deps if they
@ -782,6 +910,8 @@ func (a *statAggregate) ensure(deps *statDepSet) {
 			a.gcStats.compute()
 		case finalStatsDep:
 			a.finalStats.compute()
+		case schedStatsDep:
+			a.schedStats.compute()
 		}
 	}
 	a.ensured = a.ensured.union(missing)
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@ -437,6 +437,32 @@ var allDesc = []Description{
 		Description: "The current runtime.GOMAXPROCS setting, or the number of operating system threads that can execute user-level Go code simultaneously.",
 		Kind:        KindUint64,
 	},
+	{
+		Name:        "/sched/goroutines-created:goroutines",
+		Description: "Count of goroutines created since program start.",
+		Cumulative:  true,
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines/not-in-go:goroutines",
+		Description: "Approximate count of goroutines running or blocked in a system call or cgo call. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines/runnable:goroutines",
+		Description: "Approximate count of goroutines ready to execute, but not executing. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines/running:goroutines",
+		Description: "Approximate count of goroutines executing. Always less than or equal to /sched/gomaxprocs:threads. Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines/waiting:goroutines",
+		Description: "Approximate count of goroutines waiting on a resource (I/O or sync primitives). Not guaranteed to add up to /sched/goroutines:goroutines with other goroutine metrics.",
+		Kind:        KindUint64,
+	},
 	{
 		Name:        "/sched/goroutines:goroutines",
 		Description: "Count of live goroutines.",
@ -472,6 +498,11 @@ var allDesc = []Description{
 		Kind:        KindFloat64Histogram,
 		Cumulative:  true,
 	},
+	{
+		Name:        "/sched/threads/total:threads",
+		Description: "The current count of live threads that are owned by the Go runtime.",
+		Kind:        KindUint64,
+	},
 	{
 		Name:        "/sync/mutex/wait/total:seconds",
 		Description: "Approximate cumulative time goroutines have spent blocked on a sync.Mutex, sync.RWMutex, or runtime-internal lock. This metric is useful for identifying global changes in lock contention. Collect a mutex or block profile using the runtime/pprof package for more detailed contention data.",
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@ -509,6 +509,29 @@ Below is the full list of supported metrics, ordered lexicographically.
 		operating system threads that can execute user-level Go code
 		simultaneously.

+	/sched/goroutines-created:goroutines
+		Count of goroutines created since program start.
+
+	/sched/goroutines/not-in-go:goroutines
+		Approximate count of goroutines running or blocked in
+		a system call or cgo call. Not guaranteed to add up to
+		/sched/goroutines:goroutines with other goroutine metrics.
+
+	/sched/goroutines/runnable:goroutines
+		Approximate count of goroutines ready to execute,
+		but not executing. Not guaranteed to add up to
+		/sched/goroutines:goroutines with other goroutine metrics.
+
+	/sched/goroutines/running:goroutines
+		Approximate count of goroutines executing. Always less than or
+		equal to /sched/gomaxprocs:threads. Not guaranteed to add up to
+		/sched/goroutines:goroutines with other goroutine metrics.
+
+	/sched/goroutines/waiting:goroutines
+		Approximate count of goroutines waiting on a resource
+		(I/O or sync primitives). Not guaranteed to add up to
+		/sched/goroutines:goroutines with other goroutine metrics.
+
 	/sched/goroutines:goroutines
 		Count of live goroutines.

@ -549,6 +572,10 @@ Below is the full list of supported metrics, ordered lexicographically.
 		/sched/pauses/stopping/other:seconds). Bucket counts increase
 		monotonically.

+	/sched/threads/total:threads
+		The current count of live threads that are owned by the Go
+		runtime.
+
 	/sync/mutex/wait/total:seconds
 		Approximate cumulative time goroutines have spent blocked on a
 		sync.Mutex, sync.RWMutex, or runtime-internal lock. This metric
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@ -22,6 +22,7 @@ import (
 	"strings"
 	"sync"
 	"sync/atomic"
+	"syscall"
 	"testing"
 	"time"
 	"unsafe"
@ -1575,3 +1576,219 @@ func TestReadMetricsFinalizers(t *testing.T) {
 		t.Errorf("expected %s difference to be exactly %d, got %d -> %d", before[1].Name, N, v0, v1)
 	}
 }
+
+func TestReadMetricsSched(t *testing.T) {
+	const (
+		notInGo = iota
+		runnable
+		running
+		waiting
+		created
+		threads
+		numSamples
+	)
+	var s [numSamples]metrics.Sample
+	s[notInGo].Name = "/sched/goroutines/not-in-go:goroutines"
+	s[runnable].Name = "/sched/goroutines/runnable:goroutines"
+	s[running].Name = "/sched/goroutines/running:goroutines"
+	s[waiting].Name = "/sched/goroutines/waiting:goroutines"
+	s[created].Name = "/sched/goroutines-created:goroutines"
+	s[threads].Name = "/sched/threads/total:threads"
+
+	logMetrics := func(t *testing.T, s []metrics.Sample) {
+		for i := range s {
+			t.Logf("%s: %d", s[i].Name, s[i].Value.Uint64())
+		}
+	}
+
+	// generalSlack is the amount of goroutines we allow ourselves to be
+	// off by in any given category, either due to background system
+	// goroutines or testing package goroutines.
+	const generalSlack = 4
+
+	// waitingSlack is the max number of blocked goroutines left
+	// from other tests, the testing package, or system
+	// goroutines.
+	const waitingSlack = 100
+
+	// threadsSlack is the maximum number of threads left over
+	// from other tests and the runtime (sysmon, the template thread, etc.)
+	const threadsSlack = 20
+
+	// Make sure GC isn't running, since GC workers interfere with
+	// expected counts.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+	runtime.GC()
+
+	check := func(t *testing.T, s *metrics.Sample, min, max uint64) {
+		val := s.Value.Uint64()
+		if val < min {
+			t.Errorf("%s too low; %d < %d", s.Name, val, min)
+		}
+		if val > max {
+			t.Errorf("%s too high; %d > %d", s.Name, val, max)
+		}
+	}
+	checkEq := func(t *testing.T, s *metrics.Sample, value uint64) {
+		check(t, s, value, value)
+	}
+	spinUntil := func(f func() bool, timeout time.Duration) bool {
+		start := time.Now()
+		for time.Since(start) < timeout {
+			if f() {
+				return true
+			}
+			time.Sleep(time.Millisecond)
+		}
+		return false
+	}
+
+	// Check base values.
+	t.Run("base", func(t *testing.T) {
+		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+		metrics.Read(s[:])
+		logMetrics(t, s[:])
+		check(t, &s[notInGo], 0, generalSlack)
+		check(t, &s[runnable], 0, generalSlack)
+		checkEq(t, &s[running], 1)
+		check(t, &s[waiting], 0, waitingSlack)
+	})
+
+	metrics.Read(s[:])
+	createdAfterBase := s[created].Value.Uint64()
+
+	// Force Running count to be high. We'll use these goroutines
+	// for Runnable, too.
+	const count = 10
+	var ready, exit atomic.Uint32
+	for i := 0; i < count-1; i++ {
+		go func() {
+			ready.Add(1)
+			for exit.Load() == 0 {
+				// Spin to get us and keep us running, but check
+				// the exit condition so we exit out early if we're
+				// done.
+				start := time.Now()
+				for time.Since(start) < 10*time.Millisecond && exit.Load() == 0 {
+				}
+				runtime.Gosched()
+			}
+		}()
+	}
+	for ready.Load() < count-1 {
+		runtime.Gosched()
+	}
+
+	// Be careful. We've entered a dangerous state for platforms
+	// that do not return back to the underlying system unless all
+	// goroutines are blocked, like js/wasm, since we have a bunch
+	// of runnable goroutines all spinning. We cannot write anything
+	// out.
+	if testenv.HasParallelism() {
+		t.Run("created", func(t *testing.T) {
+			metrics.Read(s[:])
+			logMetrics(t, s[:])
+			checkEq(t, &s[created], createdAfterBase+count)
+		})
+		t.Run("running", func(t *testing.T) {
+			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(count + 4))
+			// It can take a little bit for the scheduler to
+			// distribute the goroutines to Ps, so retry for a
+			// while.
+			spinUntil(func() bool {
+				metrics.Read(s[:])
+				return s[running].Value.Uint64() >= count
+			}, time.Second)
+			logMetrics(t, s[:])
+			check(t, &s[running], count, count+4)
+			check(t, &s[threads], count, count+4+threadsSlack)
+		})
+
+		// Force runnable count to be high.
+		t.Run("runnable", func(t *testing.T) {
+			defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+			metrics.Read(s[:])
+			logMetrics(t, s[:])
+			checkEq(t, &s[running], 1)
+			check(t, &s[runnable], count-1, count+generalSlack)
+		})
+
+		// Done with the running/runnable goroutines.
+		exit.Store(1)
+	} else {
+		// Read metrics and then exit all the other goroutines,
+		// so that system calls may proceed.
+		metrics.Read(s[:])
+
+		// Done with the running/runnable goroutines.
+		exit.Store(1)
+
+		// Now we can check our invariants.
+		t.Run("created", func(t *testing.T) {
+			// Look for count-1 goroutines because we read metrics
+			// *before* t.Run goroutine was created for this sub-test.
+			checkEq(t, &s[created], createdAfterBase+count-1)
+		})
+		t.Run("running", func(t *testing.T) {
+			logMetrics(t, s[:])
+			checkEq(t, &s[running], 1)
+			checkEq(t, &s[threads], 1)
+		})
+		t.Run("runnable", func(t *testing.T) {
+			logMetrics(t, s[:])
+			check(t, &s[runnable], count-1, count+generalSlack)
+		})
+	}
+
+	// Force not-in-go count to be high. This is a little tricky since
+	// we try really hard not to let things block in system calls.
+	// We have to drop to the syscall package to do this reliably.
+	t.Run("not-in-go", func(t *testing.T) {
+		// Block a bunch of goroutines on an OS pipe.
+		pr, pw, err := pipe()
+		if err != nil {
+			switch runtime.GOOS {
+			case "js", "wasip1":
+				t.Skip("creating pipe:", err)
+			}
+			t.Fatal("creating pipe:", err)
+		}
+		for i := 0; i < count; i++ {
+			go syscall.Read(pr, make([]byte, 1))
+		}
+
+		// Let the goroutines block.
+		spinUntil(func() bool {
+			metrics.Read(s[:])
+			return s[notInGo].Value.Uint64() >= count
+		}, time.Second)
+
+		metrics.Read(s[:])
+		logMetrics(t, s[:])
+		check(t, &s[notInGo], count, count+generalSlack)
+
+		syscall.Close(pw)
+		syscall.Close(pr)
+	})
+
+	t.Run("waiting", func(t *testing.T) {
+		// Force waiting count to be high.
+		const waitingCount = 1000
+		stop := make(chan bool)
+		for i := 0; i < waitingCount; i++ {
+			go func() { <-stop }()
+		}
+
+		// Let the goroutines block.
+		spinUntil(func() bool {
+			metrics.Read(s[:])
+			return s[waiting].Value.Uint64() >= waitingCount
+		}, time.Second)
+
+		metrics.Read(s[:])
+		logMetrics(t, s[:])
+		check(t, &s[waiting], waitingCount, waitingCount+waitingSlack)
+
+		close(stop)
+	})
+}
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@ -1521,18 +1521,15 @@ func gcBgMarkWorker(ready chan struct{}) {
 		}

 		systemstack(func() {
-			// Mark our goroutine preemptible so its stack
-			// can be scanned or observed by the execution
-			// tracer. This, for example, lets two mark workers
-			// scan each other (otherwise, they would
-			// deadlock). We must not modify anything on
-			// the G stack. However, stack shrinking is
-			// disabled for mark workers, so it is safe to
-			// read from the G stack.
+			// Mark our goroutine preemptible so its stack can be scanned or observed
+			// by the execution tracer. This, for example, lets two mark workers scan
+			// each other (otherwise, they would deadlock).
 			//
-			// N.B. The execution tracer is not aware of this status
-			// transition and handles it specially based on the
-			// wait reason.
+			// casGToWaitingForSuspendG marks the goroutine as ineligible for a
+			// stack shrink, effectively pinning the stack in memory for the duration.
+			//
+			// N.B. The execution tracer is not aware of this status transition and
+			// handles it specially based on the wait reason.
 			casGToWaitingForSuspendG(gp, _Grunning, waitReasonGCWorkerActive)
 			switch pp.gcMarkWorkerMode {
 			default:
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@ -32,10 +32,8 @@ import (
 )

 func main() {
-	gen("amd64", notags, zeroAMD64, copyAMD64)
 	gen("386", notags, zero386, copy386)
 	gen("arm", notags, zeroARM, copyARM)
-	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("loong64", notags, zeroLOONG64, copyLOONG64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@ -610,10 +610,30 @@ func genLoong64(g *gen) {
 		l.add(movf, reg, regsize)
 	}

-	// save/restore FCC0
+	// Add condition flag register fcc0-fcc7
+	sv := ""
+	rs := ""
+	last := 7
+	for i := 0; i <= last; i++ {
+		msb := 7 + (i * 8)
+		lsb := 0 + (i * 8)
+
+		// MOVV FCCx, R4,
+		// BSTRINSV $msb, R4, $lsb, R5
+		sv += fmt.Sprintf("%s FCC%d, R4\n", mov, i)
+		sv += fmt.Sprintf("BSTRINSV $%d, R4, $%d, R5\n", msb, lsb)
+
+		// BSTRPICKV $msb, R5, $lsb, R4
+		// MOVV R4, FCCx
+		rs += fmt.Sprintf("BSTRPICKV $%d, R5, $%d, R4\n", msb, lsb)
+		rs += fmt.Sprintf("%s R4, FCC%d", mov, i)
+		if i != last {
+			rs += fmt.Sprintf("\n")
+		}
+	}
 	l.addSpecial(
-		mov+" FCC0, R4\n"+mov+" R4, %d(R3)",
-		mov+" %d(R3), R4\n"+mov+" R4, FCC0",
+		sv+mov+" R5, %d(R3)",
+		mov+" %d(R3), R5\n"+rs,
 		regsize)

 	// allocate frame, save PC of interrupted instruction (in LR)
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@ -1308,7 +1308,7 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 		// allocation estimate without bothering to STW. As long as
 		// this is close, then we'll only need to STW once (on the next
 		// call).
-		return int(gcount()), false
+		return int(gcount(false)), false
 	}

 	semacquire(&goroutineProfile.sema)
@ -1324,7 +1324,7 @@ func goroutineProfileWithLabelsConcurrent(p []profilerecord.StackRecord, labels
 	// goroutines that can vary between user and system to ensure that the count
 	// doesn't change during the collection. So, check the finalizer goroutine
 	// and cleanup goroutines in particular.
-	n = int(gcount())
+	n = int(gcount(false))
 	if fingStatus.Load()&fingRunningFinalizer != 0 {
 		n++
 	}
--- a/src/runtime/pipe_unix_test.go
+++ b/src/runtime/pipe_unix_test.go
@ -0,0 +1,15 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !windows
+
+package runtime_test
+
+import "syscall"
+
+func pipe() (r, w int, err error) {
+	var p [2]int
+	err = syscall.Pipe(p[:])
+	return p[0], p[1], err
+}
--- a/src/runtime/pipe_windows_test.go
+++ b/src/runtime/pipe_windows_test.go
@ -0,0 +1,13 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import "syscall"
+
+func pipe() (r, w syscall.Handle, err error) {
+	var p [2]syscall.Handle
+	err = syscall.Pipe(p[:])
+	return p[0], p[1], err
+}
--- a/src/runtime/preempt_loong64.s
+++ b/src/runtime/preempt_loong64.s
@ -65,10 +65,40 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD F30, 456(R3)
 	MOVD F31, 464(R3)
 	MOVV FCC0, R4
-	MOVV R4, 472(R3)
+	BSTRINSV $7, R4, $0, R5
+	MOVV FCC1, R4
+	BSTRINSV $15, R4, $8, R5
+	MOVV FCC2, R4
+	BSTRINSV $23, R4, $16, R5
+	MOVV FCC3, R4
+	BSTRINSV $31, R4, $24, R5
+	MOVV FCC4, R4
+	BSTRINSV $39, R4, $32, R5
+	MOVV FCC5, R4
+	BSTRINSV $47, R4, $40, R5
+	MOVV FCC6, R4
+	BSTRINSV $55, R4, $48, R5
+	MOVV FCC7, R4
+	BSTRINSV $63, R4, $56, R5
+	MOVV R5, 472(R3)
 	CALL ·asyncPreempt2(SB)
-	MOVV 472(R3), R4
+	MOVV 472(R3), R5
+	BSTRPICKV $7, R5, $0, R4
 	MOVV R4, FCC0
+	BSTRPICKV $15, R5, $8, R4
+	MOVV R4, FCC1
+	BSTRPICKV $23, R5, $16, R4
+	MOVV R4, FCC2
+	BSTRPICKV $31, R5, $24, R4
+	MOVV R4, FCC3
+	BSTRPICKV $39, R5, $32, R4
+	MOVV R4, FCC4
+	BSTRPICKV $47, R5, $40, R4
+	MOVV R4, FCC5
+	BSTRPICKV $55, R5, $48, R4
+	MOVV R4, FCC6
+	BSTRPICKV $63, R5, $56, R4
+	MOVV R4, FCC7
 	MOVD 464(R3), F31
 	MOVD 456(R3), F30
 	MOVD 448(R3), F29
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@ -1007,7 +1007,7 @@ func mcommoninit(mp *m, id int64) {
 	// when it is just in a register or thread-local storage.
 	mp.alllink = allm

-	// NumCgoCall() and others iterate over allm w/o schedlock,
+	// NumCgoCall and others iterate over allm w/o schedlock,
 	// so we need to publish it safely.
 	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
 	unlock(&sched.lock)
@ -1372,6 +1372,9 @@ func casGToWaiting(gp *g, old uint32, reason waitReason) {
 // casGToWaitingForSuspendG transitions gp from old to _Gwaiting, and sets the wait reason.
 // The wait reason must be a valid isWaitingForSuspendG wait reason.
 //
+// While a goroutine is in this state, it's stack is effectively pinned.
+// The garbage collector must not shrink or otherwise mutate the goroutine's stack.
+//
 // Use this over casgstatus when possible to ensure that a waitreason is set.
 func casGToWaitingForSuspendG(gp *g, old uint32, reason waitReason) {
 	if !reason.isWaitingForSuspendG() {
@ -1608,18 +1611,11 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 	// stack while we try to stop the world since otherwise we could get
 	// in a mutual preemption deadlock.
 	//
-	// We must not modify anything on the G stack because a stack shrink
-	// may occur, now that we switched to _Gwaiting, specifically if we're
-	// doing this during the mark phase (mark termination excepted, since
-	// we know that stack scanning is done by that point). A stack shrink
-	// is otherwise OK though because in order to return from this function
-	// (and to leave the system stack) we must have preempted all
-	// goroutines, including any attempting to scan our stack, in which
-	// case, any stack shrinking will have already completed by the time we
-	// exit.
+	// casGToWaitingForSuspendG marks the goroutine as ineligible for a
+	// stack shrink, effectively pinning the stack in memory for the duration.
 	//
 	// N.B. The execution tracer is not aware of this status transition and
-	// andles it specially based on the wait reason.
+	// handles it specially based on the wait reason.
 	casGToWaitingForSuspendG(getg().m.curg, _Grunning, waitReasonStoppingTheWorld)

 	trace := traceAcquire()
@ -1652,6 +1648,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
 			if trace.ok() {
 				trace.ProcSteal(pp, false)
 			}
+			sched.nGsyscallNoP.Add(1)
 			pp.syscalltick++
 			pp.gcStopTime = nanotime()
 			sched.stopwait--
@ -2106,16 +2103,11 @@ func forEachP(reason waitReason, fn func(*p)) {
 		// deadlock as we attempt to preempt a goroutine that's trying
 		// to preempt us (e.g. for a stack scan).
 		//
-		// We must not modify anything on the G stack because a stack shrink
-		// may occur. A stack shrink is otherwise OK though because in order
-		// to return from this function (and to leave the system stack) we
-		// must have preempted all goroutines, including any attempting
-		// to scan our stack, in which case, any stack shrinking will
-		// have already completed by the time we exit.
+		// casGToWaitingForSuspendG marks the goroutine as ineligible for a
+		// stack shrink, effectively pinning the stack in memory for the duration.
 		//
-		// N.B. The execution tracer is not aware of this status
-		// transition and handles it specially based on the
-		// wait reason.
+		// N.B. The execution tracer is not aware of this status transition and
+		// handles it specially based on the wait reason.
 		casGToWaitingForSuspendG(gp, _Grunning, reason)
 		forEachPInternal(fn)
 		casgstatus(gp, _Gwaiting, _Grunning)
@ -2183,6 +2175,7 @@ func forEachPInternal(fn func(*p)) {
 				trace.ProcSteal(p2, false)
 				traceRelease(trace)
 			}
+			sched.nGsyscallNoP.Add(1)
 			p2.syscalltick++
 			handoffp(p2)
 		} else if trace.ok() {
@ -2456,6 +2449,7 @@ func needm(signal bool) {
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdead, _Gsyscall)
 	sched.ngsys.Add(-1)
+	sched.nGsyscallNoP.Add(1)

 	if !signal {
 		if trace.ok() {
@ -2591,6 +2585,7 @@ func dropm() {
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
 	mp.curg.preemptStop = false
 	sched.ngsys.Add(1)
+	sched.nGsyscallNoP.Add(-1)

 	if !mp.isExtraInSig {
 		if trace.ok() {
@ -4684,6 +4679,7 @@ func entersyscall_gcwait() {
 			trace.ProcSteal(pp, true)
 			traceRelease(trace)
 		}
+		sched.nGsyscallNoP.Add(1)
 		pp.gcStopTime = nanotime()
 		pp.syscalltick++
 		if sched.stopwait--; sched.stopwait == 0 {
@ -4716,6 +4712,8 @@ func entersyscallblock() {
 	gp.m.syscalltick = gp.m.p.ptr().syscalltick
 	gp.m.p.ptr().syscalltick++

+	sched.nGsyscallNoP.Add(1)
+
 	// Leave SP around for GC and traceback.
 	pc := sys.GetCallerPC()
 	sp := sys.GetCallerSP()
@ -4936,6 +4934,7 @@ func exitsyscallfast_pidle() bool {
 	}
 	unlock(&sched.lock)
 	if pp != nil {
+		sched.nGsyscallNoP.Add(-1)
 		acquirep(pp)
 		return true
 	}
@ -4962,6 +4961,7 @@ func exitsyscall0(gp *g) {
 		trace.GoSysExit(true)
 		traceRelease(trace)
 	}
+	sched.nGsyscallNoP.Add(-1)
 	dropg()
 	lock(&sched.lock)
 	var pp *p
@ -5262,6 +5262,7 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreaso
 			racereleasemergeg(newg, unsafe.Pointer(&labelSync))
 		}
 	}
+	pp.goroutinesCreated++
 	releasem(mp)

 	return newg
@ -5537,8 +5538,11 @@ func badunlockosthread() {
 	throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
 }

-func gcount() int32 {
-	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.stack.size - sched.gFree.noStack.size - sched.ngsys.Load()
+func gcount(includeSys bool) int32 {
+	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.stack.size - sched.gFree.noStack.size
+	if !includeSys {
+		n -= sched.ngsys.Load()
+	}
 	for _, pp := range allp {
 		n -= pp.gFree.size
 	}
@ -5838,6 +5842,8 @@ func (pp *p) destroy() {
 	pp.gcAssistTime = 0
 	gcCleanups.queued += pp.cleanupsQueued
 	pp.cleanupsQueued = 0
+	sched.goroutinesCreated.Add(int64(pp.goroutinesCreated))
+	pp.goroutinesCreated = 0
 	pp.xRegs.free()
 	pp.status = _Pdead
 }
@ -6413,6 +6419,7 @@ func retake(now int64) uint32 {
 					trace.ProcSteal(pp, false)
 					traceRelease(trace)
 				}
+				sched.nGsyscallNoP.Add(1)
 				n++
 				pp.syscalltick++
 				handoffp(pp)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@ -764,6 +764,9 @@ type p struct {
 	// gcStopTime is the nanotime timestamp that this P last entered _Pgcstop.
 	gcStopTime int64

+	// goroutinesCreated is the total count of goroutines created by this P.
+	goroutinesCreated uint64
+
 	// xRegs is the per-P extended register state used by asynchronous
 	// preemption. This is an empty struct on platforms that don't use extended
 	// register state.
@ -793,6 +796,7 @@ type schedt struct {
 	nmfreed      int64    // cumulative number of freed m's

 	ngsys        atomic.Int32 // number of system goroutines
+	nGsyscallNoP atomic.Int32 // number of goroutines in syscalls without a P

 	pidle        puintptr // idle p's
 	npidle       atomic.Int32
@ -891,6 +895,10 @@ type schedt struct {
 	// M, but waiting for locks within the runtime. This field stores the value
 	// for Ms that have exited.
 	totalRuntimeLockWaitTime atomic.Int64
+
+	// goroutinesCreated (plus the value of goroutinesCreated on each P in allp)
+	// is the sum of all goroutines created by the program.
+	goroutinesCreated atomic.Uint64
 }

 // Values for the flags field of a sigTabT.
@ -1217,7 +1225,9 @@ var isIdleInSynctest = [len(waitReasonStrings)]bool{
 }

 var (
+	// Linked-list of all Ms. Written under sched.lock, read atomically.
 	allm *m
+
 	gomaxprocs    int32
 	numCPUStartup int32
 	forcegc       forcegcstate
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@ -178,7 +178,7 @@ func selectgo(cas0 *scase, order0 *uint16, pc0 *uintptr, nsends, nrecvs int, blo

 		if cas.c.bubble != nil {
 			if getg().bubble != cas.c.bubble {
-				panic(plainError("select on synctest channel from outside bubble"))
+				fatal("select on synctest channel from outside bubble")
 			}
 		} else {
 			allSynctest = false
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@ -1214,15 +1214,18 @@ func isShrinkStackSafe(gp *g) bool {
 	if gp.parkingOnChan.Load() {
 		return false
 	}
-	// We also can't copy the stack while tracing is enabled, and
-	// gp is in _Gwaiting solely to make itself available to suspendG.
+	// We also can't copy the stack while a gp is in _Gwaiting solely
+	// to make itself available to suspendG.
+	//
 	// In these cases, the G is actually executing on the system
-	// stack, and the execution tracer may want to take a stack trace
-	// of the G's stack. Note: it's safe to access gp.waitreason here.
-	// We're only checking if this is true if we took ownership of the
+	// stack, and the execution tracer, mutex profiler, etc. may want
+	// to take a stack trace of the G's stack.
+	//
+	// Note: it's safe to access gp.waitreason here.
+	// We're only calling isShrinkStackSafe if we took ownership of the
 	// G with the _Gscan bit. This prevents the goroutine from transitioning,
 	// which prevents gp.waitreason from changing.
-	if traceEnabled() && readgstatus(gp)&^_Gscan == _Gwaiting && gp.waitreason.isWaitingForSuspendG() {
+	if readgstatus(gp)&^_Gscan == _Gwaiting && gp.waitreason.isWaitingForSuspendG() {
 		return false
 	}
 	return true
@ -1258,12 +1261,6 @@ func shrinkstack(gp *g) {
 	if debug.gcshrinkstackoff > 0 {
 		return
 	}
-	f := findfunc(gp.startpc)
-	if f.valid() && f.funcID == abi.FuncID_gcBgMarkWorker {
-		// We're not allowed to shrink the gcBgMarkWorker
-		// stack (see gcBgMarkWorker for explanation).
-		return
-	}

 	oldsize := gp.stack.hi - gp.stack.lo
 	newsize := oldsize / 2
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@ -415,7 +415,7 @@ func newTimer(when, period int64, f func(arg any, seq uintptr, delay int64), arg
 //go:linkname stopTimer time.stopTimer
 func stopTimer(t *timeTimer) bool {
 	if t.isFake && getg().bubble == nil {
-		panic("stop of synctest timer from outside bubble")
+		fatal("stop of synctest timer from outside bubble")
 	}
 	return t.stop()
 }
@ -430,7 +430,7 @@ func resetTimer(t *timeTimer, when, period int64) bool {
 		racerelease(unsafe.Pointer(&t.timer))
 	}
 	if t.isFake && getg().bubble == nil {
-		panic("reset of synctest timer from outside bubble")
+		fatal("reset of synctest timer from outside bubble")
 	}
 	return t.reset(when, period)
 }
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@ -754,24 +754,7 @@ func traceRegisterLabelsAndReasons(gen uintptr) {
 // was on has been returned, ReadTrace returns nil. The caller must copy the
 // returned data before calling ReadTrace again.
 // ReadTrace must be called from one goroutine at a time.
-func ReadTrace() []byte {
-	for {
-		buf := readTrace()
-
-		// Skip over the end-of-generation signal which must not appear
-		// in the final trace.
-		if len(buf) == 1 && tracev2.EventType(buf[0]) == tracev2.EvEndOfGeneration {
-			continue
-		}
-		return buf
-	}
-}
-
-// readTrace is the implementation of ReadTrace, except with an additional
-// in-band signal as to when the buffer is for a new generation.
-//
-//go:linkname readTrace runtime/trace.runtime_readTrace
-func readTrace() (buf []byte) {
+func ReadTrace() (buf []byte) {
 top:
 	var park bool
 	systemstack(func() {
@ -842,7 +825,7 @@ func readTrace0() (buf []byte, park bool) {
 	if !trace.headerWritten {
 		trace.headerWritten = true
 		unlock(&trace.lock)
-		return []byte("go 1.25 trace\x00\x00\x00"), false
+		return []byte("go 1.26 trace\x00\x00\x00"), false
 	}

 	// Read the next buffer.
--- a/src/runtime/trace/batch.go
+++ b/src/runtime/trace/batch.go
@ -12,72 +12,77 @@ import (
 // timestamp is an unprocessed timestamp.
 type timestamp uint64

-// batch represents a batch of trace events.
-// It is unparsed except for its header.
 type batch struct {
-	m    threadID
 	time timestamp
+	gen  uint64
 	data []byte
 }

-// threadID is the runtime-internal M structure's ID. This is unique
-// for each OS thread.
-type threadID int64
-
 // readBatch copies b and parses the trace batch header inside.
-// Returns the batch, the generation, bytes read, and an error.
-func readBatch(b []byte) (batch, uint64, uint64, error) {
+// Returns the batch, bytes read, and an error.
+func readBatch(b []byte) (batch, uint64, error) {
 	if len(b) == 0 {
-		return batch{}, 0, 0, fmt.Errorf("batch is empty")
+		return batch{}, 0, fmt.Errorf("batch is empty")
 	}
 	data := make([]byte, len(b))
-	if nw := copy(data, b); nw != len(b) {
-		return batch{}, 0, 0, fmt.Errorf("unexpected error copying batch")
-	}
-	// Read batch header byte.
-	if typ := tracev2.EventType(b[0]); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
-		return batch{}, 0, 1, fmt.Errorf("expected batch event, got event %d", typ)
-	}
+	copy(data, b)

-	// Read the batch header: gen (generation), thread (M) ID, base timestamp
-	// for the batch.
+	// Read batch header byte.
+	if typ := tracev2.EventType(b[0]); typ == tracev2.EvEndOfGeneration {
+		if len(b) != 1 {
+			return batch{}, 1, fmt.Errorf("unexpected end of generation in batch of size >1")
+		}
+		return batch{data: data}, 1, nil
+	}
+	if typ := tracev2.EventType(b[0]); typ != tracev2.EvEventBatch && typ != tracev2.EvExperimentalBatch {
+		return batch{}, 1, fmt.Errorf("expected batch event, got event %d", typ)
+	}
 	total := 1
 	b = b[1:]
+
+	// Read the generation
 	gen, n, err := readUvarint(b)
 	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch gen: %w", err)
-	}
-	total += n
-	b = b[n:]
-	m, n, err := readUvarint(b)
-	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch M ID: %w", err)
-	}
-	total += n
-	b = b[n:]
-	ts, n, err := readUvarint(b)
-	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch timestamp: %w", err)
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch gen: %w", err)
 	}
 	total += n
 	b = b[n:]

-	// Read in the size of the batch to follow.
+	// Read the M (discard it).
+	_, n, err = readUvarint(b)
+	if err != nil {
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch M ID: %w", err)
+	}
+	total += n
+	b = b[n:]
+
+	// Read the timestamp.
+	ts, n, err := readUvarint(b)
+	if err != nil {
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch timestamp: %w", err)
+	}
+	total += n
+	b = b[n:]
+
+	// Read the size of the batch to follow.
 	size, n, err := readUvarint(b)
 	if err != nil {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("error reading batch size: %w", err)
+		return batch{}, uint64(total + n), fmt.Errorf("error reading batch size: %w", err)
 	}
 	if size > tracev2.MaxBatchSize {
-		return batch{}, gen, uint64(total + n), fmt.Errorf("invalid batch size %d, maximum is %d", size, tracev2.MaxBatchSize)
+		return batch{}, uint64(total + n), fmt.Errorf("invalid batch size %d, maximum is %d", size, tracev2.MaxBatchSize)
 	}
 	total += n
 	total += int(size)
+	if total != len(data) {
+		return batch{}, uint64(total), fmt.Errorf("expected complete batch")
+	}
 	data = data[:total]

 	// Return the batch.
 	return batch{
-		m:    threadID(m),
+		gen:  gen,
 		time: timestamp(ts),
 		data: data,
-	}, gen, uint64(total), nil
+	}, uint64(total), nil
 }
--- a/src/runtime/trace/flightrecorder.go
+++ b/src/runtime/trace/flightrecorder.go
@ -141,9 +141,9 @@ func (fr *FlightRecorder) WriteTo(w io.Writer) (n int64, err error) {

 	// Write all the data.
 	for _, gen := range gens {
-		for _, batch := range gen.batches {
+		for _, data := range gen.batches {
 			// Write batch data.
-			nw, err = w.Write(batch.data)
+			nw, err = w.Write(data)
 			n += int64(nw)
 			if err != nil {
 				return n, err
--- a/src/runtime/trace/recorder.go
+++ b/src/runtime/trace/recorder.go
@ -41,21 +41,21 @@ func (w *recorder) Write(b []byte) (n int, err error) {
 	if len(b) == n {
 		return 0, nil
 	}
-	ba, gen, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch.
+	ba, nb, err := readBatch(b[n:]) // Every write from the runtime is guaranteed to be a complete batch.
 	if err != nil {
 		return len(b) - int(nb) - n, err
 	}
 	n += int(nb)

 	// Append the batch to the current generation.
-	if r.active.gen == 0 {
-		r.active.gen = gen
+	if ba.gen != 0 && r.active.gen == 0 {
+		r.active.gen = ba.gen
 	}
-	if r.active.minTime == 0 || r.active.minTime > r.freq.mul(ba.time) {
+	if ba.time != 0 && (r.active.minTime == 0 || r.active.minTime > r.freq.mul(ba.time)) {
 		r.active.minTime = r.freq.mul(ba.time)
 	}
 	r.active.size += len(ba.data)
-	r.active.batches = append(r.active.batches, ba)
+	r.active.batches = append(r.active.batches, ba.data)

 	return len(b), nil
 }
@ -99,7 +99,7 @@ type rawGeneration struct {
 	gen     uint64
 	size    int
 	minTime eventTime
-	batches []batch
+	batches [][]byte
 }

 func traceTimeNow(freq frequency) eventTime {
--- a/src/runtime/trace/subscribe.go
+++ b/src/runtime/trace/subscribe.go
@ -155,7 +155,7 @@ func (t *traceMultiplexer) startLocked() error {
 	t.subscribersMu.Unlock()

 	go func() {
-		header := runtime_readTrace()
+		header := runtime.ReadTrace()
 		if traceStartWriter != nil {
 			traceStartWriter.Write(header)
 		}
@ -164,10 +164,16 @@ func (t *traceMultiplexer) startLocked() error {
 		}

 		for {
-			data := runtime_readTrace()
+			data := runtime.ReadTrace()
 			if data == nil {
 				break
 			}
+			if traceStartWriter != nil {
+				traceStartWriter.Write(data)
+			}
+			if flightRecorder != nil {
+				flightRecorder.Write(data)
+			}
 			if len(data) == 1 && tracev2.EventType(data[0]) == tracev2.EvEndOfGeneration {
 				if flightRecorder != nil {
 					flightRecorder.endGeneration()
@ -187,13 +193,6 @@ func (t *traceMultiplexer) startLocked() error {
 				if frIsNew {
 					flightRecorder.Write(header)
 				}
-			} else {
-				if traceStartWriter != nil {
-					traceStartWriter.Write(data)
-				}
-				if flightRecorder != nil {
-					flightRecorder.Write(data)
-				}
 			}
 		}
 	}()
--- a/src/runtime/tracetype.go
+++ b/src/runtime/tracetype.go
@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Trace stack table and acquisition.
+// Trace type table.

 package runtime

@ -13,7 +13,7 @@ import (
 	"unsafe"
 )

-// traceTypeTable maps stack traces (arrays of PC's) to unique uint32 ids.
+// traceTypeTable maps types to unique uint32 ids.
 // It is lock-free for reading.
 type traceTypeTable struct {
 	tab traceMap
--- a/src/syscall/exec_linux.go
+++ b/src/syscall/exec_linux.go
@ -365,11 +365,11 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 		if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {
 			goto childerror
 		}
-		pid, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
+		c, _, err1 = RawSyscall(SYS_READ, uintptr(mapPipe[0]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
 		if err1 != 0 {
 			goto childerror
 		}
-		if pid != unsafe.Sizeof(err2) {
+		if c != unsafe.Sizeof(err2) {
 			err1 = EINVAL
 			goto childerror
 		}
@ -427,7 +427,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&psetgroups[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&setgroups[0])), uintptr(len(setgroups)))
 			if err1 != 0 {
 				goto childerror
 			}
@ -438,7 +438,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&pgid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&gidmap[0])), uintptr(len(gidmap)))
 			if err1 != 0 {
 				goto childerror
 			}
@ -452,7 +452,7 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
 			if fd1, _, err1 = RawSyscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(&puid[0])), uintptr(O_WRONLY), 0, 0, 0); err1 != 0 {
 				goto childerror
 			}
-			pid, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
+			_, _, err1 = RawSyscall(SYS_WRITE, fd1, uintptr(unsafe.Pointer(&uidmap[0])), uintptr(len(uidmap)))
 			if err1 != 0 {
 				goto childerror
 			}
--- a/src/syscall/syscall_unix.go
+++ b/src/syscall/syscall_unix.go
@ -410,18 +410,26 @@ func SendmsgN(fd int, p, oob []byte, to Sockaddr, flags int) (n int, err error)
 }

 func sendmsgNInet4(fd int, p, oob []byte, to *SockaddrInet4, flags int) (n int, err error) {
-	ptr, salen, err := to.sockaddr()
+	var ptr unsafe.Pointer
+	var salen _Socklen
+	if to != nil {
+		ptr, salen, err = to.sockaddr()
 		if err != nil {
 			return 0, err
 		}
+	}
 	return sendmsgN(fd, p, oob, ptr, salen, flags)
 }

 func sendmsgNInet6(fd int, p, oob []byte, to *SockaddrInet6, flags int) (n int, err error) {
-	ptr, salen, err := to.sockaddr()
+	var ptr unsafe.Pointer
+	var salen _Socklen
+	if to != nil {
+		ptr, salen, err = to.sockaddr()
 		if err != nil {
 			return 0, err
 		}
+	}
 	return sendmsgN(fd, p, oob, ptr, salen, flags)
 }

--- a/src/unique/canonmap_test.go
+++ b/src/unique/canonmap_test.go
@ -108,6 +108,25 @@ func testCanonMap(t *testing.T, newMap func() *canonMap[string]) {
 				wg.Wait()
 			}

+			// Run an extra GC cycle to de-flake. Sometimes the cleanups
+			// fail to run in time, despite drainCleanupQueue.
+			//
+			// TODO(mknyszek): Figure out why the extra GC is necessary,
+			// and what is transiently keeping the cleanups live.
+			// * I have confirmed that they are not completely stuck, and
+			//   they always eventually run.
+			// * I have also confirmed it's not asynchronous preemption
+			//   keeping them around (though that is a possibility).
+			// * I have confirmed that they are not simply sitting on
+			//   the queue, and that drainCleanupQueue is just failing
+			//   to actually empty the queue.
+			// * I have confirmed that it's not a write barrier that's
+			//   keeping it alive, nor is it a weak pointer dereference
+			//   (which shades the object during the GC).
+			// The corresponding objects do seem to be transiently truly
+			// reachable, but I have no idea by what path.
+			runtime.GC()
+
 			// Drain cleanups so everything is deleted.
 			drainCleanupQueue(t)

--- a/test/fixedbugs/issue75063.go
+++ b/test/fixedbugs/issue75063.go
@ -0,0 +1,75 @@
+// compile
+
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package reorder
+
+type Element struct {
+	A     string
+	B     string
+	C     string
+	D     string
+	E     string
+	Text  []string
+	List  []string
+	Child Elements
+	F     string
+	G     bool
+	H     bool
+	I     string
+}
+
+type Elements []Element
+
+func DoesNotCompile(ve Elements) Elements {
+	aa := Elements{}
+	bb := Elements{}
+	cc := Elements{}
+	dd := Elements{}
+	ee := Elements{}
+	ff := Elements{}
+	gg := Elements{}
+	hh := Elements{}
+	ii := Elements{}
+
+	if len(ve) != 1 {
+		return ve
+	}
+	for _, e := range ve[0].Child {
+		if len(e.Text) == 1 && (e.Text[0] == "xx") {
+			ee = append(ee, e)
+		} else if len(e.Text) == 1 && e.Text[0] == "yy" {
+			for _, c := range e.Child {
+				if len(c.Text) == 1 && c.Text[0] == "zz" {
+					ii = append(ii, c)
+				} else {
+					hh = append(hh, c)
+				}
+			}
+			ii = append(ii, hh...)
+			e.Child = ii
+			gg = append(gg, e)
+		} else if len(e.Text) == 1 && e.Text[0] == "tt" {
+			for _, entry := range e.Child {
+				for _, c := range entry.Child {
+					if len(c.Text) == 1 && c.Text[0] == "ee" {
+						cc = append(cc, c)
+					} else {
+						dd = append(dd, c)
+					}
+				}
+				cc = append(cc, dd...)
+				entry.Child = cc
+				bb = append(bb, entry)
+				cc, dd = Elements{}, Elements{}
+			}
+			e.Child = bb
+			aa = append(aa, e)
+		} else {
+			ff = append(ff, e)
+		}
+	}
+	return ve
+}