internal/runtime/cgroup: fix path on non-root mount point

We should trim the mount root (4th field in /proc/self/mountinfo) from cgroup path read from /proc/self/cgroup before appending it to the mount point. Non-root mount points are very common in containers with cgroup v1. parseCPURelativePath is renamed to parseCPUCgroup, as it is unclear what it is relative to. cgroups(7) says "This pathname is relative to the mount point of the hierarchy." It should mean the root of the hierarchy, and we cannot concat it to arbirary cgroup mount point. So just use the word cgroup, since it parses /proc/self/cgroup. It now returns errMalformedFile if the cgroup pathname does not start with "/", and errPathTooLong if the pathname can't fit into the buffer. We already rely on this when composing the path, just make this explicit to avoid incorrect paths. We now parse cgroup first then parse the mount point accordingly. We consider the previously read cgroup pathname and version to ensure we got the desired mount point. The out buffer is reused to pass in the cgroup, to avoid extra memory allocation. This should also resolve the race mentioned in the comments, so removing those comments. If our cgroup changed between the two read syscalls, we will stick with the cgroup read from /proc/self/cgroup. This is the same behavior as cgroup change after FindCPU() returns, so nothing special to comment about now. parseCPUMount now returns error when the combined path is too long, to avoid panic or truncation if we got a really long path from mountinfo. cgrouptest is changed to use dev returned from stat() to detect filesystem boundary, since we don't return mount point and sub-path separately now. This also avoid using os.Root since we don't handle untrusted input here. os.Root is too complex, and the performance is bad. Fixes #76390 Change-Id: Ia9cbd7be3e58a2d51caf27a973fbd201dac06afc Reviewed-on: https://go-review.googlesource.com/c/go/+/723241 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> Auto-Submit: Michael Pratt <mpratt@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Knyszek <mknyszek@google.com>
2025-12-08 06:10:04 +00:00 · 2025-11-22 01:44:14 +08:00 · 2025-11-22 01:44:14 +08:00 · c2af9f14b4
commit c2af9f14b4
parent 6be5de4bc4
5 changed files with 424 additions and 156 deletions
--- a/src/internal/cgrouptest/cgrouptest_linux.go
+++ b/src/internal/cgrouptest/cgrouptest_linux.go
@ -50,9 +50,8 @@ func (c *CgroupV2) SetCPUMax(quota, period int64) error {
 //
 // This must not be used in parallel tests, as it affects the entire process.
 func InCgroupV2(t *testing.T, fn func(*CgroupV2)) {
-	mount, rel := findCurrent(t)
-	parent := findOwnedParent(t, mount, rel)
-	orig := filepath.Join(mount, rel)
+	orig := findCurrent(t)
+	parent := findOwnedParent(t, orig)

 	// Make sure the parent allows children to control cpu.
 	b, err := os.ReadFile(filepath.Join(parent, "cgroup.subtree_control"))
@ -93,34 +92,25 @@ func InCgroupV2(t *testing.T, fn func(*CgroupV2)) {
 	fn(c)
 }

-// Returns the mount and relative directory of the current cgroup the process
-// is in.
-func findCurrent(t *testing.T) (string, string) {
+// Returns the filesystem path to the current cgroup the process is in.
+func findCurrent(t *testing.T) string {
 	// Find the path to our current CPU cgroup. Currently this package is
 	// only used for CPU cgroup testing, so the distinction of different
 	// controllers doesn't matter.
 	var scratch [cgroup.ParseSize]byte
 	buf := make([]byte, cgroup.PathSize)
-	n, err := cgroup.FindCPUMountPoint(buf, scratch[:])
+	n, ver, err := cgroup.FindCPU(buf, scratch[:])
 	if err != nil {
 		t.Skipf("cgroup: unable to find current cgroup mount: %v", err)
 	}
-	mount := string(buf[:n])
-
-	n, ver, err := cgroup.FindCPURelativePath(buf, scratch[:])
-	if err != nil {
-		t.Skipf("cgroup: unable to find current cgroup path: %v", err)
-	}
 	if ver != cgroup.V2 {
 		t.Skipf("cgroup: running on cgroup v%d want v2", ver)
 	}
-	rel := string(buf[1:n])       // The returned path always starts with /, skip it.
-	rel = filepath.Join(".", rel) // Make sure this isn't empty string at root.
-	return mount, rel
+	return string(buf[:n])
 }

 // Returns a parent directory in which we can create our own cgroup subdirectory.
-func findOwnedParent(t *testing.T, mount, rel string) string {
+func findOwnedParent(t *testing.T, orig string) string {
 	// There are many ways cgroups may be set up on a system. We don't try
 	// to cover all of them, just common ones.
 	//
@ -142,7 +132,7 @@ func findOwnedParent(t *testing.T, mount, rel string) string {

 	// We want to create our own subdirectory that we can migrate into and
 	// then manipulate at will. It is tempting to create a new subdirectory
-	// inside the current cgroup we are already in, however that will likey
+	// inside the current cgroup we are already in, however that will likely
 	// not work. cgroup v2 only allows processes to be in leaf cgroups. Our
 	// current cgroup likely contains multiple processes (at least this one
 	// and the cmd/go test runner). If we make a subdirectory and try to
@ -166,27 +156,29 @@ func findOwnedParent(t *testing.T, mount, rel string) string {
 	// is empty. As far as I tell, the only purpose of this is to allow
 	// reorganizing processes into a new set of subdirectories and then
 	// adding controllers once done.
-	root, err := os.OpenRoot(mount)
+	var stat syscall.Stat_t
+	err := syscall.Stat(orig, &stat)
 	if err != nil {
-		t.Fatalf("error opening cgroup mount root: %v", err)
+		t.Fatalf("error stating orig cgroup: %v", err)
 	}

 	uid := os.Getuid()
 	var prev string
-	for rel != "." {
-		fi, err := root.Stat(rel)
+	cur := filepath.Dir(orig)
+	for cur != "/" {
+		var curStat syscall.Stat_t
+		err = syscall.Stat(cur, &curStat)
 		if err != nil {
 			t.Fatalf("error stating cgroup path: %v", err)
 		}

-		st := fi.Sys().(*syscall.Stat_t)
-		if int(st.Uid) != uid {
-			// Stop at first directory we don't own.
+		if int(curStat.Uid) != uid || curStat.Dev != stat.Dev {
+			// Stop at first directory we don't own or filesystem boundary.
 			break
 		}

-		prev = rel
-		rel = filepath.Join(rel, "..")
+		prev = cur
+		cur = filepath.Dir(cur)
 	}

 	if prev == "" {
@ -194,7 +186,7 @@ func findOwnedParent(t *testing.T, mount, rel string) string {
 	}

 	// We actually want the last directory where we were the owner.
-	return filepath.Join(mount, prev)
+	return prev
 }

 // Migrate the current process to the cgroup directory dst.
--- a/src/internal/runtime/cgroup/cgroup.go
+++ b/src/internal/runtime/cgroup/cgroup.go
@ -102,21 +102,23 @@ func parseV2Limit(buf []byte) (float64, bool, error) {
 	return float64(quota) / float64(period), true, nil
 }

-// Finds the path of the current process's CPU cgroup relative to the cgroup
-// mount and writes it to out.
+// Finds the path of the current process's CPU cgroup and writes it to out.
 //
+// fd is a file descriptor for /proc/self/cgroup.
 // Returns the number of bytes written and the cgroup version (1 or 2).
-func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
+func parseCPUCgroup(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, Version, error) {
 	// The format of each line is
 	//
 	//   hierarchy-ID:controller-list:cgroup-path
 	//
 	// controller-list is comma-separated.
-	// See man 5 cgroup for more details.
 	//
 	// cgroup v2 has hierarchy-ID 0. If a v1 hierarchy contains "cpu", that
 	// is the CPU controller. Otherwise the v2 hierarchy (if any) is the
-	// CPU controller.
+	// CPU controller. It is not possible to mount the same controller
+	// simultaneously under both the v1 and the v2 hierarchies.
+	//
+	// See man 7 cgroups for more details.
 	//
 	// hierarchy-ID and controller-list have relatively small maximum
 	// sizes, and the path can be up to _PATH_MAX, so we need a bit more
@ -149,7 +151,7 @@ func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), ou
 		//   hierarchy-ID:controller-list:cgroup-path
 		//
 		// controller-list is comma-separated.
-		// See man 5 cgroup for more details.
+		// See man 7 cgroups for more details.
 		i := bytealg.IndexByte(line, ':')
 		if i < 0 {
 			return 0, 0, errMalformedFile
@ -167,6 +169,15 @@ func parseCPURelativePath(fd int, read func(fd int, b []byte) (int, uintptr), ou
 		line = line[i+1:]

 		path := line
+		if len(path) == 0 || path[0] != '/' {
+			// We rely on this when composing the full path.
+			return 0, 0, errMalformedFile
+		}
+		if len(path) > len(out) {
+			// Should not be possible. If we really get a very long cgroup path,
+			// read /proc/self/cgroup will fail with ENAMETOOLONG.
+			return 0, 0, errPathTooLong
+		}

 		if string(hierarchy) == "0" {
 			// v2 hierarchy.
@ -214,9 +225,11 @@ func containsCPU(b []byte) bool {
 	return false
 }

-// Returns the mount point for the cpu cgroup controller (v1 or v2) from
-// /proc/self/mountinfo.
-func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byte, scratch []byte) (int, error) {
+// Returns the path to the specified cgroup and version with cpu controller
+//
+// fd is a file descriptor for /proc/self/mountinfo.
+// Returns the number of bytes written.
+func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out, cgroup []byte, version Version, scratch []byte) (int, error) {
 	// The format of each line is:
 	//
 	// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
@ -240,8 +253,13 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt
 	// carriage return. Those are escaped. See Linux show_mountinfo ->
 	// show_path. We must unescape before returning.
 	//
-	// We return the mount point (5) if the filesystem type (9) is cgroup2,
-	// or cgroup with "cpu" in the super options (11).
+	// A mount point matches if the filesystem type (9) is cgroup2,
+	// or cgroup with "cpu" in the super options (11),
+	// and the cgroup is in the root (4). If there are multiple matches,
+	// the first one is selected.
+	//
+	// We return full cgroup path, which is the mount point (5) +
+	// cgroup parameter without the root (4) prefix.
 	//
 	// (4), (5), and (10) are up to _PATH_MAX. The remaining fields have a
 	// small fixed maximum size, so 4*_PATH_MAX is plenty of scratch space.
@ -250,11 +268,7 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt

 	l := newLineReader(fd, scratch, read)

-	// Bytes written to out.
-	n := 0
-
 	for {
-		//incomplete := false
 		err := l.next()
 		if err == errIncompleteLine {
 			// An incomplete line is fine as long as it doesn't
@ -271,8 +285,8 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt

 		line := l.line()

-		// Skip first four fields.
-		for range 4 {
+		// Skip first three fields.
+		for range 3 {
 			i := bytealg.IndexByte(line, ' ')
 			if i < 0 {
 				return 0, errMalformedFile
@ -280,11 +294,23 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt
 			line = line[i+1:]
 		}

-		// (5) mount point:  mount point relative to the process's root
+		// (4) root:  root of the mount within the filesystem
 		i := bytealg.IndexByte(line, ' ')
 		if i < 0 {
 			return 0, errMalformedFile
 		}
+		root := line[:i]
+		if len(root) == 0 || root[0] != '/' {
+			// We rely on this in hasPathPrefix.
+			return 0, errMalformedFile
+		}
+		line = line[i+1:]
+
+		// (5) mount point:  mount point relative to the process's root
+		i = bytealg.IndexByte(line, ' ')
+		if i < 0 {
+			return 0, errMalformedFile
+		}
 		mnt := line[:i]
 		line = line[i+1:]

@ -313,25 +339,11 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt
 		ftype := line[:i]
 		line = line[i+1:]

-		if string(ftype) != "cgroup" && string(ftype) != "cgroup2" {
+		switch version {
+		case V1:
+			if string(ftype) != "cgroup" {
 				continue
 			}
-
-		// As in findCPUPath, cgroup v1 with a CPU controller takes
-		// precendence over cgroup v2.
-		if string(ftype) == "cgroup2" {
-			// v2 hierarchy.
-			n, err = unescapePath(out, mnt)
-			if err != nil {
-				// Don't keep searching on error. The kernel
-				// should never produce broken escaping.
-				return n, err
-			}
-			// Keep searching, we might find a v1 hierarchy with a
-			// CPU controller, which takes precedence.
-			continue
-		}
-
 			// (10) mount source:  filesystem specific information or "none"
 			i = bytealg.IndexByte(line, ' ')
 			if i < 0 {
@ -341,25 +353,89 @@ func parseCPUMount(fd int, read func(fd int, b []byte) (int, uintptr), out []byt
 			line = line[i+1:]

 			// (11) super options:  per super block options
-		superOpt := line
-
-		// v1 hierarchy
-		if containsCPU(superOpt) {
-			// Found a v1 CPU controller. This must be the
-			// only one, so we're done.
-			return unescapePath(out, mnt)
+			if !containsCPU(line) {
+				continue
 			}
+		case V2:
+			if string(ftype) != "cgroup2" {
+				continue
+			}
+		default:
+			throw("impossible cgroup version")
+			panic("unreachable")
+		}
+
+		// Check cgroup is in the root.
+		// If the cgroup is /sandbox/container, the matching mount point root could be
+		// /sandbox/container, /sandbox, or /
+		rootLen, err := unescapePath(root, root)
+		if err != nil {
+			return 0, err
+		}
+		root = root[:rootLen]
+		if !hasPathPrefix(cgroup, root) {
+			continue // not matched, this is not the mount point we're looking for
+		}
+
+		// Cutoff the root from cgroup, ensure rel starts with '/' or is empty.
+		rel := cgroup[rootLen:]
+		if rootLen == 1 && len(cgroup) > 1 {
+			// root is "/", but cgroup is not. Keep full cgroup path.
+			rel = cgroup
+		}
+		if hasPathPrefix(rel, []byte("/..")) {
+			// the cgroup is out of current cgroup namespace, and this mount point
+			// cannot reach that cgroup.
+			//
+			// e.g. If the process is in cgroup /init, but in a cgroup namespace
+			// rooted at /sandbox/container, /proc/self/cgroup will show /../../init.
+			// we can reach it if the mount point root is
+			// /../.. or /../../init, but not if it is /.. or /
+			// While mount point with root /../../.. should able to reach the cgroup,
+			// we don't know the path to the cgroup within that mount point.
+			continue
+		}
+
+		// All conditions met, compose the full path.
+		// Copy rel to the correct place first, it may overlap with out.
+		n := unescapedLen(mnt)
+		if n+len(rel) > len(out) {
+			return 0, errPathTooLong
+		}
+		copy(out[n:], rel)
+		n2, err := unescapePath(out[:n], mnt)
+		if err != nil {
+			return 0, err
+		}
+		if n2 != n {
+			throw("wrong unescaped len")
+		}
+		return n + len(rel), nil
 	}

-	if n == 0 {
 	// Found nothing.
 	return 0, ErrNoCgroup
-	}
-
-	return n, nil
 }

-var errInvalidEscape error = stringError("invalid path escape sequence")
+func hasPathPrefix(p, prefix []byte) bool {
+	i := len(prefix)
+	if i == 1 {
+		return true // root contains everything
+	}
+	if len(p) < i || !bytealg.Equal(prefix, p[:i]) {
+		return false
+	}
+	return len(p) == i || p[i] == '/' // must match at path boundary
+}
+
+var (
+	errInvalidEscape error = stringError("invalid path escape sequence")
+	errPathTooLong   error = stringError("path too long")
+)
+
+func unescapedLen(in []byte) int {
+	return len(in) - bytealg.Count(in, byte('\\'))*3
+}

 // unescapePath copies in to out, unescaping escape sequences generated by
 // Linux's show_path.
@ -367,20 +443,21 @@ var errInvalidEscape error = stringError("invalid path escape sequence")
 // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
 // like '\040' for space.
 //
-// out must be at least as large as in.
+// Caller must ensure that out at least has unescapedLen(in) bytes.
+// in and out may alias; in-place unescaping is supported.
 //
 // Returns the number of bytes written to out.
 //
 // Also see escapePath in cgroup_linux_test.go.
 func unescapePath(out []byte, in []byte) (int, error) {
-	// Not strictly necessary, but simplifies the implementation and will
-	// always hold in users.
-	if len(out) < len(in) {
-		throw("output too small")
-	}
-
 	var outi, ini int
 	for ini < len(in) {
+		if outi >= len(out) {
+			// given that caller already ensured out is long enough, this
+			// is only possible if there are malformed escape sequences
+			// we have not parsed yet.
+			return outi, errInvalidEscape
+		}
 		c := in[ini]
 		if c != '\\' {
 			out[outi] = c
--- a/src/internal/runtime/cgroup/cgroup_linux.go
+++ b/src/internal/runtime/cgroup/cgroup_linux.go
@ -211,44 +211,26 @@ func FindCPU(out []byte, scratch []byte) (int, Version, error) {
 	checkBufferSize(scratch, ParseSize)

 	// The cgroup path is <cgroup mount point> + <relative path>.
-	//
-	// This is racy if our cgroup is changed while this runs. For example,
-	// initially there is only a cgroup v2 mount and we are not in a
-	// cgroup. After, there a cgroup v1 mount with a CPU controller and we
-	// are placed in a cgroup in this hierarchy. In that case, findCPUMount
-	// could pick the v2 mount, and findCPURelativePath could find the v2
-	// relative path.
-	//
-	// In this case we'll later fail to read the cgroup files and fall back
-	// to assuming no cgroup.
+	// relative path is the cgroup relative to the mount root.

-	n, err := FindCPUMountPoint(out, scratch)
+	n, version, err := FindCPUCgroup(out, scratch)
 	if err != nil {
 		return 0, 0, err
 	}

-	// The relative path always starts with /, so we can directly append it
-	// to the mount point.
-	n2, version, err := FindCPURelativePath(out[n:], scratch)
-	if err != nil {
-		return 0, 0, err
-	}
-	n += n2
-
-	return n, version, nil
+	n, err = FindCPUMountPoint(out, out[:n], version, scratch)
+	return n, version, err
 }

-// FindCPURelativePath finds the path to the CPU cgroup that this process is a member of
-// relative to the root of the cgroup mount and places it in out. scratch is a
-// scratch buffer for internal use.
+// FindCPUCgroup finds the path to the CPU cgroup that this process is a member of
+// and places it in out. scratch is a scratch buffer for internal use.
 //
-// out must have length PathSize minus the size of the cgroup mount root (if
-// known). scratch must have length ParseSize.
+// out must have length PathSize. scratch must have length ParseSize.
 //
 // Returns the number of bytes written to out and the cgroup version (1 or 2).
 //
 // Returns ErrNoCgroup if the process is not in a CPU cgroup.
-func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
+func FindCPUCgroup(out []byte, scratch []byte) (int, Version, error) {
 	path := []byte("/proc/self/cgroup\x00")
 	fd, errno := linux.Open(&path[0], linux.O_RDONLY|linux.O_CLOEXEC, 0)
 	if errno == linux.ENOENT {
@ -259,7 +241,7 @@ func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {

 	// The relative path always starts with /, so we can directly append it
 	// to the mount point.
-	n, version, err := parseCPURelativePath(fd, linux.Read, out[:], scratch)
+	n, version, err := parseCPUCgroup(fd, linux.Read, out[:], scratch)
 	if err != nil {
 		linux.Close(fd)
 		return 0, 0, err
@ -269,15 +251,17 @@ func FindCPURelativePath(out []byte, scratch []byte) (int, Version, error) {
 	return n, version, nil
 }

-// FindCPUMountPoint finds the root of the CPU cgroup mount places it in out.
+// FindCPUMountPoint finds the mount point containing the specified cgroup and
+// version with cpu controller, and compose the full path to the cgroup in out.
 // scratch is a scratch buffer for internal use.
 //
-// out must have length PathSize. scratch must have length ParseSize.
+// out must have length PathSize, may overlap with cgroup.
+// scratch must have length ParseSize.
 //
 // Returns the number of bytes written to out.
 //
-// Returns ErrNoCgroup if the process is not in a CPU cgroup.
-func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
+// Returns ErrNoCgroup if no matching mount point is found.
+func FindCPUMountPoint(out, cgroup []byte, version Version, scratch []byte) (int, error) {
 	checkBufferSize(out, PathSize)
 	checkBufferSize(scratch, ParseSize)

@ -289,7 +273,7 @@ func FindCPUMountPoint(out []byte, scratch []byte) (int, error) {
 		return 0, errSyscallFailed
 	}

-	n, err := parseCPUMount(fd, linux.Read, out, scratch)
+	n, err := parseCPUMount(fd, linux.Read, out, cgroup, version, scratch)
 	if err != nil {
 		linux.Close(fd)
 		return 0, err
--- a/src/internal/runtime/cgroup/cgroup_test.go
+++ b/src/internal/runtime/cgroup/cgroup_test.go
@ -12,8 +12,6 @@ import (
 	"testing"
 )

-const _PATH_MAX = 4096
-
 func TestParseV1Number(t *testing.T) {
 	tests := []struct {
 		name     string
@ -156,7 +154,22 @@ func TestParseV2Limit(t *testing.T) {
 	}
 }

-func TestParseCPURelativePath(t *testing.T) {
+func readString(contents string) func(fd int, b []byte) (int, uintptr) {
+	r := strings.NewReader(contents)
+	return func(fd int, b []byte) (int, uintptr) {
+		n, err := r.Read(b)
+		if err != nil && err != io.EOF {
+			const dummyErrno = 42
+			return n, dummyErrno
+		}
+		return n, 0
+	}
+}
+
+func TestParseCPUCgroup(t *testing.T) {
+	veryLongPathName := strings.Repeat("a", cgroup.PathSize+10)
+	evenLongerPathName := strings.Repeat("a", cgroup.ParseSize+10)
+
 	tests := []struct {
 		name     string
 		contents string
@ -169,6 +182,16 @@ func TestParseCPURelativePath(t *testing.T) {
 			contents: "",
 			wantErr:  true,
 		},
+		{
+			name:     "too-long",
+			contents: "0::/" + veryLongPathName + "\n",
+			wantErr:  true,
+		},
+		{
+			name:     "too-long-line",
+			contents: "0::/" + evenLongerPathName + "\n",
+			wantErr:  true,
+		},
 		{
 			name: "v1",
 			contents: `2:cpu,cpuacct:/a/b/cpu
@ -196,19 +219,9 @@ func TestParseCPURelativePath(t *testing.T) {

 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			r := strings.NewReader(tc.contents)
-			read := func(fd int, b []byte) (int, uintptr) {
-				n, err := r.Read(b)
-				if err != nil && err != io.EOF {
-					const dummyErrno = 42
-					return n, dummyErrno
-				}
-				return n, 0
-			}
-
 			var got [cgroup.PathSize]byte
 			var scratch [cgroup.ParseSize]byte
-			n, gotVer, err := cgroup.ParseCPURelativePath(0, read, got[:], scratch[:])
+			n, gotVer, err := cgroup.ParseCPUCgroup(0, readString(tc.contents), got[:], scratch[:])
 			if (err != nil) != tc.wantErr {
 				t.Fatalf("parseCPURelativePath got err %v want %v", err, tc.wantErr)
 			}
@ -224,6 +237,25 @@ func TestParseCPURelativePath(t *testing.T) {
 	}
 }

+func TestParseCPUCgroupMalformed(t *testing.T) {
+	for _, contents := range []string{
+		"\n",
+		"0\n",
+		"0:\n",
+		"0::\n",
+		"0::a\n",
+	} {
+		t.Run("", func(t *testing.T) {
+			var got [cgroup.PathSize]byte
+			var scratch [cgroup.ParseSize]byte
+			n, v, err := cgroup.ParseCPUCgroup(0, readString(contents), got[:], scratch[:])
+			if err != cgroup.ErrMalformedFile {
+				t.Errorf("ParseCPUCgroup got %q (v%d), %v, want ErrMalformedFile", string(got[:n]), v, err)
+			}
+		})
+	}
+}
+
 func TestContainsCPU(t *testing.T) {
 	tests := []struct {
 		in   string
@ -279,9 +311,21 @@ func TestParseCPUMount(t *testing.T) {
 		overlayLongLowerDir += fmt.Sprintf(":%s%d", lowerPath, i)
 	}

+	var longPath [4090]byte
+	for i := range longPath {
+		longPath[i] = byte(i)
+	}
+	escapedLongPath := escapePath(string(longPath[:]))
+	if len(escapedLongPath) <= cgroup.PathSize {
+		// ensure we actually support over PathSize long escaped path
+		t.Fatalf("escapedLongPath is too short to test")
+	}
+
 	tests := []struct {
 		name     string
 		contents string
+		cgroup   string
+		version  cgroup.Version
 		want     string
 		wantErr  bool
 	}{
@ -290,6 +334,20 @@ func TestParseCPUMount(t *testing.T) {
 			contents: "",
 			wantErr:  true,
 		},
+		{
+			name:     "invalid-root",
+			contents: "56 22 0:40 /\\1 /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct\n",
+			cgroup:   "/",
+			version:  cgroup.V1,
+			wantErr:  true,
+		},
+		{
+			name:     "invalid-mount",
+			contents: "56 22 0:40 / /sys/fs/cgroup/\\1 rw - cgroup cgroup rw,cpu,cpuacct\n",
+			cgroup:   "/",
+			version:  cgroup.V1,
+			wantErr:  true,
+		},
 		{
 			name: "v1",
 			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
@ -301,6 +359,8 @@ func TestParseCPUMount(t *testing.T) {
 58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net
 59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
 `,
+			cgroup:  "/",
+			version: cgroup.V1,
 			want:    "/sys/fs/cgroup/cpu",
 		},
 		{
@ -310,6 +370,8 @@ func TestParseCPUMount(t *testing.T) {
 21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
 25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
 `,
+			cgroup:  "/",
+			version: cgroup.V2,
 			want:    "/sys/fs/cgroup",
 		},
 		{
@ -324,8 +386,26 @@ func TestParseCPUMount(t *testing.T) {
 58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net
 59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
 `,
+			cgroup:  "/",
+			version: cgroup.V1,
 			want:    "/sys/fs/cgroup/cpu",
 		},
+		{
+			name: "mixed-choose-v2",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+49 22 0:37 / /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory
+54 22 0:38 / /sys/fs/cgroup/io rw - cgroup cgroup rw,io
+56 22 0:40 / /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct
+58 22 0:42 / /sys/fs/cgroup/net rw - cgroup cgroup rw,net
+59 22 0:43 / /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
+`,
+			cgroup:  "/",
+			version: cgroup.V2,
+			want:    "/sys/fs/cgroup",
+		},
 		{
 			name: "v2-escaped",
 			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
@ -333,6 +413,8 @@ func TestParseCPUMount(t *testing.T) {
 21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
 25 21 0:22 / /sys/fs/cgroup/tab\011tab rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
 `,
+			cgroup:  "/",
+			version: cgroup.V2,
 			want:    `/sys/fs/cgroup/tab	tab`,
 		},
 		{
@ -344,25 +426,125 @@ func TestParseCPUMount(t *testing.T) {
 262 31 0:72 / /tmp/overlay2/0143e063b02f4801de9c847ad1c5ddc21fd2ead00653064d0c72ea967b248870/merged rw,relatime shared:729 - overlay overlay rw,lowerdir=` + overlayLongLowerDir + `,upperdir=/tmp/diff,workdir=/tmp/work
 25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
 `,
+			cgroup:  "/",
+			version: cgroup.V2,
+			want:    "/sys/fs/cgroup",
+		},
+		{
+			name: "long-escaped-path",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/` + escapedLongPath + ` rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+			cgroup:  "/",
+			version: cgroup.V2,
+			want:    "/sys/" + string(longPath[:]),
+		},
+		{
+			name: "too-long-escaped-path",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/` + escapedLongPath + ` rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+			cgroup:  "/container", // compared to above, this makes the path too long
+			version: cgroup.V2,
+			wantErr: true,
+		},
+		{
+			name: "non-root_mount",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 /sand /unrelated/cgroup1 rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+25 21 0:22 /stone /unrelated/cgroup2 rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+25 21 0:22 /sandbox/container/group /sys/fs/cgroup/mygroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+25 21 0:22 /sandbox /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+25 21 0:22 / /ignored/second/match rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+			cgroup:  "/sandbox/container",
+			version: cgroup.V2,
+			want:    "/sys/fs/cgroup/container",
+		},
+		{
+			name: "v2-escaped-root",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 /tab\011tab /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+			cgroup:  "/tab	tab/container",
+			version: cgroup.V2,
+			want:    `/sys/fs/cgroup/container`,
+		},
+		{
+			name: "non-root_cgroup",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 / /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+`,
+			cgroup:  "/sandbox/container",
+			version: cgroup.V2,
+			want:    "/sys/fs/cgroup/sandbox/container",
+		},
+		{
+			name: "mixed_non-root",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+25 21 0:22 /sandbox /sys/fs/cgroup rw,nosuid,nodev,noexec - cgroup2 cgroup2 rw
+49 22 0:37 /sandbox /sys/fs/cgroup/memory rw - cgroup cgroup rw,memory
+54 22 0:38 /sandbox /sys/fs/cgroup/io rw - cgroup cgroup rw,io
+56 22 0:40 /sand /unrelated/cgroup1 rw - cgroup cgroup rw,cpu,cpuacct
+56 22 0:40 /stone /unrelated/cgroup2 rw - cgroup cgroup rw,cpu,cpuacct
+56 22 0:40 /sandbox /sys/fs/cgroup/cpu rw - cgroup cgroup rw,cpu,cpuacct
+56 22 0:40 /sandbox/container/group /sys/fs/cgroup/cpu/mygroup rw - cgroup cgroup rw,cpu,cpuacct
+56 22 0:40 / /ignored/second/match rw - cgroup cgroup rw,cpu,cpuacct
+58 22 0:42 /sandbox /sys/fs/cgroup/net rw - cgroup cgroup rw,net
+59 22 0:43 /sandbox /sys/fs/cgroup/cpuset rw - cgroup cgroup rw,cpuset
+`,
+			cgroup:  "/sandbox/container",
+			version: cgroup.V1,
+			want:    "/sys/fs/cgroup/cpu/container",
+		},
+		{
+			// to see an example of this, for a PID in a cgroup namespace, run:
+			// nsenter -t <PID> -C -- cat /proc/self/cgroup
+			// nsenter -t <PID> -C -- grep cgroup /proc/self/mountinfo
+			// /mnt can be generated with `mount --bind /sys/fs/cgroup/kubepods.slice /mnt`,
+			// assuming PID is in cgroup /kubepods.slice
+			name: "out_of_namespace",
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+1243 61 0:26 /../../.. /mnt rw,nosuid,nodev,noexec,relatime shared:4 - cgroup2 cgroup2 rw
+29 22 0:26 /../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 - cgroup2 cgroup2 rw`,
+			cgroup:  "/../../../../init.scope",
+			version: cgroup.V2,
+			want:    "/sys/fs/cgroup/init.scope",
+		},
+		{
+			name: "out_of_namespace-root", // the process is directly in the root cgroup
+			contents: `22 1 8:1 / / rw,relatime - ext4 /dev/root rw
+20 22 0:19 / /proc rw,nosuid,nodev,noexec - proc proc rw
+21 22 0:20 / /sys rw,nosuid,nodev,noexec - sysfs sysfs rw
+1243 61 0:26 /../../.. /mnt rw,nosuid,nodev,noexec,relatime shared:4 - cgroup2 cgroup2 rw
+29 22 0:26 /../../../.. /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:4 - cgroup2 cgroup2 rw`,
+			cgroup:  "/../../../..",
+			version: cgroup.V2,
 			want:    "/sys/fs/cgroup",
 		},
 	}

 	for _, tc := range tests {
 		t.Run(tc.name, func(t *testing.T) {
-			r := strings.NewReader(tc.contents)
-			read := func(fd int, b []byte) (int, uintptr) {
-				n, err := r.Read(b)
-				if err != nil && err != io.EOF {
-					const dummyErrno = 42
-					return n, dummyErrno
-				}
-				return n, 0
-			}
-
 			var got [cgroup.PathSize]byte
 			var scratch [cgroup.ParseSize]byte
-			n, err := cgroup.ParseCPUMount(0, read, got[:], scratch[:])
+			n := copy(got[:], tc.cgroup)
+			n, err := cgroup.ParseCPUMount(0, readString(tc.contents), got[:],
+				got[:n], tc.version, scratch[:])
 			if (err != nil) != tc.wantErr {
 				t.Fatalf("parseCPUMount got err %v want %v", err, tc.wantErr)
 			}
@ -374,6 +556,31 @@ func TestParseCPUMount(t *testing.T) {
 	}
 }

+func TestParseCPUMountMalformed(t *testing.T) {
+	for _, contents := range []string{
+		"\n",
+		"22\n",
+		"22 1 8:1\n",
+		"22 1 8:1 /\n",
+		"22 1 8:1 / /cgroup\n",
+		"22 1 8:1 / /cgroup rw\n",
+		"22 1 8:1 / /cgroup rw -\n",
+		"22 1 8:1 / /cgroup rw - \n",
+		"22 1 8:1 / /cgroup rw - cgroup\n",
+		"22 1 8:1 / /cgroup rw - cgroup cgroup\n",
+		"22 1 8:1 a /cgroup rw - cgroup cgroup cpu\n",
+	} {
+		t.Run("", func(t *testing.T) {
+			var got [cgroup.PathSize]byte
+			var scratch [cgroup.ParseSize]byte
+			n, err := cgroup.ParseCPUMount(0, readString(contents), got[:], []byte("/"), cgroup.V1, scratch[:])
+			if err != cgroup.ErrMalformedFile {
+				t.Errorf("parseCPUMount got %q, %v, want ErrMalformedFile", string(got[:n]), err)
+			}
+		})
+	}
+}
+
 // escapePath performs escaping equivalent to Linux's show_path.
 //
 // That is, '\', ' ', '\t', and '\n' are converted to octal escape sequences,
@ -453,9 +660,7 @@ b/c`,

 	t.Run("unescapePath", func(t *testing.T) {
 		for _, tc := range tests {
-			t.Run(tc.name, func(t *testing.T) {
-				in := []byte(tc.escaped)
-				out := make([]byte, len(in))
+			runTest := func(in, out []byte) {
 				n, err := cgroup.UnescapePath(out, in)
 				if err != nil {
 					t.Errorf("unescapePath got err %v want nil", err)
@ -464,6 +669,15 @@ b/c`,
 				if got != tc.unescaped {
 					t.Errorf("unescapePath got %q want %q", got, tc.escaped)
 				}
+			}
+			t.Run(tc.name, func(t *testing.T) {
+				in := []byte(tc.escaped)
+				out := make([]byte, len(in))
+				runTest(in, out)
+			})
+			t.Run("inplace/"+tc.name, func(t *testing.T) {
+				in := []byte(tc.escaped)
+				runTest(in, in)
 			})
 		}
 	})
--- a/src/internal/runtime/cgroup/export_test.go
+++ b/src/internal/runtime/cgroup/export_test.go
@ -21,6 +21,7 @@ func NewLineReader(fd int, scratch []byte, read func(fd int, b []byte) (int, uin
 var (
 	ErrEOF            = errEOF
 	ErrIncompleteLine = errIncompleteLine
+	ErrMalformedFile  = errMalformedFile
 )

 var ContainsCPU = containsCPU
@ -28,7 +29,7 @@ var ContainsCPU = containsCPU
 var ParseV1Number = parseV1Number
 var ParseV2Limit = parseV2Limit

-var ParseCPURelativePath = parseCPURelativePath
+var ParseCPUCgroup = parseCPUCgroup
 var ParseCPUMount = parseCPUMount

 var UnescapePath = unescapePath