syscall: add CgroupFD support for ForkExec on Linux

Implement CLONE_INTO_CGROUP feature, allowing to put a child in a
specified cgroup in a clean and simple way. Note that the feature only
works for cgroup v2, and requires Linux kernel 5.7 or newer.

Using the feature requires a new syscall, clone3. Currently this is the
only reason to use clone3, but the code is structured in a way so that
other cases may be easily added in the future.

Add a test case.

While at it, try to simplify the syscall calling code in
forkAndExecInChild1, which became complicated over time because:

1. It was using either rawVforkSyscall or RawSyscall6 depending on
   whether CLONE_NEWUSER was set.

2. On Linux/s390, the first two arguments to clone(2) system call are
   swapped (which deserved a mention in Linux ABI hall of shame). It
   was worked around in rawVforkSyscall on s390, but had to be
   implemented via a switch/case when using RawSyscall6, making the code
   less clear.

Let's

 - modify rawVforkSyscall to have two arguments (which is also required
   for clone3);

 - remove the arguments workaround from s390 asm, instead implementing
   arguments swap in the caller (which still looks ugly but at least
   it's done once and is clearly documented now);

 - use rawVforkSyscall for all cases (since it is essentially similar to
   RawSyscall6, except for having less parameters, not returning r2, and
   saving/restoring the return address before/after syscall on 386 and
   amd64).

Updates #51246.

Change-Id: Ifcd418ebead9257177338ffbcccd0bdecb94474e
Reviewed-on: https://go-review.googlesource.com/c/go/+/417695
Auto-Submit: Ian Lance Taylor <iant@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Run-TryBot: Ian Lance Taylor <iant@google.com>
Run-TryBot: Kirill Kolyshkin <kolyshkin@gmail.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Kir Kolyshkin 2022-07-14 21:18:15 -07:00 committed by Gopher Robot
parent f53b2111e4
commit bca17d16ca
24 changed files with 228 additions and 99 deletions

View file

@ -7,6 +7,7 @@
package syscall_test
import (
"bytes"
"flag"
"fmt"
"internal/testenv"
@ -14,6 +15,7 @@ import (
"os"
"os/exec"
"os/user"
"path"
"path/filepath"
"runtime"
"strconv"
@ -461,6 +463,96 @@ func TestUnshareUidGidMapping(t *testing.T) {
}
}
func prepareCgroupFD(t *testing.T) (int, string) {
t.Helper()
const O_PATH = 0x200000 // Same for all architectures, but for some reason not defined in syscall for 386||amd64.
// Requires cgroup v2.
const prefix = "/sys/fs/cgroup"
selfCg, err := os.ReadFile("/proc/self/cgroup")
if err != nil {
if os.IsNotExist(err) || os.IsPermission(err) {
t.Skip(err)
}
t.Fatal(err)
}
// Expect a single line like this:
// 0::/user.slice/user-1000.slice/user@1000.service/app.slice/vte-spawn-891992a2-efbb-4f28-aedb-b24f9e706770.scope
// Otherwise it's either cgroup v1 or a hybrid hierarchy.
if bytes.Count(selfCg, []byte("\n")) > 1 {
t.Skip("cgroup v2 not available")
}
cg := bytes.TrimPrefix(selfCg, []byte("0::"))
if len(cg) == len(selfCg) { // No prefix found.
t.Skipf("cgroup v2 not available (/proc/self/cgroup contents: %q)", selfCg)
}
// Need clone3 with CLONE_INTO_CGROUP support.
_, err = syscall.ForkExec("non-existent binary", nil, &syscall.ProcAttr{
Sys: &syscall.SysProcAttr{
UseCgroupFD: true,
CgroupFD: -1,
},
})
// // EPERM can be returned if clone3 is not enabled by seccomp.
if err == syscall.ENOSYS || err == syscall.EPERM {
t.Skipf("clone3 with CLONE_INTO_CGROUP not available: %v", err)
}
// Need an ability to create a sub-cgroup.
subCgroup, err := os.MkdirTemp(prefix+string(bytes.TrimSpace(cg)), "subcg-")
if err != nil {
if os.IsPermission(err) {
t.Skip(err)
}
t.Fatal(err)
}
t.Cleanup(func() { syscall.Rmdir(subCgroup) })
cgroupFD, err := syscall.Open(subCgroup, O_PATH, 0)
if err != nil {
t.Fatal(&os.PathError{Op: "open", Path: subCgroup, Err: err})
}
t.Cleanup(func() { syscall.Close(cgroupFD) })
return cgroupFD, "/" + path.Base(subCgroup)
}
func TestUseCgroupFD(t *testing.T) {
fd, suffix := prepareCgroupFD(t)
cmd := exec.Command(os.Args[0], "-test.run=TestUseCgroupFDHelper")
cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
cmd.SysProcAttr = &syscall.SysProcAttr{
UseCgroupFD: true,
CgroupFD: fd,
}
out, err := cmd.CombinedOutput()
if err != nil {
t.Fatalf("Cmd failed with err %v, output: %s", err, out)
}
// NB: this wouldn't work with cgroupns.
if !bytes.HasSuffix(bytes.TrimSpace(out), []byte(suffix)) {
t.Fatalf("got: %q, want: a line that ends with %q", out, suffix)
}
}
func TestUseCgroupFDHelper(*testing.T) {
if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
return
}
defer os.Exit(0)
// Read and print own cgroup path.
selfCg, err := os.ReadFile("/proc/self/cgroup")
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
fmt.Print(string(selfCg))
}
type capHeader struct {
version uint32
pid int32