go/src/syscall/syscall_linux_386.go
Richard Musiol 9e6b79a5df syscall: use CLONE_VFORK and CLONE_VM
This greatly improves the latency of starting a child process when
the Go process is using a lot of memory. Even though the kernel uses
copy-on-write, preparation for that can take up to several 100ms under
certain conditions. All other goroutines are suspended while starting
a subprocess so this latency directly affects total throughput.

With CLONE_VM the child process shares the same memory with the parent
process. On its own this would lead to conflicting use of the same
memory, so CLONE_VFORK is used to suspend the parent process until the
child releases the memory when switching to to the new program binary
via the exec syscall. When the parent process continues to run, one
has to consider the changes to memory that the child process did,
namely the return address of the syscall function needs to be restored
from a register.

A simple benchmark has shown a difference in latency of 16ms vs. 0.5ms
at 10GB memory usage. However, much higher latencies of several 100ms
have been observed in real world scenarios. For more information see
comments on #5838.

Fixes #5838

Change-Id: I6377d7bd8dcd00c85ca0c52b6683e70ce2174ba6
Reviewed-on: https://go-review.googlesource.com/37439
Reviewed-by: Ian Lance Taylor <iant@golang.org>
Run-TryBot: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-03-22 23:53:01 +00:00

381 lines
10 KiB
Go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// TODO(rsc): Rewrite all nn(SP) references into name+(nn-8)(FP)
// so that go vet can check that they are correct.
package syscall
import "unsafe"
const (
_SYS_dup = SYS_DUP2
_SYS_getdents = SYS_GETDENTS64
_SYS_setgroups = SYS_SETGROUPS32
)
func setTimespec(sec, nsec int64) Timespec {
return Timespec{Sec: int32(sec), Nsec: int32(nsec)}
}
func setTimeval(sec, usec int64) Timeval {
return Timeval{Sec: int32(sec), Usec: int32(usec)}
}
//sysnb pipe(p *[2]_C_int) (err error)
func Pipe(p []int) (err error) {
if len(p) != 2 {
return EINVAL
}
var pp [2]_C_int
err = pipe(&pp)
p[0] = int(pp[0])
p[1] = int(pp[1])
return
}
//sysnb pipe2(p *[2]_C_int, flags int) (err error)
func Pipe2(p []int, flags int) (err error) {
if len(p) != 2 {
return EINVAL
}
var pp [2]_C_int
err = pipe2(&pp, flags)
p[0] = int(pp[0])
p[1] = int(pp[1])
return
}
// 64-bit file system and 32-bit uid calls
// (386 default is 32-bit file system and 16-bit uid).
//sys Dup2(oldfd int, newfd int) (err error)
//sys Fchown(fd int, uid int, gid int) (err error) = SYS_FCHOWN32
//sys Fstat(fd int, stat *Stat_t) (err error) = SYS_FSTAT64
//sys Ftruncate(fd int, length int64) (err error) = SYS_FTRUNCATE64
//sysnb Getegid() (egid int) = SYS_GETEGID32
//sysnb Geteuid() (euid int) = SYS_GETEUID32
//sysnb Getgid() (gid int) = SYS_GETGID32
//sysnb Getuid() (uid int) = SYS_GETUID32
//sysnb InotifyInit() (fd int, err error)
//sys Ioperm(from int, num int, on int) (err error)
//sys Iopl(level int) (err error)
//sys Lchown(path string, uid int, gid int) (err error) = SYS_LCHOWN32
//sys Lstat(path string, stat *Stat_t) (err error) = SYS_LSTAT64
//sys Pread(fd int, p []byte, offset int64) (n int, err error) = SYS_PREAD64
//sys Pwrite(fd int, p []byte, offset int64) (n int, err error) = SYS_PWRITE64
//sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) = SYS_SENDFILE64
//sys Setfsgid(gid int) (err error) = SYS_SETFSGID32
//sys Setfsuid(uid int) (err error) = SYS_SETFSUID32
//sysnb Setregid(rgid int, egid int) (err error) = SYS_SETREGID32
//sysnb Setresgid(rgid int, egid int, sgid int) (err error) = SYS_SETRESGID32
//sysnb Setresuid(ruid int, euid int, suid int) (err error) = SYS_SETRESUID32
//sysnb Setreuid(ruid int, euid int) (err error) = SYS_SETREUID32
//sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error)
//sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64
//sys SyncFileRange(fd int, off int64, n int64, flags int) (err error)
//sys Truncate(path string, length int64) (err error) = SYS_TRUNCATE64
//sysnb getgroups(n int, list *_Gid_t) (nn int, err error) = SYS_GETGROUPS32
//sysnb setgroups(n int, list *_Gid_t) (err error) = SYS_SETGROUPS32
//sys Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) = SYS__NEWSELECT
//sys mmap2(addr uintptr, length uintptr, prot int, flags int, fd int, pageOffset uintptr) (xaddr uintptr, err error)
func mmap(addr uintptr, length uintptr, prot int, flags int, fd int, offset int64) (xaddr uintptr, err error) {
page := uintptr(offset / 4096)
if offset != int64(page)*4096 {
return 0, EINVAL
}
return mmap2(addr, length, prot, flags, fd, page)
}
type rlimit32 struct {
Cur uint32
Max uint32
}
//sysnb getrlimit(resource int, rlim *rlimit32) (err error) = SYS_GETRLIMIT
const rlimInf32 = ^uint32(0)
const rlimInf64 = ^uint64(0)
func Getrlimit(resource int, rlim *Rlimit) (err error) {
err = prlimit(0, resource, nil, rlim)
if err != ENOSYS {
return err
}
rl := rlimit32{}
err = getrlimit(resource, &rl)
if err != nil {
return
}
if rl.Cur == rlimInf32 {
rlim.Cur = rlimInf64
} else {
rlim.Cur = uint64(rl.Cur)
}
if rl.Max == rlimInf32 {
rlim.Max = rlimInf64
} else {
rlim.Max = uint64(rl.Max)
}
return
}
//sysnb setrlimit(resource int, rlim *rlimit32) (err error) = SYS_SETRLIMIT
func Setrlimit(resource int, rlim *Rlimit) (err error) {
err = prlimit(0, resource, rlim, nil)
if err != ENOSYS {
return err
}
rl := rlimit32{}
if rlim.Cur == rlimInf64 {
rl.Cur = rlimInf32
} else if rlim.Cur < uint64(rlimInf32) {
rl.Cur = uint32(rlim.Cur)
} else {
return EINVAL
}
if rlim.Max == rlimInf64 {
rl.Max = rlimInf32
} else if rlim.Max < uint64(rlimInf32) {
rl.Max = uint32(rlim.Max)
} else {
return EINVAL
}
return setrlimit(resource, &rl)
}
// Underlying system call writes to newoffset via pointer.
// Implemented in assembly to avoid allocation.
func seek(fd int, offset int64, whence int) (newoffset int64, err Errno)
func Seek(fd int, offset int64, whence int) (newoffset int64, err error) {
newoffset, errno := seek(fd, offset, whence)
if errno != 0 {
return 0, errno
}
return newoffset, nil
}
// Vsyscalls on amd64.
//sysnb Gettimeofday(tv *Timeval) (err error)
//sysnb Time(t *Time_t) (tt Time_t, err error)
// On x86 Linux, all the socket calls go through an extra indirection,
// I think because the 5-register system call interface can't handle
// the 6-argument calls like sendto and recvfrom. Instead the
// arguments to the underlying system call are the number below
// and a pointer to an array of uintptr. We hide the pointer in the
// socketcall assembly to avoid allocation on every system call.
const (
// see linux/net.h
_SOCKET = 1
_BIND = 2
_CONNECT = 3
_LISTEN = 4
_ACCEPT = 5
_GETSOCKNAME = 6
_GETPEERNAME = 7
_SOCKETPAIR = 8
_SEND = 9
_RECV = 10
_SENDTO = 11
_RECVFROM = 12
_SHUTDOWN = 13
_SETSOCKOPT = 14
_GETSOCKOPT = 15
_SENDMSG = 16
_RECVMSG = 17
_ACCEPT4 = 18
_RECVMMSG = 19
_SENDMMSG = 20
)
func socketcall(call int, a0, a1, a2, a3, a4, a5 uintptr) (n int, err Errno)
func rawsocketcall(call int, a0, a1, a2, a3, a4, a5 uintptr) (n int, err Errno)
func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) {
fd, e := socketcall(_ACCEPT, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func accept4(s int, rsa *RawSockaddrAny, addrlen *_Socklen, flags int) (fd int, err error) {
fd, e := socketcall(_ACCEPT4, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), uintptr(flags), 0, 0)
if e != 0 {
err = e
}
return
}
func getsockname(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) {
_, e := rawsocketcall(_GETSOCKNAME, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func getpeername(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) {
_, e := rawsocketcall(_GETPEERNAME, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen)), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func socketpair(domain int, typ int, flags int, fd *[2]int32) (err error) {
_, e := rawsocketcall(_SOCKETPAIR, uintptr(domain), uintptr(typ), uintptr(flags), uintptr(unsafe.Pointer(fd)), 0, 0)
if e != 0 {
err = e
}
return
}
func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) {
_, e := socketcall(_BIND, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) {
_, e := socketcall(_CONNECT, uintptr(s), uintptr(addr), uintptr(addrlen), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func socket(domain int, typ int, proto int) (fd int, err error) {
fd, e := rawsocketcall(_SOCKET, uintptr(domain), uintptr(typ), uintptr(proto), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) {
_, e := socketcall(_GETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0)
if e != 0 {
err = e
}
return
}
func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) {
_, e := socketcall(_SETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), vallen, 0)
if e != 0 {
err = e
}
return
}
func recvfrom(s int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) {
var base uintptr
if len(p) > 0 {
base = uintptr(unsafe.Pointer(&p[0]))
}
n, e := socketcall(_RECVFROM, uintptr(s), base, uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen)))
if e != 0 {
err = e
}
return
}
func sendto(s int, p []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) {
var base uintptr
if len(p) > 0 {
base = uintptr(unsafe.Pointer(&p[0]))
}
_, e := socketcall(_SENDTO, uintptr(s), base, uintptr(len(p)), uintptr(flags), uintptr(to), uintptr(addrlen))
if e != 0 {
err = e
}
return
}
func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) {
n, e := socketcall(_RECVMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) {
n, e := socketcall(_SENDMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags), 0, 0, 0)
if e != 0 {
err = e
}
return
}
func Listen(s int, n int) (err error) {
_, e := socketcall(_LISTEN, uintptr(s), uintptr(n), 0, 0, 0, 0)
if e != 0 {
err = e
}
return
}
func Shutdown(s, how int) (err error) {
_, e := socketcall(_SHUTDOWN, uintptr(s), uintptr(how), 0, 0, 0, 0)
if e != 0 {
err = e
}
return
}
func Fstatfs(fd int, buf *Statfs_t) (err error) {
_, _, e := Syscall(SYS_FSTATFS64, uintptr(fd), unsafe.Sizeof(*buf), uintptr(unsafe.Pointer(buf)))
if e != 0 {
err = e
}
return
}
func Statfs(path string, buf *Statfs_t) (err error) {
pathp, err := BytePtrFromString(path)
if err != nil {
return err
}
_, _, e := Syscall(SYS_STATFS64, uintptr(unsafe.Pointer(pathp)), unsafe.Sizeof(*buf), uintptr(unsafe.Pointer(buf)))
if e != 0 {
err = e
}
return
}
func (r *PtraceRegs) PC() uint64 { return uint64(uint32(r.Eip)) }
func (r *PtraceRegs) SetPC(pc uint64) { r.Eip = int32(pc) }
func (iov *Iovec) SetLen(length int) {
iov.Len = uint32(length)
}
func (msghdr *Msghdr) SetControllen(length int) {
msghdr.Controllen = uint32(length)
}
func (cmsg *Cmsghdr) SetLen(length int) {
cmsg.Len = uint32(length)
}
func rawVforkSyscall(trap, a1 uintptr) (r1 uintptr, err Errno) {
panic("not implemented")
}