go/src/runtime/os_linux.go
Russ Cox 0d94f989d1 runtime: clean up system calls during cgo callback init
During a cgocallback, the runtime calls needm to get an m.
The calls made during needm cannot themselves assume that
there is an m or a g (which is attached to the m).

In the old days of making direct system calls, the only thing
you had to do for such functions was mark them //go:nosplit,
to avoid the use of g in the stack split prologue.

But now, on operating systems that make system calls through
shared libraries and use code that saves state in the g or m
before doing so, it's not safe to assume g exists. In fact, it is
not even safe to call getg(), because it might fault deferencing
the TLS storage to find the g pointer (that storage may not be
initialized yet, at least on Windows, and perhaps on other systems
in the future).

The specific routines that are problematic are usleep and osyield,
which are called during lock contention in lockextra, called
from needm.

All this is rather subtle and hidden, so in addition to fixing the
problem on Windows, this CL makes the fact of not running on
a g much clearer by introducing variants usleep_no_g and
osyield_no_g whose names should make clear that there is no g.
And then we can remove the various sketchy getg() == nil checks
in the existing routines.

As part of this cleanup, this CL also deletes onosstack on Windows.
onosstack is from back when the runtime was implemented in C.
It predates systemstack but does essentially the same thing.
Instead of having two different copies of this code, we can use
systemstack consistently. This way we need not port onosstack
to each architecture.

This CL is part of a stack adding windows/arm64
support (#36439), intended to land in the Go 1.17 cycle.
This CL is, however, not windows/arm64-specific.
It is cleanup meant to make the port (and future ports) easier.

Change-Id: I3352de1fd0a3c26267c6e209063e6e86abd26187
Reviewed-on: https://go-review.googlesource.com/c/go/+/288793
Trust: Russ Cox <rsc@golang.org>
Trust: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-02-19 00:01:25 +00:00

509 lines
14 KiB
Go

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime
import (
"runtime/internal/sys"
"unsafe"
)
type mOS struct{}
//go:noescape
func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
// Linux futex.
//
// futexsleep(uint32 *addr, uint32 val)
// futexwakeup(uint32 *addr)
//
// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
// Futexwakeup wakes up threads sleeping on addr.
// Futexsleep is allowed to wake up spuriously.
const (
_FUTEX_PRIVATE_FLAG = 128
_FUTEX_WAIT_PRIVATE = 0 | _FUTEX_PRIVATE_FLAG
_FUTEX_WAKE_PRIVATE = 1 | _FUTEX_PRIVATE_FLAG
)
// Atomically,
// if(*addr == val) sleep
// Might be woken up spuriously; that's allowed.
// Don't sleep longer than ns; ns < 0 means forever.
//go:nosplit
func futexsleep(addr *uint32, val uint32, ns int64) {
// Some Linux kernels have a bug where futex of
// FUTEX_WAIT returns an internal error code
// as an errno. Libpthread ignores the return value
// here, and so can we: as it says a few lines up,
// spurious wakeups are allowed.
if ns < 0 {
futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, nil, nil, 0)
return
}
var ts timespec
ts.setNsec(ns)
futex(unsafe.Pointer(addr), _FUTEX_WAIT_PRIVATE, val, unsafe.Pointer(&ts), nil, 0)
}
// If any procs are sleeping on addr, wake up at most cnt.
//go:nosplit
func futexwakeup(addr *uint32, cnt uint32) {
ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE_PRIVATE, cnt, nil, nil, 0)
if ret >= 0 {
return
}
// I don't know that futex wakeup can return
// EAGAIN or EINTR, but if it does, it would be
// safe to loop and call futex again.
systemstack(func() {
print("futexwakeup addr=", addr, " returned ", ret, "\n")
})
*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
}
func getproccount() int32 {
// This buffer is huge (8 kB) but we are on the system stack
// and there should be plenty of space (64 kB).
// Also this is a leaf, so we're not holding up the memory for long.
// See golang.org/issue/11823.
// The suggested behavior here is to keep trying with ever-larger
// buffers, but we don't have a dynamic memory allocator at the
// moment, so that's a bit tricky and seems like overkill.
const maxCPUs = 64 * 1024
var buf [maxCPUs / 8]byte
r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
if r < 0 {
return 1
}
n := int32(0)
for _, v := range buf[:r] {
for v != 0 {
n += int32(v & 1)
v >>= 1
}
}
if n == 0 {
n = 1
}
return n
}
// Clone, the Linux rfork.
const (
_CLONE_VM = 0x100
_CLONE_FS = 0x200
_CLONE_FILES = 0x400
_CLONE_SIGHAND = 0x800
_CLONE_PTRACE = 0x2000
_CLONE_VFORK = 0x4000
_CLONE_PARENT = 0x8000
_CLONE_THREAD = 0x10000
_CLONE_NEWNS = 0x20000
_CLONE_SYSVSEM = 0x40000
_CLONE_SETTLS = 0x80000
_CLONE_PARENT_SETTID = 0x100000
_CLONE_CHILD_CLEARTID = 0x200000
_CLONE_UNTRACED = 0x800000
_CLONE_CHILD_SETTID = 0x1000000
_CLONE_STOPPED = 0x2000000
_CLONE_NEWUTS = 0x4000000
_CLONE_NEWIPC = 0x8000000
// As of QEMU 2.8.0 (5ea2fc84d), user emulation requires all six of these
// flags to be set when creating a thread; attempts to share the other
// five but leave SYSVSEM unshared will fail with -EINVAL.
//
// In non-QEMU environments CLONE_SYSVSEM is inconsequential as we do not
// use System V semaphores.
cloneFlags = _CLONE_VM | /* share memory */
_CLONE_FS | /* share cwd, etc */
_CLONE_FILES | /* share fd table */
_CLONE_SIGHAND | /* share sig handler table */
_CLONE_SYSVSEM | /* share SysV semaphore undo lists (see issue #20763) */
_CLONE_THREAD /* revisit - okay for now */
)
//go:noescape
func clone(flags int32, stk, mp, gp, fn unsafe.Pointer) int32
// May run with m.p==nil, so write barriers are not allowed.
//go:nowritebarrier
func newosproc(mp *m) {
stk := unsafe.Pointer(mp.g0.stack.hi)
/*
* note: strace gets confused if we use CLONE_PTRACE here.
*/
if false {
print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, " ostk=", &mp, "\n")
}
// Disable signals during clone, so that the new thread starts
// with signals disabled. It will enable them in minit.
var oset sigset
sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
sigprocmask(_SIG_SETMASK, &oset, nil)
if ret < 0 {
print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
if ret == -_EAGAIN {
println("runtime: may need to increase max user processes (ulimit -u)")
}
throw("newosproc")
}
}
// Version of newosproc that doesn't require a valid G.
//go:nosplit
func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
stack := sysAlloc(stacksize, &memstats.stacks_sys)
if stack == nil {
write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
exit(1)
}
ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
if ret < 0 {
write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
exit(1)
}
}
var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
const (
_AT_NULL = 0 // End of vector
_AT_PAGESZ = 6 // System physical page size
_AT_HWCAP = 16 // hardware capability bit vector
_AT_RANDOM = 25 // introduced in 2.6.29
_AT_HWCAP2 = 26 // hardware capability bit vector 2
)
var procAuxv = []byte("/proc/self/auxv\x00")
var addrspace_vec [1]byte
func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
func sysargs(argc int32, argv **byte) {
n := argc + 1
// skip over argv, envp to get to auxv
for argv_index(argv, n) != nil {
n++
}
// skip NULL separator
n++
// now argv+n is auxv
auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
if sysauxv(auxv[:]) != 0 {
return
}
// In some situations we don't get a loader-provided
// auxv, such as when loaded as a library on Android.
// Fall back to /proc/self/auxv.
fd := open(&procAuxv[0], 0 /* O_RDONLY */, 0)
if fd < 0 {
// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to
// try using mincore to detect the physical page size.
// mincore should return EINVAL when address is not a multiple of system page size.
const size = 256 << 10 // size of memory region to allocate
p, err := mmap(nil, size, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
if err != 0 {
return
}
var n uintptr
for n = 4 << 10; n < size; n <<= 1 {
err := mincore(unsafe.Pointer(uintptr(p)+n), 1, &addrspace_vec[0])
if err == 0 {
physPageSize = n
break
}
}
if physPageSize == 0 {
physPageSize = size
}
munmap(p, size)
return
}
var buf [128]uintptr
n = read(fd, noescape(unsafe.Pointer(&buf[0])), int32(unsafe.Sizeof(buf)))
closefd(fd)
if n < 0 {
return
}
// Make sure buf is terminated, even if we didn't read
// the whole file.
buf[len(buf)-2] = _AT_NULL
sysauxv(buf[:])
}
// startupRandomData holds random bytes initialized at startup. These come from
// the ELF AT_RANDOM auxiliary vector.
var startupRandomData []byte
func sysauxv(auxv []uintptr) int {
var i int
for ; auxv[i] != _AT_NULL; i += 2 {
tag, val := auxv[i], auxv[i+1]
switch tag {
case _AT_RANDOM:
// The kernel provides a pointer to 16-bytes
// worth of random data.
startupRandomData = (*[16]byte)(unsafe.Pointer(val))[:]
case _AT_PAGESZ:
physPageSize = val
}
archauxv(tag, val)
vdsoauxv(tag, val)
}
return i / 2
}
var sysTHPSizePath = []byte("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size\x00")
func getHugePageSize() uintptr {
var numbuf [20]byte
fd := open(&sysTHPSizePath[0], 0 /* O_RDONLY */, 0)
if fd < 0 {
return 0
}
ptr := noescape(unsafe.Pointer(&numbuf[0]))
n := read(fd, ptr, int32(len(numbuf)))
closefd(fd)
if n <= 0 {
return 0
}
n-- // remove trailing newline
v, ok := atoi(slicebytetostringtmp((*byte)(ptr), int(n)))
if !ok || v < 0 {
v = 0
}
if v&(v-1) != 0 {
// v is not a power of 2
return 0
}
return uintptr(v)
}
func osinit() {
ncpu = getproccount()
physHugePageSize = getHugePageSize()
if iscgo {
// #42494 glibc and musl reserve some signals for
// internal use and require they not be blocked by
// the rest of a normal C runtime. When the go runtime
// blocks...unblocks signals, temporarily, the blocked
// interval of time is generally very short. As such,
// these expectations of *libc code are mostly met by
// the combined go+cgo system of threads. However,
// when go causes a thread to exit, via a return from
// mstart(), the combined runtime can deadlock if
// these signals are blocked. Thus, don't block these
// signals when exiting threads.
// - glibc: SIGCANCEL (32), SIGSETXID (33)
// - musl: SIGTIMER (32), SIGCANCEL (33), SIGSYNCCALL (34)
sigdelset(&sigsetAllExiting, 32)
sigdelset(&sigsetAllExiting, 33)
sigdelset(&sigsetAllExiting, 34)
}
osArchInit()
}
var urandom_dev = []byte("/dev/urandom\x00")
func getRandomData(r []byte) {
if startupRandomData != nil {
n := copy(r, startupRandomData)
extendRandom(r, n)
return
}
fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
closefd(fd)
extendRandom(r, int(n))
}
func goenvs() {
goenvs_unix()
}
// Called to do synchronous initialization of Go code built with
// -buildmode=c-archive or -buildmode=c-shared.
// None of the Go runtime is initialized.
//go:nosplit
//go:nowritebarrierrec
func libpreinit() {
initsig(true)
}
// Called to initialize a new m (including the bootstrap m).
// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
func mpreinit(mp *m) {
mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
mp.gsignal.m = mp
}
func gettid() uint32
// Called to initialize a new m (including the bootstrap m).
// Called on the new thread, cannot allocate memory.
func minit() {
minitSignals()
// Cgo-created threads and the bootstrap m are missing a
// procid. We need this for asynchronous preemption and it's
// useful in debuggers.
getg().m.procid = uint64(gettid())
}
// Called from dropm to undo the effect of an minit.
//go:nosplit
func unminit() {
unminitSignals()
}
// Called from exitm, but not from drop, to undo the effect of thread-owned
// resources in minit, semacreate, or elsewhere. Do not take locks after calling this.
func mdestroy(mp *m) {
}
//#ifdef GOARCH_386
//#define sa_handler k_sa_handler
//#endif
func sigreturn()
func sigtramp(sig uint32, info *siginfo, ctx unsafe.Pointer)
func cgoSigtramp()
//go:noescape
func sigaltstack(new, old *stackt)
//go:noescape
func setitimer(mode int32, new, old *itimerval)
//go:noescape
func rtsigprocmask(how int32, new, old *sigset, size int32)
//go:nosplit
//go:nowritebarrierrec
func sigprocmask(how int32, new, old *sigset) {
rtsigprocmask(how, new, old, int32(unsafe.Sizeof(*new)))
}
func raise(sig uint32)
func raiseproc(sig uint32)
//go:noescape
func sched_getaffinity(pid, len uintptr, buf *byte) int32
func osyield()
//go:nosplit
func osyield_no_g() {
osyield()
}
func pipe() (r, w int32, errno int32)
func pipe2(flags int32) (r, w int32, errno int32)
func setNonblock(fd int32)
//go:nosplit
//go:nowritebarrierrec
func setsig(i uint32, fn uintptr) {
var sa sigactiont
sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER | _SA_RESTART
sigfillset(&sa.sa_mask)
// Although Linux manpage says "sa_restorer element is obsolete and
// should not be used". x86_64 kernel requires it. Only use it on
// x86.
if GOARCH == "386" || GOARCH == "amd64" {
sa.sa_restorer = funcPC(sigreturn)
}
if fn == funcPC(sighandler) {
if iscgo {
fn = funcPC(cgoSigtramp)
} else {
fn = funcPC(sigtramp)
}
}
sa.sa_handler = fn
sigaction(i, &sa, nil)
}
//go:nosplit
//go:nowritebarrierrec
func setsigstack(i uint32) {
var sa sigactiont
sigaction(i, nil, &sa)
if sa.sa_flags&_SA_ONSTACK != 0 {
return
}
sa.sa_flags |= _SA_ONSTACK
sigaction(i, &sa, nil)
}
//go:nosplit
//go:nowritebarrierrec
func getsig(i uint32) uintptr {
var sa sigactiont
sigaction(i, nil, &sa)
return sa.sa_handler
}
// setSignaltstackSP sets the ss_sp field of a stackt.
//go:nosplit
func setSignalstackSP(s *stackt, sp uintptr) {
*(*uintptr)(unsafe.Pointer(&s.ss_sp)) = sp
}
//go:nosplit
func (c *sigctxt) fixsigcode(sig uint32) {
}
// sysSigaction calls the rt_sigaction system call.
//go:nosplit
func sysSigaction(sig uint32, new, old *sigactiont) {
if rt_sigaction(uintptr(sig), new, old, unsafe.Sizeof(sigactiont{}.sa_mask)) != 0 {
// Workaround for bugs in QEMU user mode emulation.
//
// QEMU turns calls to the sigaction system call into
// calls to the C library sigaction call; the C
// library call rejects attempts to call sigaction for
// SIGCANCEL (32) or SIGSETXID (33).
//
// QEMU rejects calling sigaction on SIGRTMAX (64).
//
// Just ignore the error in these case. There isn't
// anything we can do about it anyhow.
if sig != 32 && sig != 33 && sig != 64 {
// Use system stack to avoid split stack overflow on ppc64/ppc64le.
systemstack(func() {
throw("sigaction failed")
})
}
}
}
// rt_sigaction is implemented in assembly.
//go:noescape
func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
func getpid() int
func tgkill(tgid, tid, sig int)
// signalM sends a signal to mp.
func signalM(mp *m, sig int) {
tgkill(getpid(), int(mp.procid), sig)
}