2020-08-03 15:33:47 +08:00
|
|
|
// Copyright 2014 The Go Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style
|
|
|
|
// license that can be found in the LICENSE file.
|
|
|
|
|
2014-11-11 17:07:37 -05:00
|
|
|
package runtime
|
|
|
|
|
2021-09-29 17:35:27 -07:00
|
|
|
import "unsafe"
|
|
|
|
|
2014-11-11 17:07:37 -05:00
|
|
|
// Constants
|
|
|
|
const (
|
|
|
|
_EINTR = 0x4
|
|
|
|
_ENOMEM = 0xc
|
|
|
|
_EAGAIN = 0xb
|
2025-09-07 00:12:28 +02:00
|
|
|
_ENOSYS = 0x26
|
2014-11-11 17:07:37 -05:00
|
|
|
|
2015-02-23 13:58:05 -08:00
|
|
|
_PROT_NONE = 0
|
|
|
|
_PROT_READ = 0x1
|
|
|
|
_PROT_WRITE = 0x2
|
|
|
|
_PROT_EXEC = 0x4
|
|
|
|
|
|
|
|
_MAP_ANON = 0x20
|
|
|
|
_MAP_PRIVATE = 0x2
|
|
|
|
_MAP_FIXED = 0x10
|
|
|
|
|
|
|
|
_MADV_DONTNEED = 0x4
|
2018-09-14 09:57:06 +02:00
|
|
|
_MADV_FREE = 0x8
|
2015-02-23 13:58:05 -08:00
|
|
|
_MADV_HUGEPAGE = 0xe
|
|
|
|
_MADV_NOHUGEPAGE = 0xf
|
runtime: avoid MADV_HUGEPAGE for heap memory
Currently the runtime marks all new memory as MADV_HUGEPAGE on Linux and
manages its hugepage eligibility status. Unfortunately, the default
THP behavior on most Linux distros is that MADV_HUGEPAGE blocks while
the kernel eagerly reclaims and compacts memory to allocate a hugepage.
This direct reclaim and compaction is unbounded, and may result in
significant application thread stalls. In really bad cases, this can
exceed 100s of ms or even seconds.
Really all we want is to undo MADV_NOHUGEPAGE marks and let the default
Linux paging behavior take over, but the only way to unmark a region as
MADV_NOHUGEPAGE is to also mark it MADV_HUGEPAGE.
The overall strategy of trying to keep hugepages for the heap unbroken
however is sound. So instead let's use the new shiny MADV_COLLAPSE if it
exists.
MADV_COLLAPSE makes a best-effort synchronous attempt at collapsing the
physical memory backing a memory region into a hugepage. We'll use
MADV_COLLAPSE where we would've used MADV_HUGEPAGE, and stop using
MADV_NOHUGEPAGE altogether.
Because MADV_COLLAPSE is synchronous, it's also important to not
re-collapse huge pages if the huge pages are likely part of some large
allocation. Although in many cases it's advantageous to back these
allocations with hugepages because they're contiguous, eagerly
collapsing every hugepage means having to page in at least part of the
large allocation.
However, because we won't use MADV_NOHUGEPAGE anymore, we'll no longer
handle the fact that khugepaged might come in and back some memory we
returned to the OS with a hugepage. I've come to the conclusion that
this is basically unavoidable without a new madvise flag and that it's
just not a good default. If this change lands, advice about Linux huge
page settings will be added to the GC guide.
Verified that this change doesn't regress Sweet, at least not on my
machine with:
/sys/kernel/mm/transparent_hugepage/enabled [always or madvise]
/sys/kernel/mm/transparent_hugepage/defrag [madvise]
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none [0 or 511]
Unfortunately, this workaround means that we only get forced hugepages
on Linux 6.1+.
Fixes #61718.
Change-Id: I7f4a7ba397847de29f800a99f9cb66cb2720a533
Reviewed-on: https://go-review.googlesource.com/c/go/+/516795
Reviewed-by: Austin Clements <austin@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
2023-08-07 19:09:59 +00:00
|
|
|
_MADV_COLLAPSE = 0x19
|
2015-02-23 13:58:05 -08:00
|
|
|
|
2014-11-11 17:07:37 -05:00
|
|
|
_SA_RESTART = 0x10000000
|
|
|
|
_SA_ONSTACK = 0x8000000
|
|
|
|
_SA_RESTORER = 0 // unused on ARM
|
|
|
|
_SA_SIGINFO = 0x4
|
2021-08-13 08:34:25 -07:00
|
|
|
_SI_KERNEL = 0x80
|
|
|
|
_SI_TIMER = -0x2
|
2014-11-11 17:07:37 -05:00
|
|
|
_SIGHUP = 0x1
|
|
|
|
_SIGINT = 0x2
|
|
|
|
_SIGQUIT = 0x3
|
|
|
|
_SIGILL = 0x4
|
|
|
|
_SIGTRAP = 0x5
|
|
|
|
_SIGABRT = 0x6
|
|
|
|
_SIGBUS = 0x7
|
|
|
|
_SIGFPE = 0x8
|
|
|
|
_SIGKILL = 0x9
|
|
|
|
_SIGUSR1 = 0xa
|
|
|
|
_SIGSEGV = 0xb
|
|
|
|
_SIGUSR2 = 0xc
|
|
|
|
_SIGPIPE = 0xd
|
|
|
|
_SIGALRM = 0xe
|
|
|
|
_SIGSTKFLT = 0x10
|
|
|
|
_SIGCHLD = 0x11
|
|
|
|
_SIGCONT = 0x12
|
|
|
|
_SIGSTOP = 0x13
|
|
|
|
_SIGTSTP = 0x14
|
|
|
|
_SIGTTIN = 0x15
|
|
|
|
_SIGTTOU = 0x16
|
|
|
|
_SIGURG = 0x17
|
|
|
|
_SIGXCPU = 0x18
|
|
|
|
_SIGXFSZ = 0x19
|
|
|
|
_SIGVTALRM = 0x1a
|
|
|
|
_SIGPROF = 0x1b
|
|
|
|
_SIGWINCH = 0x1c
|
|
|
|
_SIGIO = 0x1d
|
|
|
|
_SIGPWR = 0x1e
|
|
|
|
_SIGSYS = 0x1f
|
runtime, syscall: reimplement AllThreadsSyscall using only signals.
In issue 50113, we see that a thread blocked in a system call can result
in a hang of AllThreadsSyscall. To resolve this, we must send a signal
to these threads to knock them out of the system call long enough to run
the per-thread syscall.
Stepping back, if we need to send signals anyway, it should be possible
to implement this entire mechanism on top of signals. This CL does so,
vastly simplifying the mechanism, both as a direct result of
newly-unnecessary code as well as some ancillary simplifications to make
things simpler to follow.
Major changes:
* The rest of the mechanism is moved to os_linux.go, with fields in mOS
instead of m itself.
* 'Fixup' fields and functions are renamed to 'perThreadSyscall' so they
are more precise about their purpose.
* Rather than getting passed a closure, doAllThreadsSyscall takes the
syscall number and arguments. This avoids a lot of hairy behavior:
* The closure may potentially only be live in fields in the M,
hidden from the GC. Not necessary with no closure.
* The need to loan out the race context. A direct RawSyscall6 call
does not require any race context.
* The closure previously conditionally panicked in strange
locations, like a signal handler. Now we simply throw.
* All manual fixup synchronization with mPark, sysmon, templateThread,
sigqueue, etc is gone. The core approach is much simpler:
doAllThreadsSyscall sends a signal to every thread in allm, which
executes the system call from the signal handler. We use (SIGRTMIN +
1), aka SIGSETXID, the same signal used by glibc for this purpose. As
such, we are careful to only handle this signal on non-cgo binaries.
Synchronization with thread creation is a key part of this CL. The
comment near the top of doAllThreadsSyscall describes the required
synchronization semantics and how they are achieved.
Note that current use of allocmLock protects the state mutations of allm
that are also protected by sched.lock. allocmLock is used instead of
sched.lock simply to avoid holding sched.lock for so long.
Fixes #50113
Change-Id: Ic7ea856dc66cf711731540a54996e08fc986ce84
Reviewed-on: https://go-review.googlesource.com/c/go/+/383434
Reviewed-by: Austin Clements <austin@google.com>
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
2022-02-04 17:15:28 -05:00
|
|
|
_SIGRTMIN = 0x20
|
2014-11-11 17:07:37 -05:00
|
|
|
_FPE_INTDIV = 0x1
|
|
|
|
_FPE_INTOVF = 0x2
|
|
|
|
_FPE_FLTDIV = 0x3
|
|
|
|
_FPE_FLTOVF = 0x4
|
|
|
|
_FPE_FLTUND = 0x5
|
|
|
|
_FPE_FLTRES = 0x6
|
|
|
|
_FPE_FLTINV = 0x7
|
|
|
|
_FPE_FLTSUB = 0x8
|
|
|
|
_BUS_ADRALN = 0x1
|
|
|
|
_BUS_ADRERR = 0x2
|
|
|
|
_BUS_OBJERR = 0x3
|
|
|
|
_SEGV_MAPERR = 0x1
|
|
|
|
_SEGV_ACCERR = 0x2
|
|
|
|
_ITIMER_REAL = 0
|
|
|
|
_ITIMER_PROF = 0x2
|
|
|
|
_ITIMER_VIRTUAL = 0x1
|
|
|
|
_O_RDONLY = 0
|
2022-10-19 14:51:15 -04:00
|
|
|
_O_WRONLY = 0x1
|
|
|
|
_O_CREAT = 0x40
|
|
|
|
_O_TRUNC = 0x200
|
2019-04-03 16:31:13 -07:00
|
|
|
_O_NONBLOCK = 0x800
|
2015-01-24 17:51:42 -05:00
|
|
|
_O_CLOEXEC = 0x80000
|
2014-11-11 17:07:37 -05:00
|
|
|
|
2021-08-13 08:34:25 -07:00
|
|
|
_CLOCK_THREAD_CPUTIME_ID = 0x3
|
|
|
|
|
|
|
|
_SIGEV_THREAD_ID = 0x4
|
|
|
|
|
2015-01-24 17:51:42 -05:00
|
|
|
_AF_UNIX = 0x1
|
|
|
|
_SOCK_DGRAM = 0x2
|
2014-11-11 17:07:37 -05:00
|
|
|
)
|
|
|
|
|
2025-09-07 00:12:28 +02:00
|
|
|
// The timespec structs and types are defined in Linux in
|
|
|
|
// include/uapi/linux/time_types.h and include/uapi/asm-generic/posix_types.h.
|
|
|
|
type timespec32 struct {
|
2014-11-11 17:07:37 -05:00
|
|
|
tv_sec int32
|
|
|
|
tv_nsec int32
|
|
|
|
}
|
|
|
|
|
2019-03-13 18:56:37 -07:00
|
|
|
//go:nosplit
|
2025-09-07 00:12:28 +02:00
|
|
|
func (ts *timespec32) setNsec(ns int64) {
|
2019-03-13 18:56:37 -07:00
|
|
|
ts.tv_sec = timediv(ns, 1e9, &ts.tv_nsec)
|
2014-11-14 12:55:10 -05:00
|
|
|
}
|
|
|
|
|
2025-09-07 00:12:28 +02:00
|
|
|
type timespec struct {
|
|
|
|
tv_sec int64
|
|
|
|
tv_nsec int64
|
|
|
|
}
|
|
|
|
|
|
|
|
//go:nosplit
|
|
|
|
func (ts *timespec) setNsec(ns int64) {
|
|
|
|
var newNS int32
|
|
|
|
ts.tv_sec = int64(timediv(ns, 1e9, &newNS))
|
|
|
|
ts.tv_nsec = int64(newNS)
|
|
|
|
}
|
|
|
|
|
2016-09-25 13:38:54 -07:00
|
|
|
type stackt struct {
|
2014-11-11 17:07:37 -05:00
|
|
|
ss_sp *byte
|
|
|
|
ss_flags int32
|
|
|
|
ss_size uintptr
|
|
|
|
}
|
|
|
|
|
|
|
|
type sigcontext struct {
|
|
|
|
trap_no uint32
|
|
|
|
error_code uint32
|
|
|
|
oldmask uint32
|
|
|
|
r0 uint32
|
|
|
|
r1 uint32
|
|
|
|
r2 uint32
|
|
|
|
r3 uint32
|
|
|
|
r4 uint32
|
|
|
|
r5 uint32
|
|
|
|
r6 uint32
|
|
|
|
r7 uint32
|
|
|
|
r8 uint32
|
|
|
|
r9 uint32
|
|
|
|
r10 uint32
|
|
|
|
fp uint32
|
|
|
|
ip uint32
|
|
|
|
sp uint32
|
|
|
|
lr uint32
|
|
|
|
pc uint32
|
|
|
|
cpsr uint32
|
|
|
|
fault_address uint32
|
|
|
|
}
|
|
|
|
|
|
|
|
type ucontext struct {
|
|
|
|
uc_flags uint32
|
|
|
|
uc_link *ucontext
|
2016-09-25 13:38:54 -07:00
|
|
|
uc_stack stackt
|
2014-11-11 17:07:37 -05:00
|
|
|
uc_mcontext sigcontext
|
|
|
|
uc_sigmask uint32
|
|
|
|
__unused [31]int32
|
|
|
|
uc_regspace [128]uint32
|
|
|
|
}
|
|
|
|
|
|
|
|
type timeval struct {
|
|
|
|
tv_sec int32
|
|
|
|
tv_usec int32
|
|
|
|
}
|
|
|
|
|
|
|
|
func (tv *timeval) set_usec(x int32) {
|
|
|
|
tv.tv_usec = x
|
|
|
|
}
|
|
|
|
|
2021-08-13 08:34:25 -07:00
|
|
|
type itimerspec struct {
|
2025-09-07 00:12:28 +02:00
|
|
|
it_interval timespec32
|
|
|
|
it_value timespec32
|
2021-08-13 08:34:25 -07:00
|
|
|
}
|
|
|
|
|
2014-11-11 17:07:37 -05:00
|
|
|
type itimerval struct {
|
|
|
|
it_interval timeval
|
|
|
|
it_value timeval
|
|
|
|
}
|
|
|
|
|
2021-09-29 17:35:27 -07:00
|
|
|
type sigeventFields struct {
|
2021-08-13 08:34:25 -07:00
|
|
|
value uintptr
|
|
|
|
signo int32
|
|
|
|
notify int32
|
|
|
|
// below here is a union; sigev_notify_thread_id is the only field we use
|
|
|
|
sigev_notify_thread_id int32
|
|
|
|
}
|
|
|
|
|
2021-09-29 17:35:27 -07:00
|
|
|
type sigevent struct {
|
|
|
|
sigeventFields
|
|
|
|
|
|
|
|
// Pad struct to the max size in the kernel.
|
|
|
|
_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
|
|
|
|
}
|
|
|
|
|
|
|
|
type siginfoFields struct {
|
2014-11-19 14:56:49 -05:00
|
|
|
si_signo int32
|
|
|
|
si_errno int32
|
|
|
|
si_code int32
|
|
|
|
// below here is a union; si_addr is the only field we use
|
|
|
|
si_addr uint32
|
2014-11-11 17:07:37 -05:00
|
|
|
}
|
|
|
|
|
2021-09-29 17:35:27 -07:00
|
|
|
type siginfo struct {
|
|
|
|
siginfoFields
|
|
|
|
|
|
|
|
// Pad struct to the max size in the kernel.
|
|
|
|
_ [_si_max_size - unsafe.Sizeof(siginfoFields{})]byte
|
|
|
|
}
|
|
|
|
|
2014-11-11 17:07:37 -05:00
|
|
|
type sigactiont struct {
|
|
|
|
sa_handler uintptr
|
|
|
|
sa_flags uint32
|
|
|
|
sa_restorer uintptr
|
|
|
|
sa_mask uint64
|
|
|
|
}
|
|
|
|
|
2015-01-24 17:51:42 -05:00
|
|
|
type sockaddr_un struct {
|
|
|
|
family uint16
|
|
|
|
path [108]byte
|
|
|
|
}
|