mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: use backoff and ISB instruction to reduce contention in (*lfstack).pop and (*spanSet).pop on arm64
When profiling CPU usage LiveKit on AArch64/x86 (AWS), the graphs show
CPU spikes that was repeating in a semi-periodic manner and spikes occur
when the GC(garbage collector) is active.
Our analysis found that the getempty function accounted for 10.54% of the
overhead, which was mainly caused by the work.empty.pop() function. And
listing pop shows that the majority of the time, with a 10.29% overhead,
is spent on atomic.Cas64((*uint64)(head), old, next).
This patch adds a backoff approach to reduce the high overhead of the
atomic operation primarily occurs when contention over a specific memory
address increases, typically with the rise in the number of threads.
Note that on paltforms other than arm64, the initial value of backoff is zero.
This patch rewrites the implementation of procyield() on arm64, which is an
Armv8.0-A compatible delay function using the counter-timer.
The garbage collector benchmark:
│ master │ opt │
│ sec/op │ sec/op vs base │
Garbage/benchmem-MB=64-160 3.782m ± 4% 2.264m ± 2% -40.12% (p=0.000 n=10)
│ user+sys-sec/op │ user+sys-sec/op vs base │
Garbage/benchmem-MB=64-160 433.5m ± 4% 255.4m ± 2% -41.08% (p=0.000 n=10)
Reference for backoff mechianism:
https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
Change-Id: Ie8128a2243ceacbb82ab2a88941acbb8428bad94
Reviewed-on: https://go-review.googlesource.com/c/go/+/654895
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
This commit is contained in:
parent
1ff59f3dd3
commit
50586182ab
3 changed files with 80 additions and 5 deletions
|
|
@ -1036,13 +1036,60 @@ aesloop:
|
|||
VMOV V0.D[0], R0
|
||||
RET
|
||||
|
||||
// The Arm architecture provides a user space accessible counter-timer which
|
||||
// is incremented at a fixed but machine-specific rate. Software can (spin)
|
||||
// wait until the counter-timer reaches some desired value.
|
||||
//
|
||||
// Armv8.7-A introduced the WFET (FEAT_WFxT) instruction, which allows the
|
||||
// processor to enter a low power state for a set time, or until an event is
|
||||
// received.
|
||||
//
|
||||
// However, WFET is not used here because it is only available on newer hardware,
|
||||
// and we aim to maintain compatibility with older Armv8-A platforms that do not
|
||||
// support this feature.
|
||||
//
|
||||
// As a fallback, we can instead use the ISB instruction to decrease processor
|
||||
// activity and thus power consumption between checks of the counter-timer.
|
||||
// Note that we do not depend on the latency of the ISB instruction which is
|
||||
// implementation specific. Actual delay comes from comparing against a fresh
|
||||
// read of the counter-timer value.
|
||||
//
|
||||
// Read more in this Arm blog post:
|
||||
// https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
|
||||
|
||||
TEXT runtime·procyieldAsm(SB),NOSPLIT,$0-0
|
||||
MOVWU cycles+0(FP), R0
|
||||
CBZ R0, done
|
||||
again:
|
||||
YIELD
|
||||
SUBW $1, R0
|
||||
CBNZ R0, again
|
||||
CBZ R0, done
|
||||
//Prevent speculation of subsequent counter/timer reads and memory accesses.
|
||||
ISB $15
|
||||
// If the delay is very short, just return.
|
||||
// Hardcode 18ns as the first ISB delay.
|
||||
CMP $18, R0
|
||||
BLS done
|
||||
// Adjust for overhead of initial ISB.
|
||||
SUB $18, R0, R0
|
||||
// Convert the delay from nanoseconds to counter/timer ticks.
|
||||
// Read the counter/timer frequency.
|
||||
// delay_ticks = (delay * CNTFRQ_EL0) / 1e9
|
||||
// With the below simplifications and adjustments,
|
||||
// we are usually within 2% of the correct value:
|
||||
// delay_ticks = (delay + delay / 16) * CNTFRQ_EL0 >> 30
|
||||
MRS CNTFRQ_EL0, R1
|
||||
ADD R0>>4, R0, R0
|
||||
MUL R1, R0, R0
|
||||
LSR $30, R0, R0
|
||||
CBZ R0, done
|
||||
// start = current counter/timer value
|
||||
MRS CNTVCT_EL0, R2
|
||||
delay:
|
||||
// Delay using ISB for all ticks.
|
||||
ISB $15
|
||||
// Substract and compare to handle counter roll-over.
|
||||
// counter_read() - start < delay_ticks
|
||||
MRS CNTVCT_EL0, R1
|
||||
SUB R2, R1, R1
|
||||
CMP R0, R1
|
||||
BCC delay
|
||||
done:
|
||||
RET
|
||||
|
||||
|
|
|
|||
|
|
@ -34,6 +34,11 @@ func (head *lfstack) push(node *lfnode) {
|
|||
}
|
||||
|
||||
func (head *lfstack) pop() unsafe.Pointer {
|
||||
var backoff uint32
|
||||
// TODO: tweak backoff parameters on other architectures.
|
||||
if GOARCH == "arm64" {
|
||||
backoff = 128
|
||||
}
|
||||
for {
|
||||
old := atomic.Load64((*uint64)(head))
|
||||
if old == 0 {
|
||||
|
|
@ -44,6 +49,16 @@ func (head *lfstack) pop() unsafe.Pointer {
|
|||
if atomic.Cas64((*uint64)(head), old, next) {
|
||||
return unsafe.Pointer(node)
|
||||
}
|
||||
|
||||
// Use a backoff approach to reduce demand to the shared memory location
|
||||
// decreases memory contention and allows for other threads to make quicker
|
||||
// progress.
|
||||
// Read more in this Arm blog post:
|
||||
// https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
|
||||
procyield(backoff)
|
||||
// Increase backoff time.
|
||||
backoff += backoff / 2
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -149,6 +149,11 @@ retry:
|
|||
// pop is safe to call concurrently with other pop and push operations.
|
||||
func (b *spanSet) pop() *mspan {
|
||||
var head, tail uint32
|
||||
var backoff uint32
|
||||
// TODO: tweak backoff parameters on other architectures.
|
||||
if GOARCH == "arm64" {
|
||||
backoff = 128
|
||||
}
|
||||
claimLoop:
|
||||
for {
|
||||
headtail := b.index.load()
|
||||
|
|
@ -177,6 +182,14 @@ claimLoop:
|
|||
if b.index.cas(headtail, makeHeadTailIndex(want+1, tail)) {
|
||||
break claimLoop
|
||||
}
|
||||
// Use a backoff approach to reduce demand to the shared memory location
|
||||
// decreases memory contention and allows for other threads to make quicker
|
||||
// progress.
|
||||
// Read more in this Arm blog post:
|
||||
// https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/multi-threaded-applications-arm
|
||||
procyield(backoff)
|
||||
// Increase backoff time.
|
||||
backoff += backoff / 2
|
||||
headtail = b.index.load()
|
||||
head, tail = headtail.split()
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue