mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: less aggressive per-thread stack segment caching
Introduce global stack segment cache and limit per-thread cache size. This greatly reduces StackSys memory on workloads that create lots of threads. benchmark old ns/op new ns/op delta BenchmarkStackGrowth 665 656 -1.35% BenchmarkStackGrowth-2 333 328 -1.50% BenchmarkStackGrowth-4 224 172 -23.21% BenchmarkStackGrowth-8 124 91 -26.13% BenchmarkStackGrowth-16 82 47 -41.94% BenchmarkStackGrowth-32 73 40 -44.79% BenchmarkStackGrowthDeep 97231 94391 -2.92% BenchmarkStackGrowthDeep-2 47230 58562 +23.99% BenchmarkStackGrowthDeep-4 24993 49356 +97.48% BenchmarkStackGrowthDeep-8 15105 30072 +99.09% BenchmarkStackGrowthDeep-16 10005 15623 +56.15% BenchmarkStackGrowthDeep-32 12517 13069 +4.41% TestStackMem#1,MB 310 12 -96.13% TestStackMem#2,MB 296 14 -95.27% TestStackMem#3,MB 479 14 -97.08% TestStackMem#1,sec 3.22 2.26 -29.81% TestStackMem#2,sec 2.43 2.15 -11.52% TestStackMem#3,sec 2.50 2.38 -4.80% R=sougou, no.smile.face, rsc CC=golang-dev, msolomon https://golang.org/cl/7029044
This commit is contained in:
parent
7d403871cb
commit
f82db7d9e4
6 changed files with 154 additions and 12 deletions
|
|
@ -748,9 +748,74 @@ runtime·cnew(Type *typ)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct StackCacheNode StackCacheNode;
|
||||||
|
struct StackCacheNode
|
||||||
|
{
|
||||||
|
StackCacheNode *next;
|
||||||
|
void* batch[StackCacheBatch-1];
|
||||||
|
};
|
||||||
|
|
||||||
|
static StackCacheNode *stackcache;
|
||||||
|
static Lock stackcachemu;
|
||||||
|
|
||||||
|
// stackcacherefill/stackcacherelease implement global cache of stack segments.
|
||||||
|
// The cache is required to prevent unlimited growth of per-thread caches.
|
||||||
|
static void
|
||||||
|
stackcacherefill(void)
|
||||||
|
{
|
||||||
|
StackCacheNode *n;
|
||||||
|
int32 i, pos;
|
||||||
|
|
||||||
|
runtime·lock(&stackcachemu);
|
||||||
|
n = stackcache;
|
||||||
|
if(n)
|
||||||
|
stackcache = n->next;
|
||||||
|
runtime·unlock(&stackcachemu);
|
||||||
|
if(n == nil) {
|
||||||
|
n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
|
||||||
|
if(n == nil)
|
||||||
|
runtime·throw("out of memory (staccachekrefill)");
|
||||||
|
runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
|
||||||
|
for(i = 0; i < StackCacheBatch-1; i++)
|
||||||
|
n->batch[i] = (byte*)n + (i+1)*FixedStack;
|
||||||
|
}
|
||||||
|
pos = m->stackcachepos;
|
||||||
|
for(i = 0; i < StackCacheBatch-1; i++) {
|
||||||
|
m->stackcache[pos] = n->batch[i];
|
||||||
|
pos = (pos + 1) % StackCacheSize;
|
||||||
|
}
|
||||||
|
m->stackcache[pos] = n;
|
||||||
|
pos = (pos + 1) % StackCacheSize;
|
||||||
|
m->stackcachepos = pos;
|
||||||
|
m->stackcachecnt += StackCacheBatch;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
stackcacherelease(void)
|
||||||
|
{
|
||||||
|
StackCacheNode *n;
|
||||||
|
uint32 i, pos;
|
||||||
|
|
||||||
|
pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
|
||||||
|
n = (StackCacheNode*)m->stackcache[pos];
|
||||||
|
pos = (pos + 1) % StackCacheSize;
|
||||||
|
for(i = 0; i < StackCacheBatch-1; i++) {
|
||||||
|
n->batch[i] = m->stackcache[pos];
|
||||||
|
pos = (pos + 1) % StackCacheSize;
|
||||||
|
}
|
||||||
|
m->stackcachecnt -= StackCacheBatch;
|
||||||
|
runtime·lock(&stackcachemu);
|
||||||
|
n->next = stackcache;
|
||||||
|
stackcache = n;
|
||||||
|
runtime·unlock(&stackcachemu);
|
||||||
|
}
|
||||||
|
|
||||||
void*
|
void*
|
||||||
runtime·stackalloc(uint32 n)
|
runtime·stackalloc(uint32 n)
|
||||||
{
|
{
|
||||||
|
uint32 pos;
|
||||||
|
void *v;
|
||||||
|
|
||||||
// Stackalloc must be called on scheduler stack, so that we
|
// Stackalloc must be called on scheduler stack, so that we
|
||||||
// never try to grow the stack during the code that stackalloc runs.
|
// never try to grow the stack during the code that stackalloc runs.
|
||||||
// Doing so would cause a deadlock (issue 1547).
|
// Doing so would cause a deadlock (issue 1547).
|
||||||
|
|
@ -769,7 +834,15 @@ runtime·stackalloc(uint32 n)
|
||||||
runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
|
runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
|
||||||
runtime·throw("stackalloc");
|
runtime·throw("stackalloc");
|
||||||
}
|
}
|
||||||
return runtime·FixAlloc_Alloc(m->stackalloc);
|
if(m->stackcachecnt == 0)
|
||||||
|
stackcacherefill();
|
||||||
|
pos = m->stackcachepos;
|
||||||
|
pos = (pos - 1) % StackCacheSize;
|
||||||
|
v = m->stackcache[pos];
|
||||||
|
m->stackcachepos = pos;
|
||||||
|
m->stackcachecnt--;
|
||||||
|
m->stackinuse++;
|
||||||
|
return v;
|
||||||
}
|
}
|
||||||
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
|
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
@ -777,8 +850,16 @@ runtime·stackalloc(uint32 n)
|
||||||
void
|
void
|
||||||
runtime·stackfree(void *v, uintptr n)
|
runtime·stackfree(void *v, uintptr n)
|
||||||
{
|
{
|
||||||
|
uint32 pos;
|
||||||
|
|
||||||
if(m->mallocing || m->gcing || n == FixedStack) {
|
if(m->mallocing || m->gcing || n == FixedStack) {
|
||||||
runtime·FixAlloc_Free(m->stackalloc, v);
|
if(m->stackcachecnt == StackCacheSize)
|
||||||
|
stackcacherelease();
|
||||||
|
pos = m->stackcachepos;
|
||||||
|
m->stackcache[pos] = v;
|
||||||
|
m->stackcachepos = (pos + 1) % StackCacheSize;
|
||||||
|
m->stackcachecnt++;
|
||||||
|
m->stackinuse--;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
runtime·free(v);
|
runtime·free(v);
|
||||||
|
|
|
||||||
|
|
@ -1176,18 +1176,15 @@ cachestats(GCStats *stats)
|
||||||
MCache *c;
|
MCache *c;
|
||||||
int32 i;
|
int32 i;
|
||||||
uint64 stacks_inuse;
|
uint64 stacks_inuse;
|
||||||
uint64 stacks_sys;
|
|
||||||
uint64 *src, *dst;
|
uint64 *src, *dst;
|
||||||
|
|
||||||
if(stats)
|
if(stats)
|
||||||
runtime·memclr((byte*)stats, sizeof(*stats));
|
runtime·memclr((byte*)stats, sizeof(*stats));
|
||||||
stacks_inuse = 0;
|
stacks_inuse = 0;
|
||||||
stacks_sys = 0;
|
|
||||||
for(mp=runtime·allm; mp; mp=mp->alllink) {
|
for(mp=runtime·allm; mp; mp=mp->alllink) {
|
||||||
c = mp->mcache;
|
c = mp->mcache;
|
||||||
runtime·purgecachedstats(c);
|
runtime·purgecachedstats(c);
|
||||||
stacks_inuse += mp->stackalloc->inuse;
|
stacks_inuse += mp->stackinuse*FixedStack;
|
||||||
stacks_sys += mp->stackalloc->sys;
|
|
||||||
if(stats) {
|
if(stats) {
|
||||||
src = (uint64*)&mp->gcstats;
|
src = (uint64*)&mp->gcstats;
|
||||||
dst = (uint64*)stats;
|
dst = (uint64*)stats;
|
||||||
|
|
@ -1203,7 +1200,6 @@ cachestats(GCStats *stats)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mstats.stacks_inuse = stacks_inuse;
|
mstats.stacks_inuse = stacks_inuse;
|
||||||
mstats.stacks_sys = stacks_sys;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Structure of arguments passed to function gc().
|
// Structure of arguments passed to function gc().
|
||||||
|
|
|
||||||
|
|
@ -345,8 +345,6 @@ mcommoninit(M *mp)
|
||||||
{
|
{
|
||||||
mp->id = runtime·sched.mcount++;
|
mp->id = runtime·sched.mcount++;
|
||||||
mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
|
mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
|
||||||
mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc));
|
|
||||||
runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
|
|
||||||
|
|
||||||
if(mp->mcache == nil)
|
if(mp->mcache == nil)
|
||||||
mp->mcache = runtime·allocmcache();
|
mp->mcache = runtime·allocmcache();
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func BenchmarkStackGrowth(b *testing.B) {
|
func benchmarkStackGrowth(b *testing.B, rec int) {
|
||||||
const CallsPerSched = 1000
|
const CallsPerSched = 1000
|
||||||
procs := runtime.GOMAXPROCS(-1)
|
procs := runtime.GOMAXPROCS(-1)
|
||||||
N := int32(b.N / CallsPerSched)
|
N := int32(b.N / CallsPerSched)
|
||||||
|
|
@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) {
|
||||||
for atomic.AddInt32(&N, -1) >= 0 {
|
for atomic.AddInt32(&N, -1) >= 0 {
|
||||||
runtime.Gosched()
|
runtime.Gosched()
|
||||||
for g := 0; g < CallsPerSched; g++ {
|
for g := 0; g < CallsPerSched; g++ {
|
||||||
stackGrowthRecursive(10)
|
stackGrowthRecursive(rec)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
c <- true
|
c <- true
|
||||||
|
|
@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func BenchmarkStackGrowth(b *testing.B) {
|
||||||
|
benchmarkStackGrowth(b, 10)
|
||||||
|
}
|
||||||
|
|
||||||
|
func BenchmarkStackGrowthDeep(b *testing.B) {
|
||||||
|
benchmarkStackGrowth(b, 1024)
|
||||||
|
}
|
||||||
|
|
||||||
func BenchmarkSyscall(b *testing.B) {
|
func BenchmarkSyscall(b *testing.B) {
|
||||||
const CallsPerSched = 1000
|
const CallsPerSched = 1000
|
||||||
procs := runtime.GOMAXPROCS(-1)
|
procs := runtime.GOMAXPROCS(-1)
|
||||||
|
|
|
||||||
|
|
@ -128,6 +128,13 @@ enum
|
||||||
{
|
{
|
||||||
PtrSize = sizeof(void*),
|
PtrSize = sizeof(void*),
|
||||||
};
|
};
|
||||||
|
enum
|
||||||
|
{
|
||||||
|
// Per-M stack segment cache size.
|
||||||
|
StackCacheSize = 32,
|
||||||
|
// Global <-> per-M stack segment cache transfer batch size.
|
||||||
|
StackCacheBatch = 16,
|
||||||
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* structures
|
* structures
|
||||||
|
|
@ -262,7 +269,10 @@ struct M
|
||||||
M* schedlink;
|
M* schedlink;
|
||||||
uint32 machport; // Return address for Mach IPC (OS X)
|
uint32 machport; // Return address for Mach IPC (OS X)
|
||||||
MCache *mcache;
|
MCache *mcache;
|
||||||
FixAlloc *stackalloc;
|
int32 stackinuse;
|
||||||
|
uint32 stackcachepos;
|
||||||
|
uint32 stackcachecnt;
|
||||||
|
void* stackcache[StackCacheSize];
|
||||||
G* lockedg;
|
G* lockedg;
|
||||||
G* idleg;
|
G* idleg;
|
||||||
uintptr createstack[32]; // Stack that created this thread.
|
uintptr createstack[32]; // Stack that created this thread.
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ package runtime_test
|
||||||
import (
|
import (
|
||||||
. "runtime"
|
. "runtime"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
|
||||||
func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
|
func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
|
||||||
func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
|
func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
|
||||||
func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
|
func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
|
||||||
|
|
||||||
|
// TestStackMem measures per-thread stack segment cache behavior.
|
||||||
|
// The test consumed up to 500MB in the past.
|
||||||
|
func TestStackMem(t *testing.T) {
|
||||||
|
const (
|
||||||
|
BatchSize = 32
|
||||||
|
BatchCount = 512
|
||||||
|
ArraySize = 1024
|
||||||
|
RecursionDepth = 128
|
||||||
|
)
|
||||||
|
if testing.Short() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
defer GOMAXPROCS(GOMAXPROCS(BatchSize))
|
||||||
|
s0 := new(MemStats)
|
||||||
|
ReadMemStats(s0)
|
||||||
|
for b := 0; b < BatchCount; b++ {
|
||||||
|
c := make(chan bool, BatchSize)
|
||||||
|
for i := 0; i < BatchSize; i++ {
|
||||||
|
go func() {
|
||||||
|
var f func(k int, a [ArraySize]byte)
|
||||||
|
f = func(k int, a [ArraySize]byte) {
|
||||||
|
if k == 0 {
|
||||||
|
time.Sleep(time.Millisecond)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
f(k-1, a)
|
||||||
|
}
|
||||||
|
f(RecursionDepth, [ArraySize]byte{})
|
||||||
|
c <- true
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
for i := 0; i < BatchSize; i++ {
|
||||||
|
<-c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s1 := new(MemStats)
|
||||||
|
ReadMemStats(s1)
|
||||||
|
consumed := s1.StackSys - s0.StackSys
|
||||||
|
t.Logf("Consumed %vMB for stack mem", consumed>>20)
|
||||||
|
estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
|
||||||
|
if consumed > estimate {
|
||||||
|
t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
|
||||||
|
}
|
||||||
|
if s1.StackInuse > 1<<20 {
|
||||||
|
t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue