go/src/runtime/slice.go

281 lines
8 KiB
Go
Raw Normal View History

// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime
import (
"runtime/internal/math"
"runtime/internal/sys"
"unsafe"
)
type slice struct {
array unsafe.Pointer
len int
cap int
}
// A notInHeapSlice is a slice backed by go:notinheap memory.
type notInHeapSlice struct {
array *notInHeap
len int
cap int
}
func panicmakeslicelen() {
panic(errorString("makeslice: len out of range"))
}
func panicmakeslicecap() {
panic(errorString("makeslice: cap out of range"))
}
cmd/compile: optimize make+copy pattern to avoid memclr match: m = make([]T, x); copy(m, s) for pointer free T and x==len(s) rewrite to: m = mallocgc(x*elemsize(T), nil, false); memmove(&m, &s, x*elemsize(T)) otherwise rewrite to: m = makeslicecopy([]T, x, s) This avoids memclear and shading of pointers in the newly created slice before the copy. With this CL "s" is only be allowed to bev a variable and not a more complex expression. This restriction could be lifted in future versions of this optimization when it can be proven that "s" is not referencing "m". Triggers 450 times during make.bash.. Reduces go binary size by ~8 kbyte. name old time/op new time/op delta MakeSliceCopy/mallocmove/Byte 71.1ns ± 1% 65.8ns ± 0% -7.49% (p=0.000 n=10+9) MakeSliceCopy/mallocmove/Int 71.2ns ± 1% 66.0ns ± 0% -7.27% (p=0.000 n=10+8) MakeSliceCopy/mallocmove/Ptr 104ns ± 4% 99ns ± 1% -5.13% (p=0.000 n=10+10) MakeSliceCopy/makecopy/Byte 70.3ns ± 0% 68.0ns ± 0% -3.22% (p=0.000 n=10+9) MakeSliceCopy/makecopy/Int 70.3ns ± 0% 68.5ns ± 1% -2.59% (p=0.000 n=9+10) MakeSliceCopy/makecopy/Ptr 102ns ± 0% 99ns ± 1% -2.97% (p=0.000 n=9+9) MakeSliceCopy/nilappend/Byte 75.4ns ± 0% 74.9ns ± 2% -0.63% (p=0.015 n=9+9) MakeSliceCopy/nilappend/Int 75.6ns ± 0% 76.4ns ± 3% ~ (p=0.245 n=9+10) MakeSliceCopy/nilappend/Ptr 107ns ± 0% 108ns ± 1% +0.93% (p=0.005 n=9+10) Fixes #26252 Change-Id: Iec553dd1fef6ded16197216a472351c8799a8e71 Reviewed-on: https://go-review.googlesource.com/c/go/+/146719 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-10-23 13:50:07 +02:00
// makeslicecopy allocates a slice of "tolen" elements of type "et",
// then copies "fromlen" elements of type "et" into that new allocation from "from".
func makeslicecopy(et *_type, tolen int, fromlen int, from unsafe.Pointer) unsafe.Pointer {
var tomem, copymem uintptr
if uintptr(tolen) > uintptr(fromlen) {
var overflow bool
tomem, overflow = math.MulUintptr(et.size, uintptr(tolen))
if overflow || tomem > maxAlloc || tolen < 0 {
panicmakeslicelen()
}
copymem = et.size * uintptr(fromlen)
} else {
// fromlen is a known good length providing and equal or greater than tolen,
// thereby making tolen a good slice length too as from and to slices have the
// same element width.
tomem = et.size * uintptr(tolen)
copymem = tomem
}
var to unsafe.Pointer
if et.ptrdata == 0 {
to = mallocgc(tomem, nil, false)
if copymem < tomem {
memclrNoHeapPointers(add(to, copymem), tomem-copymem)
}
} else {
// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
to = mallocgc(tomem, et, true)
if copymem > 0 && writeBarrier.enabled {
cmd/compile: optimize make+copy pattern to avoid memclr match: m = make([]T, x); copy(m, s) for pointer free T and x==len(s) rewrite to: m = mallocgc(x*elemsize(T), nil, false); memmove(&m, &s, x*elemsize(T)) otherwise rewrite to: m = makeslicecopy([]T, x, s) This avoids memclear and shading of pointers in the newly created slice before the copy. With this CL "s" is only be allowed to bev a variable and not a more complex expression. This restriction could be lifted in future versions of this optimization when it can be proven that "s" is not referencing "m". Triggers 450 times during make.bash.. Reduces go binary size by ~8 kbyte. name old time/op new time/op delta MakeSliceCopy/mallocmove/Byte 71.1ns ± 1% 65.8ns ± 0% -7.49% (p=0.000 n=10+9) MakeSliceCopy/mallocmove/Int 71.2ns ± 1% 66.0ns ± 0% -7.27% (p=0.000 n=10+8) MakeSliceCopy/mallocmove/Ptr 104ns ± 4% 99ns ± 1% -5.13% (p=0.000 n=10+10) MakeSliceCopy/makecopy/Byte 70.3ns ± 0% 68.0ns ± 0% -3.22% (p=0.000 n=10+9) MakeSliceCopy/makecopy/Int 70.3ns ± 0% 68.5ns ± 1% -2.59% (p=0.000 n=9+10) MakeSliceCopy/makecopy/Ptr 102ns ± 0% 99ns ± 1% -2.97% (p=0.000 n=9+9) MakeSliceCopy/nilappend/Byte 75.4ns ± 0% 74.9ns ± 2% -0.63% (p=0.015 n=9+9) MakeSliceCopy/nilappend/Int 75.6ns ± 0% 76.4ns ± 3% ~ (p=0.245 n=9+10) MakeSliceCopy/nilappend/Ptr 107ns ± 0% 108ns ± 1% +0.93% (p=0.005 n=9+10) Fixes #26252 Change-Id: Iec553dd1fef6ded16197216a472351c8799a8e71 Reviewed-on: https://go-review.googlesource.com/c/go/+/146719 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2018-10-23 13:50:07 +02:00
// Only shade the pointers in old.array since we know the destination slice to
// only contains nil pointers because it has been cleared during alloc.
bulkBarrierPreWriteSrcOnly(uintptr(to), uintptr(from), copymem)
}
}
if raceenabled {
callerpc := getcallerpc()
pc := funcPC(makeslicecopy)
racereadrangepc(from, copymem, callerpc, pc)
}
if msanenabled {
msanread(from, copymem)
}
memmove(to, from, copymem)
return to
}
func makeslice(et *_type, len, cap int) unsafe.Pointer {
mem, overflow := math.MulUintptr(et.size, uintptr(cap))
if overflow || mem > maxAlloc || len < 0 || len > cap {
// NOTE: Produce a 'len out of range' error instead of a
// 'cap out of range' error when someone does make([]T, bignumber).
// 'cap out of range' is true too, but since the cap is only being
// supplied implicitly, saying len is clearer.
// See golang.org/issue/4085.
mem, overflow := math.MulUintptr(et.size, uintptr(len))
if overflow || mem > maxAlloc || len < 0 {
panicmakeslicelen()
}
panicmakeslicecap()
}
return mallocgc(mem, et, true)
}
func makeslice64(et *_type, len64, cap64 int64) unsafe.Pointer {
len := int(len64)
if int64(len) != len64 {
panicmakeslicelen()
}
cap := int(cap64)
if int64(cap) != cap64 {
panicmakeslicecap()
}
return makeslice(et, len, cap)
}
// growslice handles slice growth during append.
// It is passed the slice element type, the old slice, and the desired new minimum capacity,
// and it returns a new slice with at least that capacity, with the old data
// copied into it.
cmd/compile: avoid a spill in append fast path Instead of spilling newlen, recalculate it. This removes a spill from the fast path, at the cost of a cheap recalculation on the (rare) growth path. This uses 8 bytes less of stack space. It generates two more bytes of code, but that is due to suboptimal register allocation; see far below. Runtime append microbenchmarks are all over the map, presumably due to incidental code movement. Sample code: func s(b []byte) []byte { b = append(b, 1, 2, 3) return b } Before: "".s t=1 size=160 args=0x30 locals=0x48 0x0000 00000 (append.go:8) TEXT "".s(SB), $72-48 0x0000 00000 (append.go:8) MOVQ (TLS), CX 0x0009 00009 (append.go:8) CMPQ SP, 16(CX) 0x000d 00013 (append.go:8) JLS 149 0x0013 00019 (append.go:8) SUBQ $72, SP 0x0017 00023 (append.go:8) FUNCDATA $0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB) 0x0017 00023 (append.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0017 00023 (append.go:9) MOVQ "".b+88(FP), CX 0x001c 00028 (append.go:9) LEAQ 3(CX), DX 0x0020 00032 (append.go:9) MOVQ DX, "".autotmp_0+64(SP) 0x0025 00037 (append.go:9) MOVQ "".b+96(FP), BX 0x002a 00042 (append.go:9) CMPQ DX, BX 0x002d 00045 (append.go:9) JGT $0, 86 0x002f 00047 (append.go:8) MOVQ "".b+80(FP), AX 0x0034 00052 (append.go:9) MOVB $1, (AX)(CX*1) 0x0038 00056 (append.go:9) MOVB $2, 1(AX)(CX*1) 0x003d 00061 (append.go:9) MOVB $3, 2(AX)(CX*1) 0x0042 00066 (append.go:10) MOVQ AX, "".~r1+104(FP) 0x0047 00071 (append.go:10) MOVQ DX, "".~r1+112(FP) 0x004c 00076 (append.go:10) MOVQ BX, "".~r1+120(FP) 0x0051 00081 (append.go:10) ADDQ $72, SP 0x0055 00085 (append.go:10) RET 0x0056 00086 (append.go:9) LEAQ type.[]uint8(SB), AX 0x005d 00093 (append.go:9) MOVQ AX, (SP) 0x0061 00097 (append.go:9) MOVQ "".b+80(FP), BP 0x0066 00102 (append.go:9) MOVQ BP, 8(SP) 0x006b 00107 (append.go:9) MOVQ CX, 16(SP) 0x0070 00112 (append.go:9) MOVQ BX, 24(SP) 0x0075 00117 (append.go:9) MOVQ DX, 32(SP) 0x007a 00122 (append.go:9) PCDATA $0, $0 0x007a 00122 (append.go:9) CALL runtime.growslice(SB) 0x007f 00127 (append.go:9) MOVQ 40(SP), AX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:8) MOVQ "".b+88(FP), CX 0x008e 00142 (append.go:9) MOVQ "".autotmp_0+64(SP), DX 0x0093 00147 (append.go:9) JMP 52 0x0095 00149 (append.go:9) NOP 0x0095 00149 (append.go:8) CALL runtime.morestack_noctxt(SB) 0x009a 00154 (append.go:8) JMP 0 After: "".s t=1 size=176 args=0x30 locals=0x40 0x0000 00000 (append.go:8) TEXT "".s(SB), $64-48 0x0000 00000 (append.go:8) MOVQ (TLS), CX 0x0009 00009 (append.go:8) CMPQ SP, 16(CX) 0x000d 00013 (append.go:8) JLS 151 0x0013 00019 (append.go:8) SUBQ $64, SP 0x0017 00023 (append.go:8) FUNCDATA $0, gclocals·6432f8c6a0d23fa7bee6c5d96f21a92a(SB) 0x0017 00023 (append.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0017 00023 (append.go:9) MOVQ "".b+80(FP), CX 0x001c 00028 (append.go:9) LEAQ 3(CX), DX 0x0020 00032 (append.go:9) MOVQ "".b+88(FP), BX 0x0025 00037 (append.go:9) CMPQ DX, BX 0x0028 00040 (append.go:9) JGT $0, 81 0x002a 00042 (append.go:8) MOVQ "".b+72(FP), AX 0x002f 00047 (append.go:9) MOVB $1, (AX)(CX*1) 0x0033 00051 (append.go:9) MOVB $2, 1(AX)(CX*1) 0x0038 00056 (append.go:9) MOVB $3, 2(AX)(CX*1) 0x003d 00061 (append.go:10) MOVQ AX, "".~r1+96(FP) 0x0042 00066 (append.go:10) MOVQ DX, "".~r1+104(FP) 0x0047 00071 (append.go:10) MOVQ BX, "".~r1+112(FP) 0x004c 00076 (append.go:10) ADDQ $64, SP 0x0050 00080 (append.go:10) RET 0x0051 00081 (append.go:9) LEAQ type.[]uint8(SB), AX 0x0058 00088 (append.go:9) MOVQ AX, (SP) 0x005c 00092 (append.go:9) MOVQ "".b+72(FP), BP 0x0061 00097 (append.go:9) MOVQ BP, 8(SP) 0x0066 00102 (append.go:9) MOVQ CX, 16(SP) 0x006b 00107 (append.go:9) MOVQ BX, 24(SP) 0x0070 00112 (append.go:9) MOVQ DX, 32(SP) 0x0075 00117 (append.go:9) PCDATA $0, $0 0x0075 00117 (append.go:9) CALL runtime.growslice(SB) 0x007a 00122 (append.go:9) MOVQ 40(SP), AX 0x007f 00127 (append.go:9) MOVQ 48(SP), CX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:9) ADDQ $3, CX 0x008d 00141 (append.go:9) MOVQ CX, DX 0x0090 00144 (append.go:8) MOVQ "".b+80(FP), CX 0x0095 00149 (append.go:9) JMP 47 0x0097 00151 (append.go:9) NOP 0x0097 00151 (append.go:8) CALL runtime.morestack_noctxt(SB) 0x009c 00156 (append.go:8) JMP 0 Observe that in the following sequence, we should use DX directly instead of using CX as a temporary register, which would make the new code a strict improvement on the old: 0x007f 00127 (append.go:9) MOVQ 48(SP), CX 0x0084 00132 (append.go:9) MOVQ 56(SP), BX 0x0089 00137 (append.go:9) ADDQ $3, CX 0x008d 00141 (append.go:9) MOVQ CX, DX 0x0090 00144 (append.go:8) MOVQ "".b+80(FP), CX Change-Id: I4ee50b18fa53865901d2d7f86c2cbb54c6fa6924 Reviewed-on: https://go-review.googlesource.com/21812 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2016-04-10 09:08:00 -07:00
// The new slice's length is set to the old slice's length,
// NOT to the new requested capacity.
// This is for codegen convenience. The old slice's length is used immediately
// to calculate where to write new values during an append.
// TODO: When the old backend is gone, reconsider this decision.
// The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
func growslice(et *_type, old slice, cap int) slice {
if raceenabled {
callerpc := getcallerpc()
racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
}
if msanenabled {
msanread(old.array, uintptr(old.len*int(et.size)))
}
if cap < old.cap {
panic(errorString("growslice: cap out of range"))
}
if et.size == 0 {
// append should not create a slice with nil pointer but non-zero len.
// We assume that append doesn't need to preserve old.array in this case.
return slice{unsafe.Pointer(&zerobase), old.len, cap}
}
newcap := old.cap
doublecap := newcap + newcap
if cap > doublecap {
newcap = cap
} else {
if old.len < 1024 {
newcap = doublecap
} else {
// Check 0 < newcap to detect overflow
// and prevent an infinite loop.
for 0 < newcap && newcap < cap {
newcap += newcap / 4
}
// Set newcap to the requested cap when
// the newcap calculation overflowed.
if newcap <= 0 {
newcap = cap
}
}
}
var overflow bool
runtime: make append only clear uncopied memory Also add a benchmark that shows off the new behavior. The existing benchmarks reuse the same slice, and thus don't ever have to clear memory. Running the Append|Grow benchmarks in runtime: name old time/op new time/op delta AppendSliceLarge/1024Bytes-12 265ns ± 1% 265ns ± 3% ~ (p=0.524 n=17+20) AppendSliceLarge/4096Bytes-12 807ns ± 3% 772ns ± 1% -4.38% (p=0.000 n=20+20) AppendSliceLarge/16384Bytes-12 3.20µs ± 4% 2.82µs ± 4% -11.93% (p=0.000 n=19+20) AppendSliceLarge/65536Bytes-12 13.0µs ± 4% 11.0µs ± 3% -15.22% (p=0.000 n=20+20) AppendSliceLarge/262144Bytes-12 62.7µs ± 1% 51.6µs ± 1% -17.67% (p=0.000 n=19+20) AppendSliceLarge/1048576Bytes-12 337µs ± 3% 289µs ± 3% -14.36% (p=0.000 n=20+20) GrowSliceBytes-12 31.2ns ± 4% 31.4ns ±11% ~ (p=0.308 n=19+18) GrowSliceInts-12 53.4ns ±14% 45.0ns ± 6% -15.74% (p=0.000 n=20+19) GrowSlicePtr-12 87.0ns ± 3% 83.3ns ± 3% -4.26% (p=0.000 n=18+17) GrowSliceStruct24Bytes-12 88.9ns ± 5% 77.8ns ± 2% -12.45% (p=0.000 n=20+19) Append-12 17.2ns ± 1% 17.3ns ± 2% ~ (p=0.464 n=18+17) AppendGrowByte-12 2.28ms ± 1% 1.92ms ± 2% -15.65% (p=0.000 n=20+18) AppendGrowString-12 255ms ± 3% 253ms ± 4% ~ (p=0.065 n=19+19) AppendSlice/1Bytes-12 3.13ns ± 0% 3.11ns ± 1% -0.65% (p=0.000 n=17+18) AppendSlice/4Bytes-12 3.02ns ± 2% 3.11ns ± 1% +3.27% (p=0.000 n=18+17) AppendSlice/7Bytes-12 4.14ns ± 3% 4.13ns ± 2% ~ (p=0.380 n=19+18) AppendSlice/8Bytes-12 3.74ns ± 3% 3.68ns ± 1% -1.76% (p=0.000 n=19+18) AppendSlice/15Bytes-12 4.03ns ± 2% 4.04ns ± 2% ~ (p=0.261 n=19+20) AppendSlice/16Bytes-12 4.03ns ± 2% 4.03ns ± 0% ~ (p=0.062 n=18+17) AppendSlice/32Bytes-12 3.23ns ± 4% 3.43ns ± 1% +6.10% (p=0.000 n=17+18) AppendStr/1Bytes-12 3.51ns ± 1% 3.52ns ± 1% ~ (p=0.321 n=18+19) AppendStr/4Bytes-12 3.46ns ± 1% 3.46ns ± 1% ~ (p=0.977 n=18+20) AppendStr/8Bytes-12 3.18ns ± 1% 3.19ns ± 1% ~ (p=0.650 n=16+17) AppendStr/16Bytes-12 6.08ns ±27% 5.52ns ± 3% -9.16% (p=0.002 n=18+19) AppendStr/32Bytes-12 3.71ns ± 1% 3.53ns ± 1% -4.73% (p=0.000 n=20+19) AppendSpecialCase-12 17.7ns ± 1% 17.8ns ± 3% +0.86% (p=0.045 n=17+18) AppendInPlace/NoGrow/Byte-12 375ns ± 1% 376ns ± 1% +0.35% (p=0.021 n=20+18) AppendInPlace/NoGrow/1Ptr-12 1.01µs ± 1% 1.10µs ± 1% +9.28% (p=0.000 n=18+20) AppendInPlace/NoGrow/2Ptr-12 1.85µs ± 2% 1.71µs ± 1% -7.51% (p=0.000 n=19+18) AppendInPlace/NoGrow/3Ptr-12 2.57µs ± 2% 2.44µs ± 1% -5.08% (p=0.000 n=19+19) AppendInPlace/NoGrow/4Ptr-12 3.52µs ± 2% 3.35µs ± 2% -4.70% (p=0.000 n=20+19) AppendInPlace/Grow/Byte-12 212ns ± 1% 217ns ± 8% +2.57% (p=0.000 n=20+20) AppendInPlace/Grow/1Ptr-12 214ns ± 2% 217ns ± 3% +1.23% (p=0.001 n=18+19) AppendInPlace/Grow/2Ptr-12 298ns ± 2% 300ns ± 2% +0.55% (p=0.038 n=19+20) AppendInPlace/Grow/3Ptr-12 367ns ± 2% 366ns ± 2% ~ (p=0.452 n=20+18) AppendInPlace/Grow/4Ptr-12 416ns ± 2% 411ns ± 2% -1.18% (p=0.000 n=20+19) StackGrowth-12 43.4ns ± 1% 43.4ns ± 0% ~ (p=1.000 n=16+16) StackGrowthDeep-12 11.4µs ± 4% 10.3µs ± 4% -9.65% (p=0.000 n=20+19) Change-Id: I69a8afbd942c787c591d95b9d9439bd6db4d1e49 Reviewed-on: https://go-review.googlesource.com/30192 Reviewed-by: Ian Lance Taylor <iant@golang.org> Run-TryBot: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-10-03 12:45:12 -07:00
var lenmem, newlenmem, capmem uintptr
// Specialize for common values of et.size.
// For 1 we don't need any division/multiplication.
// For sys.PtrSize, compiler will optimize division/multiplication into a shift by a constant.
// For powers of 2, use a variable shift.
switch {
case et.size == 1:
lenmem = uintptr(old.len)
runtime: make append only clear uncopied memory Also add a benchmark that shows off the new behavior. The existing benchmarks reuse the same slice, and thus don't ever have to clear memory. Running the Append|Grow benchmarks in runtime: name old time/op new time/op delta AppendSliceLarge/1024Bytes-12 265ns ± 1% 265ns ± 3% ~ (p=0.524 n=17+20) AppendSliceLarge/4096Bytes-12 807ns ± 3% 772ns ± 1% -4.38% (p=0.000 n=20+20) AppendSliceLarge/16384Bytes-12 3.20µs ± 4% 2.82µs ± 4% -11.93% (p=0.000 n=19+20) AppendSliceLarge/65536Bytes-12 13.0µs ± 4% 11.0µs ± 3% -15.22% (p=0.000 n=20+20) AppendSliceLarge/262144Bytes-12 62.7µs ± 1% 51.6µs ± 1% -17.67% (p=0.000 n=19+20) AppendSliceLarge/1048576Bytes-12 337µs ± 3% 289µs ± 3% -14.36% (p=0.000 n=20+20) GrowSliceBytes-12 31.2ns ± 4% 31.4ns ±11% ~ (p=0.308 n=19+18) GrowSliceInts-12 53.4ns ±14% 45.0ns ± 6% -15.74% (p=0.000 n=20+19) GrowSlicePtr-12 87.0ns ± 3% 83.3ns ± 3% -4.26% (p=0.000 n=18+17) GrowSliceStruct24Bytes-12 88.9ns ± 5% 77.8ns ± 2% -12.45% (p=0.000 n=20+19) Append-12 17.2ns ± 1% 17.3ns ± 2% ~ (p=0.464 n=18+17) AppendGrowByte-12 2.28ms ± 1% 1.92ms ± 2% -15.65% (p=0.000 n=20+18) AppendGrowString-12 255ms ± 3% 253ms ± 4% ~ (p=0.065 n=19+19) AppendSlice/1Bytes-12 3.13ns ± 0% 3.11ns ± 1% -0.65% (p=0.000 n=17+18) AppendSlice/4Bytes-12 3.02ns ± 2% 3.11ns ± 1% +3.27% (p=0.000 n=18+17) AppendSlice/7Bytes-12 4.14ns ± 3% 4.13ns ± 2% ~ (p=0.380 n=19+18) AppendSlice/8Bytes-12 3.74ns ± 3% 3.68ns ± 1% -1.76% (p=0.000 n=19+18) AppendSlice/15Bytes-12 4.03ns ± 2% 4.04ns ± 2% ~ (p=0.261 n=19+20) AppendSlice/16Bytes-12 4.03ns ± 2% 4.03ns ± 0% ~ (p=0.062 n=18+17) AppendSlice/32Bytes-12 3.23ns ± 4% 3.43ns ± 1% +6.10% (p=0.000 n=17+18) AppendStr/1Bytes-12 3.51ns ± 1% 3.52ns ± 1% ~ (p=0.321 n=18+19) AppendStr/4Bytes-12 3.46ns ± 1% 3.46ns ± 1% ~ (p=0.977 n=18+20) AppendStr/8Bytes-12 3.18ns ± 1% 3.19ns ± 1% ~ (p=0.650 n=16+17) AppendStr/16Bytes-12 6.08ns ±27% 5.52ns ± 3% -9.16% (p=0.002 n=18+19) AppendStr/32Bytes-12 3.71ns ± 1% 3.53ns ± 1% -4.73% (p=0.000 n=20+19) AppendSpecialCase-12 17.7ns ± 1% 17.8ns ± 3% +0.86% (p=0.045 n=17+18) AppendInPlace/NoGrow/Byte-12 375ns ± 1% 376ns ± 1% +0.35% (p=0.021 n=20+18) AppendInPlace/NoGrow/1Ptr-12 1.01µs ± 1% 1.10µs ± 1% +9.28% (p=0.000 n=18+20) AppendInPlace/NoGrow/2Ptr-12 1.85µs ± 2% 1.71µs ± 1% -7.51% (p=0.000 n=19+18) AppendInPlace/NoGrow/3Ptr-12 2.57µs ± 2% 2.44µs ± 1% -5.08% (p=0.000 n=19+19) AppendInPlace/NoGrow/4Ptr-12 3.52µs ± 2% 3.35µs ± 2% -4.70% (p=0.000 n=20+19) AppendInPlace/Grow/Byte-12 212ns ± 1% 217ns ± 8% +2.57% (p=0.000 n=20+20) AppendInPlace/Grow/1Ptr-12 214ns ± 2% 217ns ± 3% +1.23% (p=0.001 n=18+19) AppendInPlace/Grow/2Ptr-12 298ns ± 2% 300ns ± 2% +0.55% (p=0.038 n=19+20) AppendInPlace/Grow/3Ptr-12 367ns ± 2% 366ns ± 2% ~ (p=0.452 n=20+18) AppendInPlace/Grow/4Ptr-12 416ns ± 2% 411ns ± 2% -1.18% (p=0.000 n=20+19) StackGrowth-12 43.4ns ± 1% 43.4ns ± 0% ~ (p=1.000 n=16+16) StackGrowthDeep-12 11.4µs ± 4% 10.3µs ± 4% -9.65% (p=0.000 n=20+19) Change-Id: I69a8afbd942c787c591d95b9d9439bd6db4d1e49 Reviewed-on: https://go-review.googlesource.com/30192 Reviewed-by: Ian Lance Taylor <iant@golang.org> Run-TryBot: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-10-03 12:45:12 -07:00
newlenmem = uintptr(cap)
capmem = roundupsize(uintptr(newcap))
overflow = uintptr(newcap) > maxAlloc
newcap = int(capmem)
case et.size == sys.PtrSize:
lenmem = uintptr(old.len) * sys.PtrSize
newlenmem = uintptr(cap) * sys.PtrSize
capmem = roundupsize(uintptr(newcap) * sys.PtrSize)
overflow = uintptr(newcap) > maxAlloc/sys.PtrSize
newcap = int(capmem / sys.PtrSize)
case isPowerOfTwo(et.size):
var shift uintptr
if sys.PtrSize == 8 {
// Mask shift for better code generation.
shift = uintptr(sys.Ctz64(uint64(et.size))) & 63
} else {
shift = uintptr(sys.Ctz32(uint32(et.size))) & 31
}
lenmem = uintptr(old.len) << shift
newlenmem = uintptr(cap) << shift
capmem = roundupsize(uintptr(newcap) << shift)
overflow = uintptr(newcap) > (maxAlloc >> shift)
newcap = int(capmem >> shift)
default:
lenmem = uintptr(old.len) * et.size
runtime: make append only clear uncopied memory Also add a benchmark that shows off the new behavior. The existing benchmarks reuse the same slice, and thus don't ever have to clear memory. Running the Append|Grow benchmarks in runtime: name old time/op new time/op delta AppendSliceLarge/1024Bytes-12 265ns ± 1% 265ns ± 3% ~ (p=0.524 n=17+20) AppendSliceLarge/4096Bytes-12 807ns ± 3% 772ns ± 1% -4.38% (p=0.000 n=20+20) AppendSliceLarge/16384Bytes-12 3.20µs ± 4% 2.82µs ± 4% -11.93% (p=0.000 n=19+20) AppendSliceLarge/65536Bytes-12 13.0µs ± 4% 11.0µs ± 3% -15.22% (p=0.000 n=20+20) AppendSliceLarge/262144Bytes-12 62.7µs ± 1% 51.6µs ± 1% -17.67% (p=0.000 n=19+20) AppendSliceLarge/1048576Bytes-12 337µs ± 3% 289µs ± 3% -14.36% (p=0.000 n=20+20) GrowSliceBytes-12 31.2ns ± 4% 31.4ns ±11% ~ (p=0.308 n=19+18) GrowSliceInts-12 53.4ns ±14% 45.0ns ± 6% -15.74% (p=0.000 n=20+19) GrowSlicePtr-12 87.0ns ± 3% 83.3ns ± 3% -4.26% (p=0.000 n=18+17) GrowSliceStruct24Bytes-12 88.9ns ± 5% 77.8ns ± 2% -12.45% (p=0.000 n=20+19) Append-12 17.2ns ± 1% 17.3ns ± 2% ~ (p=0.464 n=18+17) AppendGrowByte-12 2.28ms ± 1% 1.92ms ± 2% -15.65% (p=0.000 n=20+18) AppendGrowString-12 255ms ± 3% 253ms ± 4% ~ (p=0.065 n=19+19) AppendSlice/1Bytes-12 3.13ns ± 0% 3.11ns ± 1% -0.65% (p=0.000 n=17+18) AppendSlice/4Bytes-12 3.02ns ± 2% 3.11ns ± 1% +3.27% (p=0.000 n=18+17) AppendSlice/7Bytes-12 4.14ns ± 3% 4.13ns ± 2% ~ (p=0.380 n=19+18) AppendSlice/8Bytes-12 3.74ns ± 3% 3.68ns ± 1% -1.76% (p=0.000 n=19+18) AppendSlice/15Bytes-12 4.03ns ± 2% 4.04ns ± 2% ~ (p=0.261 n=19+20) AppendSlice/16Bytes-12 4.03ns ± 2% 4.03ns ± 0% ~ (p=0.062 n=18+17) AppendSlice/32Bytes-12 3.23ns ± 4% 3.43ns ± 1% +6.10% (p=0.000 n=17+18) AppendStr/1Bytes-12 3.51ns ± 1% 3.52ns ± 1% ~ (p=0.321 n=18+19) AppendStr/4Bytes-12 3.46ns ± 1% 3.46ns ± 1% ~ (p=0.977 n=18+20) AppendStr/8Bytes-12 3.18ns ± 1% 3.19ns ± 1% ~ (p=0.650 n=16+17) AppendStr/16Bytes-12 6.08ns ±27% 5.52ns ± 3% -9.16% (p=0.002 n=18+19) AppendStr/32Bytes-12 3.71ns ± 1% 3.53ns ± 1% -4.73% (p=0.000 n=20+19) AppendSpecialCase-12 17.7ns ± 1% 17.8ns ± 3% +0.86% (p=0.045 n=17+18) AppendInPlace/NoGrow/Byte-12 375ns ± 1% 376ns ± 1% +0.35% (p=0.021 n=20+18) AppendInPlace/NoGrow/1Ptr-12 1.01µs ± 1% 1.10µs ± 1% +9.28% (p=0.000 n=18+20) AppendInPlace/NoGrow/2Ptr-12 1.85µs ± 2% 1.71µs ± 1% -7.51% (p=0.000 n=19+18) AppendInPlace/NoGrow/3Ptr-12 2.57µs ± 2% 2.44µs ± 1% -5.08% (p=0.000 n=19+19) AppendInPlace/NoGrow/4Ptr-12 3.52µs ± 2% 3.35µs ± 2% -4.70% (p=0.000 n=20+19) AppendInPlace/Grow/Byte-12 212ns ± 1% 217ns ± 8% +2.57% (p=0.000 n=20+20) AppendInPlace/Grow/1Ptr-12 214ns ± 2% 217ns ± 3% +1.23% (p=0.001 n=18+19) AppendInPlace/Grow/2Ptr-12 298ns ± 2% 300ns ± 2% +0.55% (p=0.038 n=19+20) AppendInPlace/Grow/3Ptr-12 367ns ± 2% 366ns ± 2% ~ (p=0.452 n=20+18) AppendInPlace/Grow/4Ptr-12 416ns ± 2% 411ns ± 2% -1.18% (p=0.000 n=20+19) StackGrowth-12 43.4ns ± 1% 43.4ns ± 0% ~ (p=1.000 n=16+16) StackGrowthDeep-12 11.4µs ± 4% 10.3µs ± 4% -9.65% (p=0.000 n=20+19) Change-Id: I69a8afbd942c787c591d95b9d9439bd6db4d1e49 Reviewed-on: https://go-review.googlesource.com/30192 Reviewed-by: Ian Lance Taylor <iant@golang.org> Run-TryBot: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-10-03 12:45:12 -07:00
newlenmem = uintptr(cap) * et.size
capmem, overflow = math.MulUintptr(et.size, uintptr(newcap))
capmem = roundupsize(capmem)
newcap = int(capmem / et.size)
}
// The check of overflow in addition to capmem > maxAlloc is needed
// to prevent an overflow which can be used to trigger a segfault
// on 32bit architectures with this example program:
//
// type T [1<<27 + 1]int64
//
// var d T
// var s []T
//
// func main() {
// s = append(s, d, d, d, d)
// print(len(s), "\n")
// }
if overflow || capmem > maxAlloc {
panic(errorString("growslice: cap out of range"))
}
var p unsafe.Pointer
if et.ptrdata == 0 {
p = mallocgc(capmem, nil, false)
runtime: make append only clear uncopied memory Also add a benchmark that shows off the new behavior. The existing benchmarks reuse the same slice, and thus don't ever have to clear memory. Running the Append|Grow benchmarks in runtime: name old time/op new time/op delta AppendSliceLarge/1024Bytes-12 265ns ± 1% 265ns ± 3% ~ (p=0.524 n=17+20) AppendSliceLarge/4096Bytes-12 807ns ± 3% 772ns ± 1% -4.38% (p=0.000 n=20+20) AppendSliceLarge/16384Bytes-12 3.20µs ± 4% 2.82µs ± 4% -11.93% (p=0.000 n=19+20) AppendSliceLarge/65536Bytes-12 13.0µs ± 4% 11.0µs ± 3% -15.22% (p=0.000 n=20+20) AppendSliceLarge/262144Bytes-12 62.7µs ± 1% 51.6µs ± 1% -17.67% (p=0.000 n=19+20) AppendSliceLarge/1048576Bytes-12 337µs ± 3% 289µs ± 3% -14.36% (p=0.000 n=20+20) GrowSliceBytes-12 31.2ns ± 4% 31.4ns ±11% ~ (p=0.308 n=19+18) GrowSliceInts-12 53.4ns ±14% 45.0ns ± 6% -15.74% (p=0.000 n=20+19) GrowSlicePtr-12 87.0ns ± 3% 83.3ns ± 3% -4.26% (p=0.000 n=18+17) GrowSliceStruct24Bytes-12 88.9ns ± 5% 77.8ns ± 2% -12.45% (p=0.000 n=20+19) Append-12 17.2ns ± 1% 17.3ns ± 2% ~ (p=0.464 n=18+17) AppendGrowByte-12 2.28ms ± 1% 1.92ms ± 2% -15.65% (p=0.000 n=20+18) AppendGrowString-12 255ms ± 3% 253ms ± 4% ~ (p=0.065 n=19+19) AppendSlice/1Bytes-12 3.13ns ± 0% 3.11ns ± 1% -0.65% (p=0.000 n=17+18) AppendSlice/4Bytes-12 3.02ns ± 2% 3.11ns ± 1% +3.27% (p=0.000 n=18+17) AppendSlice/7Bytes-12 4.14ns ± 3% 4.13ns ± 2% ~ (p=0.380 n=19+18) AppendSlice/8Bytes-12 3.74ns ± 3% 3.68ns ± 1% -1.76% (p=0.000 n=19+18) AppendSlice/15Bytes-12 4.03ns ± 2% 4.04ns ± 2% ~ (p=0.261 n=19+20) AppendSlice/16Bytes-12 4.03ns ± 2% 4.03ns ± 0% ~ (p=0.062 n=18+17) AppendSlice/32Bytes-12 3.23ns ± 4% 3.43ns ± 1% +6.10% (p=0.000 n=17+18) AppendStr/1Bytes-12 3.51ns ± 1% 3.52ns ± 1% ~ (p=0.321 n=18+19) AppendStr/4Bytes-12 3.46ns ± 1% 3.46ns ± 1% ~ (p=0.977 n=18+20) AppendStr/8Bytes-12 3.18ns ± 1% 3.19ns ± 1% ~ (p=0.650 n=16+17) AppendStr/16Bytes-12 6.08ns ±27% 5.52ns ± 3% -9.16% (p=0.002 n=18+19) AppendStr/32Bytes-12 3.71ns ± 1% 3.53ns ± 1% -4.73% (p=0.000 n=20+19) AppendSpecialCase-12 17.7ns ± 1% 17.8ns ± 3% +0.86% (p=0.045 n=17+18) AppendInPlace/NoGrow/Byte-12 375ns ± 1% 376ns ± 1% +0.35% (p=0.021 n=20+18) AppendInPlace/NoGrow/1Ptr-12 1.01µs ± 1% 1.10µs ± 1% +9.28% (p=0.000 n=18+20) AppendInPlace/NoGrow/2Ptr-12 1.85µs ± 2% 1.71µs ± 1% -7.51% (p=0.000 n=19+18) AppendInPlace/NoGrow/3Ptr-12 2.57µs ± 2% 2.44µs ± 1% -5.08% (p=0.000 n=19+19) AppendInPlace/NoGrow/4Ptr-12 3.52µs ± 2% 3.35µs ± 2% -4.70% (p=0.000 n=20+19) AppendInPlace/Grow/Byte-12 212ns ± 1% 217ns ± 8% +2.57% (p=0.000 n=20+20) AppendInPlace/Grow/1Ptr-12 214ns ± 2% 217ns ± 3% +1.23% (p=0.001 n=18+19) AppendInPlace/Grow/2Ptr-12 298ns ± 2% 300ns ± 2% +0.55% (p=0.038 n=19+20) AppendInPlace/Grow/3Ptr-12 367ns ± 2% 366ns ± 2% ~ (p=0.452 n=20+18) AppendInPlace/Grow/4Ptr-12 416ns ± 2% 411ns ± 2% -1.18% (p=0.000 n=20+19) StackGrowth-12 43.4ns ± 1% 43.4ns ± 0% ~ (p=1.000 n=16+16) StackGrowthDeep-12 11.4µs ± 4% 10.3µs ± 4% -9.65% (p=0.000 n=20+19) Change-Id: I69a8afbd942c787c591d95b9d9439bd6db4d1e49 Reviewed-on: https://go-review.googlesource.com/30192 Reviewed-by: Ian Lance Taylor <iant@golang.org> Run-TryBot: Ian Lance Taylor <iant@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
2016-10-03 12:45:12 -07:00
// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
// Only clear the part that will not be overwritten.
memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
} else {
// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
p = mallocgc(capmem, et, true)
if lenmem > 0 && writeBarrier.enabled {
// Only shade the pointers in old.array since we know the destination slice p
// only contains nil pointers because it has been cleared during alloc.
bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem-et.size+et.ptrdata)
}
}
runtime: replace typedmemmmove with bulkBarrierPreWrite and memmove in growslice A bulkBarrierPreWrite together with a memmove as used in typedslicecopy is faster than a typedmemmove for each element of the old slice that needs to be copied to the new slice. typedslicecopy is not used here as runtime functions should not call other instrumented runtime functions and some conditions like dst == src or the destination slice not being large enought that are checked for in typedslicecopy can not happen in growslice. Append 13.5ns ± 6% 13.3ns ± 3% ~ (p=0.304 n=10+10) AppendGrowByte 1.18ms ± 2% 1.19ms ± 1% ~ (p=0.113 n=10+9) AppendGrowString 123ms ± 1% 73ms ± 1% -40.39% (p=0.000 n=9+8) AppendSlice/1Bytes 3.81ns ± 1% 3.78ns ± 1% ~ (p=0.116 n=10+10) AppendSlice/4Bytes 3.71ns ± 1% 3.70ns ± 0% ~ (p=0.095 n=10+9) AppendSlice/7Bytes 3.73ns ± 0% 3.75ns ± 1% ~ (p=0.442 n=10+10) AppendSlice/8Bytes 4.00ns ± 1% 4.01ns ± 1% ~ (p=0.330 n=10+10) AppendSlice/15Bytes 4.29ns ± 1% 4.28ns ± 1% ~ (p=0.536 n=10+10) AppendSlice/16Bytes 4.28ns ± 1% 4.31ns ± 1% +0.75% (p=0.019 n=10+10) AppendSlice/32Bytes 4.57ns ± 2% 4.58ns ± 2% ~ (p=0.236 n=10+10) AppendSliceLarge/1024Bytes 305ns ± 2% 306ns ± 1% ~ (p=0.236 n=10+10) AppendSliceLarge/4096Bytes 1.06µs ± 1% 1.06µs ± 0% ~ (p=1.000 n=9+10) AppendSliceLarge/16384Bytes 3.12µs ± 2% 3.11µs ± 1% ~ (p=0.493 n=10+10) AppendSliceLarge/65536Bytes 5.61µs ± 5% 5.36µs ± 2% -4.58% (p=0.003 n=10+8) AppendSliceLarge/262144Bytes 21.0µs ± 1% 19.5µs ± 1% -7.09% (p=0.000 n=8+10) AppendSliceLarge/1048576Bytes 78.4µs ± 1% 78.7µs ± 2% ~ (p=0.315 n=8+10) AppendStr/1Bytes 3.96ns ± 6% 3.99ns ± 9% ~ (p=0.591 n=10+10) AppendStr/4Bytes 3.98ns ± 1% 3.99ns ± 1% ~ (p=0.515 n=9+9) AppendStr/8Bytes 4.27ns ± 1% 4.27ns ± 1% ~ (p=0.633 n=10+10) AppendStr/16Bytes 4.56ns ± 2% 4.55ns ± 1% ~ (p=0.869 n=10+10) AppendStr/32Bytes 4.85ns ± 1% 4.89ns ± 1% +0.71% (p=0.003 n=10+8) AppendSpecialCase 18.7ns ± 1% 18.7ns ± 1% ~ (p=0.144 n=10+10) AppendInPlace/NoGrow/Byte 438ns ± 1% 439ns ± 1% ~ (p=0.135 n=10+8) AppendInPlace/NoGrow/1Ptr 1.05µs ± 2% 1.05µs ± 1% ~ (p=0.469 n=10+10) AppendInPlace/NoGrow/2Ptr 1.77µs ± 1% 1.78µs ± 2% ~ (p=0.469 n=10+10) AppendInPlace/NoGrow/3Ptr 1.94µs ± 1% 1.93µs ± 2% ~ (p=0.517 n=10+10) AppendInPlace/NoGrow/4Ptr 3.18µs ± 1% 3.17µs ± 0% ~ (p=0.483 n=10+9) AppendInPlace/Grow/Byte 382ns ± 2% 383ns ± 2% ~ (p=0.705 n=9+10) AppendInPlace/Grow/1Ptr 383ns ± 1% 384ns ± 1% ~ (p=0.844 n=10+10) AppendInPlace/Grow/2Ptr 459ns ± 2% 467ns ± 2% +1.74% (p=0.001 n=10+10) AppendInPlace/Grow/3Ptr 593ns ± 1% 597ns ± 2% ~ (p=0.195 n=10+10) AppendInPlace/Grow/4Ptr 583ns ± 2% 589ns ± 2% ~ (p=0.084 n=10+10) Change-Id: I629872f065a22b29267c1adbfc578aaedd36d365 Reviewed-on: https://go-review.googlesource.com/115755 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-06-01 14:30:49 +02:00
memmove(p, old.array, lenmem)
return slice{p, old.len, newcap}
}
func isPowerOfTwo(x uintptr) bool {
return x&(x-1) == 0
}
// slicecopy is used to copy from a string or slice of pointerless elements into a slice.
func slicecopy(toPtr unsafe.Pointer, toLen int, fromPtr unsafe.Pointer, fromLen int, width uintptr) int {
if fromLen == 0 || toLen == 0 {
return 0
}
n := fromLen
if toLen < n {
n = toLen
}
if width == 0 {
return n
}
size := uintptr(n) * width
if raceenabled {
callerpc := getcallerpc()
pc := funcPC(slicecopy)
racereadrangepc(fromPtr, size, callerpc, pc)
racewriterangepc(toPtr, size, callerpc, pc)
}
if msanenabled {
msanread(fromPtr, size)
msanwrite(toPtr, size)
}
if size == 1 { // common case worth about 2x to do here
// TODO: is this still worth it with new memmove impl?
*(*byte)(toPtr) = *(*byte)(fromPtr) // known to be a byte pointer
} else {
memmove(toPtr, fromPtr, size)
}
return n
}