mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
strconv: remove hand-written divide on 32-bit systems
The compiler now generates code that is just as good.
host: s7:GOARCH=386
goos: linux
goarch: 386
pkg: strconv
cpu: AMD Ryzen 9 7950X 16-Core Processor
│ 8d2b4ce71b3 │ d5524a1f38c │
│ sec/op │ sec/op vs base │
AppendFloat/Decimal-4 43.91n ± 2% 44.23n ± 1% ~ (p=0.654 n=20)
AppendFloat/Float-4 73.73n ± 0% 74.17n ± 1% ~ (p=0.062 n=20)
AppendFloat/Exp-4 77.33n ± 1% 77.11n ± 1% ~ (p=0.234 n=20)
AppendFloat/NegExp-4 77.56n ± 1% 77.00n ± 1% ~ (p=0.136 n=20)
AppendFloat/LongExp-4 79.01n ± 1% 79.62n ± 1% ~ (p=0.213 n=20)
AppendFloat/Big-4 88.02n ± 2% 88.88n ± 1% ~ (p=0.159 n=20)
AppendFloat/BinaryExp-4 34.43n ± 1% 34.58n ± 1% ~ (p=0.683 n=20)
AppendFloat/32Integer-4 44.05n ± 2% 43.74n ± 1% ~ (p=0.055 n=20)
AppendFloat/32ExactFraction-4 66.55n ± 1% 66.62n ± 1% ~ (p=0.753 n=20)
AppendFloat/32Point-4 64.01n ± 1% 63.39n ± 1% ~ (p=0.032 n=20)
AppendFloat/32Exp-4 77.05n ± 1% 77.84n ± 1% ~ (p=0.055 n=20)
AppendFloat/32NegExp-4 66.96n ± 1% 67.41n ± 1% ~ (p=0.569 n=20)
AppendFloat/32Shortest-4 61.73n ± 1% 62.00n ± 1% ~ (p=0.457 n=20)
AppendFloat/32Fixed8Hard-4 39.09n ± 1% 39.06n ± 1% ~ (p=0.588 n=20)
AppendFloat/32Fixed9Hard-4 57.66n ± 0% 57.39n ± 1% ~ (p=0.167 n=20)
AppendFloat/64Fixed1-4 52.52n ± 1% 52.45n ± 1% ~ (p=0.867 n=20)
AppendFloat/64Fixed2-4 50.64n ± 1% 50.12n ± 4% ~ (p=0.208 n=20)
AppendFloat/64Fixed3-4 49.54n ± 1% 50.84n ± 1% +2.62% (p=0.000 n=20)
AppendFloat/64Fixed4-4 45.60n ± 1% 45.25n ± 1% ~ (p=0.034 n=20)
AppendFloat/64Fixed12-4 57.70n ± 1% 57.70n ± 1% ~ (p=0.394 n=20)
AppendFloat/64Fixed16-4 56.49n ± 1% 56.15n ± 1% ~ (p=0.044 n=20)
AppendFloat/64Fixed12Hard-4 53.99n ± 1% 53.79n ± 1% ~ (p=0.358 n=20)
AppendFloat/64Fixed17Hard-4 64.51n ± 1% 63.18n ± 1% -2.06% (p=0.000 n=20)
AppendFloat/64Fixed18Hard-4 4.281µ ± 1% 4.294µ ± 1% ~ (p=0.995 n=20)
AppendFloat/Slowpath64-4 77.94n ± 1% 78.66n ± 2% ~ (p=0.136 n=20)
AppendFloat/SlowpathDenormal64-4 77.64n ± 1% 77.96n ± 1% ~ (p=0.229 n=20)
AppendInt-4 1.122µ ± 1% 1.116µ ± 1% ~ (p=0.115 n=20)
AppendUint-4 287.9n ± 1% 286.9n ± 1% ~ (p=0.185 n=20)
AppendIntSmall-4 5.845n ± 1% 5.819n ± 1% ~ (p=0.516 n=20)
AppendUintVarlen/digits=1-4 3.924n ± 1% 3.905n ± 1% ~ (p=0.317 n=20)
AppendUintVarlen/digits=2-4 3.909n ± 1% 3.940n ± 1% ~ (p=0.995 n=20)
AppendUintVarlen/digits=3-4 9.543n ± 1% 9.567n ± 2% ~ (p=0.606 n=20)
AppendUintVarlen/digits=4-4 9.710n ± 1% 9.748n ± 1% ~ (p=0.602 n=20)
AppendUintVarlen/digits=5-4 10.84n ± 1% 10.88n ± 2% ~ (p=0.425 n=20)
AppendUintVarlen/digits=6-4 11.06n ± 1% 11.06n ± 1% ~ (p=0.506 n=20)
AppendUintVarlen/digits=7-4 11.97n ± 1% 12.05n ± 1% ~ (p=0.218 n=20)
AppendUintVarlen/digits=8-4 12.27n ± 1% 12.32n ± 2% ~ (p=0.358 n=20)
AppendUintVarlen/digits=9-4 13.57n ± 1% 13.57n ± 1% ~ (p=0.952 n=20)
AppendUintVarlen/digits=10-4 16.88n ± 1% 16.52n ± 1% -2.13% (p=0.000 n=20)
AppendUintVarlen/digits=11-4 16.83n ± 1% 16.72n ± 1% ~ (p=0.012 n=20)
AppendUintVarlen/digits=12-4 17.93n ± 1% 17.63n ± 1% -1.65% (p=0.000 n=20)
AppendUintVarlen/digits=13-4 18.38n ± 2% 17.80n ± 1% -3.16% (p=0.000 n=20)
AppendUintVarlen/digits=14-4 19.20n ± 1% 18.65n ± 1% -2.89% (p=0.000 n=20)
AppendUintVarlen/digits=15-4 19.41n ± 1% 18.85n ± 1% -2.86% (p=0.000 n=20)
AppendUintVarlen/digits=16-4 20.33n ± 1% 19.79n ± 1% -2.63% (p=0.000 n=20)
AppendUintVarlen/digits=17-4 20.32n ± 2% 19.79n ± 0% -2.61% (p=0.000 n=20)
AppendUintVarlen/digits=18-4 21.09n ± 1% 20.84n ± 1% -1.16% (p=0.000 n=20)
AppendUintVarlen/digits=19-4 25.68n ± 1% 25.24n ± 0% -1.69% (p=0.000 n=20)
AppendUintVarlen/digits=20-4 25.42n ± 1% 25.15n ± 1% -1.06% (p=0.000 n=20)
geomean 37.54n 37.39n -0.40%
%
Change-Id: I0dba26d1f6fbadc2a951dc0bbc8cf30d1391e10f
Reviewed-on: https://go-review.googlesource.com/c/go/+/716062
Auto-Submit: Russ Cox <rsc@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
1e5bb416d8
commit
cdc6b559ca
1 changed files with 2 additions and 40 deletions
|
|
@ -191,10 +191,6 @@ func formatBase10(a []byte, u uint64) int {
|
|||
// On most systems, the uint32 math is faster, but not all.
|
||||
// The decision here is based on benchmarking.
|
||||
itoaPure64 = host64bit && goarch.GOARCH != "amd64" && goarch.GOARCH != "arm64" && goarch.GOARCH != "s390x"
|
||||
|
||||
// 64-bit systems can all use 64-bit div and mod by a constant,
|
||||
// which the compiler rewrites to use 64x64→128-bit multiplies.
|
||||
itoaDivMod64 = host64bit // can use 64-bit div/mod by constant
|
||||
)
|
||||
|
||||
if itoaPure64 {
|
||||
|
|
@ -218,47 +214,13 @@ func formatBase10(a []byte, u uint64) int {
|
|||
return i
|
||||
}
|
||||
|
||||
// Convert 9-digit chunks using 32-bit math.
|
||||
// Split into 9-digit chunks that fit in uint32s and convert each chunk using 32-bit math.
|
||||
// Most numbers are small, so the comparison u >= 1e9 is usually pure overhead,
|
||||
// so we approximate it by u>>29 != 0, which is usually faster and good enough.
|
||||
i := len(a)
|
||||
for (host64bit && u>>29 != 0) || (!host64bit && (u>>32 != 0 || uint32(u)>>29 != 0)) {
|
||||
var lo uint32
|
||||
if itoaDivMod64 {
|
||||
u, lo = u/1e9, uint32(u%1e9)
|
||||
} else {
|
||||
// On 64-bit systems the compiler rewrites the div and mod above
|
||||
// into a 64x64→128-bit multiply (https://godbolt.org/z/EPnK8zvMK):
|
||||
// hi, _ := bits.Mul64(u>>1, 0x89705f4136b4a598)
|
||||
// q := hi >> 28
|
||||
// lo = uint32(u - q*1e9)
|
||||
// u = q
|
||||
// On 32-bit systems, the compiler invokes a uint64 software divide,
|
||||
// which is quite slow. We could write the bits.Mul64 code above
|
||||
// but even that is slower than we'd like, since it calls a software mul64
|
||||
// instead of having a hardware instruction to use.
|
||||
// Instead we inline bits.Mul64 here and change y0/y1 to constants.
|
||||
// The compiler does use direct 32x32→64-bit multiplies for this code.
|
||||
//
|
||||
// For lots more about division by multiplication see Warren, _Hacker's Delight_.
|
||||
// For a concise overview, see the first two sections of
|
||||
// https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html.
|
||||
const mask32 = 1<<32 - 1
|
||||
x0 := ((u >> 1) & mask32)
|
||||
x1 := (u >> 1) >> 32
|
||||
const y0 = 0x36b4a598
|
||||
const y1 = 0x89705f41
|
||||
w0 := x0 * y0
|
||||
t := x1*y0 + w0>>32
|
||||
w1 := t & mask32
|
||||
w2 := t >> 32
|
||||
w1 += x0 * y1
|
||||
hi := x1*y1 + w2 + w1>>32
|
||||
q := hi >> 28
|
||||
|
||||
lo = uint32(u) - uint32(q)*1e9 // uint32(u - q*1e9) but faster
|
||||
u = q
|
||||
}
|
||||
u, lo = u/1e9, uint32(u%1e9)
|
||||
|
||||
// Convert 9 digits.
|
||||
for range 4 {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue