go/src/cmd/compile/internal/ssa/_gen/386Ops.go

589 lines
45 KiB
Go
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import "strings"
// Notes:
// - Integer types live in the low portion of registers. Upper portions are junk.
// - Boolean types use the low-order byte of a register. 0=false, 1=true.
// Upper bytes are junk.
// - Floating-point types live in the low natural slot of an sse2 register.
// Unused portions are junk.
// - We do not use AH,BH,CH,DH registers.
// - When doing sub-register operations, we try to write the whole
// destination register to avoid a partial-register write.
// - Unused portions of AuxInt (or the Val portion of ValAndOff) are
// filled by sign-extending the used portion. Users of AuxInt which interpret
// AuxInt as unsigned (e.g. shifts) must be careful.
// Suffixes encode the bit width of various instructions.
// L (long word) = 32 bit
// W (word) = 16 bit
// B (byte) = 8 bit
// copied from ../../x86/reg.go
var regNames386 = []string{
"AX",
"CX",
"DX",
"BX",
"SP",
"BP",
"SI",
"DI",
"X0",
"X1",
"X2",
"X3",
"X4",
"X5",
"X6",
"X7",
// If you add registers, update asyncPreempt in runtime
// pseudo-registers
"SB",
}
func init() {
// Make map from reg names to reg integers.
if len(regNames386) > 64 {
panic("too many registers")
}
num := map[string]int{}
for i, name := range regNames386 {
num[name] = i
}
buildReg := func(s string) regMask {
m := regMask(0)
for _, r := range strings.Split(s, " ") {
if n, ok := num[r]; ok {
m |= regMask(1) << uint(n)
continue
}
panic("register " + r + " not found")
}
return m
}
// Common individual register masks
var (
ax = buildReg("AX")
cx = buildReg("CX")
dx = buildReg("DX")
bx = buildReg("BX")
si = buildReg("SI")
gp = buildReg("AX CX DX BX BP SI DI")
fp = buildReg("X0 X1 X2 X3 X4 X5 X6 X7")
gpsp = gp | buildReg("SP")
gpspsb = gpsp | buildReg("SB")
callerSave = gp | fp
)
// Common slices of register masks
var (
gponly = []regMask{gp}
fponly = []regMask{fp}
)
// Common regInfo
var (
gp01 = regInfo{inputs: nil, outputs: gponly}
gp11 = regInfo{inputs: []regMask{gp}, outputs: gponly}
gp11sp = regInfo{inputs: []regMask{gpsp}, outputs: gponly}
gp11sb = regInfo{inputs: []regMask{gpspsb}, outputs: gponly}
gp21 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
gp11carry = regInfo{inputs: []regMask{gp}, outputs: []regMask{gp, 0}}
gp21carry = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp, 0}}
gp1carry1 = regInfo{inputs: []regMask{gp}, outputs: gponly}
gp2carry1 = regInfo{inputs: []regMask{gp, gp}, outputs: gponly}
gp21sp = regInfo{inputs: []regMask{gpsp, gp}, outputs: gponly}
gp21sb = regInfo{inputs: []regMask{gpspsb, gpsp}, outputs: gponly}
gp21shift = regInfo{inputs: []regMask{gp, cx}, outputs: []regMask{gp}}
gp11div = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{ax}, clobbers: dx}
gp21hmul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx}, clobbers: ax}
gp11mod = regInfo{inputs: []regMask{ax, gpsp &^ dx}, outputs: []regMask{dx}, clobbers: ax}
gp21mul = regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{dx, ax}}
cmd/compile: optimize 386's comparison CMPL/CMPW/CMPB can take a memory operand on 386, and this CL implements that optimization. 1. The total size of pkg/linux_386 decreases about 45KB, excluding cmd/compile. 2. The go1 benchmark shows a little improvement. name old time/op new time/op delta BinaryTree17-4 3.36s ± 2% 3.37s ± 3% ~ (p=0.537 n=40+40) Fannkuch11-4 3.59s ± 1% 3.53s ± 2% -1.58% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.0ns ± 3% 45.8ns ± 3% ~ (p=0.249 n=40+40) FmtFprintfString-4 80.0ns ± 4% 78.8ns ± 3% -1.49% (p=0.001 n=40+40) FmtFprintfInt-4 89.7ns ± 2% 90.3ns ± 2% +0.74% (p=0.003 n=40+40) FmtFprintfIntInt-4 144ns ± 3% 143ns ± 3% -0.95% (p=0.003 n=40+40) FmtFprintfPrefixedInt-4 181ns ± 4% 180ns ± 2% ~ (p=0.103 n=40+40) FmtFprintfFloat-4 412ns ± 3% 408ns ± 4% -0.97% (p=0.018 n=40+40) FmtManyArgs-4 607ns ± 4% 605ns ± 4% ~ (p=0.148 n=40+40) GobDecode-4 7.19ms ± 4% 7.24ms ± 5% ~ (p=0.340 n=40+40) GobEncode-4 7.04ms ± 9% 6.99ms ± 9% ~ (p=0.289 n=40+40) Gzip-4 400ms ± 6% 398ms ± 5% ~ (p=0.168 n=40+40) Gunzip-4 41.2ms ± 3% 41.7ms ± 3% +1.40% (p=0.001 n=40+40) HTTPClientServer-4 62.5µs ± 1% 62.1µs ± 2% -0.61% (p=0.000 n=37+37) JSONEncode-4 20.7ms ± 4% 20.4ms ± 3% -1.60% (p=0.000 n=40+40) JSONDecode-4 69.4ms ± 4% 69.2ms ± 6% ~ (p=0.177 n=40+40) Mandelbrot200-4 5.22ms ± 6% 5.21ms ± 3% ~ (p=0.531 n=40+40) GoParse-4 3.29ms ± 3% 3.28ms ± 3% ~ (p=0.321 n=40+39) RegexpMatchEasy0_32-4 104ns ± 4% 103ns ± 7% -0.89% (p=0.040 n=40+40) RegexpMatchEasy0_1K-4 852ns ± 3% 853ns ± 2% ~ (p=0.357 n=40+40) RegexpMatchEasy1_32-4 113ns ± 8% 113ns ± 3% ~ (p=0.906 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 5% ~ (p=0.326 n=40+40) RegexpMatchMedium_32-4 136ns ± 3% 133ns ± 3% -2.31% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 44.0µs ± 3% 43.7µs ± 3% ~ (p=0.053 n=40+40) RegexpMatchHard_32-4 2.27µs ± 3% 2.26µs ± 4% ~ (p=0.391 n=40+40) RegexpMatchHard_1K-4 68.0µs ± 3% 68.9µs ± 3% +1.28% (p=0.000 n=40+40) Revcomp-4 1.86s ± 5% 1.86s ± 2% ~ (p=0.950 n=40+40) Template-4 73.4ms ± 4% 69.9ms ± 7% -4.78% (p=0.000 n=40+40) TimeParse-4 449ns ± 4% 441ns ± 5% -1.76% (p=0.000 n=40+40) TimeFormat-4 416ns ± 3% 417ns ± 4% ~ (p=0.304 n=40+40) [Geo mean] 67.7µs 67.3µs -0.55% name old speed new speed delta GobDecode-4 107MB/s ± 4% 106MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 109MB/s ± 5% 110MB/s ± 9% ~ (p=0.142 n=38+40) Gzip-4 48.5MB/s ± 5% 48.8MB/s ± 5% ~ (p=0.172 n=40+40) Gunzip-4 472MB/s ± 3% 465MB/s ± 3% -1.39% (p=0.001 n=40+40) JSONEncode-4 93.6MB/s ± 4% 95.1MB/s ± 3% +1.61% (p=0.000 n=40+40) JSONDecode-4 28.0MB/s ± 3% 28.1MB/s ± 6% ~ (p=0.181 n=40+40) GoParse-4 17.6MB/s ± 3% 17.7MB/s ± 3% ~ (p=0.350 n=40+39) RegexpMatchEasy0_32-4 308MB/s ± 4% 311MB/s ± 6% +0.96% (p=0.025 n=40+40) RegexpMatchEasy0_1K-4 1.20GB/s ± 3% 1.20GB/s ± 2% ~ (p=0.317 n=40+40) RegexpMatchEasy1_32-4 282MB/s ± 7% 282MB/s ± 3% ~ (p=0.516 n=40+40) RegexpMatchEasy1_1K-4 994MB/s ± 4% 991MB/s ± 5% ~ (p=0.319 n=40+40) RegexpMatchMedium_32-4 7.31MB/s ± 3% 7.49MB/s ± 3% +2.46% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 23.3MB/s ± 3% 23.4MB/s ± 3% ~ (p=0.052 n=40+40) RegexpMatchHard_32-4 14.1MB/s ± 3% 14.1MB/s ± 4% ~ (p=0.391 n=40+40) RegexpMatchHard_1K-4 15.1MB/s ± 3% 14.9MB/s ± 3% -1.27% (p=0.000 n=40+40) Revcomp-4 137MB/s ± 5% 137MB/s ± 2% ~ (p=0.942 n=40+40) Template-4 26.5MB/s ± 4% 27.8MB/s ± 7% +5.03% (p=0.000 n=40+40) [Geo mean] 78.6MB/s 79.0MB/s +0.57% Change-Id: Idcacc6881ef57cd7dc33aa87b711282842b72a53 Reviewed-on: https://go-review.googlesource.com/126618 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-07-29 12:50:50 +00:00
gp2flags = regInfo{inputs: []regMask{gpsp, gpsp}}
gp1flags = regInfo{inputs: []regMask{gpsp}}
gp0flagsLoad = regInfo{inputs: []regMask{gpspsb, 0}}
gp1flagsLoad = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
flagsgp = regInfo{inputs: nil, outputs: gponly}
readflags = regInfo{inputs: nil, outputs: gponly}
flagsgpax = regInfo{inputs: nil, clobbers: ax, outputs: []regMask{gp &^ ax}}
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
gpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: gponly}
gp21load = regInfo{inputs: []regMask{gp, gpspsb, 0}, outputs: gponly}
gploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: gponly}
gp21loadidx = regInfo{inputs: []regMask{gp, gpspsb, gpsp, 0}, outputs: gponly}
gpstore = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
gpstoreconst = regInfo{inputs: []regMask{gpspsb, 0}}
gpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, gpsp, 0}}
gpstoreconstidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}}
fp01 = regInfo{inputs: nil, outputs: fponly}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
fpgp = regInfo{inputs: fponly, outputs: gponly}
gpfp = regInfo{inputs: gponly, outputs: fponly}
fp11 = regInfo{inputs: fponly, outputs: fponly}
fp2flags = regInfo{inputs: []regMask{fp, fp}}
fpload = regInfo{inputs: []regMask{gpspsb, 0}, outputs: fponly}
fploadidx = regInfo{inputs: []regMask{gpspsb, gpsp, 0}, outputs: fponly}
fpstore = regInfo{inputs: []regMask{gpspsb, fp, 0}}
fpstoreidx = regInfo{inputs: []regMask{gpspsb, gpsp, fp, 0}}
)
var _386ops = []opData{
// fp ops
{name: "ADDSS", argLength: 2, reg: fp21, asm: "ADDSS", commutative: true, resultInArg0: true}, // fp32 add
{name: "ADDSD", argLength: 2, reg: fp21, asm: "ADDSD", commutative: true, resultInArg0: true}, // fp64 add
{name: "SUBSS", argLength: 2, reg: fp21, asm: "SUBSS", resultInArg0: true}, // fp32 sub
{name: "SUBSD", argLength: 2, reg: fp21, asm: "SUBSD", resultInArg0: true}, // fp64 sub
{name: "MULSS", argLength: 2, reg: fp21, asm: "MULSS", commutative: true, resultInArg0: true}, // fp32 mul
{name: "MULSD", argLength: 2, reg: fp21, asm: "MULSD", commutative: true, resultInArg0: true}, // fp64 mul
{name: "DIVSS", argLength: 2, reg: fp21, asm: "DIVSS", resultInArg0: true}, // fp32 div
{name: "DIVSD", argLength: 2, reg: fp21, asm: "DIVSD", resultInArg0: true}, // fp64 div
{name: "MOVSSload", argLength: 2, reg: fpload, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp32 load
{name: "MOVSDload", argLength: 2, reg: fpload, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // fp64 load
{name: "MOVSSconst", reg: fp01, asm: "MOVSS", aux: "Float32", rematerializeable: true}, // fp32 constant
{name: "MOVSDconst", reg: fp01, asm: "MOVSD", aux: "Float64", rematerializeable: true}, // fp64 constant
{name: "MOVSSloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by i
{name: "MOVSSloadidx4", argLength: 3, reg: fploadidx, asm: "MOVSS", aux: "SymOff", symEffect: "Read"}, // fp32 load indexed by 4*i
{name: "MOVSDloadidx1", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by i
{name: "MOVSDloadidx8", argLength: 3, reg: fploadidx, asm: "MOVSD", aux: "SymOff", symEffect: "Read"}, // fp64 load indexed by 8*i
{name: "MOVSSstore", argLength: 3, reg: fpstore, asm: "MOVSS", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp32 store
{name: "MOVSDstore", argLength: 3, reg: fpstore, asm: "MOVSD", aux: "SymOff", faultOnNilArg0: true, symEffect: "Write"}, // fp64 store
{name: "MOVSSstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by i store
{name: "MOVSSstoreidx4", argLength: 4, reg: fpstoreidx, asm: "MOVSS", aux: "SymOff", symEffect: "Write"}, // fp32 indexed by 4i store
{name: "MOVSDstoreidx1", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by i store
{name: "MOVSDstoreidx8", argLength: 4, reg: fpstoreidx, asm: "MOVSD", aux: "SymOff", symEffect: "Write"}, // fp64 indexed by 8i store
{name: "ADDSSload", argLength: 3, reg: fp21load, asm: "ADDSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "ADDSDload", argLength: 3, reg: fp21load, asm: "ADDSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "SUBSSload", argLength: 3, reg: fp21load, asm: "SUBSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "SUBSDload", argLength: 3, reg: fp21load, asm: "SUBSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "MULSSload", argLength: 3, reg: fp21load, asm: "MULSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "MULSDload", argLength: 3, reg: fp21load, asm: "MULSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
cmd/compile: optimize 386 code with MULLload/DIVSSload/DIVSDload IMULL/DIVSS/DIVSD all can take the source operand from memory directly. And this CL implement that optimization. 1. The total size of pkg/linux_386 decreases about 84KB (excluding cmd/compile). 2. The go1 benchmark shows little regression in total (excluding noise). name old time/op new time/op delta BinaryTree17-4 3.29s ± 2% 3.27s ± 4% ~ (p=0.192 n=30+30) Fannkuch11-4 3.49s ± 2% 3.54s ± 1% +1.48% (p=0.000 n=30+30) FmtFprintfEmpty-4 45.9ns ± 3% 46.3ns ± 4% +0.89% (p=0.037 n=30+30) FmtFprintfString-4 78.8ns ± 3% 78.7ns ± 4% ~ (p=0.209 n=30+27) FmtFprintfInt-4 91.0ns ± 2% 90.3ns ± 2% -0.82% (p=0.031 n=30+27) FmtFprintfIntInt-4 142ns ± 4% 143ns ± 4% ~ (p=0.136 n=30+30) FmtFprintfPrefixedInt-4 181ns ± 3% 183ns ± 4% +1.40% (p=0.005 n=30+30) FmtFprintfFloat-4 404ns ± 4% 408ns ± 3% ~ (p=0.397 n=30+30) FmtManyArgs-4 601ns ± 3% 609ns ± 5% ~ (p=0.059 n=30+30) GobDecode-4 7.21ms ± 5% 7.24ms ± 5% ~ (p=0.612 n=30+30) GobEncode-4 6.91ms ± 6% 6.91ms ± 6% ~ (p=0.797 n=30+30) Gzip-4 398ms ± 6% 399ms ± 4% ~ (p=0.173 n=30+30) Gunzip-4 41.7ms ± 3% 41.8ms ± 3% ~ (p=0.423 n=30+30) HTTPClientServer-4 62.3µs ± 2% 62.7µs ± 3% ~ (p=0.085 n=29+30) JSONEncode-4 21.0ms ± 4% 20.7ms ± 5% -1.39% (p=0.014 n=30+30) JSONDecode-4 66.3ms ± 3% 67.4ms ± 1% +1.71% (p=0.003 n=30+24) Mandelbrot200-4 5.15ms ± 3% 5.16ms ± 3% ~ (p=0.697 n=30+30) GoParse-4 3.24ms ± 3% 3.27ms ± 4% +0.91% (p=0.032 n=30+30) RegexpMatchEasy0_32-4 101ns ± 5% 99ns ± 4% -1.82% (p=0.008 n=29+30) RegexpMatchEasy0_1K-4 848ns ± 4% 841ns ± 2% -0.77% (p=0.043 n=30+30) RegexpMatchEasy1_32-4 106ns ± 6% 106ns ± 3% ~ (p=0.939 n=29+30) RegexpMatchEasy1_1K-4 1.02µs ± 3% 1.03µs ± 4% ~ (p=0.297 n=28+30) RegexpMatchMedium_32-4 129ns ± 4% 127ns ± 4% ~ (p=0.073 n=30+30) RegexpMatchMedium_1K-4 43.9µs ± 3% 43.8µs ± 3% ~ (p=0.186 n=30+30) RegexpMatchHard_32-4 2.24µs ± 4% 2.22µs ± 4% ~ (p=0.332 n=30+29) RegexpMatchHard_1K-4 68.0µs ± 4% 67.5µs ± 3% ~ (p=0.290 n=30+30) Revcomp-4 1.85s ± 3% 1.85s ± 3% ~ (p=0.358 n=30+30) Template-4 69.6ms ± 3% 70.0ms ± 4% ~ (p=0.273 n=30+30) TimeParse-4 445ns ± 3% 441ns ± 3% ~ (p=0.494 n=30+30) TimeFormat-4 412ns ± 3% 412ns ± 6% ~ (p=0.841 n=30+30) [Geo mean] 66.7µs 66.8µs +0.13% name old speed new speed delta GobDecode-4 107MB/s ± 5% 106MB/s ± 5% ~ (p=0.615 n=30+30) GobEncode-4 111MB/s ± 6% 111MB/s ± 6% ~ (p=0.790 n=30+30) Gzip-4 48.8MB/s ± 6% 48.7MB/s ± 4% ~ (p=0.167 n=30+30) Gunzip-4 465MB/s ± 3% 465MB/s ± 3% ~ (p=0.420 n=30+30) JSONEncode-4 92.4MB/s ± 4% 93.7MB/s ± 5% +1.42% (p=0.015 n=30+30) JSONDecode-4 29.3MB/s ± 3% 28.8MB/s ± 1% -1.72% (p=0.003 n=30+24) GoParse-4 17.9MB/s ± 3% 17.7MB/s ± 4% -0.89% (p=0.037 n=30+30) RegexpMatchEasy0_32-4 317MB/s ± 8% 324MB/s ± 4% +2.14% (p=0.006 n=30+30) RegexpMatchEasy0_1K-4 1.21GB/s ± 4% 1.22GB/s ± 2% +0.77% (p=0.036 n=30+30) RegexpMatchEasy1_32-4 298MB/s ± 7% 299MB/s ± 4% ~ (p=0.511 n=30+30) RegexpMatchEasy1_1K-4 1.00GB/s ± 3% 1.00GB/s ± 4% ~ (p=0.304 n=28+30) RegexpMatchMedium_32-4 7.75MB/s ± 4% 7.82MB/s ± 4% ~ (p=0.089 n=30+30) RegexpMatchMedium_1K-4 23.3MB/s ± 3% 23.4MB/s ± 3% ~ (p=0.181 n=30+30) RegexpMatchHard_32-4 14.3MB/s ± 4% 14.4MB/s ± 4% ~ (p=0.320 n=30+29) RegexpMatchHard_1K-4 15.1MB/s ± 4% 15.2MB/s ± 3% ~ (p=0.273 n=30+30) Revcomp-4 137MB/s ± 3% 137MB/s ± 3% ~ (p=0.352 n=30+30) Template-4 27.9MB/s ± 3% 27.7MB/s ± 4% ~ (p=0.277 n=30+30) [Geo mean] 79.9MB/s 80.1MB/s +0.15% Change-Id: I97333cd8ddabb3c7c88ca5aa9e14a005b74d306d Reviewed-on: https://go-review.googlesource.com/120695 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-06-24 07:04:21 +00:00
{name: "DIVSSload", argLength: 3, reg: fp21load, asm: "DIVSS", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp32 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "DIVSDload", argLength: 3, reg: fp21load, asm: "DIVSD", aux: "SymOff", resultInArg0: true, faultOnNilArg1: true, symEffect: "Read"}, // fp64 arg0 / tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
cmd/compile: optimize 386 binary operations with a memory operand Some integer/float binary operations of 386 can take a direct memory operand, which is more efficient than loading to a register. These CL does this optimization by copying the similar solution of amd64. And the go1 benchmark shows some inprovements, especially the test case Template. (excluding noise) name old time/op new time/op delta BinaryTree17-4 3.42s ± 2% 3.40s ± 2% ~ (p=0.069 n=38+39) Fannkuch11-4 3.48s ± 1% 3.53s ± 1% +1.59% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.7ns ± 4% 46.3ns ± 3% -1.03% (p=0.001 n=40+40) FmtFprintfString-4 80.1ns ± 3% 80.6ns ± 3% +0.56% (p=0.029 n=40+40) FmtFprintfInt-4 92.4ns ± 2% 92.3ns ± 3% ~ (p=0.847 n=40+40) FmtFprintfIntInt-4 147ns ± 3% 144ns ± 3% -1.87% (p=0.000 n=40+40) FmtFprintfPrefixedInt-4 182ns ± 2% 184ns ± 3% +0.99% (p=0.002 n=40+40) FmtFprintfFloat-4 387ns ± 3% 384ns ± 3% ~ (p=0.069 n=40+40) FmtManyArgs-4 619ns ± 3% 616ns ± 3% ~ (p=0.320 n=40+40) GobDecode-4 7.28ms ± 6% 7.27ms ± 5% ~ (p=0.897 n=40+40) GobEncode-4 7.33ms ± 6% 7.21ms ± 6% -1.56% (p=0.022 n=38+40) Gzip-4 357ms ± 4% 357ms ± 4% ~ (p=0.071 n=40+40) Gunzip-4 45.3ms ± 3% 45.4ms ± 3% ~ (p=0.452 n=40+40) HTTPClientServer-4 63.0µs ± 2% 62.9µs ± 3% ~ (p=0.760 n=38+39) JSONEncode-4 22.0ms ± 4% 21.7ms ± 4% -1.49% (p=0.000 n=40+40) JSONDecode-4 67.7ms ± 4% 68.3ms ± 3% +0.86% (p=0.039 n=40+40) Mandelbrot200-4 5.16ms ± 3% 5.17ms ± 3% ~ (p=0.418 n=40+40) GoParse-4 3.30ms ± 2% 3.32ms ± 3% +0.55% (p=0.017 n=40+40) RegexpMatchEasy0_32-4 104ns ± 3% 104ns ± 4% ~ (p=0.992 n=40+40) RegexpMatchEasy0_1K-4 852ns ± 3% 851ns ± 2% ~ (p=0.344 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 5% ~ (p=0.937 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 5% 1.04µs ± 4% ~ (p=0.430 n=40+40) RegexpMatchMedium_32-4 132ns ± 4% 131ns ± 3% -1.06% (p=0.027 n=40+40) RegexpMatchMedium_1K-4 43.0µs ± 3% 43.2µs ± 3% ~ (p=0.122 n=40+40) RegexpMatchHard_32-4 2.21µs ± 4% 2.20µs ± 4% ~ (p=0.146 n=40+40) RegexpMatchHard_1K-4 67.1µs ± 4% 67.2µs ± 3% ~ (p=0.859 n=40+40) Revcomp-4 1.85s ± 2% 1.85s ± 3% ~ (p=0.184 n=40+40) Template-4 70.1ms ± 4% 67.5ms ± 3% -3.65% (p=0.000 n=40+40) TimeParse-4 457ns ±16% 439ns ± 4% ~ (p=0.683 n=40+34) TimeFormat-4 413ns ± 3% 414ns ± 3% ~ (p=0.850 n=40+40) [Geo mean] 67.5µs 67.3µs -0.38% name old speed new speed delta GobDecode-4 105MB/s ± 6% 106MB/s ± 5% ~ (p=0.893 n=40+40) GobEncode-4 105MB/s ± 6% 107MB/s ± 7% +1.60% (p=0.023 n=38+40) Gzip-4 54.4MB/s ± 4% 54.5MB/s ± 4% ~ (p=0.073 n=40+40) Gunzip-4 429MB/s ± 3% 428MB/s ± 3% ~ (p=0.453 n=40+40) JSONEncode-4 88.3MB/s ± 5% 89.6MB/s ± 4% +1.51% (p=0.000 n=40+40) JSONDecode-4 28.7MB/s ± 4% 28.4MB/s ± 3% -0.87% (p=0.039 n=40+40) GoParse-4 17.6MB/s ± 3% 17.5MB/s ± 3% -0.55% (p=0.020 n=40+40) RegexpMatchEasy0_32-4 308MB/s ± 4% 308MB/s ± 5% ~ (p=0.988 n=40+40) RegexpMatchEasy0_1K-4 1.20GB/s ± 3% 1.20GB/s ± 2% ~ (p=0.329 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 4% 283MB/s ± 4% ~ (p=0.507 n=40+40) RegexpMatchEasy1_1K-4 991MB/s ± 5% 987MB/s ± 4% ~ (p=0.446 n=40+40) RegexpMatchMedium_32-4 7.54MB/s ± 4% 7.63MB/s ± 3% +1.26% (p=0.004 n=40+40) RegexpMatchMedium_1K-4 23.8MB/s ± 3% 23.7MB/s ± 4% ~ (p=0.121 n=40+40) RegexpMatchHard_32-4 14.5MB/s ± 4% 14.6MB/s ± 4% ~ (p=0.145 n=40+40) RegexpMatchHard_1K-4 15.3MB/s ± 4% 15.2MB/s ± 3% ~ (p=0.874 n=40+40) Revcomp-4 137MB/s ± 2% 137MB/s ± 3% ~ (p=0.179 n=40+40) Template-4 27.7MB/s ± 4% 28.7MB/s ± 3% +3.78% (p=0.000 n=40+40) [Geo mean] 78.9MB/s 79.2MB/s +0.38% Change-Id: I3ba688c253b665485c1ebdf5a75f4ce82cc3def3 Reviewed-on: https://go-review.googlesource.com/102036 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-22 02:18:50 +00:00
// binary ops
{name: "ADDL", argLength: 2, reg: gp21sp, asm: "ADDL", commutative: true, clobberFlags: true}, // arg0 + arg1
{name: "ADDLconst", argLength: 1, reg: gp11sp, asm: "ADDL", aux: "Int32", typ: "UInt32", clobberFlags: true}, // arg0 + auxint
{name: "ADDLcarry", argLength: 2, reg: gp21carry, asm: "ADDL", commutative: true, resultInArg0: true}, // arg0 + arg1, generates <carry,result> pair
{name: "ADDLconstcarry", argLength: 1, reg: gp11carry, asm: "ADDL", aux: "Int32", resultInArg0: true}, // arg0 + auxint, generates <carry,result> pair
{name: "ADCL", argLength: 3, reg: gp2carry1, asm: "ADCL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0+arg1+carry(arg2), where arg2 is flags
{name: "ADCLconst", argLength: 2, reg: gp1carry1, asm: "ADCL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0+auxint+carry(arg1), where arg1 is flags
{name: "SUBL", argLength: 2, reg: gp21, asm: "SUBL", resultInArg0: true, clobberFlags: true}, // arg0 - arg1
{name: "SUBLconst", argLength: 1, reg: gp11, asm: "SUBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 - auxint
{name: "SUBLcarry", argLength: 2, reg: gp21carry, asm: "SUBL", resultInArg0: true}, // arg0-arg1, generates <borrow,result> pair
{name: "SUBLconstcarry", argLength: 1, reg: gp11carry, asm: "SUBL", aux: "Int32", resultInArg0: true}, // arg0-auxint, generates <borrow,result> pair
{name: "SBBL", argLength: 3, reg: gp2carry1, asm: "SBBL", resultInArg0: true, clobberFlags: true}, // arg0-arg1-borrow(arg2), where arg2 is flags
{name: "SBBLconst", argLength: 2, reg: gp1carry1, asm: "SBBL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0-auxint-borrow(arg1), where arg1 is flags
{name: "MULL", argLength: 2, reg: gp21, asm: "IMULL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 * arg1
{name: "MULLconst", argLength: 1, reg: gp11, asm: "IMUL3L", aux: "Int32", clobberFlags: true}, // arg0 * auxint
{name: "MULLU", argLength: 2, reg: regInfo{inputs: []regMask{ax, gpsp}, outputs: []regMask{ax, 0}, clobbers: dx}, typ: "(UInt32,Flags)", asm: "MULL", commutative: true, clobberFlags: true}, // Let x = arg0*arg1 (full 32x32->64 unsigned multiply). Returns uint32(x), and flags set to overflow if uint32(x) != x.
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "HMULL", argLength: 2, reg: gp21hmul, commutative: true, asm: "IMULL", clobberFlags: true}, // (arg0 * arg1) >> width
{name: "HMULLU", argLength: 2, reg: gp21hmul, commutative: true, asm: "MULL", clobberFlags: true}, // (arg0 * arg1) >> width
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "MULLQU", argLength: 2, reg: gp21mul, commutative: true, asm: "MULL", clobberFlags: true}, // arg0 * arg1, high 32 in result[0], low 32 in result[1]
{name: "AVGLU", argLength: 2, reg: gp21, commutative: true, resultInArg0: true, clobberFlags: true}, // (arg0 + arg1) / 2 as unsigned, all 32 result bits
// For DIVL, DIVW, MODL and MODW, AuxInt non-zero means that the divisor has been proved to be not -1.
{name: "DIVL", argLength: 2, reg: gp11div, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 / arg1
{name: "DIVW", argLength: 2, reg: gp11div, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 / arg1
{name: "DIVLU", argLength: 2, reg: gp11div, asm: "DIVL", clobberFlags: true}, // arg0 / arg1
{name: "DIVWU", argLength: 2, reg: gp11div, asm: "DIVW", clobberFlags: true}, // arg0 / arg1
{name: "MODL", argLength: 2, reg: gp11mod, asm: "IDIVL", aux: "Bool", clobberFlags: true}, // arg0 % arg1
{name: "MODW", argLength: 2, reg: gp11mod, asm: "IDIVW", aux: "Bool", clobberFlags: true}, // arg0 % arg1
{name: "MODLU", argLength: 2, reg: gp11mod, asm: "DIVL", clobberFlags: true}, // arg0 % arg1
{name: "MODWU", argLength: 2, reg: gp11mod, asm: "DIVW", clobberFlags: true}, // arg0 % arg1
{name: "ANDL", argLength: 2, reg: gp21, asm: "ANDL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 & arg1
{name: "ANDLconst", argLength: 1, reg: gp11, asm: "ANDL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 & auxint
{name: "ORL", argLength: 2, reg: gp21, asm: "ORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 | arg1
{name: "ORLconst", argLength: 1, reg: gp11, asm: "ORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 | auxint
{name: "XORL", argLength: 2, reg: gp21, asm: "XORL", commutative: true, resultInArg0: true, clobberFlags: true}, // arg0 ^ arg1
{name: "XORLconst", argLength: 1, reg: gp11, asm: "XORL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 ^ auxint
{name: "CMPL", argLength: 2, reg: gp2flags, asm: "CMPL", typ: "Flags"}, // arg0 compare to arg1
{name: "CMPW", argLength: 2, reg: gp2flags, asm: "CMPW", typ: "Flags"}, // arg0 compare to arg1
{name: "CMPB", argLength: 2, reg: gp2flags, asm: "CMPB", typ: "Flags"}, // arg0 compare to arg1
{name: "CMPLconst", argLength: 1, reg: gp1flags, asm: "CMPL", typ: "Flags", aux: "Int32"}, // arg0 compare to auxint
{name: "CMPWconst", argLength: 1, reg: gp1flags, asm: "CMPW", typ: "Flags", aux: "Int16"}, // arg0 compare to auxint
{name: "CMPBconst", argLength: 1, reg: gp1flags, asm: "CMPB", typ: "Flags", aux: "Int8"}, // arg0 compare to auxint
cmd/compile: optimize 386's comparison CMPL/CMPW/CMPB can take a memory operand on 386, and this CL implements that optimization. 1. The total size of pkg/linux_386 decreases about 45KB, excluding cmd/compile. 2. The go1 benchmark shows a little improvement. name old time/op new time/op delta BinaryTree17-4 3.36s ± 2% 3.37s ± 3% ~ (p=0.537 n=40+40) Fannkuch11-4 3.59s ± 1% 3.53s ± 2% -1.58% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.0ns ± 3% 45.8ns ± 3% ~ (p=0.249 n=40+40) FmtFprintfString-4 80.0ns ± 4% 78.8ns ± 3% -1.49% (p=0.001 n=40+40) FmtFprintfInt-4 89.7ns ± 2% 90.3ns ± 2% +0.74% (p=0.003 n=40+40) FmtFprintfIntInt-4 144ns ± 3% 143ns ± 3% -0.95% (p=0.003 n=40+40) FmtFprintfPrefixedInt-4 181ns ± 4% 180ns ± 2% ~ (p=0.103 n=40+40) FmtFprintfFloat-4 412ns ± 3% 408ns ± 4% -0.97% (p=0.018 n=40+40) FmtManyArgs-4 607ns ± 4% 605ns ± 4% ~ (p=0.148 n=40+40) GobDecode-4 7.19ms ± 4% 7.24ms ± 5% ~ (p=0.340 n=40+40) GobEncode-4 7.04ms ± 9% 6.99ms ± 9% ~ (p=0.289 n=40+40) Gzip-4 400ms ± 6% 398ms ± 5% ~ (p=0.168 n=40+40) Gunzip-4 41.2ms ± 3% 41.7ms ± 3% +1.40% (p=0.001 n=40+40) HTTPClientServer-4 62.5µs ± 1% 62.1µs ± 2% -0.61% (p=0.000 n=37+37) JSONEncode-4 20.7ms ± 4% 20.4ms ± 3% -1.60% (p=0.000 n=40+40) JSONDecode-4 69.4ms ± 4% 69.2ms ± 6% ~ (p=0.177 n=40+40) Mandelbrot200-4 5.22ms ± 6% 5.21ms ± 3% ~ (p=0.531 n=40+40) GoParse-4 3.29ms ± 3% 3.28ms ± 3% ~ (p=0.321 n=40+39) RegexpMatchEasy0_32-4 104ns ± 4% 103ns ± 7% -0.89% (p=0.040 n=40+40) RegexpMatchEasy0_1K-4 852ns ± 3% 853ns ± 2% ~ (p=0.357 n=40+40) RegexpMatchEasy1_32-4 113ns ± 8% 113ns ± 3% ~ (p=0.906 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 5% ~ (p=0.326 n=40+40) RegexpMatchMedium_32-4 136ns ± 3% 133ns ± 3% -2.31% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 44.0µs ± 3% 43.7µs ± 3% ~ (p=0.053 n=40+40) RegexpMatchHard_32-4 2.27µs ± 3% 2.26µs ± 4% ~ (p=0.391 n=40+40) RegexpMatchHard_1K-4 68.0µs ± 3% 68.9µs ± 3% +1.28% (p=0.000 n=40+40) Revcomp-4 1.86s ± 5% 1.86s ± 2% ~ (p=0.950 n=40+40) Template-4 73.4ms ± 4% 69.9ms ± 7% -4.78% (p=0.000 n=40+40) TimeParse-4 449ns ± 4% 441ns ± 5% -1.76% (p=0.000 n=40+40) TimeFormat-4 416ns ± 3% 417ns ± 4% ~ (p=0.304 n=40+40) [Geo mean] 67.7µs 67.3µs -0.55% name old speed new speed delta GobDecode-4 107MB/s ± 4% 106MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 109MB/s ± 5% 110MB/s ± 9% ~ (p=0.142 n=38+40) Gzip-4 48.5MB/s ± 5% 48.8MB/s ± 5% ~ (p=0.172 n=40+40) Gunzip-4 472MB/s ± 3% 465MB/s ± 3% -1.39% (p=0.001 n=40+40) JSONEncode-4 93.6MB/s ± 4% 95.1MB/s ± 3% +1.61% (p=0.000 n=40+40) JSONDecode-4 28.0MB/s ± 3% 28.1MB/s ± 6% ~ (p=0.181 n=40+40) GoParse-4 17.6MB/s ± 3% 17.7MB/s ± 3% ~ (p=0.350 n=40+39) RegexpMatchEasy0_32-4 308MB/s ± 4% 311MB/s ± 6% +0.96% (p=0.025 n=40+40) RegexpMatchEasy0_1K-4 1.20GB/s ± 3% 1.20GB/s ± 2% ~ (p=0.317 n=40+40) RegexpMatchEasy1_32-4 282MB/s ± 7% 282MB/s ± 3% ~ (p=0.516 n=40+40) RegexpMatchEasy1_1K-4 994MB/s ± 4% 991MB/s ± 5% ~ (p=0.319 n=40+40) RegexpMatchMedium_32-4 7.31MB/s ± 3% 7.49MB/s ± 3% +2.46% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 23.3MB/s ± 3% 23.4MB/s ± 3% ~ (p=0.052 n=40+40) RegexpMatchHard_32-4 14.1MB/s ± 3% 14.1MB/s ± 4% ~ (p=0.391 n=40+40) RegexpMatchHard_1K-4 15.1MB/s ± 3% 14.9MB/s ± 3% -1.27% (p=0.000 n=40+40) Revcomp-4 137MB/s ± 5% 137MB/s ± 2% ~ (p=0.942 n=40+40) Template-4 26.5MB/s ± 4% 27.8MB/s ± 7% +5.03% (p=0.000 n=40+40) [Geo mean] 78.6MB/s 79.0MB/s +0.57% Change-Id: Idcacc6881ef57cd7dc33aa87b711282842b72a53 Reviewed-on: https://go-review.googlesource.com/126618 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-07-29 12:50:50 +00:00
// compare *(arg0+auxint+aux) to arg1 (in that order). arg2=mem.
{name: "CMPLload", argLength: 3, reg: gp1flagsLoad, asm: "CMPL", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
{name: "CMPWload", argLength: 3, reg: gp1flagsLoad, asm: "CMPW", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
{name: "CMPBload", argLength: 3, reg: gp1flagsLoad, asm: "CMPB", aux: "SymOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
// compare *(arg0+ValAndOff(AuxInt).Off()+aux) to ValAndOff(AuxInt).Val() (in that order). arg1=mem.
{name: "CMPLconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPL", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
{name: "CMPWconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPW", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
{name: "CMPBconstload", argLength: 2, reg: gp0flagsLoad, asm: "CMPB", aux: "SymValAndOff", typ: "Flags", symEffect: "Read", faultOnNilArg0: true},
{name: "UCOMISS", argLength: 2, reg: fp2flags, asm: "UCOMISS", typ: "Flags"}, // arg0 compare to arg1, f32
{name: "UCOMISD", argLength: 2, reg: fp2flags, asm: "UCOMISD", typ: "Flags"}, // arg0 compare to arg1, f64
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0
{name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0
{name: "TESTB", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTB", typ: "Flags"}, // (arg0 & arg1) compare to 0
{name: "TESTLconst", argLength: 1, reg: gp1flags, asm: "TESTL", typ: "Flags", aux: "Int32"}, // (arg0 & auxint) compare to 0
{name: "TESTWconst", argLength: 1, reg: gp1flags, asm: "TESTW", typ: "Flags", aux: "Int16"}, // (arg0 & auxint) compare to 0
{name: "TESTBconst", argLength: 1, reg: gp1flags, asm: "TESTB", typ: "Flags", aux: "Int8"}, // (arg0 & auxint) compare to 0
{name: "SHLL", argLength: 2, reg: gp21shift, asm: "SHLL", resultInArg0: true, clobberFlags: true}, // arg0 << arg1, shift amount is mod 32
{name: "SHLLconst", argLength: 1, reg: gp11, asm: "SHLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 << auxint, shift amount 0-31
// Note: x86 is weird, the 16 and 8 byte shifts still use all 5 bits of shift amount!
{name: "SHRL", argLength: 2, reg: gp21shift, asm: "SHRL", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
{name: "SHRW", argLength: 2, reg: gp21shift, asm: "SHRW", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
{name: "SHRB", argLength: 2, reg: gp21shift, asm: "SHRB", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> arg1, shift amount is mod 32
{name: "SHRLconst", argLength: 1, reg: gp11, asm: "SHRL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-31
{name: "SHRWconst", argLength: 1, reg: gp11, asm: "SHRW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-15
{name: "SHRBconst", argLength: 1, reg: gp11, asm: "SHRB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // unsigned arg0 >> auxint, shift amount 0-7
{name: "SARL", argLength: 2, reg: gp21shift, asm: "SARL", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
{name: "SARW", argLength: 2, reg: gp21shift, asm: "SARW", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
{name: "SARB", argLength: 2, reg: gp21shift, asm: "SARB", resultInArg0: true, clobberFlags: true}, // signed arg0 >> arg1, shift amount is mod 32
{name: "SARLconst", argLength: 1, reg: gp11, asm: "SARL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-31
{name: "SARWconst", argLength: 1, reg: gp11, asm: "SARW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-15
{name: "SARBconst", argLength: 1, reg: gp11, asm: "SARB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // signed arg0 >> auxint, shift amount 0-7
{name: "ROLL", argLength: 2, reg: gp21shift, asm: "ROLL", resultInArg0: true, clobberFlags: true}, // 32 bits of arg0 rotate left by arg1
{name: "ROLW", argLength: 2, reg: gp21shift, asm: "ROLW", resultInArg0: true, clobberFlags: true}, // low 16 bits of arg0 rotate left by arg1
{name: "ROLB", argLength: 2, reg: gp21shift, asm: "ROLB", resultInArg0: true, clobberFlags: true}, // low 8 bits of arg0 rotate left by arg1
{name: "ROLLconst", argLength: 1, reg: gp11, asm: "ROLL", aux: "Int32", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-31
{name: "ROLWconst", argLength: 1, reg: gp11, asm: "ROLW", aux: "Int16", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-15
{name: "ROLBconst", argLength: 1, reg: gp11, asm: "ROLB", aux: "Int8", resultInArg0: true, clobberFlags: true}, // arg0 rotate left auxint, rotate amount 0-7
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// binary-op with a memory source operand
cmd/compile: optimize 386 code with MULLload/DIVSSload/DIVSDload IMULL/DIVSS/DIVSD all can take the source operand from memory directly. And this CL implement that optimization. 1. The total size of pkg/linux_386 decreases about 84KB (excluding cmd/compile). 2. The go1 benchmark shows little regression in total (excluding noise). name old time/op new time/op delta BinaryTree17-4 3.29s ± 2% 3.27s ± 4% ~ (p=0.192 n=30+30) Fannkuch11-4 3.49s ± 2% 3.54s ± 1% +1.48% (p=0.000 n=30+30) FmtFprintfEmpty-4 45.9ns ± 3% 46.3ns ± 4% +0.89% (p=0.037 n=30+30) FmtFprintfString-4 78.8ns ± 3% 78.7ns ± 4% ~ (p=0.209 n=30+27) FmtFprintfInt-4 91.0ns ± 2% 90.3ns ± 2% -0.82% (p=0.031 n=30+27) FmtFprintfIntInt-4 142ns ± 4% 143ns ± 4% ~ (p=0.136 n=30+30) FmtFprintfPrefixedInt-4 181ns ± 3% 183ns ± 4% +1.40% (p=0.005 n=30+30) FmtFprintfFloat-4 404ns ± 4% 408ns ± 3% ~ (p=0.397 n=30+30) FmtManyArgs-4 601ns ± 3% 609ns ± 5% ~ (p=0.059 n=30+30) GobDecode-4 7.21ms ± 5% 7.24ms ± 5% ~ (p=0.612 n=30+30) GobEncode-4 6.91ms ± 6% 6.91ms ± 6% ~ (p=0.797 n=30+30) Gzip-4 398ms ± 6% 399ms ± 4% ~ (p=0.173 n=30+30) Gunzip-4 41.7ms ± 3% 41.8ms ± 3% ~ (p=0.423 n=30+30) HTTPClientServer-4 62.3µs ± 2% 62.7µs ± 3% ~ (p=0.085 n=29+30) JSONEncode-4 21.0ms ± 4% 20.7ms ± 5% -1.39% (p=0.014 n=30+30) JSONDecode-4 66.3ms ± 3% 67.4ms ± 1% +1.71% (p=0.003 n=30+24) Mandelbrot200-4 5.15ms ± 3% 5.16ms ± 3% ~ (p=0.697 n=30+30) GoParse-4 3.24ms ± 3% 3.27ms ± 4% +0.91% (p=0.032 n=30+30) RegexpMatchEasy0_32-4 101ns ± 5% 99ns ± 4% -1.82% (p=0.008 n=29+30) RegexpMatchEasy0_1K-4 848ns ± 4% 841ns ± 2% -0.77% (p=0.043 n=30+30) RegexpMatchEasy1_32-4 106ns ± 6% 106ns ± 3% ~ (p=0.939 n=29+30) RegexpMatchEasy1_1K-4 1.02µs ± 3% 1.03µs ± 4% ~ (p=0.297 n=28+30) RegexpMatchMedium_32-4 129ns ± 4% 127ns ± 4% ~ (p=0.073 n=30+30) RegexpMatchMedium_1K-4 43.9µs ± 3% 43.8µs ± 3% ~ (p=0.186 n=30+30) RegexpMatchHard_32-4 2.24µs ± 4% 2.22µs ± 4% ~ (p=0.332 n=30+29) RegexpMatchHard_1K-4 68.0µs ± 4% 67.5µs ± 3% ~ (p=0.290 n=30+30) Revcomp-4 1.85s ± 3% 1.85s ± 3% ~ (p=0.358 n=30+30) Template-4 69.6ms ± 3% 70.0ms ± 4% ~ (p=0.273 n=30+30) TimeParse-4 445ns ± 3% 441ns ± 3% ~ (p=0.494 n=30+30) TimeFormat-4 412ns ± 3% 412ns ± 6% ~ (p=0.841 n=30+30) [Geo mean] 66.7µs 66.8µs +0.13% name old speed new speed delta GobDecode-4 107MB/s ± 5% 106MB/s ± 5% ~ (p=0.615 n=30+30) GobEncode-4 111MB/s ± 6% 111MB/s ± 6% ~ (p=0.790 n=30+30) Gzip-4 48.8MB/s ± 6% 48.7MB/s ± 4% ~ (p=0.167 n=30+30) Gunzip-4 465MB/s ± 3% 465MB/s ± 3% ~ (p=0.420 n=30+30) JSONEncode-4 92.4MB/s ± 4% 93.7MB/s ± 5% +1.42% (p=0.015 n=30+30) JSONDecode-4 29.3MB/s ± 3% 28.8MB/s ± 1% -1.72% (p=0.003 n=30+24) GoParse-4 17.9MB/s ± 3% 17.7MB/s ± 4% -0.89% (p=0.037 n=30+30) RegexpMatchEasy0_32-4 317MB/s ± 8% 324MB/s ± 4% +2.14% (p=0.006 n=30+30) RegexpMatchEasy0_1K-4 1.21GB/s ± 4% 1.22GB/s ± 2% +0.77% (p=0.036 n=30+30) RegexpMatchEasy1_32-4 298MB/s ± 7% 299MB/s ± 4% ~ (p=0.511 n=30+30) RegexpMatchEasy1_1K-4 1.00GB/s ± 3% 1.00GB/s ± 4% ~ (p=0.304 n=28+30) RegexpMatchMedium_32-4 7.75MB/s ± 4% 7.82MB/s ± 4% ~ (p=0.089 n=30+30) RegexpMatchMedium_1K-4 23.3MB/s ± 3% 23.4MB/s ± 3% ~ (p=0.181 n=30+30) RegexpMatchHard_32-4 14.3MB/s ± 4% 14.4MB/s ± 4% ~ (p=0.320 n=30+29) RegexpMatchHard_1K-4 15.1MB/s ± 4% 15.2MB/s ± 3% ~ (p=0.273 n=30+30) Revcomp-4 137MB/s ± 3% 137MB/s ± 3% ~ (p=0.352 n=30+30) Template-4 27.9MB/s ± 3% 27.7MB/s ± 4% ~ (p=0.277 n=30+30) [Geo mean] 79.9MB/s 80.1MB/s +0.15% Change-Id: I97333cd8ddabb3c7c88ca5aa9e14a005b74d306d Reviewed-on: https://go-review.googlesource.com/120695 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-06-24 07:04:21 +00:00
{name: "ADDLload", argLength: 3, reg: gp21load, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "SUBLload", argLength: 3, reg: gp21load, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "MULLload", argLength: 3, reg: gp21load, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "ANDLload", argLength: 3, reg: gp21load, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "ORLload", argLength: 3, reg: gp21load, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
{name: "XORLload", argLength: 3, reg: gp21load, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, faultOnNilArg1: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+auxint+aux, arg2 = mem
cmd/compile: optimize 386 binary operations with a memory operand Some integer/float binary operations of 386 can take a direct memory operand, which is more efficient than loading to a register. These CL does this optimization by copying the similar solution of amd64. And the go1 benchmark shows some inprovements, especially the test case Template. (excluding noise) name old time/op new time/op delta BinaryTree17-4 3.42s ± 2% 3.40s ± 2% ~ (p=0.069 n=38+39) Fannkuch11-4 3.48s ± 1% 3.53s ± 1% +1.59% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.7ns ± 4% 46.3ns ± 3% -1.03% (p=0.001 n=40+40) FmtFprintfString-4 80.1ns ± 3% 80.6ns ± 3% +0.56% (p=0.029 n=40+40) FmtFprintfInt-4 92.4ns ± 2% 92.3ns ± 3% ~ (p=0.847 n=40+40) FmtFprintfIntInt-4 147ns ± 3% 144ns ± 3% -1.87% (p=0.000 n=40+40) FmtFprintfPrefixedInt-4 182ns ± 2% 184ns ± 3% +0.99% (p=0.002 n=40+40) FmtFprintfFloat-4 387ns ± 3% 384ns ± 3% ~ (p=0.069 n=40+40) FmtManyArgs-4 619ns ± 3% 616ns ± 3% ~ (p=0.320 n=40+40) GobDecode-4 7.28ms ± 6% 7.27ms ± 5% ~ (p=0.897 n=40+40) GobEncode-4 7.33ms ± 6% 7.21ms ± 6% -1.56% (p=0.022 n=38+40) Gzip-4 357ms ± 4% 357ms ± 4% ~ (p=0.071 n=40+40) Gunzip-4 45.3ms ± 3% 45.4ms ± 3% ~ (p=0.452 n=40+40) HTTPClientServer-4 63.0µs ± 2% 62.9µs ± 3% ~ (p=0.760 n=38+39) JSONEncode-4 22.0ms ± 4% 21.7ms ± 4% -1.49% (p=0.000 n=40+40) JSONDecode-4 67.7ms ± 4% 68.3ms ± 3% +0.86% (p=0.039 n=40+40) Mandelbrot200-4 5.16ms ± 3% 5.17ms ± 3% ~ (p=0.418 n=40+40) GoParse-4 3.30ms ± 2% 3.32ms ± 3% +0.55% (p=0.017 n=40+40) RegexpMatchEasy0_32-4 104ns ± 3% 104ns ± 4% ~ (p=0.992 n=40+40) RegexpMatchEasy0_1K-4 852ns ± 3% 851ns ± 2% ~ (p=0.344 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 5% ~ (p=0.937 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 5% 1.04µs ± 4% ~ (p=0.430 n=40+40) RegexpMatchMedium_32-4 132ns ± 4% 131ns ± 3% -1.06% (p=0.027 n=40+40) RegexpMatchMedium_1K-4 43.0µs ± 3% 43.2µs ± 3% ~ (p=0.122 n=40+40) RegexpMatchHard_32-4 2.21µs ± 4% 2.20µs ± 4% ~ (p=0.146 n=40+40) RegexpMatchHard_1K-4 67.1µs ± 4% 67.2µs ± 3% ~ (p=0.859 n=40+40) Revcomp-4 1.85s ± 2% 1.85s ± 3% ~ (p=0.184 n=40+40) Template-4 70.1ms ± 4% 67.5ms ± 3% -3.65% (p=0.000 n=40+40) TimeParse-4 457ns ±16% 439ns ± 4% ~ (p=0.683 n=40+34) TimeFormat-4 413ns ± 3% 414ns ± 3% ~ (p=0.850 n=40+40) [Geo mean] 67.5µs 67.3µs -0.38% name old speed new speed delta GobDecode-4 105MB/s ± 6% 106MB/s ± 5% ~ (p=0.893 n=40+40) GobEncode-4 105MB/s ± 6% 107MB/s ± 7% +1.60% (p=0.023 n=38+40) Gzip-4 54.4MB/s ± 4% 54.5MB/s ± 4% ~ (p=0.073 n=40+40) Gunzip-4 429MB/s ± 3% 428MB/s ± 3% ~ (p=0.453 n=40+40) JSONEncode-4 88.3MB/s ± 5% 89.6MB/s ± 4% +1.51% (p=0.000 n=40+40) JSONDecode-4 28.7MB/s ± 4% 28.4MB/s ± 3% -0.87% (p=0.039 n=40+40) GoParse-4 17.6MB/s ± 3% 17.5MB/s ± 3% -0.55% (p=0.020 n=40+40) RegexpMatchEasy0_32-4 308MB/s ± 4% 308MB/s ± 5% ~ (p=0.988 n=40+40) RegexpMatchEasy0_1K-4 1.20GB/s ± 3% 1.20GB/s ± 2% ~ (p=0.329 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 4% 283MB/s ± 4% ~ (p=0.507 n=40+40) RegexpMatchEasy1_1K-4 991MB/s ± 5% 987MB/s ± 4% ~ (p=0.446 n=40+40) RegexpMatchMedium_32-4 7.54MB/s ± 4% 7.63MB/s ± 3% +1.26% (p=0.004 n=40+40) RegexpMatchMedium_1K-4 23.8MB/s ± 3% 23.7MB/s ± 4% ~ (p=0.121 n=40+40) RegexpMatchHard_32-4 14.5MB/s ± 4% 14.6MB/s ± 4% ~ (p=0.145 n=40+40) RegexpMatchHard_1K-4 15.3MB/s ± 4% 15.2MB/s ± 3% ~ (p=0.874 n=40+40) Revcomp-4 137MB/s ± 2% 137MB/s ± 3% ~ (p=0.179 n=40+40) Template-4 27.7MB/s ± 4% 28.7MB/s ± 3% +3.78% (p=0.000 n=40+40) [Geo mean] 78.9MB/s 79.2MB/s +0.38% Change-Id: I3ba688c253b665485c1ebdf5a75f4ce82cc3def3 Reviewed-on: https://go-review.googlesource.com/102036 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-22 02:18:50 +00:00
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// binary-op with an indexed memory source operand
{name: "ADDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ADDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 + tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
{name: "SUBLloadidx4", argLength: 4, reg: gp21loadidx, asm: "SUBL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 - tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
{name: "MULLloadidx4", argLength: 4, reg: gp21loadidx, asm: "IMULL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 * tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
{name: "ANDLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ANDL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 & tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
{name: "ORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "ORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 | tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
{name: "XORLloadidx4", argLength: 4, reg: gp21loadidx, asm: "XORL", aux: "SymOff", resultInArg0: true, clobberFlags: true, symEffect: "Read"}, // arg0 ^ tmp, tmp loaded from arg1+arg2*4+auxint+aux, arg3 = mem
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// unary ops
{name: "NEGL", argLength: 1, reg: gp11, asm: "NEGL", resultInArg0: true, clobberFlags: true}, // -arg0
{name: "NOTL", argLength: 1, reg: gp11, asm: "NOTL", resultInArg0: true}, // ^arg0
{name: "BSFL", argLength: 1, reg: gp11, asm: "BSFL", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "BSFW", argLength: 1, reg: gp11, asm: "BSFW", clobberFlags: true}, // arg0 # of low-order zeroes ; undef if zero
{name: "BSRL", argLength: 1, reg: gp11, asm: "BSRL", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
{name: "BSRW", argLength: 1, reg: gp11, asm: "BSRW", clobberFlags: true}, // arg0 # of high-order zeroes ; undef if zero
{name: "BSWAPL", argLength: 1, reg: gp11, asm: "BSWAPL", resultInArg0: true}, // arg0 swap bytes
{name: "SQRTSD", argLength: 1, reg: fp11, asm: "SQRTSD"}, // sqrt(arg0)
{name: "SQRTSS", argLength: 1, reg: fp11, asm: "SQRTSS"}, // sqrt(arg0), float32
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
// Note: SBBW and SBBB are subsumed by SBBL
{name: "SETEQ", argLength: 1, reg: readflags, asm: "SETEQ"}, // extract == condition from arg0
{name: "SETNE", argLength: 1, reg: readflags, asm: "SETNE"}, // extract != condition from arg0
{name: "SETL", argLength: 1, reg: readflags, asm: "SETLT"}, // extract signed < condition from arg0
{name: "SETLE", argLength: 1, reg: readflags, asm: "SETLE"}, // extract signed <= condition from arg0
{name: "SETG", argLength: 1, reg: readflags, asm: "SETGT"}, // extract signed > condition from arg0
{name: "SETGE", argLength: 1, reg: readflags, asm: "SETGE"}, // extract signed >= condition from arg0
{name: "SETB", argLength: 1, reg: readflags, asm: "SETCS"}, // extract unsigned < condition from arg0
{name: "SETBE", argLength: 1, reg: readflags, asm: "SETLS"}, // extract unsigned <= condition from arg0
{name: "SETA", argLength: 1, reg: readflags, asm: "SETHI"}, // extract unsigned > condition from arg0
{name: "SETAE", argLength: 1, reg: readflags, asm: "SETCC"}, // extract unsigned >= condition from arg0
{name: "SETO", argLength: 1, reg: readflags, asm: "SETOS"}, // extract if overflow flag is set from arg0
// Need different opcodes for floating point conditions because
// any comparison involving a NaN is always FALSE and thus
// the patterns for inverting conditions cannot be used.
{name: "SETEQF", argLength: 1, reg: flagsgpax, asm: "SETEQ", clobberFlags: true}, // extract == condition from arg0
{name: "SETNEF", argLength: 1, reg: flagsgpax, asm: "SETNE", clobberFlags: true}, // extract != condition from arg0
{name: "SETORD", argLength: 1, reg: flagsgp, asm: "SETPC"}, // extract "ordered" (No Nan present) condition from arg0
{name: "SETNAN", argLength: 1, reg: flagsgp, asm: "SETPS"}, // extract "unordered" (Nan present) condition from arg0
{name: "SETGF", argLength: 1, reg: flagsgp, asm: "SETHI"}, // extract floating > condition from arg0
{name: "SETGEF", argLength: 1, reg: flagsgp, asm: "SETCC"}, // extract floating >= condition from arg0
{name: "MOVBLSX", argLength: 1, reg: gp11, asm: "MOVBLSX"}, // sign extend arg0 from int8 to int32
{name: "MOVBLZX", argLength: 1, reg: gp11, asm: "MOVBLZX"}, // zero extend arg0 from int8 to int32
{name: "MOVWLSX", argLength: 1, reg: gp11, asm: "MOVWLSX"}, // sign extend arg0 from int16 to int32
{name: "MOVWLZX", argLength: 1, reg: gp11, asm: "MOVWLZX"}, // zero extend arg0 from int16 to int32
{name: "MOVLconst", reg: gp01, asm: "MOVL", typ: "UInt32", aux: "Int32", rematerializeable: true}, // 32 low bits of auxint
{name: "CVTTSD2SL", argLength: 1, reg: fpgp, asm: "CVTTSD2SL"}, // convert float64 to int32
{name: "CVTTSS2SL", argLength: 1, reg: fpgp, asm: "CVTTSS2SL"}, // convert float32 to int32
{name: "CVTSL2SS", argLength: 1, reg: gpfp, asm: "CVTSL2SS"}, // convert int32 to float32
{name: "CVTSL2SD", argLength: 1, reg: gpfp, asm: "CVTSL2SD"}, // convert int32 to float64
{name: "CVTSD2SS", argLength: 1, reg: fp11, asm: "CVTSD2SS"}, // convert float64 to float32
{name: "CVTSS2SD", argLength: 1, reg: fp11, asm: "CVTSS2SD"}, // convert float32 to float64
{name: "PXOR", argLength: 2, reg: fp21, asm: "PXOR", commutative: true, resultInArg0: true}, // exclusive or, applied to X regs for float negation.
{name: "LEAL", argLength: 1, reg: gp11sb, aux: "SymOff", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxint + offset encoded in aux
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "LEAL1", argLength: 2, reg: gp21sb, commutative: true, aux: "SymOff", symEffect: "Addr"}, // arg0 + arg1 + auxint + aux
{name: "LEAL2", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 2*arg1 + auxint + aux
{name: "LEAL4", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 4*arg1 + auxint + aux
{name: "LEAL8", argLength: 2, reg: gp21sb, aux: "SymOff", symEffect: "Addr"}, // arg0 + 8*arg1 + auxint + aux
// Note: LEAL{1,2,4,8} must not have OpSB as either argument.
// auxint+aux == add auxint and the offset of the symbol in aux (if any) to the effective address
{name: "MOVBload", argLength: 2, reg: gpload, asm: "MOVBLZX", aux: "SymOff", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load byte from arg0+auxint+aux. arg1=mem. Zero extend.
{name: "MOVBLSXload", argLength: 2, reg: gpload, asm: "MOVBLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32
{name: "MOVWload", argLength: 2, reg: gpload, asm: "MOVWLZX", aux: "SymOff", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load 2 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
{name: "MOVWLSXload", argLength: 2, reg: gpload, asm: "MOVWLSX", aux: "SymOff", faultOnNilArg0: true, symEffect: "Read"}, // ditto, sign extend to int32
{name: "MOVLload", argLength: 2, reg: gpload, asm: "MOVL", aux: "SymOff", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load 4 bytes from arg0+auxint+aux. arg1=mem. Zero extend.
{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
cmd/compile: emit more compact 386 instructions ADDL/SUBL/ANDL/ORL/XORL can have a memory operand as destination, and this CL optimize the compiler to emit such instructions on 386 for more compact binary. Here is test report: 1. The total size of pkg/linux_386/ and pkg/tool/linux_386/ decreases about 14KB. (pkg/linux_386/cmd/compile/ and pkg/tool/linux_386/compile are excluded) 2. The go1 benchmark shows little change, excluding ±2% noise. name old time/op new time/op delta BinaryTree17-4 3.34s ± 2% 3.38s ± 2% +1.27% (p=0.000 n=40+39) Fannkuch11-4 3.55s ± 1% 3.51s ± 1% -1.33% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.3ns ± 3% 46.9ns ± 4% +1.41% (p=0.002 n=40+40) FmtFprintfString-4 80.8ns ± 3% 80.4ns ± 6% -0.54% (p=0.044 n=40+40) FmtFprintfInt-4 93.0ns ± 3% 92.2ns ± 4% -0.88% (p=0.007 n=39+40) FmtFprintfIntInt-4 144ns ± 5% 145ns ± 2% +0.78% (p=0.015 n=40+40) FmtFprintfPrefixedInt-4 184ns ± 2% 182ns ± 2% -1.06% (p=0.004 n=40+40) FmtFprintfFloat-4 415ns ± 4% 419ns ± 4% ~ (p=0.434 n=40+40) FmtManyArgs-4 615ns ± 3% 619ns ± 3% ~ (p=0.100 n=40+40) GobDecode-4 7.30ms ± 6% 7.36ms ± 6% ~ (p=0.074 n=40+40) GobEncode-4 7.10ms ± 6% 7.21ms ± 5% ~ (p=0.082 n=40+39) Gzip-4 364ms ± 3% 362ms ± 6% -0.71% (p=0.020 n=40+40) Gunzip-4 42.4ms ± 3% 42.2ms ± 3% ~ (p=0.303 n=40+40) HTTPClientServer-4 62.9µs ± 1% 62.9µs ± 1% ~ (p=0.768 n=38+39) JSONEncode-4 21.4ms ± 4% 21.5ms ± 5% ~ (p=0.210 n=40+40) JSONDecode-4 67.7ms ± 3% 67.9ms ± 4% ~ (p=0.713 n=40+40) Mandelbrot200-4 5.18ms ± 3% 5.21ms ± 3% +0.59% (p=0.021 n=40+40) GoParse-4 3.35ms ± 3% 3.34ms ± 2% ~ (p=0.996 n=40+40) RegexpMatchEasy0_32-4 98.5ns ± 5% 96.3ns ± 4% -2.15% (p=0.001 n=40+40) RegexpMatchEasy0_1K-4 851ns ± 4% 850ns ± 5% ~ (p=0.700 n=40+40) RegexpMatchEasy1_32-4 105ns ± 7% 107ns ± 4% +1.50% (p=0.017 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 5% 1.03µs ± 4% ~ (p=0.992 n=40+40) RegexpMatchMedium_32-4 130ns ± 6% 128ns ± 4% -1.66% (p=0.012 n=40+40) RegexpMatchMedium_1K-4 44.0µs ± 5% 43.6µs ± 3% ~ (p=0.704 n=40+40) RegexpMatchHard_32-4 2.29µs ± 3% 2.23µs ± 4% -2.38% (p=0.000 n=40+40) RegexpMatchHard_1K-4 69.0µs ± 3% 68.1µs ± 3% -1.28% (p=0.003 n=40+40) Revcomp-4 1.85s ± 2% 1.87s ± 3% +1.11% (p=0.000 n=40+40) Template-4 69.8ms ± 3% 69.6ms ± 3% ~ (p=0.125 n=40+40) TimeParse-4 442ns ± 5% 440ns ± 3% ~ (p=0.585 n=40+40) TimeFormat-4 419ns ± 3% 420ns ± 3% ~ (p=0.824 n=40+40) [Geo mean] 67.3µs 67.2µs -0.11% name old speed new speed delta GobDecode-4 105MB/s ± 6% 104MB/s ± 6% ~ (p=0.074 n=40+40) GobEncode-4 108MB/s ± 7% 107MB/s ± 5% ~ (p=0.080 n=40+39) Gzip-4 53.3MB/s ± 3% 53.7MB/s ± 6% +0.73% (p=0.021 n=40+40) Gunzip-4 458MB/s ± 3% 460MB/s ± 3% ~ (p=0.301 n=40+40) JSONEncode-4 90.8MB/s ± 4% 90.3MB/s ± 4% ~ (p=0.213 n=40+40) JSONDecode-4 28.7MB/s ± 3% 28.6MB/s ± 4% ~ (p=0.679 n=40+40) GoParse-4 17.3MB/s ± 3% 17.3MB/s ± 2% ~ (p=1.000 n=40+40) RegexpMatchEasy0_32-4 325MB/s ± 5% 333MB/s ± 4% +2.44% (p=0.000 n=40+38) RegexpMatchEasy0_1K-4 1.20GB/s ± 4% 1.21GB/s ± 5% ~ (p=0.684 n=40+40) RegexpMatchEasy1_32-4 303MB/s ± 7% 298MB/s ± 4% -1.52% (p=0.022 n=40+40) RegexpMatchEasy1_1K-4 995MB/s ± 5% 996MB/s ± 4% ~ (p=0.996 n=40+40) RegexpMatchMedium_32-4 7.67MB/s ± 6% 7.80MB/s ± 4% +1.68% (p=0.011 n=40+40) RegexpMatchMedium_1K-4 23.3MB/s ± 5% 23.5MB/s ± 3% ~ (p=0.697 n=40+40) RegexpMatchHard_32-4 14.0MB/s ± 3% 14.3MB/s ± 4% +2.43% (p=0.000 n=40+40) RegexpMatchHard_1K-4 14.8MB/s ± 3% 15.0MB/s ± 3% +1.30% (p=0.003 n=40+40) Revcomp-4 137MB/s ± 2% 136MB/s ± 3% -1.10% (p=0.000 n=40+40) Template-4 27.8MB/s ± 3% 27.9MB/s ± 3% ~ (p=0.128 n=40+40) [Geo mean] 79.6MB/s 79.9MB/s +0.28% Change-Id: I02a3efc125dc81e18fc8495eb2bf1bba59ab8733 Reviewed-on: https://go-review.googlesource.com/110157 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-04-29 10:42:14 +00:00
// direct binary-op on memory (read-modify-write)
{name: "ADDLmodify", argLength: 3, reg: gpstore, asm: "ADDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) += arg1, arg2=mem
{name: "SUBLmodify", argLength: 3, reg: gpstore, asm: "SUBL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) -= arg1, arg2=mem
{name: "ANDLmodify", argLength: 3, reg: gpstore, asm: "ANDL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) &= arg1, arg2=mem
{name: "ORLmodify", argLength: 3, reg: gpstore, asm: "ORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) |= arg1, arg2=mem
{name: "XORLmodify", argLength: 3, reg: gpstore, asm: "XORL", aux: "SymOff", typ: "Mem", faultOnNilArg0: true, clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+auxint+aux) ^= arg1, arg2=mem
cmd/compile: emit more compact 386 instructions ADDL/SUBL/ANDL/ORL/XORL can have a memory operand as destination, and this CL optimize the compiler to emit such instructions on 386 for more compact binary. Here is test report: 1. The total size of pkg/linux_386/ and pkg/tool/linux_386/ decreases about 14KB. (pkg/linux_386/cmd/compile/ and pkg/tool/linux_386/compile are excluded) 2. The go1 benchmark shows little change, excluding ±2% noise. name old time/op new time/op delta BinaryTree17-4 3.34s ± 2% 3.38s ± 2% +1.27% (p=0.000 n=40+39) Fannkuch11-4 3.55s ± 1% 3.51s ± 1% -1.33% (p=0.000 n=40+40) FmtFprintfEmpty-4 46.3ns ± 3% 46.9ns ± 4% +1.41% (p=0.002 n=40+40) FmtFprintfString-4 80.8ns ± 3% 80.4ns ± 6% -0.54% (p=0.044 n=40+40) FmtFprintfInt-4 93.0ns ± 3% 92.2ns ± 4% -0.88% (p=0.007 n=39+40) FmtFprintfIntInt-4 144ns ± 5% 145ns ± 2% +0.78% (p=0.015 n=40+40) FmtFprintfPrefixedInt-4 184ns ± 2% 182ns ± 2% -1.06% (p=0.004 n=40+40) FmtFprintfFloat-4 415ns ± 4% 419ns ± 4% ~ (p=0.434 n=40+40) FmtManyArgs-4 615ns ± 3% 619ns ± 3% ~ (p=0.100 n=40+40) GobDecode-4 7.30ms ± 6% 7.36ms ± 6% ~ (p=0.074 n=40+40) GobEncode-4 7.10ms ± 6% 7.21ms ± 5% ~ (p=0.082 n=40+39) Gzip-4 364ms ± 3% 362ms ± 6% -0.71% (p=0.020 n=40+40) Gunzip-4 42.4ms ± 3% 42.2ms ± 3% ~ (p=0.303 n=40+40) HTTPClientServer-4 62.9µs ± 1% 62.9µs ± 1% ~ (p=0.768 n=38+39) JSONEncode-4 21.4ms ± 4% 21.5ms ± 5% ~ (p=0.210 n=40+40) JSONDecode-4 67.7ms ± 3% 67.9ms ± 4% ~ (p=0.713 n=40+40) Mandelbrot200-4 5.18ms ± 3% 5.21ms ± 3% +0.59% (p=0.021 n=40+40) GoParse-4 3.35ms ± 3% 3.34ms ± 2% ~ (p=0.996 n=40+40) RegexpMatchEasy0_32-4 98.5ns ± 5% 96.3ns ± 4% -2.15% (p=0.001 n=40+40) RegexpMatchEasy0_1K-4 851ns ± 4% 850ns ± 5% ~ (p=0.700 n=40+40) RegexpMatchEasy1_32-4 105ns ± 7% 107ns ± 4% +1.50% (p=0.017 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 5% 1.03µs ± 4% ~ (p=0.992 n=40+40) RegexpMatchMedium_32-4 130ns ± 6% 128ns ± 4% -1.66% (p=0.012 n=40+40) RegexpMatchMedium_1K-4 44.0µs ± 5% 43.6µs ± 3% ~ (p=0.704 n=40+40) RegexpMatchHard_32-4 2.29µs ± 3% 2.23µs ± 4% -2.38% (p=0.000 n=40+40) RegexpMatchHard_1K-4 69.0µs ± 3% 68.1µs ± 3% -1.28% (p=0.003 n=40+40) Revcomp-4 1.85s ± 2% 1.87s ± 3% +1.11% (p=0.000 n=40+40) Template-4 69.8ms ± 3% 69.6ms ± 3% ~ (p=0.125 n=40+40) TimeParse-4 442ns ± 5% 440ns ± 3% ~ (p=0.585 n=40+40) TimeFormat-4 419ns ± 3% 420ns ± 3% ~ (p=0.824 n=40+40) [Geo mean] 67.3µs 67.2µs -0.11% name old speed new speed delta GobDecode-4 105MB/s ± 6% 104MB/s ± 6% ~ (p=0.074 n=40+40) GobEncode-4 108MB/s ± 7% 107MB/s ± 5% ~ (p=0.080 n=40+39) Gzip-4 53.3MB/s ± 3% 53.7MB/s ± 6% +0.73% (p=0.021 n=40+40) Gunzip-4 458MB/s ± 3% 460MB/s ± 3% ~ (p=0.301 n=40+40) JSONEncode-4 90.8MB/s ± 4% 90.3MB/s ± 4% ~ (p=0.213 n=40+40) JSONDecode-4 28.7MB/s ± 3% 28.6MB/s ± 4% ~ (p=0.679 n=40+40) GoParse-4 17.3MB/s ± 3% 17.3MB/s ± 2% ~ (p=1.000 n=40+40) RegexpMatchEasy0_32-4 325MB/s ± 5% 333MB/s ± 4% +2.44% (p=0.000 n=40+38) RegexpMatchEasy0_1K-4 1.20GB/s ± 4% 1.21GB/s ± 5% ~ (p=0.684 n=40+40) RegexpMatchEasy1_32-4 303MB/s ± 7% 298MB/s ± 4% -1.52% (p=0.022 n=40+40) RegexpMatchEasy1_1K-4 995MB/s ± 5% 996MB/s ± 4% ~ (p=0.996 n=40+40) RegexpMatchMedium_32-4 7.67MB/s ± 6% 7.80MB/s ± 4% +1.68% (p=0.011 n=40+40) RegexpMatchMedium_1K-4 23.3MB/s ± 5% 23.5MB/s ± 3% ~ (p=0.697 n=40+40) RegexpMatchHard_32-4 14.0MB/s ± 3% 14.3MB/s ± 4% +2.43% (p=0.000 n=40+40) RegexpMatchHard_1K-4 14.8MB/s ± 3% 15.0MB/s ± 3% +1.30% (p=0.003 n=40+40) Revcomp-4 137MB/s ± 2% 136MB/s ± 3% -1.10% (p=0.000 n=40+40) Template-4 27.8MB/s ± 3% 27.9MB/s ± 3% ~ (p=0.128 n=40+40) [Geo mean] 79.6MB/s 79.9MB/s +0.28% Change-Id: I02a3efc125dc81e18fc8495eb2bf1bba59ab8733 Reviewed-on: https://go-review.googlesource.com/110157 Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-04-29 10:42:14 +00:00
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// direct binary-op on indexed memory (read-modify-write)
{name: "ADDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ADDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) += arg2, arg3=mem
{name: "SUBLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "SUBL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) -= arg2, arg3=mem
{name: "ANDLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ANDL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) &= arg2, arg3=mem
{name: "ORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "ORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) |= arg2, arg3=mem
{name: "XORLmodifyidx4", argLength: 4, reg: gpstoreidx, asm: "XORL", aux: "SymOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // *(arg0+arg1*4+auxint+aux) ^= arg2, arg3=mem
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
cmd/compile: implement "OPC $imm, (mem)" for 386 New read-modify-write operations are introduced in this CL for 386. 1. The total size of pkg/linux_386 decreases about 10KB (excluding cmd/compile). 2. The go1 benchmark shows little regression. name old time/op new time/op delta BinaryTree17-4 3.32s ± 4% 3.29s ± 2% ~ (p=0.059 n=30+30) Fannkuch11-4 3.49s ± 1% 3.46s ± 1% -0.92% (p=0.001 n=30+30) FmtFprintfEmpty-4 47.7ns ± 2% 46.8ns ± 5% -1.93% (p=0.011 n=25+30) FmtFprintfString-4 79.5ns ± 7% 80.2ns ± 3% +0.89% (p=0.001 n=28+29) FmtFprintfInt-4 90.5ns ± 2% 92.1ns ± 2% +1.82% (p=0.014 n=22+30) FmtFprintfIntInt-4 141ns ± 1% 144ns ± 3% +2.23% (p=0.013 n=22+30) FmtFprintfPrefixedInt-4 183ns ± 2% 184ns ± 3% ~ (p=0.080 n=21+30) FmtFprintfFloat-4 409ns ± 3% 412ns ± 3% +0.83% (p=0.040 n=30+30) FmtManyArgs-4 597ns ± 6% 607ns ± 4% +1.71% (p=0.006 n=30+30) GobDecode-4 7.21ms ± 5% 7.18ms ± 6% ~ (p=0.665 n=30+30) GobEncode-4 7.17ms ± 6% 7.09ms ± 7% ~ (p=0.117 n=29+30) Gzip-4 413ms ± 4% 399ms ± 4% -3.48% (p=0.000 n=30+30) Gunzip-4 41.3ms ± 4% 41.7ms ± 3% +1.05% (p=0.011 n=30+30) HTTPClientServer-4 63.5µs ± 3% 62.9µs ± 2% -0.97% (p=0.017 n=30+27) JSONEncode-4 20.3ms ± 5% 20.1ms ± 5% -1.16% (p=0.004 n=30+30) JSONDecode-4 66.2ms ± 4% 67.7ms ± 4% +2.21% (p=0.000 n=30+30) Mandelbrot200-4 5.16ms ± 3% 5.18ms ± 3% ~ (p=0.123 n=30+30) GoParse-4 3.23ms ± 2% 3.27ms ± 2% +1.08% (p=0.006 n=30+30) RegexpMatchEasy0_32-4 98.9ns ± 5% 97.1ns ± 4% -1.83% (p=0.006 n=30+30) RegexpMatchEasy0_1K-4 842ns ± 3% 842ns ± 3% ~ (p=0.550 n=30+30) RegexpMatchEasy1_32-4 107ns ± 4% 105ns ± 4% -1.93% (p=0.012 n=30+30) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.04µs ± 4% ~ (p=0.304 n=30+30) RegexpMatchMedium_32-4 132ns ± 2% 129ns ± 4% -2.02% (p=0.000 n=21+30) RegexpMatchMedium_1K-4 44.1µs ± 4% 43.8µs ± 3% ~ (p=0.641 n=30+30) RegexpMatchHard_32-4 2.26µs ± 4% 2.23µs ± 4% -1.28% (p=0.023 n=30+30) RegexpMatchHard_1K-4 68.1µs ± 3% 68.6µs ± 4% ~ (p=0.089 n=30+30) Revcomp-4 1.85s ± 2% 1.84s ± 2% ~ (p=0.072 n=30+30) Template-4 69.2ms ± 3% 68.5ms ± 3% -1.04% (p=0.012 n=30+30) TimeParse-4 441ns ± 3% 446ns ± 4% +1.21% (p=0.001 n=30+30) TimeFormat-4 415ns ± 3% 415ns ± 3% ~ (p=0.436 n=30+30) [Geo mean] 67.0µs 66.9µs -0.17% name old speed new speed delta GobDecode-4 107MB/s ± 5% 107MB/s ± 6% ~ (p=0.663 n=30+30) GobEncode-4 107MB/s ± 6% 108MB/s ± 7% ~ (p=0.117 n=29+30) Gzip-4 47.0MB/s ± 4% 48.7MB/s ± 4% +3.61% (p=0.000 n=30+30) Gunzip-4 470MB/s ± 4% 466MB/s ± 4% -1.05% (p=0.011 n=30+30) JSONEncode-4 95.6MB/s ± 5% 96.7MB/s ± 5% +1.16% (p=0.005 n=30+30) JSONDecode-4 29.3MB/s ± 4% 28.7MB/s ± 4% -2.17% (p=0.000 n=30+30) GoParse-4 17.9MB/s ± 2% 17.7MB/s ± 2% -1.06% (p=0.007 n=30+30) RegexpMatchEasy0_32-4 323MB/s ± 5% 329MB/s ± 4% +1.93% (p=0.006 n=30+30) RegexpMatchEasy0_1K-4 1.22GB/s ± 3% 1.22GB/s ± 3% ~ (p=0.496 n=30+30) RegexpMatchEasy1_32-4 298MB/s ± 4% 303MB/s ± 4% +1.84% (p=0.017 n=30+30) RegexpMatchEasy1_1K-4 995MB/s ± 4% 989MB/s ± 4% ~ (p=0.307 n=30+30) RegexpMatchMedium_32-4 7.56MB/s ± 4% 7.74MB/s ± 4% +2.46% (p=0.000 n=22+30) RegexpMatchMedium_1K-4 23.2MB/s ± 4% 23.4MB/s ± 3% ~ (p=0.651 n=30+30) RegexpMatchHard_32-4 14.2MB/s ± 4% 14.3MB/s ± 4% +1.29% (p=0.021 n=30+30) RegexpMatchHard_1K-4 15.0MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.069 n=30+29) Revcomp-4 138MB/s ± 2% 138MB/s ± 2% ~ (p=0.072 n=30+30) Template-4 28.1MB/s ± 3% 28.4MB/s ± 3% +1.05% (p=0.012 n=30+30) [Geo mean] 79.7MB/s 80.2MB/s +0.60% Change-Id: I44a1dfc942c9a385904553c4fe1fa8e509c8aa31 Reviewed-on: https://go-review.googlesource.com/120916 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-06-26 02:58:54 +00:00
// direct binary-op on memory with a constant (read-modify-write)
{name: "ADDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
{name: "ANDLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
{name: "ORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
{name: "XORLconstmodify", argLength: 2, reg: gpstoreconst, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// direct binary-op on indexed memory with a constant (read-modify-write)
{name: "ADDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ADDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // add ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
{name: "ANDLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ANDL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // and ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
{name: "ORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "ORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // or ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
{name: "XORLconstmodifyidx4", argLength: 3, reg: gpstoreconstidx, asm: "XORL", aux: "SymValAndOff", typ: "Mem", clobberFlags: true, symEffect: "Read,Write"}, // xor ValAndOff(AuxInt).Val() to arg0+arg1*4+ValAndOff(AuxInt).Off()+aux, arg2=mem
cmd/compile: add indexed form for several 386 instructions This CL implements indexed memory operands for the following instructions. (ADD|SUB|MUL|AND|OR|XOR)Lload -> (ADD|SUB|MUL|AND|OR|XOR)Lloadidx4 (ADD|SUB|AND|OR|XOR)Lmodify -> (ADD|SUB|AND|OR|XOR)Lmodifyidx4 (ADD|AND|OR|XOR)Lconstmodify -> (ADD|AND|OR|XOR)Lconstmodifyidx4 1. The total size of pkg/linux_386/ decreases about 2.5KB, excluding cmd/compile/ . 2. There is little regression in the go1 benchmark test, excluding noise. name old time/op new time/op delta BinaryTree17-4 3.25s ± 3% 3.25s ± 3% ~ (p=0.218 n=40+40) Fannkuch11-4 3.53s ± 1% 3.53s ± 1% ~ (p=0.303 n=40+40) FmtFprintfEmpty-4 44.9ns ± 3% 45.6ns ± 3% +1.48% (p=0.030 n=40+36) FmtFprintfString-4 78.7ns ± 5% 80.1ns ± 7% ~ (p=0.217 n=36+40) FmtFprintfInt-4 90.2ns ± 6% 89.8ns ± 5% ~ (p=0.659 n=40+38) FmtFprintfIntInt-4 140ns ± 5% 141ns ± 5% +1.00% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 185ns ± 3% 183ns ± 3% ~ (p=0.104 n=40+40) FmtFprintfFloat-4 411ns ± 4% 406ns ± 3% -1.37% (p=0.005 n=40+40) FmtManyArgs-4 590ns ± 4% 598ns ± 4% +1.35% (p=0.008 n=40+40) GobDecode-4 7.16ms ± 5% 7.10ms ± 5% ~ (p=0.335 n=40+40) GobEncode-4 6.85ms ± 7% 6.74ms ± 9% ~ (p=0.058 n=38+40) Gzip-4 400ms ± 4% 399ms ± 2% -0.34% (p=0.003 n=40+33) Gunzip-4 41.4ms ± 3% 41.4ms ± 4% -0.12% (p=0.020 n=40+40) HTTPClientServer-4 64.1µs ± 4% 63.5µs ± 2% -1.07% (p=0.000 n=39+37) JSONEncode-4 15.9ms ± 2% 15.9ms ± 3% ~ (p=0.103 n=40+40) JSONDecode-4 62.2ms ± 4% 61.6ms ± 3% -0.98% (p=0.006 n=39+40) Mandelbrot200-4 5.18ms ± 3% 5.14ms ± 4% ~ (p=0.125 n=40+40) GoParse-4 3.29ms ± 2% 3.27ms ± 2% -0.66% (p=0.006 n=40+40) RegexpMatchEasy0_32-4 103ns ± 4% 103ns ± 4% ~ (p=0.632 n=40+40) RegexpMatchEasy0_1K-4 830ns ± 3% 828ns ± 3% ~ (p=0.563 n=40+40) RegexpMatchEasy1_32-4 113ns ± 4% 113ns ± 4% ~ (p=0.494 n=40+40) RegexpMatchEasy1_1K-4 1.03µs ± 4% 1.03µs ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 130ns ± 4% 129ns ± 3% ~ (p=0.458 n=40+40) RegexpMatchMedium_1K-4 39.4µs ± 3% 39.7µs ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 2.16µs ± 4% 2.15µs ± 4% ~ (p=0.137 n=40+40) RegexpMatchHard_1K-4 65.2µs ± 3% 65.4µs ± 4% ~ (p=0.160 n=40+40) Revcomp-4 1.87s ± 2% 1.87s ± 1% +0.17% (p=0.019 n=33+33) Template-4 69.4ms ± 3% 69.8ms ± 3% +0.60% (p=0.009 n=40+40) TimeParse-4 437ns ± 4% 438ns ± 4% ~ (p=0.234 n=40+40) TimeFormat-4 408ns ± 3% 408ns ± 3% ~ (p=0.904 n=40+40) [Geo mean] 65.7µs 65.6µs -0.08% name old speed new speed delta GobDecode-4 107MB/s ± 5% 108MB/s ± 5% ~ (p=0.336 n=40+40) GobEncode-4 112MB/s ± 6% 114MB/s ± 9% +1.95% (p=0.036 n=37+40) Gzip-4 48.5MB/s ± 4% 48.6MB/s ± 2% +0.28% (p=0.003 n=40+33) Gunzip-4 469MB/s ± 4% 469MB/s ± 4% +0.11% (p=0.021 n=40+40) JSONEncode-4 122MB/s ± 2% 122MB/s ± 3% ~ (p=0.105 n=40+40) JSONDecode-4 31.2MB/s ± 4% 31.5MB/s ± 4% +0.99% (p=0.007 n=39+40) GoParse-4 17.6MB/s ± 2% 17.7MB/s ± 2% +0.66% (p=0.007 n=40+40) RegexpMatchEasy0_32-4 310MB/s ± 4% 310MB/s ± 4% ~ (p=0.384 n=40+40) RegexpMatchEasy0_1K-4 1.23GB/s ± 3% 1.24GB/s ± 3% ~ (p=0.186 n=40+40) RegexpMatchEasy1_32-4 283MB/s ± 3% 281MB/s ± 4% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 1.00GB/s ± 4% 1.00GB/s ± 4% ~ (p=0.665 n=40+40) RegexpMatchMedium_32-4 7.68MB/s ± 4% 7.73MB/s ± 3% ~ (p=0.359 n=40+40) RegexpMatchMedium_1K-4 26.0MB/s ± 3% 25.8MB/s ± 3% ~ (p=0.825 n=40+40) RegexpMatchHard_32-4 14.8MB/s ± 3% 14.9MB/s ± 4% ~ (p=0.136 n=40+40) RegexpMatchHard_1K-4 15.7MB/s ± 3% 15.7MB/s ± 4% ~ (p=0.150 n=40+40) Revcomp-4 136MB/s ± 1% 136MB/s ± 1% -0.09% (p=0.028 n=32+33) Template-4 28.0MB/s ± 3% 27.8MB/s ± 3% -0.59% (p=0.010 n=40+40) [Geo mean] 82.1MB/s 82.3MB/s +0.25% Change-Id: Ifa387a251056678326d3508aa02753b70bf7e5d0 Reviewed-on: https://go-review.googlesource.com/c/140303 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-06 13:13:48 +00:00
// indexed loads/stores
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVBLZX", aux: "SymOff", symEffect: "Read"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
{name: "MOVWloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+arg1+auxint+aux. arg2=mem
{name: "MOVWloadidx2", argLength: 3, reg: gploadidx, asm: "MOVWLZX", aux: "SymOff", symEffect: "Read"}, // load 2 bytes from arg0+2*arg1+auxint+aux. arg2=mem
{name: "MOVLloadidx1", argLength: 3, reg: gploadidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+arg1+auxint+aux. arg2=mem
{name: "MOVLloadidx4", argLength: 3, reg: gploadidx, asm: "MOVL", aux: "SymOff", symEffect: "Read"}, // load 4 bytes from arg0+4*arg1+auxint+aux. arg2=mem
// TODO: sign-extending indexed loads
cmd/compile: automatically handle commuting ops in rewrite rules Note that this is a redo of an undo of the original buggy CL 38666. We have lots of rewrite rules that vary only in the fact that we have 2 versions for the 2 different orderings of various commuting ops. For example: (ADDL x (MOVLconst [c])) -> (ADDLconst [c] x) (ADDL (MOVLconst [c]) x) -> (ADDLconst [c] x) It can get unwieldly quickly, especially when there is more than one commuting op in a rule. Our existing "fix" for this problem is to have rules that canonicalize the operations first. For example: (Eq64 x (Const64 <t> [c])) && x.Op != OpConst64 -> (Eq64 (Const64 <t> [c]) x) Subsequent rules can then assume if there is a constant arg to Eq64, it will be the first one. This fix kinda works, but it is fragile and only works when we remember to include the required extra rules. The fundamental problem is that the rule matcher doesn't know anything about commuting ops. This CL fixes that fact. We already have information about which ops commute. (The register allocator takes advantage of commutivity.) The rule generator now automatically generates multiple rules for a single source rule when there are commutative ops in the rule. We can now drop all of our almost-duplicate source-level rules and the canonicalization rules. I have some CLs in progress that will be a lot less verbose when the rule generator handles commutivity for me. I had to reorganize the load-combining rules a bit. The 8-way OR rules generated 128 different reorderings, which was causing the generator to put too much code in the rewrite*.go files (the big ones were going from 25K lines to 132K lines). Instead I reorganized the rules to combine pairs of loads at a time. The generated rule files are now actually a bit (5%) smaller. Make.bash times are ~unchanged. Compiler benchmarks are not observably different. Probably because we don't spend much compiler time in rule matching anyway. I've also done a pass over all of our ops adding commutative markings for ops which hadn't had them previously. Fixes #18292 Change-Id: Ic1c0e43fbf579539f459971625f69690c9ab8805 Reviewed-on: https://go-review.googlesource.com/38801 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2017-03-30 03:30:22 +00:00
{name: "MOVBstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVB", aux: "SymOff", symEffect: "Write"}, // store byte in arg2 to arg0+arg1+auxint+aux. arg3=mem
{name: "MOVWstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
{name: "MOVWstoreidx2", argLength: 4, reg: gpstoreidx, asm: "MOVW", aux: "SymOff", symEffect: "Write"}, // store 2 bytes in arg2 to arg0+2*arg1+auxint+aux. arg3=mem
{name: "MOVLstoreidx1", argLength: 4, reg: gpstoreidx, commutative: true, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+arg1+auxint+aux. arg3=mem
{name: "MOVLstoreidx4", argLength: 4, reg: gpstoreidx, asm: "MOVL", aux: "SymOff", symEffect: "Write"}, // store 4 bytes in arg2 to arg0+4*arg1+auxint+aux. arg3=mem
// TODO: add size-mismatched indexed loads, like MOVBstoreidx4.
// For storeconst ops, the AuxInt field encodes both
// the value to store and an address offset of the store.
// Cast AuxInt to a ValAndOff to extract Val and Off fields.
{name: "MOVBstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+ValAndOff(AuxInt).Off()+aux. arg1=mem
{name: "MOVWstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 2 bytes of ...
{name: "MOVLstoreconst", argLength: 2, reg: gpstoreconst, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store low 4 bytes of ...
{name: "MOVBstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVB", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low byte of ValAndOff(AuxInt).Val() to arg0+1*arg1+ValAndOff(AuxInt).Off()+aux. arg2=mem
{name: "MOVWstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... arg1 ...
{name: "MOVWstoreconstidx2", argLength: 3, reg: gpstoreconstidx, asm: "MOVW", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 2 bytes of ... 2*arg1 ...
{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... arg1 ...
{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem", symEffect: "Write"}, // store low 4 bytes of ... 4*arg1 ...
// arg0 = pointer to start of memory to zero
// arg1 = value to store (will always be zero)
// arg2 = mem
// auxint = offset into duffzero code to start executing
// returns mem
{
name: "DUFFZERO",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("AX")},
clobbers: buildReg("DI CX"),
// Note: CX is only clobbered when dynamic linking.
},
faultOnNilArg0: true,
},
// arg0 = address of memory to zero
// arg1 = # of 4-byte words to zero
// arg2 = value to store (will always be zero)
// arg3 = mem
// returns mem
{
name: "REPSTOSL",
argLength: 4,
reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("CX"), buildReg("AX")},
clobbers: buildReg("DI CX"),
},
faultOnNilArg0: true,
},
{name: "CALLstatic", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
{name: "CALLtail", argLength: 1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). arg0=mem, auxint=argsize, returns mem
{name: "CALLclosure", argLength: 3, reg: regInfo{inputs: []regMask{gpsp, buildReg("DX"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, arg2=mem, auxint=argsize, returns mem
{name: "CALLinter", argLength: 2, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, arg1=mem, auxint=argsize, returns mem
// arg0 = destination pointer
// arg1 = source pointer
// arg2 = mem
// auxint = offset from duffcopy symbol to call
// returns memory
{
name: "DUFFCOPY",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("SI")},
clobbers: buildReg("DI SI CX"), // uses CX as a temporary
},
clobberFlags: true,
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// arg0 = destination pointer
// arg1 = source pointer
// arg2 = # of 8-byte words to copy
// arg3 = mem
// returns memory
{
name: "REPMOVSL",
argLength: 4,
reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("SI"), buildReg("CX")},
clobbers: buildReg("DI SI CX"),
},
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// (InvertFlags (CMPL a b)) == (CMPL b a)
// So if we want (SETL (CMPL a b)) but we can't do that because a is a constant,
// then we do (SETL (InvertFlags (CMPL b a))) instead.
// Rewrites will convert this to (SETG (CMPL b a)).
// InvertFlags is a pseudo-op which can't appear in assembly output.
{name: "InvertFlags", argLength: 1}, // reverse direction of arg0
// Pseudo-ops
{name: "LoweredGetG", argLength: 1, reg: gp01}, // arg0=mem
// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
// and sorts it to the very beginning of the block to prevent other
// use of DX (the closure pointer)
{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("DX")}}, zeroWidth: true},
// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
// I.e., if f calls g "calls" getcallerpc,
// the result should be the PC within f that g will return to.
// See runtime/stubs.go for a more detailed discussion.
{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
// LoweredGetCallerSP returns the SP of the caller of the current function. arg0=mem
{name: "LoweredGetCallerSP", argLength: 1, reg: gp01, rematerializeable: true},
//arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpsp}}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
// LoweredWB invokes runtime.gcWriteBarrier. arg0=destptr, arg1=srcptr, arg2=mem, aux=runtime.gcWriteBarrier
// It saves all GP registers if necessary, but may clobber others.
{name: "LoweredWB", argLength: 3, reg: regInfo{inputs: []regMask{buildReg("DI"), ax}, clobbers: callerSave &^ gp}, clobberFlags: true, aux: "Sym", symEffect: "None"},
// There are three of these functions so that they can have three different register inputs.
// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
// default registers to match so we don't need to copy registers around unnecessarily.
{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{dx, bx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{cx, dx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{ax, cx}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
// Extend ops are the same as Bounds ops except the indexes are 64-bit.
{name: "LoweredPanicExtendA", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, dx, bx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
{name: "LoweredPanicExtendB", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, cx, dx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
{name: "LoweredPanicExtendC", argLength: 4, aux: "Int64", reg: regInfo{inputs: []regMask{si, ax, cx}}, typ: "Mem", call: true}, // arg0=idxHi, arg1=idxLo, arg2=len, arg3=mem, returns memory. AuxInt contains report code (see PanicExtend in genericOps.go).
// Constant flag values. For any comparison, there are 5 possible
// outcomes: the three from the signed total order (<,==,>) and the
// three from the unsigned total order. The == cases overlap.
// Note: there's a sixth "unordered" outcome for floating-point
// comparisons, but we don't use such a beast yet.
// These ops are for temporary use by rewrite rules. They
// cannot appear in the generated assembly.
{name: "FlagEQ"}, // equal
{name: "FlagLT_ULT"}, // signed < and unsigned <
{name: "FlagLT_UGT"}, // signed < and unsigned >
{name: "FlagGT_UGT"}, // signed > and unsigned <
{name: "FlagGT_ULT"}, // signed > and unsigned >
// Special ops for PIC floating-point constants.
// MOVSXconst1 loads the address of the constant-pool entry into a register.
// MOVSXconst2 loads the constant from that address.
// MOVSXconst1 returns a pointer, but we type it as uint32 because it can never point to the Go heap.
{name: "MOVSSconst1", reg: gp01, typ: "UInt32", aux: "Float32"},
{name: "MOVSDconst1", reg: gp01, typ: "UInt32", aux: "Float64"},
{name: "MOVSSconst2", argLength: 1, reg: gpfp, asm: "MOVSS"},
{name: "MOVSDconst2", argLength: 1, reg: gpfp, asm: "MOVSD"},
}
var _386blocks = []blockData{
cmd/compile: allow multiple SSA block control values Control values are used to choose which successor of a block is jumped to. Typically a control value takes the form of a 'flags' value that represents the result of a comparison. Some architectures however use a variable in a register as a control value. Up until now we have managed with a single control value per block. However some architectures (e.g. s390x and riscv64) have combined compare-and-branch instructions that take two variables in registers as parameters. To generate these instructions we need to support 2 control values per block. This CL allows up to 2 control values to be used in a block in order to support the addition of compare-and-branch instructions. I have implemented s390x compare-and-branch instructions in a different CL. Passes toolstash-check -all. Results of compilebench: name old time/op new time/op delta Template 208ms ± 1% 209ms ± 1% ~ (p=0.289 n=20+20) Unicode 83.7ms ± 1% 83.3ms ± 3% -0.49% (p=0.017 n=18+18) GoTypes 748ms ± 1% 748ms ± 0% ~ (p=0.460 n=20+18) Compiler 3.47s ± 1% 3.48s ± 1% ~ (p=0.070 n=19+18) SSA 11.5s ± 1% 11.7s ± 1% +1.64% (p=0.000 n=19+18) Flate 130ms ± 1% 130ms ± 1% ~ (p=0.588 n=19+20) GoParser 160ms ± 1% 161ms ± 1% ~ (p=0.211 n=20+20) Reflect 465ms ± 1% 467ms ± 1% +0.42% (p=0.007 n=20+20) Tar 184ms ± 1% 185ms ± 2% ~ (p=0.087 n=18+20) XML 253ms ± 1% 253ms ± 1% ~ (p=0.377 n=20+18) LinkCompiler 769ms ± 2% 774ms ± 2% ~ (p=0.070 n=19+19) ExternalLinkCompiler 3.59s ±11% 3.68s ± 6% ~ (p=0.072 n=20+20) LinkWithoutDebugCompiler 446ms ± 5% 454ms ± 3% +1.79% (p=0.002 n=19+20) StdCmd 26.0s ± 2% 26.0s ± 2% ~ (p=0.799 n=20+20) name old user-time/op new user-time/op delta Template 238ms ± 5% 240ms ± 5% ~ (p=0.142 n=20+20) Unicode 105ms ±11% 106ms ±10% ~ (p=0.512 n=20+20) GoTypes 876ms ± 2% 873ms ± 4% ~ (p=0.647 n=20+19) Compiler 4.17s ± 2% 4.19s ± 1% ~ (p=0.093 n=20+18) SSA 13.9s ± 1% 14.1s ± 1% +1.45% (p=0.000 n=18+18) Flate 145ms ±13% 146ms ± 5% ~ (p=0.851 n=20+18) GoParser 185ms ± 5% 188ms ± 7% ~ (p=0.174 n=20+20) Reflect 534ms ± 3% 538ms ± 2% ~ (p=0.105 n=20+18) Tar 215ms ± 4% 211ms ± 9% ~ (p=0.079 n=19+20) XML 295ms ± 6% 295ms ± 5% ~ (p=0.968 n=20+20) LinkCompiler 832ms ± 4% 837ms ± 7% ~ (p=0.707 n=17+20) ExternalLinkCompiler 1.58s ± 8% 1.60s ± 4% ~ (p=0.296 n=20+19) LinkWithoutDebugCompiler 478ms ±12% 489ms ±10% ~ (p=0.429 n=20+20) name old object-bytes new object-bytes delta Template 559kB ± 0% 559kB ± 0% ~ (all equal) Unicode 216kB ± 0% 216kB ± 0% ~ (all equal) GoTypes 2.03MB ± 0% 2.03MB ± 0% ~ (all equal) Compiler 8.07MB ± 0% 8.07MB ± 0% -0.06% (p=0.000 n=20+20) SSA 27.1MB ± 0% 27.3MB ± 0% +0.89% (p=0.000 n=20+20) Flate 343kB ± 0% 343kB ± 0% ~ (all equal) GoParser 441kB ± 0% 441kB ± 0% ~ (all equal) Reflect 1.36MB ± 0% 1.36MB ± 0% ~ (all equal) Tar 487kB ± 0% 487kB ± 0% ~ (all equal) XML 632kB ± 0% 632kB ± 0% ~ (all equal) name old export-bytes new export-bytes delta Template 18.5kB ± 0% 18.5kB ± 0% ~ (all equal) Unicode 7.92kB ± 0% 7.92kB ± 0% ~ (all equal) GoTypes 35.0kB ± 0% 35.0kB ± 0% ~ (all equal) Compiler 109kB ± 0% 110kB ± 0% +0.72% (p=0.000 n=20+20) SSA 137kB ± 0% 138kB ± 0% +0.58% (p=0.000 n=20+20) Flate 4.89kB ± 0% 4.89kB ± 0% ~ (all equal) GoParser 8.49kB ± 0% 8.49kB ± 0% ~ (all equal) Reflect 11.4kB ± 0% 11.4kB ± 0% ~ (all equal) Tar 10.5kB ± 0% 10.5kB ± 0% ~ (all equal) XML 16.7kB ± 0% 16.7kB ± 0% ~ (all equal) name old text-bytes new text-bytes delta HelloSize 761kB ± 0% 761kB ± 0% ~ (all equal) CmdGoSize 10.8MB ± 0% 10.8MB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 10.7kB ± 0% 10.7kB ± 0% ~ (all equal) CmdGoSize 312kB ± 0% 312kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 122kB ± 0% 122kB ± 0% ~ (all equal) CmdGoSize 146kB ± 0% 146kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.13MB ± 0% 1.13MB ± 0% ~ (all equal) CmdGoSize 15.1MB ± 0% 15.1MB ± 0% ~ (all equal) Change-Id: I3cc2f9829a109543d9a68be4a21775d2d3e9801f Reviewed-on: https://go-review.googlesource.com/c/go/+/196557 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Reviewed-by: Keith Randall <khr@golang.org>
2019-08-12 20:19:58 +01:00
{name: "EQ", controls: 1},
{name: "NE", controls: 1},
{name: "LT", controls: 1},
{name: "LE", controls: 1},
{name: "GT", controls: 1},
{name: "GE", controls: 1},
{name: "OS", controls: 1},
{name: "OC", controls: 1},
{name: "ULT", controls: 1},
{name: "ULE", controls: 1},
{name: "UGT", controls: 1},
{name: "UGE", controls: 1},
{name: "EQF", controls: 1},
{name: "NEF", controls: 1},
{name: "ORD", controls: 1}, // FP, ordered comparison (parity zero)
{name: "NAN", controls: 1}, // FP, unordered comparison (parity one)
}
archs = append(archs, arch{
name: "386",
pkg: "cmd/internal/obj/x86",
genfile: "../../x86/ssa.go",
ops: _386ops,
blocks: _386blocks,
regnames: regNames386,
gpregmask: gp,
fpregmask: fp,
framepointerreg: int8(num["BP"]),
linkreg: -1, // not used
})
}