go/src/cmd/internal/obj/wasm/wasmobj.go

1180 lines
29 KiB
Go
Raw Normal View History

// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package wasm
import (
"bytes"
"cmd/internal/obj"
"cmd/internal/objabi"
"cmd/internal/sys"
"encoding/binary"
"fmt"
"io"
"math"
)
var Register = map[string]int16{
"SP": REG_SP,
"CTXT": REG_CTXT,
"g": REG_g,
"RET0": REG_RET0,
"RET1": REG_RET1,
"RET2": REG_RET2,
"RET3": REG_RET3,
"PAUSE": REG_PAUSE,
"R0": REG_R0,
"R1": REG_R1,
"R2": REG_R2,
"R3": REG_R3,
"R4": REG_R4,
"R5": REG_R5,
"R6": REG_R6,
"R7": REG_R7,
"R8": REG_R8,
"R9": REG_R9,
"R10": REG_R10,
"R11": REG_R11,
"R12": REG_R12,
"R13": REG_R13,
"R14": REG_R14,
"R15": REG_R15,
"F0": REG_F0,
"F1": REG_F1,
"F2": REG_F2,
"F3": REG_F3,
"F4": REG_F4,
"F5": REG_F5,
"F6": REG_F6,
"F7": REG_F7,
"F8": REG_F8,
"F9": REG_F9,
"F10": REG_F10,
"F11": REG_F11,
"F12": REG_F12,
"F13": REG_F13,
"F14": REG_F14,
"F15": REG_F15,
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
"F16": REG_F16,
"F17": REG_F17,
"F18": REG_F18,
"F19": REG_F19,
"F20": REG_F20,
"F21": REG_F21,
"F22": REG_F22,
"F23": REG_F23,
"F24": REG_F24,
"F25": REG_F25,
"F26": REG_F26,
"F27": REG_F27,
"F28": REG_F28,
"F29": REG_F29,
"F30": REG_F30,
"F31": REG_F31,
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
"PC_B": REG_PC_B,
}
var registerNames []string
func init() {
obj.RegisterRegister(MINREG, MAXREG, rconv)
obj.RegisterOpcode(obj.ABaseWasm, Anames)
registerNames = make([]string, MAXREG-MINREG)
for name, reg := range Register {
registerNames[reg-MINREG] = name
}
}
func rconv(r int) string {
return registerNames[r-MINREG]
}
var unaryDst = map[obj.As]bool{
ASet: true,
ATee: true,
ACall: true,
ACallIndirect: true,
ACallImport: true,
ABr: true,
ABrIf: true,
ABrTable: true,
AI32Store: true,
AI64Store: true,
AF32Store: true,
AF64Store: true,
AI32Store8: true,
AI32Store16: true,
AI64Store8: true,
AI64Store16: true,
AI64Store32: true,
ACALLNORESUME: true,
}
var Linkwasm = obj.LinkArch{
Arch: sys.ArchWasm,
Init: instinit,
Preprocess: preprocess,
Assemble: assemble,
UnaryDst: unaryDst,
}
var (
morestack *obj.LSym
morestackNoCtxt *obj.LSym
gcWriteBarrier *obj.LSym
sigpanic *obj.LSym
deferreturn *obj.LSym
jmpdefer *obj.LSym
)
const (
/* mark flags */
WasmImport = 1 << 0
)
func instinit(ctxt *obj.Link) {
morestack = ctxt.Lookup("runtime.morestack")
morestackNoCtxt = ctxt.Lookup("runtime.morestack_noctxt")
gcWriteBarrier = ctxt.Lookup("runtime.gcWriteBarrier")
cmd/compile: separate data and function LSyms Currently, obj.Ctxt's symbol table does not distinguish between ABI0 and ABIInternal symbols. This is *almost* okay, since a given symbol name in the final object file is only going to belong to one ABI or the other, but it requires that the compiler mark a Sym as being a function symbol before it retrieves its LSym. If it retrieves the LSym first, that LSym will be created as ABI0, and later marking the Sym as a function symbol won't change the LSym's ABI. Marking a Sym as a function symbol before looking up its LSym sounds easy, except Syms have a dual purpose: they are used just as interned strings (every function, variable, parameter, etc with the same textual name shares a Sym), and *also* to store state for whatever package global has that name. As a result, it's easy to slip up and look up an LSym when a Sym is serving as the name of a local variable, and then later mark it as a function when it's serving as the global with the name. In general, we were careful to avoid this, but #29610 demonstrates one case where we messed up. Because of on-demand importing from indexed export data, it's possible to compile a method wrapper for a type imported from another package before importing an init function from that package. If the argument of the method is named "init", the "init" LSym will be created as a data symbol when compiling the wrapper, before it gets marked as a function symbol. To fix this, we separate obj.Ctxt's symbol tables for ABI0 and ABIInternal symbols. This way, the compiler will simply get a different LSym once the Sym takes on its package-global meaning as a function. This fixes the above ordering issue, and means we no longer need to go out of our way to create the "init" function early and mark it as a function symbol. Fixes #29610. Updates #27539. Change-Id: Id9458b40017893d46ef9e4a3f9b47fc49e1ce8df Reviewed-on: https://go-review.googlesource.com/c/157017 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Robert Griesemer <gri@golang.org>
2019-01-08 22:23:52 -05:00
sigpanic = ctxt.LookupABI("runtime.sigpanic", obj.ABIInternal)
deferreturn = ctxt.LookupABI("runtime.deferreturn", obj.ABIInternal)
// jmpdefer is defined in assembly as ABI0, but what we're
// looking for is the *call* to jmpdefer from the Go function
// deferreturn, so we're looking for the ABIInternal version
// of jmpdefer that's called by Go.
jmpdefer = ctxt.LookupABI(`"".jmpdefer`, obj.ABIInternal)
}
func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
appendp := func(p *obj.Prog, as obj.As, args ...obj.Addr) *obj.Prog {
if p.As != obj.ANOP {
p2 := obj.Appendp(p, newprog)
p2.Pc = p.Pc
p = p2
}
p.As = as
switch len(args) {
case 0:
p.From = obj.Addr{}
p.To = obj.Addr{}
case 1:
if unaryDst[as] {
p.From = obj.Addr{}
p.To = args[0]
} else {
p.From = args[0]
p.To = obj.Addr{}
}
case 2:
p.From = args[0]
p.To = args[1]
default:
panic("bad args")
}
return p
}
framesize := s.Func.Text.To.Offset
if framesize < 0 {
panic("bad framesize")
}
s.Func.Args = s.Func.Text.To.Val.(int32)
s.Func.Locals = int32(framesize)
if s.Func.Text.From.Sym.Wrapper() {
// if g._panic != nil && g._panic.argp == FP {
// g._panic.argp = bottom-of-frame
// }
//
// MOVD g_panic(g), R0
// Get R0
// I64Eqz
// Not
// If
// Get SP
// I64ExtendI32U
// I64Const $framesize+8
// I64Add
// I64Load panic_argp(R0)
// I64Eq
// If
// MOVD SP, panic_argp(R0)
// End
// End
gpanic := obj.Addr{
Type: obj.TYPE_MEM,
Reg: REGG,
Offset: 4 * 8, // g_panic
}
panicargp := obj.Addr{
Type: obj.TYPE_MEM,
Reg: REG_R0,
Offset: 0, // panic.argp
}
p := s.Func.Text
p = appendp(p, AMOVD, gpanic, regAddr(REG_R0))
p = appendp(p, AGet, regAddr(REG_R0))
p = appendp(p, AI64Eqz)
p = appendp(p, ANot)
p = appendp(p, AIf)
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI64ExtendI32U)
p = appendp(p, AI64Const, constAddr(framesize+8))
p = appendp(p, AI64Add)
p = appendp(p, AI64Load, panicargp)
p = appendp(p, AI64Eq)
p = appendp(p, AIf)
p = appendp(p, AMOVD, regAddr(REG_SP), panicargp)
p = appendp(p, AEnd)
p = appendp(p, AEnd)
}
if framesize > 0 {
p := s.Func.Text
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI32Const, constAddr(framesize))
p = appendp(p, AI32Sub)
p = appendp(p, ASet, regAddr(REG_SP))
p.Spadj = int32(framesize)
}
// Introduce resume points for CALL instructions
// and collect other explicit resume points.
numResumePoints := 0
explicitBlockDepth := 0
pc := int64(0) // pc is only incremented when necessary, this avoids bloat of the BrTable instruction
var tableIdxs []uint64
tablePC := int64(0)
base := ctxt.PosTable.Pos(s.Func.Text.Pos).Base()
for p := s.Func.Text; p != nil; p = p.Link {
prevBase := base
base = ctxt.PosTable.Pos(p.Pos).Base()
switch p.As {
case ABlock, ALoop, AIf:
explicitBlockDepth++
case AEnd:
if explicitBlockDepth == 0 {
panic("End without block")
}
explicitBlockDepth--
case ARESUMEPOINT:
if explicitBlockDepth != 0 {
panic("RESUME can only be used on toplevel")
}
p.As = AEnd
for tablePC <= pc {
tableIdxs = append(tableIdxs, uint64(numResumePoints))
tablePC++
}
numResumePoints++
pc++
case obj.ACALL:
if explicitBlockDepth != 0 {
panic("CALL can only be used on toplevel, try CALLNORESUME instead")
}
appendp(p, ARESUMEPOINT)
}
p.Pc = pc
// Increase pc whenever some pc-value table needs a new entry. Don't increase it
// more often to avoid bloat of the BrTable instruction.
// The "base != prevBase" condition detects inlined instructions. They are an
// implicit call, so entering and leaving this section affects the stack trace.
if p.As == ACALLNORESUME || p.As == obj.ANOP || p.As == ANop || p.Spadj != 0 || base != prevBase {
pc++
if p.To.Sym == sigpanic {
// The panic stack trace expects the PC at the call of sigpanic,
// not the next one. However, runtime.Caller subtracts 1 from the
// PC. To make both PC and PC-1 work (have the same line number),
// we advance the PC by 2 at sigpanic.
pc++
}
}
}
tableIdxs = append(tableIdxs, uint64(numResumePoints))
s.Size = pc + 1
if !s.Func.Text.From.Sym.NoSplit() {
p := s.Func.Text
if framesize <= objabi.StackSmall {
// small stack: SP <= stackguard
// Get SP
// Get g
// I32WrapI64
// I32Load $stackguard0
// I32GtU
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AGet, regAddr(REGG))
p = appendp(p, AI32WrapI64)
p = appendp(p, AI32Load, constAddr(2*int64(ctxt.Arch.PtrSize))) // G.stackguard0
p = appendp(p, AI32LeU)
} else {
// large stack: SP-framesize <= stackguard-StackSmall
// SP <= stackguard+(framesize-StackSmall)
// Get SP
// Get g
// I32WrapI64
// I32Load $stackguard0
// I32Const $(framesize-StackSmall)
// I32Add
// I32GtU
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AGet, regAddr(REGG))
p = appendp(p, AI32WrapI64)
p = appendp(p, AI32Load, constAddr(2*int64(ctxt.Arch.PtrSize))) // G.stackguard0
p = appendp(p, AI32Const, constAddr(int64(framesize)-objabi.StackSmall))
p = appendp(p, AI32Add)
p = appendp(p, AI32LeU)
}
// TODO(neelance): handle wraparound case
p = appendp(p, AIf)
p = appendp(p, obj.ACALL, constAddr(0))
if s.Func.Text.From.Sym.NeedCtxt() {
p.To = obj.Addr{Type: obj.TYPE_MEM, Name: obj.NAME_EXTERN, Sym: morestack}
} else {
p.To = obj.Addr{Type: obj.TYPE_MEM, Name: obj.NAME_EXTERN, Sym: morestackNoCtxt}
}
p = appendp(p, AEnd)
}
// record the branches targeting the entry loop and the unwind exit,
// their targets with be filled in later
var entryPointLoopBranches []*obj.Prog
var unwindExitBranches []*obj.Prog
currentDepth := 0
for p := s.Func.Text; p != nil; p = p.Link {
switch p.As {
case ABlock, ALoop, AIf:
currentDepth++
case AEnd:
currentDepth--
}
switch p.As {
case obj.AJMP:
jmp := *p
p.As = obj.ANOP
if jmp.To.Type == obj.TYPE_BRANCH {
// jump to basic block
p = appendp(p, AI32Const, constAddr(jmp.To.Val.(*obj.Prog).Pc))
p = appendp(p, ASet, regAddr(REG_PC_B)) // write next basic block to PC_B
p = appendp(p, ABr) // jump to beginning of entryPointLoop
entryPointLoopBranches = append(entryPointLoopBranches, p)
break
}
// low-level WebAssembly call to function
switch jmp.To.Type {
case obj.TYPE_MEM:
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
if !notUsePC_B[jmp.To.Sym.Name] {
// Set PC_B parameter to function entry.
p = appendp(p, AI32Const, constAddr(0))
}
p = appendp(p, ACall, jmp.To)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
case obj.TYPE_NONE:
// (target PC is on stack)
p = appendp(p, AI32WrapI64)
p = appendp(p, AI32Const, constAddr(16)) // only needs PC_F bits (16-31), PC_B bits (0-15) are zero
p = appendp(p, AI32ShrU)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
// Set PC_B parameter to function entry.
// We need to push this before pushing the target PC_F,
// so temporarily pop PC_F, using our REG_PC_B as a
// scratch register, and push it back after pushing 0.
p = appendp(p, ASet, regAddr(REG_PC_B))
p = appendp(p, AI32Const, constAddr(0))
p = appendp(p, AGet, regAddr(REG_PC_B))
p = appendp(p, ACallIndirect)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
default:
panic("bad target for JMP")
}
p = appendp(p, AReturn)
case obj.ACALL, ACALLNORESUME:
call := *p
p.As = obj.ANOP
pcAfterCall := call.Link.Pc
if call.To.Sym == sigpanic {
pcAfterCall-- // sigpanic expects to be called without advancing the pc
}
// jmpdefer manipulates the return address on the stack so deferreturn gets called repeatedly.
// Model this in WebAssembly with a loop.
if call.To.Sym == deferreturn {
p = appendp(p, ALoop)
}
// SP -= 8
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI32Const, constAddr(8))
p = appendp(p, AI32Sub)
p = appendp(p, ASet, regAddr(REG_SP))
// write return address to Go stack
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI64Const, obj.Addr{
Type: obj.TYPE_ADDR,
Name: obj.NAME_EXTERN,
Sym: s, // PC_F
Offset: pcAfterCall, // PC_B
})
p = appendp(p, AI64Store, constAddr(0))
// low-level WebAssembly call to function
switch call.To.Type {
case obj.TYPE_MEM:
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
if !notUsePC_B[call.To.Sym.Name] {
// Set PC_B parameter to function entry.
p = appendp(p, AI32Const, constAddr(0))
}
p = appendp(p, ACall, call.To)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
case obj.TYPE_NONE:
// (target PC is on stack)
p = appendp(p, AI32WrapI64)
p = appendp(p, AI32Const, constAddr(16)) // only needs PC_F bits (16-31), PC_B bits (0-15) are zero
p = appendp(p, AI32ShrU)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
// Set PC_B parameter to function entry.
// We need to push this before pushing the target PC_F,
// so temporarily pop PC_F, using our PC_B as a
// scratch register, and push it back after pushing 0.
p = appendp(p, ASet, regAddr(REG_PC_B))
p = appendp(p, AI32Const, constAddr(0))
p = appendp(p, AGet, regAddr(REG_PC_B))
p = appendp(p, ACallIndirect)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
default:
panic("bad target for CALL")
}
// gcWriteBarrier has no return value, it never unwinds the stack
if call.To.Sym == gcWriteBarrier {
break
}
// jmpdefer removes the frame of deferreturn from the Go stack.
// However, its WebAssembly function still returns normally,
// so we need to return from deferreturn without removing its
// stack frame (no RET), because the frame is already gone.
if call.To.Sym == jmpdefer {
p = appendp(p, AReturn)
break
}
// return value of call is on the top of the stack, indicating whether to unwind the WebAssembly stack
if call.As == ACALLNORESUME && call.To.Sym != sigpanic { // sigpanic unwinds the stack, but it never resumes
// trying to unwind WebAssembly stack but call has no resume point, terminate with error
p = appendp(p, AIf)
p = appendp(p, obj.AUNDEF)
p = appendp(p, AEnd)
} else {
// unwinding WebAssembly stack to switch goroutine, return 1
p = appendp(p, ABrIf)
unwindExitBranches = append(unwindExitBranches, p)
}
// jump to before the call if jmpdefer has reset the return address to the call's PC
if call.To.Sym == deferreturn {
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
// get PC_B from -8(SP)
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI32Const, constAddr(8))
p = appendp(p, AI32Sub)
p = appendp(p, AI32Load16U, constAddr(0))
p = appendp(p, ATee, regAddr(REG_PC_B))
p = appendp(p, AI32Const, constAddr(call.Pc))
p = appendp(p, AI32Eq)
p = appendp(p, ABrIf, constAddr(0))
p = appendp(p, AEnd) // end of Loop
}
case obj.ARET, ARETUNWIND:
ret := *p
p.As = obj.ANOP
if framesize > 0 {
// SP += framesize
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI32Const, constAddr(framesize))
p = appendp(p, AI32Add)
p = appendp(p, ASet, regAddr(REG_SP))
// TODO(neelance): This should theoretically set Spadj, but it only works without.
// p.Spadj = int32(-framesize)
}
if ret.To.Type == obj.TYPE_MEM {
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
// Set PC_B parameter to function entry.
p = appendp(p, AI32Const, constAddr(0))
// low-level WebAssembly call to function
p = appendp(p, ACall, ret.To)
p = appendp(p, AReturn)
break
}
// SP += 8
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, AI32Const, constAddr(8))
p = appendp(p, AI32Add)
p = appendp(p, ASet, regAddr(REG_SP))
if ret.As == ARETUNWIND {
// function needs to unwind the WebAssembly stack, return 1
p = appendp(p, AI32Const, constAddr(1))
p = appendp(p, AReturn)
break
}
// not unwinding the WebAssembly stack, return 0
p = appendp(p, AI32Const, constAddr(0))
p = appendp(p, AReturn)
}
}
for p := s.Func.Text; p != nil; p = p.Link {
switch p.From.Name {
case obj.NAME_AUTO:
p.From.Offset += int64(framesize)
case obj.NAME_PARAM:
p.From.Reg = REG_SP
p.From.Offset += int64(framesize) + 8 // parameters are after the frame and the 8-byte return address
}
switch p.To.Name {
case obj.NAME_AUTO:
p.To.Offset += int64(framesize)
case obj.NAME_PARAM:
p.To.Reg = REG_SP
p.To.Offset += int64(framesize) + 8 // parameters are after the frame and the 8-byte return address
}
switch p.As {
case AGet:
if p.From.Type == obj.TYPE_ADDR {
get := *p
p.As = obj.ANOP
switch get.From.Name {
case obj.NAME_EXTERN:
p = appendp(p, AI64Const, get.From)
case obj.NAME_AUTO, obj.NAME_PARAM:
p = appendp(p, AGet, regAddr(get.From.Reg))
if get.From.Reg == REG_SP {
p = appendp(p, AI64ExtendI32U)
}
if get.From.Offset != 0 {
p = appendp(p, AI64Const, constAddr(get.From.Offset))
p = appendp(p, AI64Add)
}
default:
panic("bad Get: invalid name")
}
}
case AI32Load, AI64Load, AF32Load, AF64Load, AI32Load8S, AI32Load8U, AI32Load16S, AI32Load16U, AI64Load8S, AI64Load8U, AI64Load16S, AI64Load16U, AI64Load32S, AI64Load32U:
if p.From.Type == obj.TYPE_MEM {
as := p.As
from := p.From
p.As = AGet
p.From = regAddr(from.Reg)
if from.Reg != REG_SP {
p = appendp(p, AI32WrapI64)
}
p = appendp(p, as, constAddr(from.Offset))
}
case AMOVB, AMOVH, AMOVW, AMOVD:
mov := *p
p.As = obj.ANOP
var loadAs obj.As
var storeAs obj.As
switch mov.As {
case AMOVB:
loadAs = AI64Load8U
storeAs = AI64Store8
case AMOVH:
loadAs = AI64Load16U
storeAs = AI64Store16
case AMOVW:
loadAs = AI64Load32U
storeAs = AI64Store32
case AMOVD:
loadAs = AI64Load
storeAs = AI64Store
}
appendValue := func() {
switch mov.From.Type {
case obj.TYPE_CONST:
p = appendp(p, AI64Const, constAddr(mov.From.Offset))
case obj.TYPE_ADDR:
switch mov.From.Name {
case obj.NAME_NONE, obj.NAME_PARAM, obj.NAME_AUTO:
p = appendp(p, AGet, regAddr(mov.From.Reg))
if mov.From.Reg == REG_SP {
p = appendp(p, AI64ExtendI32U)
}
p = appendp(p, AI64Const, constAddr(mov.From.Offset))
p = appendp(p, AI64Add)
case obj.NAME_EXTERN:
p = appendp(p, AI64Const, mov.From)
default:
panic("bad name for MOV")
}
case obj.TYPE_REG:
p = appendp(p, AGet, mov.From)
if mov.From.Reg == REG_SP {
p = appendp(p, AI64ExtendI32U)
}
case obj.TYPE_MEM:
p = appendp(p, AGet, regAddr(mov.From.Reg))
if mov.From.Reg != REG_SP {
p = appendp(p, AI32WrapI64)
}
p = appendp(p, loadAs, constAddr(mov.From.Offset))
default:
panic("bad MOV type")
}
}
switch mov.To.Type {
case obj.TYPE_REG:
appendValue()
if mov.To.Reg == REG_SP {
p = appendp(p, AI32WrapI64)
}
p = appendp(p, ASet, mov.To)
case obj.TYPE_MEM:
switch mov.To.Name {
case obj.NAME_NONE, obj.NAME_PARAM:
p = appendp(p, AGet, regAddr(mov.To.Reg))
if mov.To.Reg != REG_SP {
p = appendp(p, AI32WrapI64)
}
case obj.NAME_EXTERN:
p = appendp(p, AI32Const, obj.Addr{Type: obj.TYPE_ADDR, Name: obj.NAME_EXTERN, Sym: mov.To.Sym})
default:
panic("bad MOV name")
}
appendValue()
p = appendp(p, storeAs, constAddr(mov.To.Offset))
default:
panic("bad MOV type")
}
case ACallImport:
p.As = obj.ANOP
p = appendp(p, AGet, regAddr(REG_SP))
p = appendp(p, ACall, obj.Addr{Type: obj.TYPE_MEM, Name: obj.NAME_EXTERN, Sym: s})
p.Mark = WasmImport
}
}
{
p := s.Func.Text
if len(unwindExitBranches) > 0 {
p = appendp(p, ABlock) // unwindExit, used to return 1 when unwinding the stack
for _, b := range unwindExitBranches {
b.To = obj.Addr{Type: obj.TYPE_BRANCH, Val: p}
}
}
if len(entryPointLoopBranches) > 0 {
p = appendp(p, ALoop) // entryPointLoop, used to jump between basic blocks
for _, b := range entryPointLoopBranches {
b.To = obj.Addr{Type: obj.TYPE_BRANCH, Val: p}
}
}
if numResumePoints > 0 {
// Add Block instructions for resume points and BrTable to jump to selected resume point.
for i := 0; i < numResumePoints+1; i++ {
p = appendp(p, ABlock)
}
p = appendp(p, AGet, regAddr(REG_PC_B)) // read next basic block from PC_B
p = appendp(p, ABrTable, obj.Addr{Val: tableIdxs})
p = appendp(p, AEnd) // end of Block
}
for p.Link != nil {
p = p.Link // function instructions
}
if len(entryPointLoopBranches) > 0 {
p = appendp(p, AEnd) // end of entryPointLoop
}
p = appendp(p, obj.AUNDEF)
if len(unwindExitBranches) > 0 {
p = appendp(p, AEnd) // end of unwindExit
p = appendp(p, AI32Const, constAddr(1))
}
}
currentDepth = 0
blockDepths := make(map[*obj.Prog]int)
for p := s.Func.Text; p != nil; p = p.Link {
switch p.As {
case ABlock, ALoop, AIf:
currentDepth++
blockDepths[p] = currentDepth
case AEnd:
currentDepth--
}
switch p.As {
case ABr, ABrIf:
if p.To.Type == obj.TYPE_BRANCH {
blockDepth, ok := blockDepths[p.To.Val.(*obj.Prog)]
if !ok {
panic("label not at block")
}
p.To = constAddr(int64(currentDepth - blockDepth))
}
}
}
}
func constAddr(value int64) obj.Addr {
return obj.Addr{Type: obj.TYPE_CONST, Offset: value}
}
func regAddr(reg int16) obj.Addr {
return obj.Addr{Type: obj.TYPE_REG, Reg: reg}
}
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
// Most of the Go functions has a single parameter (PC_B) in
// Wasm ABI. This is a list of exceptions.
var notUsePC_B = map[string]bool{
"_rt0_wasm_js": true,
"wasm_export_run": true,
"wasm_export_resume": true,
"wasm_export_getsp": true,
"wasm_pc_f_loop": true,
"runtime.wasmMove": true,
"runtime.wasmZero": true,
"runtime.wasmDiv": true,
"runtime.wasmTruncS": true,
"runtime.wasmTruncU": true,
"runtime.gcWriteBarrier": true,
"cmpbody": true,
"memeqbody": true,
"memcmp": true,
"memchr": true,
}
func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
type regVar struct {
global bool
index uint64
}
type varDecl struct {
count uint64
typ valueType
}
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
hasLocalSP := false
regVars := [MAXREG - MINREG]*regVar{
REG_SP - MINREG: {true, 0},
REG_CTXT - MINREG: {true, 1},
REG_g - MINREG: {true, 2},
REG_RET0 - MINREG: {true, 3},
REG_RET1 - MINREG: {true, 4},
REG_RET2 - MINREG: {true, 5},
REG_RET3 - MINREG: {true, 6},
REG_PAUSE - MINREG: {true, 7},
}
var varDecls []*varDecl
useAssemblyRegMap := func() {
for i := int16(0); i < 16; i++ {
regVars[REG_R0+i-MINREG] = &regVar{false, uint64(i)}
}
}
// Function starts with declaration of locals: numbers and types.
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
// Some functions use a special calling convention.
switch s.Name {
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
case "_rt0_wasm_js", "wasm_export_run", "wasm_export_resume", "wasm_export_getsp", "wasm_pc_f_loop",
"runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody":
varDecls = []*varDecl{}
useAssemblyRegMap()
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
case "memchr", "memcmp":
varDecls = []*varDecl{{count: 2, typ: i32}}
useAssemblyRegMap()
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
case "cmpbody":
varDecls = []*varDecl{{count: 2, typ: i64}}
useAssemblyRegMap()
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
case "runtime.gcWriteBarrier":
varDecls = []*varDecl{{count: 4, typ: i64}}
useAssemblyRegMap()
default:
// Normal calling convention: PC_B as WebAssembly parameter. First local variable is local SP cache.
regVars[REG_PC_B-MINREG] = &regVar{false, 0}
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
hasLocalSP = true
var regUsed [MAXREG - MINREG]bool
for p := s.Func.Text; p != nil; p = p.Link {
if p.From.Reg != 0 {
regUsed[p.From.Reg-MINREG] = true
}
if p.To.Reg != 0 {
regUsed[p.To.Reg-MINREG] = true
}
}
regs := []int16{REG_SP}
for reg := int16(REG_R0); reg <= REG_F31; reg++ {
if regUsed[reg-MINREG] {
regs = append(regs, reg)
}
}
var lastDecl *varDecl
for i, reg := range regs {
t := regType(reg)
if lastDecl == nil || lastDecl.typ != t {
lastDecl = &varDecl{
count: 0,
typ: t,
}
varDecls = append(varDecls, lastDecl)
}
lastDecl.count++
if reg != REG_SP {
regVars[reg-MINREG] = &regVar{false, 1 + uint64(i)}
}
}
}
w := new(bytes.Buffer)
writeUleb128(w, uint64(len(varDecls)))
for _, decl := range varDecls {
writeUleb128(w, decl.count)
w.WriteByte(byte(decl.typ))
}
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
if hasLocalSP {
// Copy SP from its global variable into a local variable. Accessing a local variable is more efficient.
updateLocalSP(w)
}
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
for p := s.Func.Text; p != nil; p = p.Link {
switch p.As {
case AGet:
if p.From.Type != obj.TYPE_REG {
panic("bad Get: argument is not a register")
}
reg := p.From.Reg
v := regVars[reg-MINREG]
if v == nil {
panic("bad Get: invalid register")
}
if reg == REG_SP && hasLocalSP {
writeOpcode(w, ALocalGet)
writeUleb128(w, 1) // local SP
continue
}
if v.global {
writeOpcode(w, AGlobalGet)
} else {
writeOpcode(w, ALocalGet)
}
writeUleb128(w, v.index)
continue
case ASet:
if p.To.Type != obj.TYPE_REG {
panic("bad Set: argument is not a register")
}
reg := p.To.Reg
v := regVars[reg-MINREG]
if v == nil {
panic("bad Set: invalid register")
}
if reg == REG_SP && hasLocalSP {
writeOpcode(w, ALocalTee)
writeUleb128(w, 1) // local SP
}
if v.global {
writeOpcode(w, AGlobalSet)
} else {
if p.Link.As == AGet && p.Link.From.Reg == reg {
writeOpcode(w, ALocalTee)
p = p.Link
} else {
writeOpcode(w, ALocalSet)
}
}
writeUleb128(w, v.index)
continue
case ATee:
if p.To.Type != obj.TYPE_REG {
panic("bad Tee: argument is not a register")
}
reg := p.To.Reg
v := regVars[reg-MINREG]
if v == nil {
panic("bad Tee: invalid register")
}
writeOpcode(w, ALocalTee)
writeUleb128(w, v.index)
continue
case ANot:
writeOpcode(w, AI32Eqz)
continue
case obj.AUNDEF:
writeOpcode(w, AUnreachable)
continue
case obj.ANOP, obj.ATEXT, obj.AFUNCDATA, obj.APCDATA:
// ignore
continue
}
writeOpcode(w, p.As)
switch p.As {
case ABlock, ALoop, AIf:
if p.From.Offset != 0 {
// block type, rarely used, e.g. for code compiled with emscripten
w.WriteByte(0x80 - byte(p.From.Offset))
continue
}
w.WriteByte(0x40)
case ABr, ABrIf:
if p.To.Type != obj.TYPE_CONST {
panic("bad Br/BrIf")
}
writeUleb128(w, uint64(p.To.Offset))
case ABrTable:
idxs := p.To.Val.([]uint64)
writeUleb128(w, uint64(len(idxs)-1))
for _, idx := range idxs {
writeUleb128(w, idx)
}
case ACall:
switch p.To.Type {
case obj.TYPE_CONST:
writeUleb128(w, uint64(p.To.Offset))
case obj.TYPE_MEM:
if p.To.Name != obj.NAME_EXTERN && p.To.Name != obj.NAME_STATIC {
fmt.Println(p.To)
panic("bad name for Call")
}
r := obj.Addrel(s)
r.Off = int32(w.Len())
r.Type = objabi.R_CALL
if p.Mark&WasmImport != 0 {
r.Type = objabi.R_WASMIMPORT
}
r.Sym = p.To.Sym
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
if hasLocalSP {
// The stack may have moved, which changes SP. Update the local SP variable.
updateLocalSP(w)
}
default:
panic("bad type for Call")
}
case ACallIndirect:
writeUleb128(w, uint64(p.To.Offset))
w.WriteByte(0x00) // reserved value
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
if hasLocalSP {
// The stack may have moved, which changes SP. Update the local SP variable.
updateLocalSP(w)
}
case AI32Const, AI64Const:
if p.From.Name == obj.NAME_EXTERN {
r := obj.Addrel(s)
r.Off = int32(w.Len())
r.Type = objabi.R_ADDR
r.Sym = p.From.Sym
r.Add = p.From.Offset
break
}
writeSleb128(w, p.From.Offset)
case AF32Const:
b := make([]byte, 4)
binary.LittleEndian.PutUint32(b, math.Float32bits(float32(p.From.Val.(float64))))
w.Write(b)
case AF64Const:
b := make([]byte, 8)
binary.LittleEndian.PutUint64(b, math.Float64bits(p.From.Val.(float64)))
w.Write(b)
case AI32Load, AI64Load, AF32Load, AF64Load, AI32Load8S, AI32Load8U, AI32Load16S, AI32Load16U, AI64Load8S, AI64Load8U, AI64Load16S, AI64Load16U, AI64Load32S, AI64Load32U:
if p.From.Offset < 0 {
panic("negative offset for *Load")
}
if p.From.Type != obj.TYPE_CONST {
panic("bad type for *Load")
}
if p.From.Offset > math.MaxUint32 {
ctxt.Diag("bad offset in %v", p)
}
writeUleb128(w, align(p.As))
writeUleb128(w, uint64(p.From.Offset))
case AI32Store, AI64Store, AF32Store, AF64Store, AI32Store8, AI32Store16, AI64Store8, AI64Store16, AI64Store32:
if p.To.Offset < 0 {
panic("negative offset")
}
if p.From.Offset > math.MaxUint32 {
ctxt.Diag("bad offset in %v", p)
}
writeUleb128(w, align(p.As))
writeUleb128(w, uint64(p.To.Offset))
case ACurrentMemory, AGrowMemory:
w.WriteByte(0x00)
}
}
w.WriteByte(0x0b) // end
s.P = w.Bytes()
}
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
func updateLocalSP(w *bytes.Buffer) {
writeOpcode(w, AGlobalGet)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
writeUleb128(w, 0) // global SP
writeOpcode(w, ALocalSet)
cmd, runtime: remove PC_F & PC_B globals on Wasm Following the previous CL, this removes more global variables on Wasm. PC_B is used mostly for intra-function jumps, and for a function telling its callee where to start or resume. This usage can be served by a parameter. The top level loop (wasm_pc_f_loop) uses PC_B for resuming a function. This value is either set by gogo, or loaded from the Go stack at function return. Instead of loading PC_B at each function return, we could make gogo stores PC_B at the same stack location, and let the top level loop do the load. This way, we don't need to use global PC_B to communicate with the top level loop, and we can replace global PC_B with a parameter. PC_F is similar. It is even more so in that the only reader is the top level loop. Let the top level loop read it from the stack, and we can get rid of PC_F entirely. PC_F and PC_B are used less entensively as SP, so this CL has smaller performance gain. Running on Chrome 74.0.3729.108 on Linux/AMD64, name old time/op new time/op delta BinaryTree17 16.6s ± 0% 16.2s ± 1% -2.59% (p=0.016 n=4+5) Fannkuch11 11.1s ± 1% 10.8s ± 0% -2.65% (p=0.008 n=5+5) FmtFprintfEmpty 231ns ± 1% 217ns ± 0% -6.06% (p=0.008 n=5+5) FmtFprintfString 407ns ± 3% 375ns ± 2% -7.81% (p=0.008 n=5+5) FmtFprintfInt 466ns ± 2% 430ns ± 0% -7.79% (p=0.016 n=5+4) FmtFprintfIntInt 719ns ± 2% 673ns ± 2% -6.37% (p=0.008 n=5+5) FmtFprintfPrefixedInt 706ns ± 1% 676ns ± 3% -4.31% (p=0.008 n=5+5) FmtFprintfFloat 1.01µs ± 1% 0.97µs ± 1% -4.30% (p=0.008 n=5+5) FmtManyArgs 2.67µs ± 1% 2.51µs ± 1% -5.95% (p=0.008 n=5+5) GobDecode 30.7ms ± 9% 31.3ms ±34% ~ (p=0.222 n=5+5) GobEncode 24.2ms ±23% 20.2ms ± 0% -16.36% (p=0.016 n=5+4) Gzip 852ms ± 0% 823ms ± 0% -3.38% (p=0.016 n=4+5) Gunzip 160ms ± 1% 151ms ± 1% -5.37% (p=0.008 n=5+5) JSONEncode 35.7ms ± 1% 34.3ms ± 1% -3.81% (p=0.008 n=5+5) JSONDecode 247ms ± 8% 254ms ± 7% ~ (p=0.548 n=5+5) Mandelbrot200 5.39ms ± 0% 5.41ms ± 0% +0.42% (p=0.008 n=5+5) GoParse 18.5ms ± 1% 18.3ms ± 2% ~ (p=0.343 n=4+4) RegexpMatchEasy0_32 424ns ± 2% 397ns ± 0% -6.23% (p=0.008 n=5+5) RegexpMatchEasy0_1K 2.88µs ± 0% 2.86µs ± 1% ~ (p=0.079 n=5+5) RegexpMatchEasy1_32 395ns ± 2% 370ns ± 1% -6.23% (p=0.008 n=5+5) RegexpMatchEasy1_1K 3.26µs ± 0% 3.19µs ± 1% -2.06% (p=0.008 n=5+5) RegexpMatchMedium_32 564ns ± 1% 532ns ± 0% -5.71% (p=0.008 n=5+5) RegexpMatchMedium_1K 146µs ± 2% 140µs ± 1% -4.62% (p=0.008 n=5+5) RegexpMatchHard_32 8.47µs ± 1% 7.91µs ± 1% -6.65% (p=0.008 n=5+5) RegexpMatchHard_1K 253µs ± 1% 236µs ± 2% -6.66% (p=0.008 n=5+5) Revcomp 1.78s ± 4% 1.76s ± 5% ~ (p=1.000 n=5+5) Template 292ms ±29% 269ms ± 5% ~ (p=0.690 n=5+5) TimeParse 1.61µs ± 4% 1.54µs ± 1% -4.42% (p=0.008 n=5+5) TimeFormat 1.66µs ± 3% 1.58µs ± 1% -5.22% (p=0.008 n=5+5) [Geo mean] 232µs 221µs -4.54% name old speed new speed delta GobDecode 25.0MB/s ± 8% 25.1MB/s ±27% ~ (p=0.222 n=5+5) GobEncode 32.8MB/s ±21% 38.0MB/s ± 0% +15.84% (p=0.016 n=5+4) Gzip 22.8MB/s ± 0% 23.6MB/s ± 0% +3.49% (p=0.016 n=4+5) Gunzip 121MB/s ± 1% 128MB/s ± 1% +5.68% (p=0.008 n=5+5) JSONEncode 54.4MB/s ± 1% 56.5MB/s ± 1% +3.97% (p=0.008 n=5+5) JSONDecode 7.88MB/s ± 8% 7.65MB/s ± 8% ~ (p=0.548 n=5+5) GoParse 3.07MB/s ± 8% 3.00MB/s ±22% ~ (p=0.579 n=5+5) RegexpMatchEasy0_32 75.6MB/s ± 2% 80.5MB/s ± 0% +6.58% (p=0.008 n=5+5) RegexpMatchEasy0_1K 356MB/s ± 0% 358MB/s ± 1% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32 81.1MB/s ± 2% 86.5MB/s ± 1% +6.69% (p=0.008 n=5+5) RegexpMatchEasy1_1K 314MB/s ± 0% 320MB/s ± 0% +2.10% (p=0.008 n=5+5) RegexpMatchMedium_32 1.77MB/s ± 1% 1.88MB/s ± 0% +6.09% (p=0.016 n=5+4) RegexpMatchMedium_1K 6.99MB/s ± 2% 7.33MB/s ± 1% +4.83% (p=0.008 n=5+5) RegexpMatchHard_32 3.78MB/s ± 1% 4.04MB/s ± 1% +7.04% (p=0.008 n=5+5) RegexpMatchHard_1K 4.04MB/s ± 1% 4.33MB/s ± 2% +7.17% (p=0.008 n=5+5) Revcomp 143MB/s ± 4% 145MB/s ± 5% ~ (p=1.000 n=5+5) Template 6.77MB/s ±24% 7.22MB/s ± 5% ~ (p=0.690 n=5+5) [Geo mean] 24.4MB/s 25.4MB/s +4.18% Change-Id: Ib80716e62992aec28b2c4a96af280c278f83aa49 Reviewed-on: https://go-review.googlesource.com/c/go/+/173980 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Richard Musiol <neelance@gmail.com>
2019-04-25 17:25:54 -04:00
writeUleb128(w, 1) // local SP
cmd/internal/obj/wasm: cache SP in a local We use Wasm global variables extensively for simulating registers, especially SP. V8 does not handle global variables efficiently. This CL reduces global variable accesses by caching the global SP in a local variable in each function. The local cache is set on function entry and updated after each call (where the stack could have moved). Within a function, the SP access will use the local variable. Supersedes https://golang.org/cl/173979. Running on Chrome Version 73.0.3683.103 on darwin/amd64: name old time/op new time/op delta BinaryTree17 15.3s ± 2% 14.5s ± 3% -5.20% (p=0.000 n=9+10) Fannkuch11 8.91s ± 2% 9.48s ± 2% +6.41% (p=0.000 n=9+10) FmtFprintfEmpty 197ns ± 5% 165ns ± 3% -16.09% (p=0.000 n=9+8) FmtFprintfString 354ns ± 8% 325ns ± 7% -8.33% (p=0.001 n=10+10) FmtFprintfInt 400ns ± 4% 368ns ± 6% -8.01% (p=0.000 n=10+10) FmtFprintfIntInt 618ns ± 3% 587ns ± 6% -4.97% (p=0.001 n=10+10) FmtFprintfPrefixedInt 637ns ± 4% 606ns ± 4% -4.88% (p=0.000 n=10+10) FmtFprintfFloat 965ns ± 7% 898ns ± 4% -6.97% (p=0.000 n=10+10) FmtManyArgs 2.34µs ± 1% 2.24µs ± 3% -4.40% (p=0.000 n=9+10) GobDecode 29.8ms ± 3% 28.8ms ± 6% -3.60% (p=0.006 n=9+10) GobEncode 20.5ms ± 8% 17.6ms ± 3% -14.32% (p=0.000 n=10+10) Gzip 714ms ± 3% 718ms ± 8% ~ (p=0.971 n=10+10) Gunzip 148ms ± 3% 136ms ± 3% -7.99% (p=0.000 n=10+9) HTTPClientServer 219µs ± 3% 215µs ± 4% ~ (p=0.190 n=10+10) JSONEncode 35.1ms ± 2% 31.8ms ±13% -9.52% (p=0.002 n=10+10) JSONDecode 220ms ± 3% 207ms ± 5% -5.87% (p=0.000 n=10+10) Mandelbrot200 5.22ms ± 1% 5.11ms ± 4% -2.11% (p=0.027 n=8+10) GoParse 17.2ms ± 6% 16.1ms ± 5% -6.63% (p=0.000 n=10+9) RegexpMatchEasy0_32 375ns ± 3% 340ns ± 3% -9.25% (p=0.000 n=10+10) RegexpMatchEasy0_1K 2.70µs ± 3% 2.65µs ± 4% ~ (p=0.118 n=10+10) RegexpMatchEasy1_32 341ns ± 2% 305ns ± 4% -10.62% (p=0.000 n=9+10) RegexpMatchEasy1_1K 3.20µs ± 3% 2.99µs ± 3% -6.35% (p=0.000 n=10+10) RegexpMatchMedium_32 520ns ± 3% 501ns ± 4% -3.64% (p=0.002 n=9+10) RegexpMatchMedium_1K 145µs ± 7% 128µs ± 3% -11.57% (p=0.000 n=9+10) RegexpMatchHard_32 7.88µs ± 3% 7.01µs ± 5% -10.97% (p=0.000 n=10+10) RegexpMatchHard_1K 237µs ± 5% 207µs ± 4% -12.71% (p=0.000 n=9+10) Revcomp 2.34s ± 1% 2.31s ± 5% ~ (p=0.230 n=7+10) Template 261ms ± 7% 246ms ± 5% -5.93% (p=0.007 n=10+10) TimeParse 1.47µs ± 3% 1.39µs ± 5% -5.75% (p=0.000 n=9+10) TimeFormat 1.52µs ± 3% 1.43µs ± 4% -6.42% (p=0.000 n=8+10) name old speed new speed delta GobDecode 25.7MB/s ± 3% 26.7MB/s ± 5% +3.77% (p=0.006 n=9+10) GobEncode 37.5MB/s ± 8% 43.7MB/s ± 3% +16.61% (p=0.000 n=10+10) Gzip 27.2MB/s ± 3% 27.0MB/s ± 7% ~ (p=0.971 n=10+10) Gunzip 131MB/s ± 3% 142MB/s ± 5% +8.07% (p=0.000 n=10+10) JSONEncode 55.2MB/s ± 2% 61.2MB/s ±12% +10.80% (p=0.002 n=10+10) JSONDecode 8.84MB/s ± 3% 9.39MB/s ± 5% +6.28% (p=0.000 n=10+10) GoParse 3.37MB/s ± 6% 3.61MB/s ± 5% +7.09% (p=0.000 n=10+9) RegexpMatchEasy0_32 85.3MB/s ± 3% 94.0MB/s ± 3% +10.20% (p=0.000 n=10+10) RegexpMatchEasy0_1K 379MB/s ± 3% 387MB/s ± 4% ~ (p=0.123 n=10+10) RegexpMatchEasy1_32 93.9MB/s ± 2% 105.1MB/s ± 4% +11.96% (p=0.000 n=9+10) RegexpMatchEasy1_1K 320MB/s ± 3% 342MB/s ± 3% +6.79% (p=0.000 n=10+10) RegexpMatchMedium_32 1.92MB/s ± 2% 2.00MB/s ± 3% +3.94% (p=0.001 n=9+10) RegexpMatchMedium_1K 7.09MB/s ± 6% 8.01MB/s ± 3% +13.00% (p=0.000 n=9+10) RegexpMatchHard_32 4.06MB/s ± 3% 4.56MB/s ± 5% +12.38% (p=0.000 n=10+10) RegexpMatchHard_1K 4.32MB/s ± 4% 4.96MB/s ± 4% +14.60% (p=0.000 n=9+10) Revcomp 109MB/s ± 1% 110MB/s ± 5% ~ (p=0.219 n=7+10) Template 7.44MB/s ± 8% 7.91MB/s ± 5% +6.30% (p=0.007 n=10+10) Change-Id: I5828cf6b23ce104c02addc2642aba48dd6c48aab Reviewed-on: https://go-review.googlesource.com/c/go/+/174062 Run-TryBot: Richard Musiol <neelance@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-04-27 01:16:10 +02:00
}
func writeOpcode(w *bytes.Buffer, as obj.As) {
switch {
case as < AUnreachable:
panic(fmt.Sprintf("unexpected assembler op: %s", as))
case as < AEnd:
w.WriteByte(byte(as - AUnreachable + 0x00))
case as < ADrop:
w.WriteByte(byte(as - AEnd + 0x0B))
case as < ALocalGet:
w.WriteByte(byte(as - ADrop + 0x1A))
case as < AI32Load:
w.WriteByte(byte(as - ALocalGet + 0x20))
case as < AI32TruncSatF32S:
w.WriteByte(byte(as - AI32Load + 0x28))
case as < ALast:
w.WriteByte(0xFC)
w.WriteByte(byte(as - AI32TruncSatF32S + 0x00))
default:
panic(fmt.Sprintf("unexpected assembler op: %s", as))
}
}
type valueType byte
const (
i32 valueType = 0x7F
i64 valueType = 0x7E
f32 valueType = 0x7D
f64 valueType = 0x7C
)
func regType(reg int16) valueType {
switch {
case reg == REG_SP:
return i32
case reg >= REG_R0 && reg <= REG_R15:
return i64
case reg >= REG_F0 && reg <= REG_F15:
return f32
case reg >= REG_F16 && reg <= REG_F31:
return f64
default:
panic("invalid register")
}
}
func align(as obj.As) uint64 {
switch as {
case AI32Load8S, AI32Load8U, AI64Load8S, AI64Load8U, AI32Store8, AI64Store8:
return 0
case AI32Load16S, AI32Load16U, AI64Load16S, AI64Load16U, AI32Store16, AI64Store16:
return 1
case AI32Load, AF32Load, AI64Load32S, AI64Load32U, AI32Store, AF32Store, AI64Store32:
return 2
case AI64Load, AF64Load, AI64Store, AF64Store:
return 3
default:
panic("align: bad op")
}
}
func writeUleb128(w io.ByteWriter, v uint64) {
more := true
for more {
c := uint8(v & 0x7f)
v >>= 7
more = v != 0
if more {
c |= 0x80
}
w.WriteByte(c)
}
}
func writeSleb128(w io.ByteWriter, v int64) {
more := true
for more {
c := uint8(v & 0x7f)
s := uint8(v & 0x40)
v >>= 7
more = !((v == 0 && s == 0) || (v == -1 && s != 0))
if more {
c |= 0x80
}
w.WriteByte(c)
}
}