go/src/cmd/compile/internal/amd64/ssa.go

2579 lines
77 KiB
Go
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package amd64
import (
"fmt"
"math"
"cmd/compile/internal/base"
[dev.regabi] cmd/compile: group known symbols, packages, names [generated] There are a handful of pre-computed magic symbols known by package gc, and we need a place to store them. If we keep them together, the need for type *ir.Name means that package ir is the lowest package in the import hierarchy that they can go in. And package ir needs gopkg for methodSymSuffix (in a later CL), so they can't go any higher either, at least not all together. So package ir it is. Rather than dump them all into the top-level package ir namespace, however, we introduce global structs, Syms, Pkgs, and Names, and make the known symbols, packages, and names fields of those. [git-generate] cd src/cmd/compile/internal/gc rf ' add go.go:$ \ // Names holds known names. \ var Names struct{} \ \ // Syms holds known symbols. \ var Syms struct {} \ \ // Pkgs holds known packages. \ var Pkgs struct {} \ mv staticuint64s Names.Staticuint64s mv zerobase Names.Zerobase mv assertE2I Syms.AssertE2I mv assertE2I2 Syms.AssertE2I2 mv assertI2I Syms.AssertI2I mv assertI2I2 Syms.AssertI2I2 mv deferproc Syms.Deferproc mv deferprocStack Syms.DeferprocStack mv Deferreturn Syms.Deferreturn mv Duffcopy Syms.Duffcopy mv Duffzero Syms.Duffzero mv gcWriteBarrier Syms.GCWriteBarrier mv goschedguarded Syms.Goschedguarded mv growslice Syms.Growslice mv msanread Syms.Msanread mv msanwrite Syms.Msanwrite mv msanmove Syms.Msanmove mv newobject Syms.Newobject mv newproc Syms.Newproc mv panicdivide Syms.Panicdivide mv panicshift Syms.Panicshift mv panicdottypeE Syms.PanicdottypeE mv panicdottypeI Syms.PanicdottypeI mv panicnildottype Syms.Panicnildottype mv panicoverflow Syms.Panicoverflow mv raceread Syms.Raceread mv racereadrange Syms.Racereadrange mv racewrite Syms.Racewrite mv racewriterange Syms.Racewriterange mv SigPanic Syms.SigPanic mv typedmemclr Syms.Typedmemclr mv typedmemmove Syms.Typedmemmove mv Udiv Syms.Udiv mv writeBarrier Syms.WriteBarrier mv zerobaseSym Syms.Zerobase mv arm64HasATOMICS Syms.ARM64HasATOMICS mv armHasVFPv4 Syms.ARMHasVFPv4 mv x86HasFMA Syms.X86HasFMA mv x86HasPOPCNT Syms.X86HasPOPCNT mv x86HasSSE41 Syms.X86HasSSE41 mv WasmDiv Syms.WasmDiv mv WasmMove Syms.WasmMove mv WasmZero Syms.WasmZero mv WasmTruncS Syms.WasmTruncS mv WasmTruncU Syms.WasmTruncU mv gopkg Pkgs.Go mv itabpkg Pkgs.Itab mv itablinkpkg Pkgs.Itablink mv mappkg Pkgs.Map mv msanpkg Pkgs.Msan mv racepkg Pkgs.Race mv Runtimepkg Pkgs.Runtime mv trackpkg Pkgs.Track mv unsafepkg Pkgs.Unsafe mv Names Syms Pkgs symtab.go mv symtab.go cmd/compile/internal/ir ' Change-Id: Ic143862148569a3bcde8e70b26d75421aa2d00f3 Reviewed-on: https://go-review.googlesource.com/c/go/+/279235 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:10:25 -05:00
"cmd/compile/internal/ir"
"cmd/compile/internal/logopt"
"cmd/compile/internal/objw"
"cmd/compile/internal/ssa"
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
"cmd/compile/internal/ssagen"
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
"cmd/compile/internal/types"
"cmd/internal/obj"
"cmd/internal/obj/x86"
"internal/abi"
"internal/buildcfg"
)
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
flive := b.FlagsLiveAtEnd
cmd/compile: allow multiple SSA block control values Control values are used to choose which successor of a block is jumped to. Typically a control value takes the form of a 'flags' value that represents the result of a comparison. Some architectures however use a variable in a register as a control value. Up until now we have managed with a single control value per block. However some architectures (e.g. s390x and riscv64) have combined compare-and-branch instructions that take two variables in registers as parameters. To generate these instructions we need to support 2 control values per block. This CL allows up to 2 control values to be used in a block in order to support the addition of compare-and-branch instructions. I have implemented s390x compare-and-branch instructions in a different CL. Passes toolstash-check -all. Results of compilebench: name old time/op new time/op delta Template 208ms ± 1% 209ms ± 1% ~ (p=0.289 n=20+20) Unicode 83.7ms ± 1% 83.3ms ± 3% -0.49% (p=0.017 n=18+18) GoTypes 748ms ± 1% 748ms ± 0% ~ (p=0.460 n=20+18) Compiler 3.47s ± 1% 3.48s ± 1% ~ (p=0.070 n=19+18) SSA 11.5s ± 1% 11.7s ± 1% +1.64% (p=0.000 n=19+18) Flate 130ms ± 1% 130ms ± 1% ~ (p=0.588 n=19+20) GoParser 160ms ± 1% 161ms ± 1% ~ (p=0.211 n=20+20) Reflect 465ms ± 1% 467ms ± 1% +0.42% (p=0.007 n=20+20) Tar 184ms ± 1% 185ms ± 2% ~ (p=0.087 n=18+20) XML 253ms ± 1% 253ms ± 1% ~ (p=0.377 n=20+18) LinkCompiler 769ms ± 2% 774ms ± 2% ~ (p=0.070 n=19+19) ExternalLinkCompiler 3.59s ±11% 3.68s ± 6% ~ (p=0.072 n=20+20) LinkWithoutDebugCompiler 446ms ± 5% 454ms ± 3% +1.79% (p=0.002 n=19+20) StdCmd 26.0s ± 2% 26.0s ± 2% ~ (p=0.799 n=20+20) name old user-time/op new user-time/op delta Template 238ms ± 5% 240ms ± 5% ~ (p=0.142 n=20+20) Unicode 105ms ±11% 106ms ±10% ~ (p=0.512 n=20+20) GoTypes 876ms ± 2% 873ms ± 4% ~ (p=0.647 n=20+19) Compiler 4.17s ± 2% 4.19s ± 1% ~ (p=0.093 n=20+18) SSA 13.9s ± 1% 14.1s ± 1% +1.45% (p=0.000 n=18+18) Flate 145ms ±13% 146ms ± 5% ~ (p=0.851 n=20+18) GoParser 185ms ± 5% 188ms ± 7% ~ (p=0.174 n=20+20) Reflect 534ms ± 3% 538ms ± 2% ~ (p=0.105 n=20+18) Tar 215ms ± 4% 211ms ± 9% ~ (p=0.079 n=19+20) XML 295ms ± 6% 295ms ± 5% ~ (p=0.968 n=20+20) LinkCompiler 832ms ± 4% 837ms ± 7% ~ (p=0.707 n=17+20) ExternalLinkCompiler 1.58s ± 8% 1.60s ± 4% ~ (p=0.296 n=20+19) LinkWithoutDebugCompiler 478ms ±12% 489ms ±10% ~ (p=0.429 n=20+20) name old object-bytes new object-bytes delta Template 559kB ± 0% 559kB ± 0% ~ (all equal) Unicode 216kB ± 0% 216kB ± 0% ~ (all equal) GoTypes 2.03MB ± 0% 2.03MB ± 0% ~ (all equal) Compiler 8.07MB ± 0% 8.07MB ± 0% -0.06% (p=0.000 n=20+20) SSA 27.1MB ± 0% 27.3MB ± 0% +0.89% (p=0.000 n=20+20) Flate 343kB ± 0% 343kB ± 0% ~ (all equal) GoParser 441kB ± 0% 441kB ± 0% ~ (all equal) Reflect 1.36MB ± 0% 1.36MB ± 0% ~ (all equal) Tar 487kB ± 0% 487kB ± 0% ~ (all equal) XML 632kB ± 0% 632kB ± 0% ~ (all equal) name old export-bytes new export-bytes delta Template 18.5kB ± 0% 18.5kB ± 0% ~ (all equal) Unicode 7.92kB ± 0% 7.92kB ± 0% ~ (all equal) GoTypes 35.0kB ± 0% 35.0kB ± 0% ~ (all equal) Compiler 109kB ± 0% 110kB ± 0% +0.72% (p=0.000 n=20+20) SSA 137kB ± 0% 138kB ± 0% +0.58% (p=0.000 n=20+20) Flate 4.89kB ± 0% 4.89kB ± 0% ~ (all equal) GoParser 8.49kB ± 0% 8.49kB ± 0% ~ (all equal) Reflect 11.4kB ± 0% 11.4kB ± 0% ~ (all equal) Tar 10.5kB ± 0% 10.5kB ± 0% ~ (all equal) XML 16.7kB ± 0% 16.7kB ± 0% ~ (all equal) name old text-bytes new text-bytes delta HelloSize 761kB ± 0% 761kB ± 0% ~ (all equal) CmdGoSize 10.8MB ± 0% 10.8MB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 10.7kB ± 0% 10.7kB ± 0% ~ (all equal) CmdGoSize 312kB ± 0% 312kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 122kB ± 0% 122kB ± 0% ~ (all equal) CmdGoSize 146kB ± 0% 146kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.13MB ± 0% 1.13MB ± 0% ~ (all equal) CmdGoSize 15.1MB ± 0% 15.1MB ± 0% ~ (all equal) Change-Id: I3cc2f9829a109543d9a68be4a21775d2d3e9801f Reviewed-on: https://go-review.googlesource.com/c/go/+/196557 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Reviewed-by: Keith Randall <khr@golang.org>
2019-08-12 20:19:58 +01:00
for _, c := range b.ControlValues() {
flive = c.Type.IsFlags() || flive
}
for i := len(b.Values) - 1; i >= 0; i-- {
v := b.Values[i]
if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
// The "mark" is any non-nil Aux value.
v.Aux = ssa.AuxMark
}
if v.Type.IsFlags() {
flive = false
}
for _, a := range v.Args {
if a.Type.IsFlags() {
flive = true
}
}
}
}
func isFPReg(r int16) bool {
return x86.REG_X0 <= r && r <= x86.REG_Z31
}
func isKReg(r int16) bool {
return x86.REG_K0 <= r && r <= x86.REG_K7
}
func isLowFPReg(r int16) bool {
return x86.REG_X0 <= r && r <= x86.REG_X15
}
// loadByRegWidth returns the load instruction of the given register of a given width.
func loadByRegWidth(r int16, width int64) obj.As {
// Avoid partial register write for GPR
if !isFPReg(r) && !isKReg(r) {
switch width {
case 1:
return x86.AMOVBLZX
case 2:
return x86.AMOVWLZX
}
}
// Otherwise, there's no difference between load and store opcodes.
return storeByRegWidth(r, width)
}
// storeByRegWidth returns the store instruction of the given register of a given width.
// It's also used for loading const to a reg.
func storeByRegWidth(r int16, width int64) obj.As {
if isFPReg(r) {
switch width {
case 4:
return x86.AMOVSS
case 8:
return x86.AMOVSD
case 16:
// int128s are in SSE registers
if isLowFPReg(r) {
return x86.AMOVUPS
} else {
return x86.AVMOVDQU
}
case 32:
return x86.AVMOVDQU
case 64:
return x86.AVMOVDQU64
}
}
if isKReg(r) {
return x86.AKMOVQ
}
// gp
switch width {
case 1:
return x86.AMOVB
case 2:
return x86.AMOVW
case 4:
return x86.AMOVL
case 8:
return x86.AMOVQ
}
panic(fmt.Sprintf("bad store reg=%v, width=%d", r, width))
}
// moveByRegsWidth returns the reg->reg move instruction of the given dest/src registers of a given width.
func moveByRegsWidth(dest, src int16, width int64) obj.As {
// fp -> fp
if isFPReg(dest) && isFPReg(src) {
// Moving the whole sse2 register is faster
// than moving just the correct low portion of it.
// There is no xmm->xmm move with 1 byte opcode,
// so use movups, which has 2 byte opcode.
if isLowFPReg(dest) && isLowFPReg(src) && width <= 16 {
return x86.AMOVUPS
}
if width <= 32 {
return x86.AVMOVDQU
}
return x86.AVMOVDQU64
}
// k -> gp, gp -> k, k -> k
if isKReg(dest) || isKReg(src) {
if isFPReg(dest) || isFPReg(src) {
panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
}
return x86.AKMOVQ
}
// gp -> fp, fp -> gp, gp -> gp
switch width {
case 1:
// Avoids partial register write
return x86.AMOVL
case 2:
return x86.AMOVL
case 4:
return x86.AMOVL
case 8:
return x86.AMOVQ
case 16:
if isLowFPReg(dest) && isLowFPReg(src) {
// int128s are in SSE registers
return x86.AMOVUPS
} else {
return x86.AVMOVDQU
}
case 32:
return x86.AVMOVDQU
case 64:
return x86.AVMOVDQU64
}
panic(fmt.Sprintf("bad move, src=%v, dest=%v, width=%d", src, dest, width))
}
// opregreg emits instructions for
//
// dest := dest(To) op src(From)
//
// and also returns the created obj.Prog so it
// may be further adjusted (offset, scale, etc).
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
func opregreg(s *ssagen.State, op obj.As, dest, src int16) *obj.Prog {
p := s.Prog(op)
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = dest
p.From.Reg = src
return p
}
// memIdx fills out a as an indexed memory reference for v.
// It assumes that the base register and the index register
// are v.Args[0].Reg() and v.Args[1].Reg(), respectively.
// The caller must still use gc.AddAux/gc.AddAux2 to handle v.Aux as necessary.
func memIdx(a *obj.Addr, v *ssa.Value) {
r, i := v.Args[0].Reg(), v.Args[1].Reg()
a.Type = obj.TYPE_MEM
a.Scale = v.Op.Scale()
if a.Scale == 1 && i == x86.REG_SP {
r, i = i, r
}
a.Reg = r
a.Index = i
}
func getgFromTLS(s *ssagen.State, r int16) {
// See the comments in cmd/internal/obj/x86/obj6.go
// near CanUse1InsnTLS for a detailed explanation of these instructions.
if x86.CanUse1InsnTLS(base.Ctxt) {
// MOVQ (TLS), r
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_MEM
p.From.Reg = x86.REG_TLS
p.To.Type = obj.TYPE_REG
p.To.Reg = r
} else {
// MOVQ TLS, r
// MOVQ (r)(TLS*1), r
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_TLS
p.To.Type = obj.TYPE_REG
p.To.Reg = r
q := s.Prog(x86.AMOVQ)
q.From.Type = obj.TYPE_MEM
q.From.Reg = r
q.From.Index = x86.REG_TLS
q.From.Scale = 1
q.To.Type = obj.TYPE_REG
q.To.Reg = r
}
}
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
func ssaGenValue(s *ssagen.State, v *ssa.Value) {
switch v.Op {
cmd/compile: lower x*z + y to FMA if FMA enabled There is a generic opcode for FMA, but we don't use it in rewrite rules. This is maybe because some archs, like WASM and MIPS don't have a late lowering rule for it. Fixes #71204 Intel Alder Lake 12600k (GOAMD64=v3): math: name old time/op new time/op delta Acos-16 4.58ns ± 0% 3.36ns ± 0% -26.68% (p=0.008 n=5+5) Acosh-16 8.04ns ± 1% 6.44ns ± 0% -19.95% (p=0.008 n=5+5) Asin-16 4.28ns ± 0% 3.32ns ± 0% -22.24% (p=0.008 n=5+5) Asinh-16 9.92ns ± 0% 8.62ns ± 0% -13.13% (p=0.008 n=5+5) Atan-16 2.31ns ± 0% 1.84ns ± 0% -20.02% (p=0.008 n=5+5) Atanh-16 7.79ns ± 0% 7.03ns ± 0% -9.67% (p=0.008 n=5+5) Atan2-16 3.93ns ± 0% 3.52ns ± 0% -10.35% (p=0.000 n=5+4) Cbrt-16 4.62ns ± 0% 4.41ns ± 0% -4.57% (p=0.016 n=4+5) Ceil-16 0.14ns ± 1% 0.14ns ± 2% ~ (p=0.103 n=5+5) Copysign-16 0.33ns ± 0% 0.33ns ± 0% +0.03% (p=0.029 n=4+4) Cos-16 4.87ns ± 0% 4.75ns ± 0% -2.44% (p=0.016 n=5+4) Cosh-16 4.86ns ± 0% 4.86ns ± 0% ~ (p=0.317 n=5+5) Erf-16 2.71ns ± 0% 2.25ns ± 0% -16.69% (p=0.008 n=5+5) Erfc-16 3.06ns ± 0% 2.67ns ± 0% -13.00% (p=0.016 n=5+4) Erfinv-16 3.88ns ± 0% 2.84ns ± 3% -26.83% (p=0.008 n=5+5) Erfcinv-16 4.08ns ± 0% 3.01ns ± 1% -26.27% (p=0.008 n=5+5) Exp-16 3.29ns ± 0% 3.37ns ± 2% +2.64% (p=0.016 n=4+5) ExpGo-16 8.44ns ± 0% 7.48ns ± 1% -11.37% (p=0.008 n=5+5) Expm1-16 4.46ns ± 0% 3.69ns ± 2% -17.26% (p=0.016 n=4+5) Exp2-16 8.20ns ± 0% 7.39ns ± 2% -9.94% (p=0.008 n=5+5) Exp2Go-16 8.26ns ± 0% 7.23ns ± 0% -12.49% (p=0.016 n=4+5) Abs-16 0.26ns ± 3% 0.22ns ± 1% -16.34% (p=0.008 n=5+5) Dim-16 0.38ns ± 1% 0.40ns ± 2% +5.02% (p=0.008 n=5+5) Floor-16 0.11ns ± 1% 0.17ns ± 4% +54.99% (p=0.008 n=5+5) Max-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.619 n=5+5) Min-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.484 n=5+5) Mod-16 13.4ns ± 1% 12.8ns ± 0% -4.21% (p=0.016 n=5+4) Frexp-16 1.70ns ± 0% 1.71ns ± 0% +0.46% (p=0.008 n=5+5) Gamma-16 3.97ns ± 0% 3.97ns ± 0% ~ (p=0.643 n=5+5) Hypot-16 2.11ns ± 0% 2.11ns ± 0% ~ (p=0.762 n=5+5) HypotGo-16 2.48ns ± 4% 2.26ns ± 0% -8.94% (p=0.008 n=5+5) Ilogb-16 1.67ns ± 0% 1.67ns ± 0% -0.07% (p=0.048 n=5+5) J0-16 19.8ns ± 0% 19.3ns ± 0% ~ (p=0.079 n=4+5) J1-16 19.4ns ± 0% 18.9ns ± 0% -2.63% (p=0.000 n=5+4) Jn-16 41.5ns ± 0% 40.6ns ± 0% -2.32% (p=0.016 n=4+5) Ldexp-16 2.26ns ± 0% 2.26ns ± 0% ~ (p=0.683 n=5+5) Lgamma-16 4.40ns ± 0% 4.21ns ± 0% -4.21% (p=0.008 n=5+5) Log-16 4.05ns ± 0% 4.05ns ± 0% ~ (all equal) Logb-16 1.69ns ± 0% 1.69ns ± 0% ~ (p=0.429 n=5+5) Log1p-16 5.00ns ± 0% 3.99ns ± 0% -20.14% (p=0.008 n=5+5) Log10-16 4.22ns ± 0% 4.21ns ± 0% -0.15% (p=0.008 n=5+5) Log2-16 2.27ns ± 0% 2.25ns ± 0% -0.94% (p=0.008 n=5+5) Modf-16 1.44ns ± 0% 1.44ns ± 0% ~ (p=0.492 n=5+5) Nextafter32-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.079 n=4+5) Nextafter64-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.095 n=4+5) PowInt-16 10.8ns ± 0% 10.8ns ± 0% ~ (all equal) PowFrac-16 25.3ns ± 0% 25.3ns ± 0% -0.09% (p=0.000 n=5+4) Pow10Pos-16 0.52ns ± 1% 0.52ns ± 0% ~ (p=0.810 n=5+5) Pow10Neg-16 0.82ns ± 0% 0.82ns ± 0% ~ (p=0.381 n=5+5) Round-16 0.93ns ± 0% 0.93ns ± 0% ~ (p=0.056 n=5+5) RoundToEven-16 1.64ns ± 0% 1.64ns ± 0% ~ (all equal) Remainder-16 12.4ns ± 2% 12.0ns ± 0% -3.27% (p=0.008 n=5+5) Signbit-16 0.37ns ± 0% 0.37ns ± 0% -0.19% (p=0.008 n=5+5) Sin-16 4.04ns ± 0% 3.92ns ± 0% -3.13% (p=0.000 n=4+5) Sincos-16 5.99ns ± 0% 5.80ns ± 0% -3.03% (p=0.008 n=5+5) Sinh-16 5.22ns ± 0% 5.22ns ± 0% ~ (p=0.651 n=5+4) SqrtIndirect-16 0.41ns ± 0% 0.41ns ± 0% ~ (p=0.333 n=4+5) SqrtLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=0.079 n=4+5) SqrtIndirectLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=1.000 n=5+5) SqrtGoLatency-16 30.1ns ± 0% 28.6ns ± 1% -4.84% (p=0.008 n=5+5) SqrtPrime-16 645ns ± 0% 645ns ± 0% ~ (p=0.095 n=5+4) Tan-16 4.21ns ± 0% 4.09ns ± 0% -2.76% (p=0.029 n=4+4) Tanh-16 5.36ns ± 0% 5.36ns ± 0% ~ (p=0.444 n=5+5) Trunc-16 0.12ns ± 6% 0.11ns ± 1% -6.79% (p=0.008 n=5+5) Y0-16 19.2ns ± 0% 18.7ns ± 0% -2.52% (p=0.000 n=5+4) Y1-16 19.1ns ± 0% 18.4ns ± 0% ~ (p=0.079 n=4+5) Yn-16 40.7ns ± 0% 39.5ns ± 0% -2.82% (p=0.008 n=5+5) Float64bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.603 n=5+5) Float64frombits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.984 n=4+5) Float32bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.778 n=4+5) Float32frombits-16 0.21ns ± 0% 0.20ns ± 0% ~ (p=0.397 n=5+5) FMA-16 0.82ns ± 0% 0.82ns ± 0% +0.02% (p=0.029 n=4+4) [Geo mean] 2.87ns 2.74ns -4.61% math/cmplx: name old time/op new time/op delta Abs-16 2.07ns ± 0% 2.05ns ± 0% -0.70% (p=0.016 n=5+4) Acos-16 36.5ns ± 0% 35.7ns ± 0% -2.33% (p=0.029 n=4+4) Acosh-16 37.0ns ± 0% 36.2ns ± 0% -2.20% (p=0.008 n=5+5) Asin-16 36.5ns ± 0% 35.7ns ± 0% -2.29% (p=0.008 n=5+5) Asinh-16 33.5ns ± 0% 31.6ns ± 0% -5.51% (p=0.008 n=5+5) Atan-16 15.5ns ± 0% 13.9ns ± 0% -10.61% (p=0.008 n=5+5) Atanh-16 15.0ns ± 0% 13.6ns ± 0% -9.73% (p=0.008 n=5+5) Conj-16 0.11ns ± 5% 0.11ns ± 1% ~ (p=0.421 n=5+5) Cos-16 12.3ns ± 0% 12.2ns ± 0% -0.60% (p=0.000 n=4+5) Cosh-16 12.1ns ± 0% 12.0ns ± 0% ~ (p=0.079 n=4+5) Exp-16 10.0ns ± 0% 9.8ns ± 0% -1.77% (p=0.008 n=5+5) Log-16 14.5ns ± 0% 13.7ns ± 0% -5.67% (p=0.008 n=5+5) Log10-16 14.5ns ± 0% 13.7ns ± 0% -5.55% (p=0.000 n=5+4) Phase-16 5.11ns ± 0% 4.25ns ± 0% -16.90% (p=0.008 n=5+5) Polar-16 7.12ns ± 0% 6.35ns ± 0% -10.90% (p=0.008 n=5+5) Pow-16 64.3ns ± 0% 63.7ns ± 0% -0.97% (p=0.008 n=5+5) Rect-16 5.74ns ± 0% 5.58ns ± 0% -2.73% (p=0.016 n=4+5) Sin-16 12.2ns ± 0% 12.2ns ± 0% -0.54% (p=0.000 n=4+5) Sinh-16 12.1ns ± 0% 12.0ns ± 0% -0.58% (p=0.000 n=5+4) Sqrt-16 5.30ns ± 0% 5.18ns ± 0% -2.36% (p=0.008 n=5+5) Tan-16 22.7ns ± 0% 22.6ns ± 0% -0.33% (p=0.008 n=5+5) Tanh-16 21.2ns ± 0% 20.9ns ± 0% -1.32% (p=0.008 n=5+5) [Geo mean] 11.3ns 10.8ns -3.97% Change-Id: Idcc4b357ba68477929c126289e5095b27a827b1b Reviewed-on: https://go-review.googlesource.com/c/go/+/646335 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2025-02-02 23:42:43 +01:00
case ssa.OpAMD64VFMADD231SD, ssa.OpAMD64VFMADD231SS:
p := s.Prog(v.Op.Asm())
p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
p.AddRestSourceReg(v.Args[1].Reg())
case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
r := v.Reg()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
switch {
case r == r1:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case r == r2:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
default:
var asm obj.As
if v.Op == ssa.OpAMD64ADDQ {
asm = x86.ALEAQ
} else {
asm = x86.ALEAL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_MEM
p.From.Reg = r1
p.From.Scale = 1
p.From.Index = r2
p.To.Type = obj.TYPE_REG
p.To.Reg = r
}
// 2-address opcode arithmetic
case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
ssa.OpAMD64MINSS, ssa.OpAMD64MINSD,
ssa.OpAMD64POR, ssa.OpAMD64PXOR,
cmd/compile: add patterns for bit set/clear/complement on amd64 This patch completes implementation of BT(Q|L), and adds support for BT(S|R|C)(Q|L). Example of code changes from time.(*Time).addSec: if t.wall&hasMonotonic != 0 { 0x1073465 488b08 MOVQ 0(AX), CX 0x1073468 4889ca MOVQ CX, DX 0x107346b 48c1e93f SHRQ $0x3f, CX 0x107346f 48c1e13f SHLQ $0x3f, CX 0x1073473 48f7c1ffffffff TESTQ $-0x1, CX 0x107347a 746b JE 0x10734e7 if t.wall&hasMonotonic != 0 { 0x1073435 488b08 MOVQ 0(AX), CX 0x1073438 480fbae13f BTQ $0x3f, CX 0x107343d 7363 JAE 0x10734a2 Another example: t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic 0x10734c8 4881e1ffffff3f ANDQ $0x3fffffff, CX 0x10734cf 48c1e61e SHLQ $0x1e, SI 0x10734d3 4809ce ORQ CX, SI 0x10734d6 48b90000000000000080 MOVQ $0x8000000000000000, CX 0x10734e0 4809f1 ORQ SI, CX 0x10734e3 488908 MOVQ CX, 0(AX) t.wall = t.wall&nsecMask | uint64(dsec)<<nsecShift | hasMonotonic 0x107348b 4881e2ffffff3f ANDQ $0x3fffffff, DX 0x1073492 48c1e61e SHLQ $0x1e, SI 0x1073496 4809f2 ORQ SI, DX 0x1073499 480fbaea3f BTSQ $0x3f, DX 0x107349e 488910 MOVQ DX, 0(AX) Go1 benchmarks seem unaffected, and I would be surprised otherwise: name old time/op new time/op delta BinaryTree17-4 2.64s ± 4% 2.56s ± 9% -2.92% (p=0.008 n=9+9) Fannkuch11-4 2.90s ± 1% 2.95s ± 3% +1.76% (p=0.010 n=10+9) FmtFprintfEmpty-4 35.3ns ± 1% 34.5ns ± 2% -2.34% (p=0.004 n=9+8) FmtFprintfString-4 57.0ns ± 1% 58.4ns ± 5% +2.52% (p=0.029 n=9+10) FmtFprintfInt-4 59.8ns ± 3% 59.8ns ± 6% ~ (p=0.565 n=10+10) FmtFprintfIntInt-4 93.9ns ± 3% 91.2ns ± 5% -2.94% (p=0.014 n=10+9) FmtFprintfPrefixedInt-4 107ns ± 6% 104ns ± 6% ~ (p=0.099 n=10+10) FmtFprintfFloat-4 187ns ± 3% 188ns ± 3% ~ (p=0.505 n=10+9) FmtManyArgs-4 410ns ± 1% 415ns ± 6% ~ (p=0.649 n=8+10) GobDecode-4 5.30ms ± 3% 5.27ms ± 3% ~ (p=0.436 n=10+10) GobEncode-4 4.62ms ± 5% 4.47ms ± 2% -3.24% (p=0.001 n=9+10) Gzip-4 197ms ± 4% 193ms ± 3% ~ (p=0.123 n=10+10) Gunzip-4 30.4ms ± 3% 30.1ms ± 3% ~ (p=0.481 n=10+10) HTTPClientServer-4 76.3µs ± 1% 76.0µs ± 1% ~ (p=0.236 n=8+9) JSONEncode-4 10.5ms ± 9% 10.3ms ± 3% ~ (p=0.280 n=10+10) JSONDecode-4 42.3ms ±10% 41.3ms ± 2% ~ (p=0.053 n=9+10) Mandelbrot200-4 3.80ms ± 2% 3.72ms ± 2% -2.15% (p=0.001 n=9+10) GoParse-4 2.88ms ±10% 2.81ms ± 2% ~ (p=0.247 n=10+10) RegexpMatchEasy0_32-4 69.5ns ± 4% 68.6ns ± 2% ~ (p=0.171 n=10+10) RegexpMatchEasy0_1K-4 165ns ± 3% 162ns ± 3% ~ (p=0.137 n=10+10) RegexpMatchEasy1_32-4 65.7ns ± 6% 64.4ns ± 2% -2.02% (p=0.037 n=10+10) RegexpMatchEasy1_1K-4 278ns ± 2% 279ns ± 3% ~ (p=0.991 n=8+9) RegexpMatchMedium_32-4 99.3ns ± 3% 98.5ns ± 4% ~ (p=0.457 n=10+9) RegexpMatchMedium_1K-4 30.1µs ± 1% 30.4µs ± 2% ~ (p=0.173 n=8+10) RegexpMatchHard_32-4 1.40µs ± 2% 1.41µs ± 4% ~ (p=0.565 n=10+10) RegexpMatchHard_1K-4 42.5µs ± 1% 41.5µs ± 3% -2.13% (p=0.002 n=8+9) Revcomp-4 332ms ± 4% 328ms ± 5% ~ (p=0.720 n=9+10) Template-4 48.3ms ± 2% 49.6ms ± 3% +2.56% (p=0.002 n=8+10) TimeParse-4 252ns ± 2% 249ns ± 3% ~ (p=0.116 n=9+10) TimeFormat-4 262ns ± 4% 252ns ± 3% -4.01% (p=0.000 n=9+10) name old speed new speed delta GobDecode-4 145MB/s ± 3% 146MB/s ± 3% ~ (p=0.436 n=10+10) GobEncode-4 166MB/s ± 5% 172MB/s ± 2% +3.28% (p=0.001 n=9+10) Gzip-4 98.6MB/s ± 4% 100.4MB/s ± 3% ~ (p=0.123 n=10+10) Gunzip-4 639MB/s ± 3% 645MB/s ± 3% ~ (p=0.481 n=10+10) JSONEncode-4 185MB/s ± 8% 189MB/s ± 3% ~ (p=0.280 n=10+10) JSONDecode-4 46.0MB/s ± 9% 47.0MB/s ± 2% +2.21% (p=0.046 n=9+10) GoParse-4 20.1MB/s ± 9% 20.6MB/s ± 2% ~ (p=0.239 n=10+10) RegexpMatchEasy0_32-4 460MB/s ± 4% 467MB/s ± 2% ~ (p=0.165 n=10+10) RegexpMatchEasy0_1K-4 6.19GB/s ± 3% 6.28GB/s ± 3% ~ (p=0.165 n=10+10) RegexpMatchEasy1_32-4 487MB/s ± 5% 497MB/s ± 2% +2.00% (p=0.043 n=10+10) RegexpMatchEasy1_1K-4 3.67GB/s ± 2% 3.67GB/s ± 3% ~ (p=0.963 n=8+9) RegexpMatchMedium_32-4 10.1MB/s ± 3% 10.1MB/s ± 4% ~ (p=0.435 n=10+9) RegexpMatchMedium_1K-4 34.0MB/s ± 1% 33.7MB/s ± 2% ~ (p=0.173 n=8+10) RegexpMatchHard_32-4 22.9MB/s ± 2% 22.7MB/s ± 4% ~ (p=0.565 n=10+10) RegexpMatchHard_1K-4 24.0MB/s ± 3% 24.7MB/s ± 3% +2.64% (p=0.001 n=9+9) Revcomp-4 766MB/s ± 4% 775MB/s ± 5% ~ (p=0.720 n=9+10) Template-4 40.2MB/s ± 2% 39.2MB/s ± 3% -2.47% (p=0.002 n=8+10) The rules match ~1800 times during all.bash. Fixes #18943 Change-Id: I64be1ada34e89c486dfd935bf429b35652117ed4 Reviewed-on: https://go-review.googlesource.com/94766 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-02-17 13:54:03 +01:00
ssa.OpAMD64BTSL, ssa.OpAMD64BTSQ,
ssa.OpAMD64BTCL, ssa.OpAMD64BTCQ,
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
ssa.OpAMD64BTRL, ssa.OpAMD64BTRQ,
ssa.OpAMD64PCMPEQB, ssa.OpAMD64PSIGNB,
ssa.OpAMD64PUNPCKLBW:
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
case ssa.OpAMD64PSHUFLW:
p := s.Prog(v.Op.Asm())
imm := v.AuxInt
if imm < 0 || imm > 255 {
v.Fatalf("Invalid source selection immediate")
}
p.From.Offset = imm
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(v.Args[0].Reg())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64PSHUFBbroadcast:
// PSHUFB with a control mask of zero copies byte 0 to all
// bytes in the register.
//
// X15 is always zero with ABIInternal.
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p.From.Reg = x86.REG_X15
cmd/compile: optimize multi-register shifts on amd64 amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-01-07 19:25:05 -08:00
case ssa.OpAMD64SHRDQ, ssa.OpAMD64SHLDQ:
p := s.Prog(v.Op.Asm())
lo, hi, bits := v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg()
p.From.Type = obj.TYPE_REG
p.From.Reg = bits
p.To.Type = obj.TYPE_REG
p.To.Reg = lo
p.AddRestSourceReg(hi)
cmd/compile: optimize multi-register shifts on amd64 amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-01-07 19:25:05 -08:00
case ssa.OpAMD64BLSIQ, ssa.OpAMD64BLSIL,
ssa.OpAMD64BLSMSKQ, ssa.OpAMD64BLSMSKL,
ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
switch v.Op {
case ssa.OpAMD64BLSRQ, ssa.OpAMD64BLSRL:
p.To.Reg = v.Reg0()
default:
p.To.Reg = v.Reg()
}
case ssa.OpAMD64ANDNQ, ssa.OpAMD64ANDNL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p.AddRestSourceReg(v.Args[1].Reg())
case ssa.OpAMD64SARXL, ssa.OpAMD64SARXQ,
ssa.OpAMD64SHLXL, ssa.OpAMD64SHLXQ,
ssa.OpAMD64SHRXL, ssa.OpAMD64SHRXQ:
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
p.AddRestSourceReg(v.Args[0].Reg())
case ssa.OpAMD64SHLXLload, ssa.OpAMD64SHLXQload,
ssa.OpAMD64SHRXLload, ssa.OpAMD64SHRXQload,
ssa.OpAMD64SARXLload, ssa.OpAMD64SARXQload:
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[1].Reg())
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
ssagen.AddAux(&m, v)
p.AddRestSource(m)
case ssa.OpAMD64SHLXLloadidx1, ssa.OpAMD64SHLXLloadidx4, ssa.OpAMD64SHLXLloadidx8,
ssa.OpAMD64SHRXLloadidx1, ssa.OpAMD64SHRXLloadidx4, ssa.OpAMD64SHRXLloadidx8,
ssa.OpAMD64SARXLloadidx1, ssa.OpAMD64SARXLloadidx4, ssa.OpAMD64SARXLloadidx8,
ssa.OpAMD64SHLXQloadidx1, ssa.OpAMD64SHLXQloadidx8,
ssa.OpAMD64SHRXQloadidx1, ssa.OpAMD64SHRXQloadidx8,
ssa.OpAMD64SARXQloadidx1, ssa.OpAMD64SARXQloadidx8:
p := opregreg(s, v.Op.Asm(), v.Reg(), v.Args[2].Reg())
m := obj.Addr{Type: obj.TYPE_MEM}
memIdx(&m, v)
ssagen.AddAux(&m, v)
p.AddRestSource(m)
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.
// Result[0] (the quotient) is in AX.
// Result[1] (the remainder) is in DX.
r := v.Args[1].Reg()
// Zero extend dividend.
opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
// Issue divide.
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.
// Result[0] (the quotient) is in AX.
// Result[1] (the remainder) is in DX.
r := v.Args[1].Reg()
var opCMP, opNEG, opSXD obj.As
switch v.Op {
case ssa.OpAMD64DIVQ:
opCMP, opNEG, opSXD = x86.ACMPQ, x86.ANEGQ, x86.ACQO
case ssa.OpAMD64DIVL:
opCMP, opNEG, opSXD = x86.ACMPL, x86.ANEGL, x86.ACDQ
case ssa.OpAMD64DIVW:
opCMP, opNEG, opSXD = x86.ACMPW, x86.ANEGW, x86.ACWD
}
// CPU faults upon signed overflow, which occurs when the most
// negative int is divided by -1. Handle divide by -1 as a special case.
var j1, j2 *obj.Prog
if ssa.DivisionNeedsFixUp(v) {
c := s.Prog(opCMP)
c.From.Type = obj.TYPE_REG
c.From.Reg = r
c.To.Type = obj.TYPE_CONST
c.To.Offset = -1
// Divisor is not -1, proceed with normal division.
j1 = s.Prog(x86.AJNE)
j1.To.Type = obj.TYPE_BRANCH
// Divisor is -1, manually compute quotient and remainder via fixup code.
// n / -1 = -n
n1 := s.Prog(opNEG)
n1.To.Type = obj.TYPE_REG
n1.To.Reg = x86.REG_AX
// n % -1 == 0
opregreg(s, x86.AXORL, x86.REG_DX, x86.REG_DX)
// TODO(khr): issue only the -1 fixup code we need.
// For instance, if only the quotient is used, no point in zeroing the remainder.
// Skip over normal division.
j2 = s.Prog(obj.AJMP)
j2.To.Type = obj.TYPE_BRANCH
}
// Sign extend dividend and perform division.
p := s.Prog(opSXD)
if j1 != nil {
j1.To.SetTarget(p)
}
p = s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
if j2 != nil {
j2.To.SetTarget(s.Pc())
}
case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
// the frontend rewrites constant division by 8/16/32 bit integers into
// HMUL by a constant
// SSA rewrites generate the 64 bit versions
// Arg[0] is already in AX as it's the only register we allow
// and DX is the only output we care about (the high bits)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
// IMULB puts the high portion in AH instead of DL,
// so move it to DL for consistency
if v.Type.Size() == 1 {
m := s.Prog(x86.AMOVB)
m.From.Type = obj.TYPE_REG
m.From.Reg = x86.REG_AH
m.To.Type = obj.TYPE_REG
m.To.Reg = x86.REG_DX
}
case ssa.OpAMD64MULQU, ssa.OpAMD64MULLU:
// Arg[0] is already in AX as it's the only register we allow
// results lo in AX
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
case ssa.OpAMD64MULQU2:
// Arg[0] is already in AX as it's the only register we allow
// results hi in DX, lo in AX
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
case ssa.OpAMD64DIVQU2:
// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
// results q in AX, r in DX
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
case ssa.OpAMD64AVGQU:
// compute (x+y)/2 unsigned.
// Do a 64-bit add, the overflow goes into the carry.
// Shift right once and pull the carry back into the 63rd bit.
p := s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p.From.Reg = v.Args[1].Reg()
p = s.Prog(x86.ARCRQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 1
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64ADDQcarry, ssa.OpAMD64ADCQ:
r := v.Reg0()
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
switch r {
case r0:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case r1:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r0
p.To.Type = obj.TYPE_REG
p.To.Reg = r
default:
v.Fatalf("output not in same register as an input %s", v.LongString())
}
case ssa.OpAMD64SUBQborrow, ssa.OpAMD64SBBQ:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64ADDQconstcarry, ssa.OpAMD64ADCQconst, ssa.OpAMD64SUBQconstborrow, ssa.OpAMD64SBBQconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
r := v.Reg()
a := v.Args[0].Reg()
if r == a {
switch v.AuxInt {
case 1:
var asm obj.As
// Software optimization manual recommends add $1,reg.
// But inc/dec is 1 byte smaller. ICC always uses inc
// Clang/GCC choose depending on flags, but prefer add.
// Experiments show that inc/dec is both a little faster
// and make a binary a little smaller.
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.AINCQ
} else {
asm = x86.AINCL
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
case -1:
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.ADECQ
} else {
asm = x86.ADECL
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
case 0x80:
// 'SUBQ $-0x80, r' is shorter to encode than
// and functionally equivalent to 'ADDQ $0x80, r'.
asm := x86.ASUBL
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.ASUBQ
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_CONST
p.From.Offset = -0x80
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
}
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.ALEAQ
} else {
asm = x86.ALEAL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_MEM
p.From.Reg = a
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
// Flag condition: ^ZERO || PARITY
// Generate:
// CMOV*NE SRC,DST
// CMOV*PS SRC,DST
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
var q *obj.Prog
if v.Op == ssa.OpAMD64CMOVQNEF {
q = s.Prog(x86.ACMOVQPS)
} else if v.Op == ssa.OpAMD64CMOVLNEF {
q = s.Prog(x86.ACMOVLPS)
} else {
q = s.Prog(x86.ACMOVWPS)
}
q.From.Type = obj.TYPE_REG
q.From.Reg = v.Args[1].Reg()
q.To.Type = obj.TYPE_REG
q.To.Reg = v.Reg()
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
// Flag condition: ZERO && !PARITY
// Generate:
// MOV SRC,TMP
// CMOV*NE DST,TMP
// CMOV*PC TMP,DST
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
//
// TODO(rasky): we could generate:
// CMOV*NE DST,SRC
// CMOV*PC SRC,DST
// But this requires a way for regalloc to know that SRC might be
// clobbered by this instruction.
t := v.RegTmp()
opregreg(s, moveByRegsWidth(t, v.Args[1].Reg(), v.Type.Size()), t, v.Args[1].Reg())
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Reg()
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
p.To.Type = obj.TYPE_REG
p.To.Reg = t
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
var q *obj.Prog
if v.Op == ssa.OpAMD64CMOVQEQF {
q = s.Prog(x86.ACMOVQPC)
} else if v.Op == ssa.OpAMD64CMOVLEQF {
q = s.Prog(x86.ACMOVLPC)
} else {
q = s.Prog(x86.ACMOVWPC)
}
q.From.Type = obj.TYPE_REG
q.From.Reg = t
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
q.To.Type = obj.TYPE_REG
q.To.Reg = v.Reg()
cmd/compile: implement CMOV on amd64 This builds upon the branchelim pass, activating it for amd64 and lowering CondSelect. Special care is made to FPU instructions for NaN handling. Benchmark results on Xeon E5630 (Westmere EP): name old time/op new time/op delta BinaryTree17-16 4.99s ± 9% 4.66s ± 2% ~ (p=0.095 n=5+5) Fannkuch11-16 4.93s ± 3% 5.04s ± 2% ~ (p=0.548 n=5+5) FmtFprintfEmpty-16 58.8ns ± 7% 61.4ns ±14% ~ (p=0.579 n=5+5) FmtFprintfString-16 114ns ± 2% 114ns ± 4% ~ (p=0.603 n=5+5) FmtFprintfInt-16 181ns ± 4% 125ns ± 3% -30.90% (p=0.008 n=5+5) FmtFprintfIntInt-16 263ns ± 2% 217ns ± 2% -17.34% (p=0.008 n=5+5) FmtFprintfPrefixedInt-16 230ns ± 1% 212ns ± 1% -7.99% (p=0.008 n=5+5) FmtFprintfFloat-16 411ns ± 3% 344ns ± 5% -16.43% (p=0.008 n=5+5) FmtManyArgs-16 828ns ± 4% 790ns ± 2% -4.59% (p=0.032 n=5+5) GobDecode-16 10.9ms ± 4% 10.8ms ± 5% ~ (p=0.548 n=5+5) GobEncode-16 9.52ms ± 5% 9.46ms ± 2% ~ (p=1.000 n=5+5) Gzip-16 334ms ± 2% 337ms ± 2% ~ (p=0.548 n=5+5) Gunzip-16 64.4ms ± 1% 65.0ms ± 1% +1.00% (p=0.008 n=5+5) HTTPClientServer-16 156µs ± 3% 155µs ± 3% ~ (p=0.690 n=5+5) JSONEncode-16 21.0ms ± 1% 21.8ms ± 0% +3.76% (p=0.016 n=5+4) JSONDecode-16 95.1ms ± 0% 95.7ms ± 1% ~ (p=0.151 n=5+5) Mandelbrot200-16 6.38ms ± 1% 6.42ms ± 1% ~ (p=0.095 n=5+5) GoParse-16 5.47ms ± 2% 5.36ms ± 1% -1.95% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 111ns ± 1% 111ns ± 1% ~ (p=0.635 n=5+4) RegexpMatchEasy0_1K-16 408ns ± 1% 411ns ± 2% ~ (p=0.087 n=5+5) RegexpMatchEasy1_32-16 103ns ± 1% 104ns ± 1% ~ (p=0.484 n=5+5) RegexpMatchEasy1_1K-16 659ns ± 2% 652ns ± 1% ~ (p=0.571 n=5+5) RegexpMatchMedium_32-16 176ns ± 2% 174ns ± 1% ~ (p=0.476 n=5+5) RegexpMatchMedium_1K-16 58.6µs ± 4% 57.7µs ± 4% ~ (p=0.548 n=5+5) RegexpMatchHard_32-16 3.07µs ± 3% 3.04µs ± 4% ~ (p=0.421 n=5+5) RegexpMatchHard_1K-16 89.2µs ± 1% 87.9µs ± 2% -1.52% (p=0.032 n=5+5) Revcomp-16 575ms ± 0% 587ms ± 2% +2.12% (p=0.032 n=4+5) Template-16 110ms ± 1% 107ms ± 3% -3.00% (p=0.032 n=5+5) TimeParse-16 463ns ± 0% 462ns ± 0% ~ (p=0.810 n=5+4) TimeFormat-16 538ns ± 0% 535ns ± 0% -0.63% (p=0.024 n=5+5) name old speed new speed delta GobDecode-16 70.7MB/s ± 4% 71.4MB/s ± 5% ~ (p=0.452 n=5+5) GobEncode-16 80.7MB/s ± 5% 81.2MB/s ± 2% ~ (p=1.000 n=5+5) Gzip-16 58.2MB/s ± 2% 57.7MB/s ± 2% ~ (p=0.452 n=5+5) Gunzip-16 302MB/s ± 1% 299MB/s ± 1% -0.99% (p=0.008 n=5+5) JSONEncode-16 92.4MB/s ± 1% 89.1MB/s ± 0% -3.63% (p=0.016 n=5+4) JSONDecode-16 20.4MB/s ± 0% 20.3MB/s ± 1% ~ (p=0.135 n=5+5) GoParse-16 10.6MB/s ± 2% 10.8MB/s ± 1% +2.00% (p=0.016 n=5+5) RegexpMatchEasy0_32-16 286MB/s ± 1% 285MB/s ± 3% ~ (p=1.000 n=5+5) RegexpMatchEasy0_1K-16 2.51GB/s ± 1% 2.49GB/s ± 2% ~ (p=0.095 n=5+5) RegexpMatchEasy1_32-16 309MB/s ± 1% 307MB/s ± 1% ~ (p=0.548 n=5+5) RegexpMatchEasy1_1K-16 1.55GB/s ± 2% 1.57GB/s ± 1% ~ (p=0.690 n=5+5) RegexpMatchMedium_32-16 5.68MB/s ± 2% 5.73MB/s ± 1% ~ (p=0.579 n=5+5) RegexpMatchMedium_1K-16 17.5MB/s ± 4% 17.8MB/s ± 4% ~ (p=0.500 n=5+5) RegexpMatchHard_32-16 10.4MB/s ± 3% 10.5MB/s ± 4% ~ (p=0.460 n=5+5) RegexpMatchHard_1K-16 11.5MB/s ± 1% 11.7MB/s ± 2% +1.57% (p=0.032 n=5+5) Revcomp-16 442MB/s ± 0% 433MB/s ± 2% -2.05% (p=0.032 n=4+5) Template-16 17.7MB/s ± 1% 18.2MB/s ± 3% +3.12% (p=0.032 n=5+5) Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558 Reviewed-on: https://go-review.googlesource.com/100935 Run-TryBot: Giovanni Bajo <rasky@develer.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-03-05 20:59:40 +01:00
case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
r := v.Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
p.AddRestSourceReg(v.Args[0].Reg())
cmd/compile: use ANDL for small immediates We can rewrite ANDQ with an immediate fitting in 32bit with an ANDL, which is shorter to encode. Looking at Go binary itself, before the change there was: ANDL: 2337 ANDQ: 4476 After the change: ANDL: 3790 ANDQ: 3024 So we got rid of 1452 ANDQs This makes the Linux x86_64 binary 0.03% smaller. There seems to be an impact on performance. Intel Cascade Lake benchmarks (with perflock): name old time/op new time/op delta BinaryTree17-8 1.91s ± 1% 1.89s ± 1% -1.22% (p=0.000 n=21+18) Fannkuch11-8 2.34s ± 0% 2.34s ± 0% ~ (p=0.052 n=20+20) FmtFprintfEmpty-8 27.7ns ± 1% 27.4ns ± 3% ~ (p=0.497 n=21+21) FmtFprintfString-8 53.2ns ± 0% 51.5ns ± 0% -3.21% (p=0.000 n=20+19) FmtFprintfInt-8 57.3ns ± 0% 55.7ns ± 0% -2.89% (p=0.000 n=19+19) FmtFprintfIntInt-8 92.3ns ± 0% 88.4ns ± 1% -4.23% (p=0.000 n=20+21) FmtFprintfPrefixedInt-8 103ns ± 0% 103ns ± 0% +0.23% (p=0.000 n=20+21) FmtFprintfFloat-8 147ns ± 0% 148ns ± 0% +0.75% (p=0.000 n=20+21) FmtManyArgs-8 384ns ± 0% 381ns ± 0% -0.63% (p=0.000 n=21+21) GobDecode-8 3.86ms ± 1% 3.88ms ± 1% +0.52% (p=0.000 n=20+21) GobEncode-8 2.77ms ± 1% 2.77ms ± 0% ~ (p=0.078 n=21+21) Gzip-8 168ms ± 1% 168ms ± 0% +0.24% (p=0.000 n=20+20) Gunzip-8 25.1ms ± 0% 24.3ms ± 0% -3.03% (p=0.000 n=21+21) HTTPClientServer-8 61.4µs ± 8% 59.1µs ±10% ~ (p=0.088 n=20+21) JSONEncode-8 6.86ms ± 0% 6.70ms ± 0% -2.29% (p=0.000 n=20+19) JSONDecode-8 30.8ms ± 1% 30.6ms ± 1% -0.82% (p=0.000 n=20+20) Mandelbrot200-8 3.85ms ± 0% 3.85ms ± 0% ~ (p=0.191 n=16+17) GoParse-8 2.61ms ± 2% 2.60ms ± 1% ~ (p=0.561 n=21+20) RegexpMatchEasy0_32-8 48.5ns ± 2% 45.9ns ± 3% -5.26% (p=0.000 n=20+21) RegexpMatchEasy0_1K-8 139ns ± 0% 139ns ± 0% +0.27% (p=0.000 n=18+20) RegexpMatchEasy1_32-8 41.3ns ± 0% 42.1ns ± 4% +1.95% (p=0.000 n=17+21) RegexpMatchEasy1_1K-8 216ns ± 2% 216ns ± 0% +0.17% (p=0.020 n=21+19) RegexpMatchMedium_32-8 790ns ± 7% 803ns ± 8% ~ (p=0.178 n=21+21) RegexpMatchMedium_1K-8 23.5µs ± 5% 23.7µs ± 5% ~ (p=0.421 n=21+21) RegexpMatchHard_32-8 1.09µs ± 1% 1.09µs ± 1% -0.53% (p=0.000 n=19+18) RegexpMatchHard_1K-8 33.0µs ± 0% 33.0µs ± 0% ~ (p=0.610 n=21+20) Revcomp-8 348ms ± 0% 353ms ± 0% +1.38% (p=0.000 n=17+18) Template-8 42.0ms ± 1% 41.9ms ± 1% -0.30% (p=0.049 n=20+20) TimeParse-8 185ns ± 0% 185ns ± 0% ~ (p=0.387 n=20+18) TimeFormat-8 237ns ± 1% 241ns ± 1% +1.57% (p=0.000 n=21+21) [Geo mean] 35.4µs 35.2µs -0.66% name old speed new speed delta GobDecode-8 199MB/s ± 1% 198MB/s ± 1% -0.52% (p=0.000 n=20+21) GobEncode-8 277MB/s ± 1% 277MB/s ± 0% ~ (p=0.075 n=21+21) Gzip-8 116MB/s ± 1% 115MB/s ± 0% -0.25% (p=0.000 n=20+20) Gunzip-8 773MB/s ± 0% 797MB/s ± 0% +3.12% (p=0.000 n=21+21) JSONEncode-8 283MB/s ± 0% 290MB/s ± 0% +2.35% (p=0.000 n=20+19) JSONDecode-8 63.0MB/s ± 1% 63.5MB/s ± 1% +0.82% (p=0.000 n=20+20) GoParse-8 22.2MB/s ± 2% 22.3MB/s ± 1% ~ (p=0.539 n=21+20) RegexpMatchEasy0_32-8 660MB/s ± 2% 697MB/s ± 3% +5.57% (p=0.000 n=20+21) RegexpMatchEasy0_1K-8 7.36GB/s ± 0% 7.34GB/s ± 0% -0.26% (p=0.000 n=18+20) RegexpMatchEasy1_32-8 775MB/s ± 0% 761MB/s ± 4% -1.88% (p=0.000 n=17+21) RegexpMatchEasy1_1K-8 4.74GB/s ± 2% 4.74GB/s ± 0% -0.18% (p=0.020 n=21+19) RegexpMatchMedium_32-8 40.6MB/s ± 7% 39.9MB/s ± 9% ~ (p=0.191 n=21+21) RegexpMatchMedium_1K-8 43.7MB/s ± 5% 43.2MB/s ± 5% ~ (p=0.435 n=21+21) RegexpMatchHard_32-8 29.3MB/s ± 1% 29.4MB/s ± 1% +0.53% (p=0.000 n=19+18) RegexpMatchHard_1K-8 31.0MB/s ± 0% 31.0MB/s ± 0% ~ (p=0.572 n=21+20) Revcomp-8 730MB/s ± 0% 720MB/s ± 0% -1.36% (p=0.000 n=17+18) Template-8 46.2MB/s ± 1% 46.3MB/s ± 1% +0.30% (p=0.041 n=20+20) [Geo mean] 204MB/s 205MB/s +0.30% Change-Id: Iac75d0ec184a515ce0e65e19559d5fe2e9840514 Reviewed-on: https://go-review.googlesource.com/c/go/+/354970 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com> Trust: Josh Bleecher Snyder <josharian@gmail.com> Trust: Keith Randall <khr@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org>
2021-10-10 17:56:16 +02:00
case ssa.OpAMD64ANDQconst:
asm := v.Op.Asm()
// If the constant is positive and fits into 32 bits, use ANDL.
// This saves a few bytes of encoding.
if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
asm = x86.AANDL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
cmd/compile: use ANDL for small immediates We can rewrite ANDQ with an immediate fitting in 32bit with an ANDL, which is shorter to encode. Looking at Go binary itself, before the change there was: ANDL: 2337 ANDQ: 4476 After the change: ANDL: 3790 ANDQ: 3024 So we got rid of 1452 ANDQs This makes the Linux x86_64 binary 0.03% smaller. There seems to be an impact on performance. Intel Cascade Lake benchmarks (with perflock): name old time/op new time/op delta BinaryTree17-8 1.91s ± 1% 1.89s ± 1% -1.22% (p=0.000 n=21+18) Fannkuch11-8 2.34s ± 0% 2.34s ± 0% ~ (p=0.052 n=20+20) FmtFprintfEmpty-8 27.7ns ± 1% 27.4ns ± 3% ~ (p=0.497 n=21+21) FmtFprintfString-8 53.2ns ± 0% 51.5ns ± 0% -3.21% (p=0.000 n=20+19) FmtFprintfInt-8 57.3ns ± 0% 55.7ns ± 0% -2.89% (p=0.000 n=19+19) FmtFprintfIntInt-8 92.3ns ± 0% 88.4ns ± 1% -4.23% (p=0.000 n=20+21) FmtFprintfPrefixedInt-8 103ns ± 0% 103ns ± 0% +0.23% (p=0.000 n=20+21) FmtFprintfFloat-8 147ns ± 0% 148ns ± 0% +0.75% (p=0.000 n=20+21) FmtManyArgs-8 384ns ± 0% 381ns ± 0% -0.63% (p=0.000 n=21+21) GobDecode-8 3.86ms ± 1% 3.88ms ± 1% +0.52% (p=0.000 n=20+21) GobEncode-8 2.77ms ± 1% 2.77ms ± 0% ~ (p=0.078 n=21+21) Gzip-8 168ms ± 1% 168ms ± 0% +0.24% (p=0.000 n=20+20) Gunzip-8 25.1ms ± 0% 24.3ms ± 0% -3.03% (p=0.000 n=21+21) HTTPClientServer-8 61.4µs ± 8% 59.1µs ±10% ~ (p=0.088 n=20+21) JSONEncode-8 6.86ms ± 0% 6.70ms ± 0% -2.29% (p=0.000 n=20+19) JSONDecode-8 30.8ms ± 1% 30.6ms ± 1% -0.82% (p=0.000 n=20+20) Mandelbrot200-8 3.85ms ± 0% 3.85ms ± 0% ~ (p=0.191 n=16+17) GoParse-8 2.61ms ± 2% 2.60ms ± 1% ~ (p=0.561 n=21+20) RegexpMatchEasy0_32-8 48.5ns ± 2% 45.9ns ± 3% -5.26% (p=0.000 n=20+21) RegexpMatchEasy0_1K-8 139ns ± 0% 139ns ± 0% +0.27% (p=0.000 n=18+20) RegexpMatchEasy1_32-8 41.3ns ± 0% 42.1ns ± 4% +1.95% (p=0.000 n=17+21) RegexpMatchEasy1_1K-8 216ns ± 2% 216ns ± 0% +0.17% (p=0.020 n=21+19) RegexpMatchMedium_32-8 790ns ± 7% 803ns ± 8% ~ (p=0.178 n=21+21) RegexpMatchMedium_1K-8 23.5µs ± 5% 23.7µs ± 5% ~ (p=0.421 n=21+21) RegexpMatchHard_32-8 1.09µs ± 1% 1.09µs ± 1% -0.53% (p=0.000 n=19+18) RegexpMatchHard_1K-8 33.0µs ± 0% 33.0µs ± 0% ~ (p=0.610 n=21+20) Revcomp-8 348ms ± 0% 353ms ± 0% +1.38% (p=0.000 n=17+18) Template-8 42.0ms ± 1% 41.9ms ± 1% -0.30% (p=0.049 n=20+20) TimeParse-8 185ns ± 0% 185ns ± 0% ~ (p=0.387 n=20+18) TimeFormat-8 237ns ± 1% 241ns ± 1% +1.57% (p=0.000 n=21+21) [Geo mean] 35.4µs 35.2µs -0.66% name old speed new speed delta GobDecode-8 199MB/s ± 1% 198MB/s ± 1% -0.52% (p=0.000 n=20+21) GobEncode-8 277MB/s ± 1% 277MB/s ± 0% ~ (p=0.075 n=21+21) Gzip-8 116MB/s ± 1% 115MB/s ± 0% -0.25% (p=0.000 n=20+20) Gunzip-8 773MB/s ± 0% 797MB/s ± 0% +3.12% (p=0.000 n=21+21) JSONEncode-8 283MB/s ± 0% 290MB/s ± 0% +2.35% (p=0.000 n=20+19) JSONDecode-8 63.0MB/s ± 1% 63.5MB/s ± 1% +0.82% (p=0.000 n=20+20) GoParse-8 22.2MB/s ± 2% 22.3MB/s ± 1% ~ (p=0.539 n=21+20) RegexpMatchEasy0_32-8 660MB/s ± 2% 697MB/s ± 3% +5.57% (p=0.000 n=20+21) RegexpMatchEasy0_1K-8 7.36GB/s ± 0% 7.34GB/s ± 0% -0.26% (p=0.000 n=18+20) RegexpMatchEasy1_32-8 775MB/s ± 0% 761MB/s ± 4% -1.88% (p=0.000 n=17+21) RegexpMatchEasy1_1K-8 4.74GB/s ± 2% 4.74GB/s ± 0% -0.18% (p=0.020 n=21+19) RegexpMatchMedium_32-8 40.6MB/s ± 7% 39.9MB/s ± 9% ~ (p=0.191 n=21+21) RegexpMatchMedium_1K-8 43.7MB/s ± 5% 43.2MB/s ± 5% ~ (p=0.435 n=21+21) RegexpMatchHard_32-8 29.3MB/s ± 1% 29.4MB/s ± 1% +0.53% (p=0.000 n=19+18) RegexpMatchHard_1K-8 31.0MB/s ± 0% 31.0MB/s ± 0% ~ (p=0.572 n=21+20) Revcomp-8 730MB/s ± 0% 720MB/s ± 0% -1.36% (p=0.000 n=17+18) Template-8 46.2MB/s ± 1% 46.3MB/s ± 1% +0.30% (p=0.041 n=20+20) [Geo mean] 204MB/s 205MB/s +0.30% Change-Id: Iac75d0ec184a515ce0e65e19559d5fe2e9840514 Reviewed-on: https://go-review.googlesource.com/c/go/+/354970 Reviewed-by: Josh Bleecher Snyder <josharian@gmail.com> Trust: Josh Bleecher Snyder <josharian@gmail.com> Trust: Keith Randall <khr@golang.org> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org>
2021-10-10 17:56:16 +02:00
ssa.OpAMD64ANDLconst,
ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
r := v.Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8,
ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8,
ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
p := s.Prog(v.Op.Asm())
memIdx(&p.From, v)
o := v.Reg()
p.To.Type = obj.TYPE_REG
cmd/compile: split slow 3 operand LEA instructions into two LEAs go tool objdump ../bin/go | grep "\.go\:" | grep -c "LEA.*0x.*[(].*[(].*" Before: 1012 After: 20 Updates #21735 Benchmarks thanks to drchase@google.com Intel(R) Xeon(R) CPU E5-1650 v3 @ 3.50GHz benchstat -geomean *.stdout | grep -v pkg: name old time/op new time/op delta FastTest2KB-12 131.0ns ± 0% 131.0ns ± 0% ~ (all equal) BaseTest2KB-12 601.3ns ± 0% 601.3ns ± 0% ~ (p=0.942 n=45+41) Encoding4KBVerySparse-12 15.38µs ± 1% 15.41µs ± 0% +0.24% (p=0.000 n=44+43) Join_8-12 2.117s ± 2% 2.128s ± 2% +0.51% (p=0.007 n=48+48) HashimotoLight-12 1.663ms ± 1% 1.668ms ± 1% +0.35% (p=0.006 n=49+49) Sha3_224_MTU-12 4.843µs ± 0% 4.836µs ± 0% -0.14% (p=0.000 n=45+42) GenSharedKeyP256-12 73.74µs ± 0% 70.92µs ± 0% -3.82% (p=0.000 n=48+47) Run/10k/1-12 24.81s ± 0% 24.88s ± 0% +0.30% (p=0.000 n=48+47) Run/10k/16-12 4.621s ± 2% 4.625s ± 3% ~ (p=0.776 n=50+50) Dnrm2MediumPosInc-12 4.018µs ± 0% 4.019µs ± 0% ~ (p=0.060 n=50+48) DasumMediumUnitaryInc-12 855.3ns ± 0% 855.0ns ± 0% -0.03% (p=0.000 n=47+42) Dgeev/Circulant10-12 40.45µs ± 1% 41.11µs ± 1% +1.62% (p=0.000 n=45+49) Dgeev/Circulant100-12 10.42ms ± 2% 10.61ms ± 1% +1.83% (p=0.000 n=49+48) MulWorkspaceDense1000Hundredth-12 64.69ms ± 1% 64.63ms ± 1% ~ (p=0.718 n=48+48) ScaleVec10000Inc20-12 22.31µs ± 1% 22.29µs ± 1% ~ (p=0.424 n=50+50) ValidateVersionTildeFail-12 737.6ns ± 1% 736.0ns ± 1% -0.22% (p=0.000 n=49+49) StripHTML-12 2.846µs ± 0% 2.806µs ± 1% -1.40% (p=0.000 n=43+50) ReaderContains-12 6.073µs ± 0% 5.999µs ± 0% -1.22% (p=0.000 n=48+48) EncodeCodecFromInternalProtobuf-12 5.817µs ± 2% 5.555µs ± 2% -4.51% (p=0.000 n=47+47) TarjanSCCGnp_10_tenth-12 7.091µs ± 5% 7.132µs ± 7% ~ (p=0.361 n=50+50) TarjanSCCGnp_1000_half-12 82.25ms ± 3% 81.29ms ± 2% -1.16% (p=0.000 n=50+43) AStarUndirectedmallWorld_10_2_2_2_Heur-12 15.18µs ± 8% 15.11µs ± 7% ~ (p=0.511 n=50+49) LouvainDirectedMultiplex-12 20.92ms ± 1% 21.00ms ± 1% +0.36% (p=0.000 n=48+49) WalkAllBreadthFirstGnp_10_tenth-12 2.974µs ± 4% 2.964µs ± 5% ~ (p=0.504 n=50+50) WalkAllBreadthFirstGnp_1000_tenth-12 9.733ms ± 4% 9.741ms ± 4% ~ (p=0.774 n=48+50) TextMovementBetweenSegments-12 432.8µs ± 0% 433.2µs ± 1% ~ (p=0.128 n=50+50) Growth_MultiSegment-12 13.11ms ± 0% 13.19ms ± 1% +0.58% (p=0.000 n=44+46) AddingFields/Zap.Sugar-12 1.296µs ± 1% 1.310µs ± 2% +1.09% (p=0.000 n=43+43) AddingFields/apex/log-12 34.19µs ± 1% 34.31µs ± 1% +0.35% (p=0.000 n=45+45) AddingFields/inconshreveable/log15-12 30.08µs ± 2% 30.07µs ± 2% ~ (p=0.803 n=48+47) AddingFields/sirupsen/logrus-12 6.683µs ± 3% 6.735µs ± 3% +0.78% (p=0.000 n=43+42) [Geo mean] 143.5µs 143.3µs -0.16% Change-Id: I637203c75c837737f1febced75d5985703e51044 Reviewed-on: https://go-review.googlesource.com/114655 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-05-11 08:01:31 +02:00
p.To.Reg = o
if v.AuxInt != 0 && v.Aux == nil {
// Emit an additional LEA to add the displacement instead of creating a slow 3 operand LEA.
switch v.Op {
case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
p = s.Prog(x86.ALEAQ)
case ssa.OpAMD64LEAL1, ssa.OpAMD64LEAL2, ssa.OpAMD64LEAL4, ssa.OpAMD64LEAL8:
p = s.Prog(x86.ALEAL)
case ssa.OpAMD64LEAW1, ssa.OpAMD64LEAW2, ssa.OpAMD64LEAW4, ssa.OpAMD64LEAW8:
p = s.Prog(x86.ALEAW)
}
p.From.Type = obj.TYPE_MEM
p.From.Reg = o
p.To.Type = obj.TYPE_REG
p.To.Reg = o
}
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL, ssa.OpAMD64LEAW:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
// Go assembler has swapped operands for UCOMISx relative to CMP,
// must account for that right here.
opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_CONST
p.To.Offset = v.AuxInt
case ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst,
ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
ssa.OpAMD64BTSQconst,
ssa.OpAMD64BTCQconst,
ssa.OpAMD64BTRQconst:
op := v.Op
if op == ssa.OpAMD64BTQconst && v.AuxInt < 32 {
// Emit 32-bit version because it's shorter
op = ssa.OpAMD64BTLconst
}
p := s.Prog(op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
case ssa.OpAMD64CMPQload, ssa.OpAMD64CMPLload, ssa.OpAMD64CMPWload, ssa.OpAMD64CMPBload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[1].Reg()
case ssa.OpAMD64CMPQconstload, ssa.OpAMD64CMPLconstload, ssa.OpAMD64CMPWconstload, ssa.OpAMD64CMPBconstload:
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux2(&p.From, v, sc.Off64())
p.To.Type = obj.TYPE_CONST
p.To.Offset = sc.Val64()
case ssa.OpAMD64CMPQloadidx8, ssa.OpAMD64CMPQloadidx1, ssa.OpAMD64CMPLloadidx4, ssa.OpAMD64CMPLloadidx1, ssa.OpAMD64CMPWloadidx2, ssa.OpAMD64CMPWloadidx1, ssa.OpAMD64CMPBloadidx1:
p := s.Prog(v.Op.Asm())
memIdx(&p.From, v)
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[2].Reg()
case ssa.OpAMD64CMPQconstloadidx8, ssa.OpAMD64CMPQconstloadidx1, ssa.OpAMD64CMPLconstloadidx4, ssa.OpAMD64CMPLconstloadidx1, ssa.OpAMD64CMPWconstloadidx2, ssa.OpAMD64CMPWconstloadidx1, ssa.OpAMD64CMPBconstloadidx1:
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
memIdx(&p.From, v)
ssagen.AddAux2(&p.From, v, sc.Off64())
p.To.Type = obj.TYPE_CONST
p.To.Offset = sc.Val64()
case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
x := v.Reg()
// If flags aren't live (indicated by v.Aux == nil),
// then we can rewrite MOV $0, AX into XOR AX, AX.
if v.AuxInt == 0 && v.Aux == nil {
opregreg(s, x86.AXORL, x, x)
break
}
cmd/compile: use MOVL instead of MOVQ for small constants on amd64 The encoding of MOVL to a register is 2 bytes shorter than for MOVQ. The upper 32bit are automatically zeroed when MOVL to a register is used. Replaces 1657 MOVQ by MOVL in the go binary. Reduces go binary size by 4 kilobyte. name old time/op new time/op delta BinaryTree17 1.93s ± 0% 1.93s ± 0% -0.32% (p=0.000 n=9+9) Fannkuch11 2.66s ± 0% 2.48s ± 0% -6.60% (p=0.000 n=9+9) FmtFprintfEmpty 31.8ns ± 0% 31.6ns ± 0% -0.63% (p=0.000 n=10+10) FmtFprintfString 52.0ns ± 0% 51.9ns ± 0% -0.19% (p=0.000 n=10+10) FmtFprintfInt 55.6ns ± 0% 54.6ns ± 0% -1.80% (p=0.002 n=8+10) FmtFprintfIntInt 87.7ns ± 0% 84.8ns ± 0% -3.31% (p=0.000 n=9+9) FmtFprintfPrefixedInt 98.9ns ± 0% 102.0ns ± 0% +3.10% (p=0.000 n=10+10) FmtFprintfFloat 165ns ± 0% 164ns ± 0% -0.61% (p=0.000 n=10+10) FmtManyArgs 368ns ± 0% 361ns ± 0% -1.98% (p=0.000 n=8+10) GobDecode 4.53ms ± 0% 4.58ms ± 0% +1.08% (p=0.000 n=9+10) GobEncode 3.74ms ± 0% 3.73ms ± 0% -0.27% (p=0.000 n=10+10) Gzip 164ms ± 0% 163ms ± 0% -0.48% (p=0.000 n=10+10) Gunzip 26.7ms ± 0% 26.6ms ± 0% -0.13% (p=0.000 n=9+10) HTTPClientServer 30.4µs ± 1% 30.3µs ± 1% -0.41% (p=0.016 n=10+10) JSONEncode 10.9ms ± 0% 11.0ms ± 0% +0.70% (p=0.000 n=10+10) JSONDecode 36.8ms ± 0% 37.0ms ± 0% +0.59% (p=0.000 n=9+10) Mandelbrot200 3.20ms ± 0% 3.21ms ± 0% +0.44% (p=0.000 n=9+10) GoParse 2.35ms ± 0% 2.35ms ± 0% +0.26% (p=0.000 n=10+9) RegexpMatchEasy0_32 58.3ns ± 0% 58.4ns ± 0% +0.17% (p=0.000 n=10+10) RegexpMatchEasy0_1K 138ns ± 0% 142ns ± 0% +2.68% (p=0.000 n=10+10) RegexpMatchEasy1_32 55.1ns ± 0% 55.6ns ± 1% ~ (p=0.104 n=10+10) RegexpMatchEasy1_1K 242ns ± 0% 243ns ± 0% +0.41% (p=0.000 n=10+10) RegexpMatchMedium_32 87.4ns ± 0% 89.9ns ± 0% +2.86% (p=0.000 n=10+10) RegexpMatchMedium_1K 27.4µs ± 0% 27.4µs ± 0% +0.15% (p=0.000 n=10+10) RegexpMatchHard_32 1.30µs ± 0% 1.32µs ± 1% +1.91% (p=0.000 n=10+10) RegexpMatchHard_1K 39.0µs ± 0% 39.5µs ± 0% +1.38% (p=0.000 n=10+10) Revcomp 316ms ± 0% 319ms ± 0% +1.13% (p=0.000 n=9+8) Template 40.6ms ± 0% 40.6ms ± 0% ~ (p=0.123 n=10+10) TimeParse 224ns ± 0% 224ns ± 0% ~ (all equal) TimeFormat 230ns ± 0% 225ns ± 0% -2.17% (p=0.000 n=10+10) Change-Id: I32a099b65f9e6d4ad7288ed48546655c534757d8 Reviewed-on: https://go-review.googlesource.com/38630 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-03-24 08:13:17 +01:00
asm := v.Op.Asm()
// Use MOVL to move a small constant into a register
// when the constant is positive and fits into 32 bits.
if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
// The upper 32bit are zeroed automatically when using MOVL.
asm = x86.AMOVL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = x
case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
x := v.Reg()
if !isFPReg(x) && v.AuxInt == 0 && v.Aux == nil {
opregreg(s, x86.AXORL, x, x)
break
}
p := s.Prog(storeByRegWidth(x, v.Type.Size()))
p.From.Type = obj.TYPE_FCONST
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = x
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVOload,
ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload,
ssa.OpAMD64MOVBEQload, ssa.OpAMD64MOVBELload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1,
ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8, ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4, ssa.OpAMD64MOVWloadidx2,
ssa.OpAMD64MOVBELloadidx1, ssa.OpAMD64MOVBELloadidx4, ssa.OpAMD64MOVBELloadidx8, ssa.OpAMD64MOVBEQloadidx1, ssa.OpAMD64MOVBEQloadidx8:
p := s.Prog(v.Op.Asm())
memIdx(&p.From, v)
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
cmd/compile: introduce more read-modify-write operations for amd64 Add suport of read-modify-write for AND/SUB/AND/OR/XOR on amd64. 1. The total size of pkg/linux_amd64 decreases about 4KB, excluding cmd/compile. 2. The go1 benchmark shows a little improvement, excluding noise. name old time/op new time/op delta BinaryTree17-4 2.63s ± 3% 2.65s ± 4% +1.01% (p=0.037 n=35+35) Fannkuch11-4 2.33s ± 2% 2.39s ± 2% +2.49% (p=0.000 n=35+35) FmtFprintfEmpty-4 45.4ns ± 5% 40.8ns ± 6% -10.09% (p=0.000 n=35+35) FmtFprintfString-4 73.3ns ± 4% 70.9ns ± 3% -3.23% (p=0.000 n=30+35) FmtFprintfInt-4 79.9ns ± 4% 79.5ns ± 3% ~ (p=0.736 n=34+35) FmtFprintfIntInt-4 126ns ± 4% 125ns ± 4% ~ (p=0.083 n=35+35) FmtFprintfPrefixedInt-4 152ns ± 6% 152ns ± 3% ~ (p=0.855 n=34+35) FmtFprintfFloat-4 215ns ± 4% 213ns ± 4% ~ (p=0.066 n=35+35) FmtManyArgs-4 522ns ± 3% 506ns ± 3% -3.15% (p=0.000 n=35+35) GobDecode-4 6.45ms ± 8% 6.51ms ± 7% +0.96% (p=0.026 n=35+35) GobEncode-4 6.10ms ± 6% 6.02ms ± 8% ~ (p=0.160 n=35+35) Gzip-4 228ms ± 3% 221ms ± 3% -2.92% (p=0.000 n=35+35) Gunzip-4 37.5ms ± 4% 37.2ms ± 3% -0.78% (p=0.036 n=35+35) HTTPClientServer-4 58.7µs ± 2% 59.2µs ± 1% +0.80% (p=0.000 n=33+33) JSONEncode-4 12.0ms ± 3% 12.2ms ± 3% +1.84% (p=0.008 n=35+35) JSONDecode-4 57.0ms ± 4% 56.6ms ± 3% ~ (p=0.320 n=35+35) Mandelbrot200-4 3.82ms ± 3% 3.79ms ± 3% ~ (p=0.074 n=35+35) GoParse-4 3.21ms ± 5% 3.24ms ± 4% ~ (p=0.119 n=35+35) RegexpMatchEasy0_32-4 76.3ns ± 4% 75.4ns ± 4% -1.14% (p=0.014 n=34+33) RegexpMatchEasy0_1K-4 251ns ± 4% 254ns ± 3% +1.28% (p=0.016 n=35+35) RegexpMatchEasy1_32-4 69.6ns ± 3% 70.1ns ± 3% +0.82% (p=0.005 n=35+35) RegexpMatchEasy1_1K-4 367ns ± 4% 376ns ± 4% +2.47% (p=0.000 n=35+35) RegexpMatchMedium_32-4 108ns ± 5% 104ns ± 4% -3.18% (p=0.000 n=35+35) RegexpMatchMedium_1K-4 33.8µs ± 3% 32.7µs ± 3% -3.27% (p=0.000 n=35+35) RegexpMatchHard_32-4 1.55µs ± 3% 1.52µs ± 3% -1.64% (p=0.000 n=35+35) RegexpMatchHard_1K-4 46.6µs ± 3% 46.6µs ± 4% ~ (p=0.149 n=35+35) Revcomp-4 416ms ± 7% 412ms ± 6% -0.95% (p=0.033 n=33+35) Template-4 64.3ms ± 3% 62.4ms ± 7% -2.94% (p=0.000 n=35+35) TimeParse-4 320ns ± 2% 322ns ± 3% ~ (p=0.589 n=35+35) TimeFormat-4 300ns ± 3% 300ns ± 3% ~ (p=0.597 n=35+35) [Geo mean] 47.4µs 47.0µs -0.86% name old speed new speed delta GobDecode-4 119MB/s ± 7% 118MB/s ± 7% -0.96% (p=0.027 n=35+35) GobEncode-4 126MB/s ± 7% 127MB/s ± 6% ~ (p=0.157 n=34+34) Gzip-4 85.3MB/s ± 3% 87.9MB/s ± 3% +3.02% (p=0.000 n=35+35) Gunzip-4 518MB/s ± 4% 522MB/s ± 3% +0.79% (p=0.037 n=35+35) JSONEncode-4 162MB/s ± 3% 159MB/s ± 3% -1.81% (p=0.009 n=35+35) JSONDecode-4 34.1MB/s ± 4% 34.3MB/s ± 3% ~ (p=0.318 n=35+35) GoParse-4 18.0MB/s ± 5% 17.9MB/s ± 4% ~ (p=0.117 n=35+35) RegexpMatchEasy0_32-4 419MB/s ± 3% 425MB/s ± 4% +1.46% (p=0.003 n=32+33) RegexpMatchEasy0_1K-4 4.07GB/s ± 4% 4.02GB/s ± 3% -1.28% (p=0.014 n=35+35) RegexpMatchEasy1_32-4 460MB/s ± 3% 456MB/s ± 4% -0.82% (p=0.004 n=35+35) RegexpMatchEasy1_1K-4 2.79GB/s ± 4% 2.72GB/s ± 4% -2.39% (p=0.000 n=35+35) RegexpMatchMedium_32-4 9.23MB/s ± 4% 9.53MB/s ± 4% +3.16% (p=0.000 n=35+35) RegexpMatchMedium_1K-4 30.3MB/s ± 3% 31.3MB/s ± 3% +3.38% (p=0.000 n=35+35) RegexpMatchHard_32-4 20.7MB/s ± 3% 21.0MB/s ± 3% +1.67% (p=0.000 n=35+35) RegexpMatchHard_1K-4 22.0MB/s ± 3% 21.9MB/s ± 4% ~ (p=0.277 n=35+33) Revcomp-4 612MB/s ± 7% 618MB/s ± 6% +0.96% (p=0.034 n=33+35) Template-4 30.2MB/s ± 3% 31.1MB/s ± 6% +3.05% (p=0.000 n=35+35) [Geo mean] 123MB/s 124MB/s +0.64% Change-Id: Ia025da272e07d0069413824bfff3471b106d6280 Reviewed-on: https://go-review.googlesource.com/121535 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com> Reviewed-by: Keith Randall <khr@golang.org>
2018-06-29 02:11:53 +00:00
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore,
ssa.OpAMD64ADDQmodify, ssa.OpAMD64SUBQmodify, ssa.OpAMD64ANDQmodify, ssa.OpAMD64ORQmodify, ssa.OpAMD64XORQmodify,
ssa.OpAMD64ADDLmodify, ssa.OpAMD64SUBLmodify, ssa.OpAMD64ANDLmodify, ssa.OpAMD64ORLmodify, ssa.OpAMD64XORLmodify,
ssa.OpAMD64MOVBEQstore, ssa.OpAMD64MOVBELstore, ssa.OpAMD64MOVBEWstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1,
ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8, ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4, ssa.OpAMD64MOVWstoreidx2,
ssa.OpAMD64ADDLmodifyidx1, ssa.OpAMD64ADDLmodifyidx4, ssa.OpAMD64ADDLmodifyidx8, ssa.OpAMD64ADDQmodifyidx1, ssa.OpAMD64ADDQmodifyidx8,
ssa.OpAMD64SUBLmodifyidx1, ssa.OpAMD64SUBLmodifyidx4, ssa.OpAMD64SUBLmodifyidx8, ssa.OpAMD64SUBQmodifyidx1, ssa.OpAMD64SUBQmodifyidx8,
ssa.OpAMD64ANDLmodifyidx1, ssa.OpAMD64ANDLmodifyidx4, ssa.OpAMD64ANDLmodifyidx8, ssa.OpAMD64ANDQmodifyidx1, ssa.OpAMD64ANDQmodifyidx8,
ssa.OpAMD64ORLmodifyidx1, ssa.OpAMD64ORLmodifyidx4, ssa.OpAMD64ORLmodifyidx8, ssa.OpAMD64ORQmodifyidx1, ssa.OpAMD64ORQmodifyidx8,
ssa.OpAMD64XORLmodifyidx1, ssa.OpAMD64XORLmodifyidx4, ssa.OpAMD64XORLmodifyidx8, ssa.OpAMD64XORQmodifyidx1, ssa.OpAMD64XORQmodifyidx8,
ssa.OpAMD64MOVBEWstoreidx1, ssa.OpAMD64MOVBEWstoreidx2, ssa.OpAMD64MOVBELstoreidx1, ssa.OpAMD64MOVBELstoreidx4, ssa.OpAMD64MOVBELstoreidx8, ssa.OpAMD64MOVBEQstoreidx1, ssa.OpAMD64MOVBEQstoreidx8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
memIdx(&p.To, v)
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64ADDQconstmodify, ssa.OpAMD64ADDLconstmodify:
sc := v.AuxValAndOff()
off := sc.Off64()
val := sc.Val()
cmd/compile: optimize amd64's ADDQconstmodify/ADDLconstmodify This CL optimize amd64's code: "ADDQ $-1, MEM_OP" -> "DECQ MEM_OP" "ADDL $-1, MEM_OP" -> "DECL MEM_OP" 1. The total size of pkg/linux_amd64 (excluding cmd/compile) decreases about 0.1KB. 2. The go1 benchmark shows little regression, excluding noise. name old time/op new time/op delta BinaryTree17-4 2.60s ± 5% 2.64s ± 3% +1.53% (p=0.000 n=38+39) Fannkuch11-4 2.37s ± 2% 2.38s ± 2% ~ (p=0.950 n=40+40) FmtFprintfEmpty-4 40.4ns ± 5% 40.5ns ± 5% ~ (p=0.711 n=40+40) FmtFprintfString-4 72.4ns ± 5% 72.3ns ± 3% ~ (p=0.485 n=40+40) FmtFprintfInt-4 79.7ns ± 3% 80.1ns ± 3% ~ (p=0.124 n=40+40) FmtFprintfIntInt-4 126ns ± 3% 127ns ± 3% +0.71% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 153ns ± 4% 153ns ± 2% ~ (p=0.604 n=40+40) FmtFprintfFloat-4 206ns ± 5% 210ns ± 5% +1.79% (p=0.002 n=40+40) FmtManyArgs-4 498ns ± 3% 496ns ± 3% ~ (p=0.099 n=40+40) GobDecode-4 6.48ms ± 6% 6.47ms ± 7% ~ (p=0.686 n=39+40) GobEncode-4 5.95ms ± 7% 5.96ms ± 6% ~ (p=0.670 n=40+34) Gzip-4 224ms ± 6% 223ms ± 5% ~ (p=0.143 n=40+40) Gunzip-4 36.5ms ± 4% 36.5ms ± 4% ~ (p=0.556 n=40+40) HTTPClientServer-4 60.7µs ± 2% 59.9µs ± 3% -1.20% (p=0.000 n=39+39) JSONEncode-4 9.03ms ± 4% 9.04ms ± 4% ~ (p=0.589 n=40+40) JSONDecode-4 49.4ms ± 4% 49.2ms ± 4% ~ (p=0.276 n=40+40) Mandelbrot200-4 3.80ms ± 4% 3.79ms ± 4% ~ (p=0.837 n=40+40) GoParse-4 3.15ms ± 5% 3.13ms ± 5% ~ (p=0.240 n=40+40) RegexpMatchEasy0_32-4 72.9ns ± 3% 72.0ns ± 8% -1.25% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 229ns ± 5% 230ns ± 4% ~ (p=0.318 n=40+40) RegexpMatchEasy1_32-4 66.9ns ± 3% 67.3ns ± 7% ~ (p=0.817 n=40+40) RegexpMatchEasy1_1K-4 371ns ± 5% 370ns ± 4% ~ (p=0.275 n=40+40) RegexpMatchMedium_32-4 106ns ± 4% 104ns ± 7% -2.28% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 32.0µs ± 2% 31.4µs ± 3% -2.08% (p=0.000 n=40+40) RegexpMatchHard_32-4 1.54µs ± 7% 1.52µs ± 3% -1.80% (p=0.007 n=39+40) RegexpMatchHard_1K-4 45.8µs ± 4% 45.5µs ± 3% ~ (p=0.707 n=40+40) Revcomp-4 401ms ± 5% 401ms ± 6% ~ (p=0.935 n=40+40) Template-4 62.4ms ± 4% 61.2ms ± 3% -1.85% (p=0.000 n=40+40) TimeParse-4 315ns ± 2% 318ns ± 3% +1.10% (p=0.002 n=40+40) TimeFormat-4 297ns ± 3% 298ns ± 3% ~ (p=0.238 n=40+40) [Geo mean] 45.8µs 45.7µs -0.22% name old speed new speed delta GobDecode-4 119MB/s ± 6% 119MB/s ± 7% ~ (p=0.684 n=39+40) GobEncode-4 129MB/s ± 7% 128MB/s ± 6% ~ (p=0.413 n=40+34) Gzip-4 86.6MB/s ± 6% 87.0MB/s ± 6% ~ (p=0.145 n=40+40) Gunzip-4 532MB/s ± 4% 532MB/s ± 4% ~ (p=0.556 n=40+40) JSONEncode-4 215MB/s ± 4% 215MB/s ± 4% ~ (p=0.583 n=40+40) JSONDecode-4 39.3MB/s ± 4% 39.5MB/s ± 4% ~ (p=0.277 n=40+40) GoParse-4 18.4MB/s ± 5% 18.5MB/s ± 5% ~ (p=0.229 n=40+40) RegexpMatchEasy0_32-4 439MB/s ± 3% 445MB/s ± 8% +1.28% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 4.46GB/s ± 4% 4.45GB/s ± 4% ~ (p=0.343 n=40+40) RegexpMatchEasy1_32-4 479MB/s ± 3% 476MB/s ± 7% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 2.76GB/s ± 5% 2.77GB/s ± 4% ~ (p=0.250 n=40+40) RegexpMatchMedium_32-4 9.36MB/s ± 4% 9.58MB/s ± 6% +2.31% (p=0.001 n=40+40) RegexpMatchMedium_1K-4 32.0MB/s ± 2% 32.7MB/s ± 3% +2.12% (p=0.000 n=40+40) RegexpMatchHard_32-4 20.7MB/s ± 7% 21.1MB/s ± 3% +1.95% (p=0.005 n=40+40) RegexpMatchHard_1K-4 22.4MB/s ± 4% 22.5MB/s ± 3% ~ (p=0.689 n=40+40) Revcomp-4 634MB/s ± 5% 634MB/s ± 6% ~ (p=0.935 n=40+40) Template-4 31.1MB/s ± 3% 31.7MB/s ± 3% +1.88% (p=0.000 n=40+40) [Geo mean] 129MB/s 130MB/s +0.62% Change-Id: I9d61ee810d900920c572cbe89e2f1626bfed12b7 Reviewed-on: https://go-review.googlesource.com/c/145209 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-29 08:34:42 +00:00
if val == 1 || val == -1 {
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconstmodify {
cmd/compile: optimize amd64's ADDQconstmodify/ADDLconstmodify This CL optimize amd64's code: "ADDQ $-1, MEM_OP" -> "DECQ MEM_OP" "ADDL $-1, MEM_OP" -> "DECL MEM_OP" 1. The total size of pkg/linux_amd64 (excluding cmd/compile) decreases about 0.1KB. 2. The go1 benchmark shows little regression, excluding noise. name old time/op new time/op delta BinaryTree17-4 2.60s ± 5% 2.64s ± 3% +1.53% (p=0.000 n=38+39) Fannkuch11-4 2.37s ± 2% 2.38s ± 2% ~ (p=0.950 n=40+40) FmtFprintfEmpty-4 40.4ns ± 5% 40.5ns ± 5% ~ (p=0.711 n=40+40) FmtFprintfString-4 72.4ns ± 5% 72.3ns ± 3% ~ (p=0.485 n=40+40) FmtFprintfInt-4 79.7ns ± 3% 80.1ns ± 3% ~ (p=0.124 n=40+40) FmtFprintfIntInt-4 126ns ± 3% 127ns ± 3% +0.71% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 153ns ± 4% 153ns ± 2% ~ (p=0.604 n=40+40) FmtFprintfFloat-4 206ns ± 5% 210ns ± 5% +1.79% (p=0.002 n=40+40) FmtManyArgs-4 498ns ± 3% 496ns ± 3% ~ (p=0.099 n=40+40) GobDecode-4 6.48ms ± 6% 6.47ms ± 7% ~ (p=0.686 n=39+40) GobEncode-4 5.95ms ± 7% 5.96ms ± 6% ~ (p=0.670 n=40+34) Gzip-4 224ms ± 6% 223ms ± 5% ~ (p=0.143 n=40+40) Gunzip-4 36.5ms ± 4% 36.5ms ± 4% ~ (p=0.556 n=40+40) HTTPClientServer-4 60.7µs ± 2% 59.9µs ± 3% -1.20% (p=0.000 n=39+39) JSONEncode-4 9.03ms ± 4% 9.04ms ± 4% ~ (p=0.589 n=40+40) JSONDecode-4 49.4ms ± 4% 49.2ms ± 4% ~ (p=0.276 n=40+40) Mandelbrot200-4 3.80ms ± 4% 3.79ms ± 4% ~ (p=0.837 n=40+40) GoParse-4 3.15ms ± 5% 3.13ms ± 5% ~ (p=0.240 n=40+40) RegexpMatchEasy0_32-4 72.9ns ± 3% 72.0ns ± 8% -1.25% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 229ns ± 5% 230ns ± 4% ~ (p=0.318 n=40+40) RegexpMatchEasy1_32-4 66.9ns ± 3% 67.3ns ± 7% ~ (p=0.817 n=40+40) RegexpMatchEasy1_1K-4 371ns ± 5% 370ns ± 4% ~ (p=0.275 n=40+40) RegexpMatchMedium_32-4 106ns ± 4% 104ns ± 7% -2.28% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 32.0µs ± 2% 31.4µs ± 3% -2.08% (p=0.000 n=40+40) RegexpMatchHard_32-4 1.54µs ± 7% 1.52µs ± 3% -1.80% (p=0.007 n=39+40) RegexpMatchHard_1K-4 45.8µs ± 4% 45.5µs ± 3% ~ (p=0.707 n=40+40) Revcomp-4 401ms ± 5% 401ms ± 6% ~ (p=0.935 n=40+40) Template-4 62.4ms ± 4% 61.2ms ± 3% -1.85% (p=0.000 n=40+40) TimeParse-4 315ns ± 2% 318ns ± 3% +1.10% (p=0.002 n=40+40) TimeFormat-4 297ns ± 3% 298ns ± 3% ~ (p=0.238 n=40+40) [Geo mean] 45.8µs 45.7µs -0.22% name old speed new speed delta GobDecode-4 119MB/s ± 6% 119MB/s ± 7% ~ (p=0.684 n=39+40) GobEncode-4 129MB/s ± 7% 128MB/s ± 6% ~ (p=0.413 n=40+34) Gzip-4 86.6MB/s ± 6% 87.0MB/s ± 6% ~ (p=0.145 n=40+40) Gunzip-4 532MB/s ± 4% 532MB/s ± 4% ~ (p=0.556 n=40+40) JSONEncode-4 215MB/s ± 4% 215MB/s ± 4% ~ (p=0.583 n=40+40) JSONDecode-4 39.3MB/s ± 4% 39.5MB/s ± 4% ~ (p=0.277 n=40+40) GoParse-4 18.4MB/s ± 5% 18.5MB/s ± 5% ~ (p=0.229 n=40+40) RegexpMatchEasy0_32-4 439MB/s ± 3% 445MB/s ± 8% +1.28% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 4.46GB/s ± 4% 4.45GB/s ± 4% ~ (p=0.343 n=40+40) RegexpMatchEasy1_32-4 479MB/s ± 3% 476MB/s ± 7% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 2.76GB/s ± 5% 2.77GB/s ± 4% ~ (p=0.250 n=40+40) RegexpMatchMedium_32-4 9.36MB/s ± 4% 9.58MB/s ± 6% +2.31% (p=0.001 n=40+40) RegexpMatchMedium_1K-4 32.0MB/s ± 2% 32.7MB/s ± 3% +2.12% (p=0.000 n=40+40) RegexpMatchHard_32-4 20.7MB/s ± 7% 21.1MB/s ± 3% +1.95% (p=0.005 n=40+40) RegexpMatchHard_1K-4 22.4MB/s ± 4% 22.5MB/s ± 3% ~ (p=0.689 n=40+40) Revcomp-4 634MB/s ± 5% 634MB/s ± 6% ~ (p=0.935 n=40+40) Template-4 31.1MB/s ± 3% 31.7MB/s ± 3% +1.88% (p=0.000 n=40+40) [Geo mean] 129MB/s 130MB/s +0.62% Change-Id: I9d61ee810d900920c572cbe89e2f1626bfed12b7 Reviewed-on: https://go-review.googlesource.com/c/145209 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-29 08:34:42 +00:00
if val == 1 {
asm = x86.AINCQ
} else {
asm = x86.ADECQ
}
} else {
cmd/compile: optimize amd64's ADDQconstmodify/ADDLconstmodify This CL optimize amd64's code: "ADDQ $-1, MEM_OP" -> "DECQ MEM_OP" "ADDL $-1, MEM_OP" -> "DECL MEM_OP" 1. The total size of pkg/linux_amd64 (excluding cmd/compile) decreases about 0.1KB. 2. The go1 benchmark shows little regression, excluding noise. name old time/op new time/op delta BinaryTree17-4 2.60s ± 5% 2.64s ± 3% +1.53% (p=0.000 n=38+39) Fannkuch11-4 2.37s ± 2% 2.38s ± 2% ~ (p=0.950 n=40+40) FmtFprintfEmpty-4 40.4ns ± 5% 40.5ns ± 5% ~ (p=0.711 n=40+40) FmtFprintfString-4 72.4ns ± 5% 72.3ns ± 3% ~ (p=0.485 n=40+40) FmtFprintfInt-4 79.7ns ± 3% 80.1ns ± 3% ~ (p=0.124 n=40+40) FmtFprintfIntInt-4 126ns ± 3% 127ns ± 3% +0.71% (p=0.027 n=40+40) FmtFprintfPrefixedInt-4 153ns ± 4% 153ns ± 2% ~ (p=0.604 n=40+40) FmtFprintfFloat-4 206ns ± 5% 210ns ± 5% +1.79% (p=0.002 n=40+40) FmtManyArgs-4 498ns ± 3% 496ns ± 3% ~ (p=0.099 n=40+40) GobDecode-4 6.48ms ± 6% 6.47ms ± 7% ~ (p=0.686 n=39+40) GobEncode-4 5.95ms ± 7% 5.96ms ± 6% ~ (p=0.670 n=40+34) Gzip-4 224ms ± 6% 223ms ± 5% ~ (p=0.143 n=40+40) Gunzip-4 36.5ms ± 4% 36.5ms ± 4% ~ (p=0.556 n=40+40) HTTPClientServer-4 60.7µs ± 2% 59.9µs ± 3% -1.20% (p=0.000 n=39+39) JSONEncode-4 9.03ms ± 4% 9.04ms ± 4% ~ (p=0.589 n=40+40) JSONDecode-4 49.4ms ± 4% 49.2ms ± 4% ~ (p=0.276 n=40+40) Mandelbrot200-4 3.80ms ± 4% 3.79ms ± 4% ~ (p=0.837 n=40+40) GoParse-4 3.15ms ± 5% 3.13ms ± 5% ~ (p=0.240 n=40+40) RegexpMatchEasy0_32-4 72.9ns ± 3% 72.0ns ± 8% -1.25% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 229ns ± 5% 230ns ± 4% ~ (p=0.318 n=40+40) RegexpMatchEasy1_32-4 66.9ns ± 3% 67.3ns ± 7% ~ (p=0.817 n=40+40) RegexpMatchEasy1_1K-4 371ns ± 5% 370ns ± 4% ~ (p=0.275 n=40+40) RegexpMatchMedium_32-4 106ns ± 4% 104ns ± 7% -2.28% (p=0.000 n=40+40) RegexpMatchMedium_1K-4 32.0µs ± 2% 31.4µs ± 3% -2.08% (p=0.000 n=40+40) RegexpMatchHard_32-4 1.54µs ± 7% 1.52µs ± 3% -1.80% (p=0.007 n=39+40) RegexpMatchHard_1K-4 45.8µs ± 4% 45.5µs ± 3% ~ (p=0.707 n=40+40) Revcomp-4 401ms ± 5% 401ms ± 6% ~ (p=0.935 n=40+40) Template-4 62.4ms ± 4% 61.2ms ± 3% -1.85% (p=0.000 n=40+40) TimeParse-4 315ns ± 2% 318ns ± 3% +1.10% (p=0.002 n=40+40) TimeFormat-4 297ns ± 3% 298ns ± 3% ~ (p=0.238 n=40+40) [Geo mean] 45.8µs 45.7µs -0.22% name old speed new speed delta GobDecode-4 119MB/s ± 6% 119MB/s ± 7% ~ (p=0.684 n=39+40) GobEncode-4 129MB/s ± 7% 128MB/s ± 6% ~ (p=0.413 n=40+34) Gzip-4 86.6MB/s ± 6% 87.0MB/s ± 6% ~ (p=0.145 n=40+40) Gunzip-4 532MB/s ± 4% 532MB/s ± 4% ~ (p=0.556 n=40+40) JSONEncode-4 215MB/s ± 4% 215MB/s ± 4% ~ (p=0.583 n=40+40) JSONDecode-4 39.3MB/s ± 4% 39.5MB/s ± 4% ~ (p=0.277 n=40+40) GoParse-4 18.4MB/s ± 5% 18.5MB/s ± 5% ~ (p=0.229 n=40+40) RegexpMatchEasy0_32-4 439MB/s ± 3% 445MB/s ± 8% +1.28% (p=0.003 n=40+40) RegexpMatchEasy0_1K-4 4.46GB/s ± 4% 4.45GB/s ± 4% ~ (p=0.343 n=40+40) RegexpMatchEasy1_32-4 479MB/s ± 3% 476MB/s ± 7% ~ (p=0.855 n=40+40) RegexpMatchEasy1_1K-4 2.76GB/s ± 5% 2.77GB/s ± 4% ~ (p=0.250 n=40+40) RegexpMatchMedium_32-4 9.36MB/s ± 4% 9.58MB/s ± 6% +2.31% (p=0.001 n=40+40) RegexpMatchMedium_1K-4 32.0MB/s ± 2% 32.7MB/s ± 3% +2.12% (p=0.000 n=40+40) RegexpMatchHard_32-4 20.7MB/s ± 7% 21.1MB/s ± 3% +1.95% (p=0.005 n=40+40) RegexpMatchHard_1K-4 22.4MB/s ± 4% 22.5MB/s ± 3% ~ (p=0.689 n=40+40) Revcomp-4 634MB/s ± 5% 634MB/s ± 6% ~ (p=0.935 n=40+40) Template-4 31.1MB/s ± 3% 31.7MB/s ± 3% +1.88% (p=0.000 n=40+40) [Geo mean] 129MB/s 130MB/s +0.62% Change-Id: I9d61ee810d900920c572cbe89e2f1626bfed12b7 Reviewed-on: https://go-review.googlesource.com/c/145209 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2018-10-29 08:34:42 +00:00
if val == 1 {
asm = x86.AINCL
} else {
asm = x86.ADECL
}
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux2(&p.To, v, off)
break
}
fallthrough
cmd/compile: optimize AMD64 with more read-modify-write operations 6 more operations which do read-modify-write with a constant source operand are added. 1. The total size of pkg/linux_amd64 decreases about 3KB, excluding cmd/compile. 2. The go1 benckmark shows a slight improvement. name old time/op new time/op delta BinaryTree17-4 2.61s ± 4% 2.67s ± 2% +2.26% (p=0.000 n=30+29) Fannkuch11-4 2.39s ± 2% 2.32s ± 2% -2.67% (p=0.000 n=30+30) FmtFprintfEmpty-4 44.0ns ± 4% 41.7ns ± 4% -5.15% (p=0.000 n=30+30) FmtFprintfString-4 74.2ns ± 4% 72.3ns ± 4% -2.59% (p=0.000 n=30+30) FmtFprintfInt-4 81.7ns ± 3% 78.8ns ± 4% -3.54% (p=0.000 n=27+30) FmtFprintfIntInt-4 130ns ± 4% 124ns ± 5% -4.60% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 154ns ± 3% 152ns ± 3% -1.13% (p=0.012 n=30+30) FmtFprintfFloat-4 215ns ± 4% 212ns ± 5% -1.56% (p=0.002 n=30+30) FmtManyArgs-4 522ns ± 3% 512ns ± 3% -1.84% (p=0.001 n=30+30) GobDecode-4 6.42ms ± 5% 6.49ms ± 7% ~ (p=0.070 n=30+30) GobEncode-4 6.07ms ± 8% 5.98ms ± 8% ~ (p=0.150 n=30+30) Gzip-4 236ms ± 4% 223ms ± 4% -5.57% (p=0.000 n=30+30) Gunzip-4 37.4ms ± 3% 36.7ms ± 4% -2.03% (p=0.000 n=30+30) HTTPClientServer-4 58.7µs ± 1% 58.5µs ± 2% -0.37% (p=0.018 n=30+29) JSONEncode-4 12.0ms ± 4% 12.1ms ± 3% ~ (p=0.112 n=30+30) JSONDecode-4 54.5ms ± 3% 55.5ms ± 4% +1.80% (p=0.006 n=30+30) Mandelbrot200-4 3.78ms ± 4% 3.78ms ± 4% ~ (p=0.173 n=30+30) GoParse-4 3.16ms ± 5% 3.22ms ± 5% +1.75% (p=0.010 n=30+30) RegexpMatchEasy0_32-4 76.6ns ± 1% 75.9ns ± 3% ~ (p=0.672 n=25+30) RegexpMatchEasy0_1K-4 252ns ± 3% 253ns ± 3% +0.57% (p=0.027 n=30+30) RegexpMatchEasy1_32-4 69.8ns ± 4% 70.2ns ± 6% ~ (p=0.539 n=30+30) RegexpMatchEasy1_1K-4 374ns ± 3% 373ns ± 5% ~ (p=0.263 n=30+30) RegexpMatchMedium_32-4 107ns ± 4% 109ns ± 3% ~ (p=0.067 n=30+30) RegexpMatchMedium_1K-4 33.9µs ± 5% 34.1µs ± 4% ~ (p=0.297 n=30+30) RegexpMatchHard_32-4 1.54µs ± 3% 1.56µs ± 4% +1.43% (p=0.002 n=30+30) RegexpMatchHard_1K-4 46.6µs ± 3% 47.0µs ± 3% ~ (p=0.055 n=30+30) Revcomp-4 411ms ± 6% 407ms ± 6% ~ (p=0.219 n=30+30) Template-4 66.8ms ± 3% 64.8ms ± 5% -3.01% (p=0.000 n=30+30) TimeParse-4 312ns ± 2% 319ns ± 3% +2.50% (p=0.000 n=30+30) TimeFormat-4 296ns ± 5% 299ns ± 3% +0.93% (p=0.005 n=30+30) [Geo mean] 47.5µs 47.1µs -0.75% name old speed new speed delta GobDecode-4 120MB/s ± 5% 118MB/s ± 6% ~ (p=0.072 n=30+30) GobEncode-4 127MB/s ± 8% 129MB/s ± 8% ~ (p=0.150 n=30+30) Gzip-4 82.1MB/s ± 4% 87.0MB/s ± 4% +5.90% (p=0.000 n=30+30) Gunzip-4 519MB/s ± 4% 529MB/s ± 4% +2.07% (p=0.001 n=30+30) JSONEncode-4 162MB/s ± 4% 161MB/s ± 3% ~ (p=0.110 n=30+30) JSONDecode-4 35.6MB/s ± 3% 35.0MB/s ± 4% -1.77% (p=0.007 n=30+30) GoParse-4 18.3MB/s ± 4% 18.0MB/s ± 4% -1.72% (p=0.009 n=30+30) RegexpMatchEasy0_32-4 418MB/s ± 1% 422MB/s ± 3% ~ (p=0.645 n=25+30) RegexpMatchEasy0_1K-4 4.06GB/s ± 3% 4.04GB/s ± 3% -0.57% (p=0.033 n=30+30) RegexpMatchEasy1_32-4 459MB/s ± 4% 456MB/s ± 6% ~ (p=0.530 n=30+30) RegexpMatchEasy1_1K-4 2.73GB/s ± 3% 2.75GB/s ± 5% ~ (p=0.279 n=30+30) RegexpMatchMedium_32-4 9.28MB/s ± 5% 9.18MB/s ± 4% ~ (p=0.086 n=30+30) RegexpMatchMedium_1K-4 30.2MB/s ± 4% 30.0MB/s ± 4% ~ (p=0.300 n=30+30) RegexpMatchHard_32-4 20.8MB/s ± 3% 20.5MB/s ± 4% -1.41% (p=0.002 n=30+30) RegexpMatchHard_1K-4 22.0MB/s ± 3% 21.8MB/s ± 3% ~ (p=0.051 n=30+30) Revcomp-4 619MB/s ± 7% 625MB/s ± 7% ~ (p=0.219 n=30+30) Template-4 29.0MB/s ± 3% 29.9MB/s ± 4% +3.11% (p=0.000 n=30+30) [Geo mean] 123MB/s 123MB/s +0.28% Change-Id: I850652cfd53329c1af804b7f57f4393d8097bb0d Reviewed-on: https://go-review.googlesource.com/121135 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-06-27 02:46:17 +00:00
case ssa.OpAMD64ANDQconstmodify, ssa.OpAMD64ANDLconstmodify, ssa.OpAMD64ORQconstmodify, ssa.OpAMD64ORLconstmodify,
ssa.OpAMD64XORQconstmodify, ssa.OpAMD64XORLconstmodify,
ssa.OpAMD64BTSQconstmodify, ssa.OpAMD64BTRQconstmodify, ssa.OpAMD64BTCQconstmodify:
cmd/compile: optimize AMD64 with more read-modify-write operations 6 more operations which do read-modify-write with a constant source operand are added. 1. The total size of pkg/linux_amd64 decreases about 3KB, excluding cmd/compile. 2. The go1 benckmark shows a slight improvement. name old time/op new time/op delta BinaryTree17-4 2.61s ± 4% 2.67s ± 2% +2.26% (p=0.000 n=30+29) Fannkuch11-4 2.39s ± 2% 2.32s ± 2% -2.67% (p=0.000 n=30+30) FmtFprintfEmpty-4 44.0ns ± 4% 41.7ns ± 4% -5.15% (p=0.000 n=30+30) FmtFprintfString-4 74.2ns ± 4% 72.3ns ± 4% -2.59% (p=0.000 n=30+30) FmtFprintfInt-4 81.7ns ± 3% 78.8ns ± 4% -3.54% (p=0.000 n=27+30) FmtFprintfIntInt-4 130ns ± 4% 124ns ± 5% -4.60% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 154ns ± 3% 152ns ± 3% -1.13% (p=0.012 n=30+30) FmtFprintfFloat-4 215ns ± 4% 212ns ± 5% -1.56% (p=0.002 n=30+30) FmtManyArgs-4 522ns ± 3% 512ns ± 3% -1.84% (p=0.001 n=30+30) GobDecode-4 6.42ms ± 5% 6.49ms ± 7% ~ (p=0.070 n=30+30) GobEncode-4 6.07ms ± 8% 5.98ms ± 8% ~ (p=0.150 n=30+30) Gzip-4 236ms ± 4% 223ms ± 4% -5.57% (p=0.000 n=30+30) Gunzip-4 37.4ms ± 3% 36.7ms ± 4% -2.03% (p=0.000 n=30+30) HTTPClientServer-4 58.7µs ± 1% 58.5µs ± 2% -0.37% (p=0.018 n=30+29) JSONEncode-4 12.0ms ± 4% 12.1ms ± 3% ~ (p=0.112 n=30+30) JSONDecode-4 54.5ms ± 3% 55.5ms ± 4% +1.80% (p=0.006 n=30+30) Mandelbrot200-4 3.78ms ± 4% 3.78ms ± 4% ~ (p=0.173 n=30+30) GoParse-4 3.16ms ± 5% 3.22ms ± 5% +1.75% (p=0.010 n=30+30) RegexpMatchEasy0_32-4 76.6ns ± 1% 75.9ns ± 3% ~ (p=0.672 n=25+30) RegexpMatchEasy0_1K-4 252ns ± 3% 253ns ± 3% +0.57% (p=0.027 n=30+30) RegexpMatchEasy1_32-4 69.8ns ± 4% 70.2ns ± 6% ~ (p=0.539 n=30+30) RegexpMatchEasy1_1K-4 374ns ± 3% 373ns ± 5% ~ (p=0.263 n=30+30) RegexpMatchMedium_32-4 107ns ± 4% 109ns ± 3% ~ (p=0.067 n=30+30) RegexpMatchMedium_1K-4 33.9µs ± 5% 34.1µs ± 4% ~ (p=0.297 n=30+30) RegexpMatchHard_32-4 1.54µs ± 3% 1.56µs ± 4% +1.43% (p=0.002 n=30+30) RegexpMatchHard_1K-4 46.6µs ± 3% 47.0µs ± 3% ~ (p=0.055 n=30+30) Revcomp-4 411ms ± 6% 407ms ± 6% ~ (p=0.219 n=30+30) Template-4 66.8ms ± 3% 64.8ms ± 5% -3.01% (p=0.000 n=30+30) TimeParse-4 312ns ± 2% 319ns ± 3% +2.50% (p=0.000 n=30+30) TimeFormat-4 296ns ± 5% 299ns ± 3% +0.93% (p=0.005 n=30+30) [Geo mean] 47.5µs 47.1µs -0.75% name old speed new speed delta GobDecode-4 120MB/s ± 5% 118MB/s ± 6% ~ (p=0.072 n=30+30) GobEncode-4 127MB/s ± 8% 129MB/s ± 8% ~ (p=0.150 n=30+30) Gzip-4 82.1MB/s ± 4% 87.0MB/s ± 4% +5.90% (p=0.000 n=30+30) Gunzip-4 519MB/s ± 4% 529MB/s ± 4% +2.07% (p=0.001 n=30+30) JSONEncode-4 162MB/s ± 4% 161MB/s ± 3% ~ (p=0.110 n=30+30) JSONDecode-4 35.6MB/s ± 3% 35.0MB/s ± 4% -1.77% (p=0.007 n=30+30) GoParse-4 18.3MB/s ± 4% 18.0MB/s ± 4% -1.72% (p=0.009 n=30+30) RegexpMatchEasy0_32-4 418MB/s ± 1% 422MB/s ± 3% ~ (p=0.645 n=25+30) RegexpMatchEasy0_1K-4 4.06GB/s ± 3% 4.04GB/s ± 3% -0.57% (p=0.033 n=30+30) RegexpMatchEasy1_32-4 459MB/s ± 4% 456MB/s ± 6% ~ (p=0.530 n=30+30) RegexpMatchEasy1_1K-4 2.73GB/s ± 3% 2.75GB/s ± 5% ~ (p=0.279 n=30+30) RegexpMatchMedium_32-4 9.28MB/s ± 5% 9.18MB/s ± 4% ~ (p=0.086 n=30+30) RegexpMatchMedium_1K-4 30.2MB/s ± 4% 30.0MB/s ± 4% ~ (p=0.300 n=30+30) RegexpMatchHard_32-4 20.8MB/s ± 3% 20.5MB/s ± 4% -1.41% (p=0.002 n=30+30) RegexpMatchHard_1K-4 22.0MB/s ± 3% 21.8MB/s ± 3% ~ (p=0.051 n=30+30) Revcomp-4 619MB/s ± 7% 625MB/s ± 7% ~ (p=0.219 n=30+30) Template-4 29.0MB/s ± 3% 29.9MB/s ± 4% +3.11% (p=0.000 n=30+30) [Geo mean] 123MB/s 123MB/s +0.28% Change-Id: I850652cfd53329c1af804b7f57f4393d8097bb0d Reviewed-on: https://go-review.googlesource.com/121135 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-06-27 02:46:17 +00:00
sc := v.AuxValAndOff()
off := sc.Off64()
val := sc.Val64()
cmd/compile: optimize AMD64 with more read-modify-write operations 6 more operations which do read-modify-write with a constant source operand are added. 1. The total size of pkg/linux_amd64 decreases about 3KB, excluding cmd/compile. 2. The go1 benckmark shows a slight improvement. name old time/op new time/op delta BinaryTree17-4 2.61s ± 4% 2.67s ± 2% +2.26% (p=0.000 n=30+29) Fannkuch11-4 2.39s ± 2% 2.32s ± 2% -2.67% (p=0.000 n=30+30) FmtFprintfEmpty-4 44.0ns ± 4% 41.7ns ± 4% -5.15% (p=0.000 n=30+30) FmtFprintfString-4 74.2ns ± 4% 72.3ns ± 4% -2.59% (p=0.000 n=30+30) FmtFprintfInt-4 81.7ns ± 3% 78.8ns ± 4% -3.54% (p=0.000 n=27+30) FmtFprintfIntInt-4 130ns ± 4% 124ns ± 5% -4.60% (p=0.000 n=30+30) FmtFprintfPrefixedInt-4 154ns ± 3% 152ns ± 3% -1.13% (p=0.012 n=30+30) FmtFprintfFloat-4 215ns ± 4% 212ns ± 5% -1.56% (p=0.002 n=30+30) FmtManyArgs-4 522ns ± 3% 512ns ± 3% -1.84% (p=0.001 n=30+30) GobDecode-4 6.42ms ± 5% 6.49ms ± 7% ~ (p=0.070 n=30+30) GobEncode-4 6.07ms ± 8% 5.98ms ± 8% ~ (p=0.150 n=30+30) Gzip-4 236ms ± 4% 223ms ± 4% -5.57% (p=0.000 n=30+30) Gunzip-4 37.4ms ± 3% 36.7ms ± 4% -2.03% (p=0.000 n=30+30) HTTPClientServer-4 58.7µs ± 1% 58.5µs ± 2% -0.37% (p=0.018 n=30+29) JSONEncode-4 12.0ms ± 4% 12.1ms ± 3% ~ (p=0.112 n=30+30) JSONDecode-4 54.5ms ± 3% 55.5ms ± 4% +1.80% (p=0.006 n=30+30) Mandelbrot200-4 3.78ms ± 4% 3.78ms ± 4% ~ (p=0.173 n=30+30) GoParse-4 3.16ms ± 5% 3.22ms ± 5% +1.75% (p=0.010 n=30+30) RegexpMatchEasy0_32-4 76.6ns ± 1% 75.9ns ± 3% ~ (p=0.672 n=25+30) RegexpMatchEasy0_1K-4 252ns ± 3% 253ns ± 3% +0.57% (p=0.027 n=30+30) RegexpMatchEasy1_32-4 69.8ns ± 4% 70.2ns ± 6% ~ (p=0.539 n=30+30) RegexpMatchEasy1_1K-4 374ns ± 3% 373ns ± 5% ~ (p=0.263 n=30+30) RegexpMatchMedium_32-4 107ns ± 4% 109ns ± 3% ~ (p=0.067 n=30+30) RegexpMatchMedium_1K-4 33.9µs ± 5% 34.1µs ± 4% ~ (p=0.297 n=30+30) RegexpMatchHard_32-4 1.54µs ± 3% 1.56µs ± 4% +1.43% (p=0.002 n=30+30) RegexpMatchHard_1K-4 46.6µs ± 3% 47.0µs ± 3% ~ (p=0.055 n=30+30) Revcomp-4 411ms ± 6% 407ms ± 6% ~ (p=0.219 n=30+30) Template-4 66.8ms ± 3% 64.8ms ± 5% -3.01% (p=0.000 n=30+30) TimeParse-4 312ns ± 2% 319ns ± 3% +2.50% (p=0.000 n=30+30) TimeFormat-4 296ns ± 5% 299ns ± 3% +0.93% (p=0.005 n=30+30) [Geo mean] 47.5µs 47.1µs -0.75% name old speed new speed delta GobDecode-4 120MB/s ± 5% 118MB/s ± 6% ~ (p=0.072 n=30+30) GobEncode-4 127MB/s ± 8% 129MB/s ± 8% ~ (p=0.150 n=30+30) Gzip-4 82.1MB/s ± 4% 87.0MB/s ± 4% +5.90% (p=0.000 n=30+30) Gunzip-4 519MB/s ± 4% 529MB/s ± 4% +2.07% (p=0.001 n=30+30) JSONEncode-4 162MB/s ± 4% 161MB/s ± 3% ~ (p=0.110 n=30+30) JSONDecode-4 35.6MB/s ± 3% 35.0MB/s ± 4% -1.77% (p=0.007 n=30+30) GoParse-4 18.3MB/s ± 4% 18.0MB/s ± 4% -1.72% (p=0.009 n=30+30) RegexpMatchEasy0_32-4 418MB/s ± 1% 422MB/s ± 3% ~ (p=0.645 n=25+30) RegexpMatchEasy0_1K-4 4.06GB/s ± 3% 4.04GB/s ± 3% -0.57% (p=0.033 n=30+30) RegexpMatchEasy1_32-4 459MB/s ± 4% 456MB/s ± 6% ~ (p=0.530 n=30+30) RegexpMatchEasy1_1K-4 2.73GB/s ± 3% 2.75GB/s ± 5% ~ (p=0.279 n=30+30) RegexpMatchMedium_32-4 9.28MB/s ± 5% 9.18MB/s ± 4% ~ (p=0.086 n=30+30) RegexpMatchMedium_1K-4 30.2MB/s ± 4% 30.0MB/s ± 4% ~ (p=0.300 n=30+30) RegexpMatchHard_32-4 20.8MB/s ± 3% 20.5MB/s ± 4% -1.41% (p=0.002 n=30+30) RegexpMatchHard_1K-4 22.0MB/s ± 3% 21.8MB/s ± 3% ~ (p=0.051 n=30+30) Revcomp-4 619MB/s ± 7% 625MB/s ± 7% ~ (p=0.219 n=30+30) Template-4 29.0MB/s ± 3% 29.9MB/s ± 4% +3.11% (p=0.000 n=30+30) [Geo mean] 123MB/s 123MB/s +0.28% Change-Id: I850652cfd53329c1af804b7f57f4393d8097bb0d Reviewed-on: https://go-review.googlesource.com/121135 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-06-27 02:46:17 +00:00
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = val
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux2(&p.To, v, off)
case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
sc := v.AuxValAndOff()
p.From.Offset = sc.Val64()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux2(&p.To, v, sc.Off64())
case ssa.OpAMD64MOVOstoreconst:
sc := v.AuxValAndOff()
if sc.Val() != 0 {
v.Fatalf("MOVO for non zero constants not implemented: %s", v.LongString())
}
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux2(&p.To, v, sc.Off64())
case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1,
ssa.OpAMD64ADDLconstmodifyidx1, ssa.OpAMD64ADDLconstmodifyidx4, ssa.OpAMD64ADDLconstmodifyidx8, ssa.OpAMD64ADDQconstmodifyidx1, ssa.OpAMD64ADDQconstmodifyidx8,
ssa.OpAMD64ANDLconstmodifyidx1, ssa.OpAMD64ANDLconstmodifyidx4, ssa.OpAMD64ANDLconstmodifyidx8, ssa.OpAMD64ANDQconstmodifyidx1, ssa.OpAMD64ANDQconstmodifyidx8,
ssa.OpAMD64ORLconstmodifyidx1, ssa.OpAMD64ORLconstmodifyidx4, ssa.OpAMD64ORLconstmodifyidx8, ssa.OpAMD64ORQconstmodifyidx1, ssa.OpAMD64ORQconstmodifyidx8,
ssa.OpAMD64XORLconstmodifyidx1, ssa.OpAMD64XORLconstmodifyidx4, ssa.OpAMD64XORLconstmodifyidx8, ssa.OpAMD64XORQconstmodifyidx1, ssa.OpAMD64XORQconstmodifyidx8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
sc := v.AuxValAndOff()
p.From.Offset = sc.Val64()
switch {
case p.As == x86.AADDQ && p.From.Offset == 1:
p.As = x86.AINCQ
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDQ && p.From.Offset == -1:
p.As = x86.ADECQ
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDL && p.From.Offset == 1:
p.As = x86.AINCL
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDL && p.From.Offset == -1:
p.As = x86.ADECL
p.From.Type = obj.TYPE_NONE
}
memIdx(&p.To, v)
ssagen.AddAux2(&p.To, v, sc.Off64())
case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS, ssa.OpAMD64VPBROADCASTB, ssa.OpAMD64PMOVMSKB:
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
cmd/compile/internal/amd64: break dependency for CVTS[LQ]2S[DS] CVTSL2SS, CVTSQ2SS, CVTSL2SD, CVTSQ2SD preserve upper part of xmm register, introducing false dependency on a previous value. Break it by xoring destination with itself. Increases size of go executable by 320 bytes, but shows nice improvement on go1. Also fixes performance degradation introduced by 1.7. name old time/op new time/op delta BinaryTree17-4 2.20s ± 1% 2.19s ± 0% -0.36% (p=0.000 n=18+16) Fannkuch11-4 2.44s ± 1% 2.45s ± 2% +0.47% (p=0.030 n=20+20) FmtFprintfEmpty-4 40.9ns ± 7% 40.5ns ± 1% ~ (p=0.531 n=20+16) FmtFprintfString-4 111ns ± 2% 111ns ± 1% ~ (p=0.510 n=18+19) FmtFprintfInt-4 98.3ns ± 3% 99.3ns ± 1% +1.01% (p=0.003 n=20+18) FmtFprintfIntInt-4 148ns ± 3% 147ns ± 1% ~ (p=0.919 n=20+17) FmtFprintfPrefixedInt-4 149ns ± 1% 152ns ± 0% +1.73% (p=0.000 n=19+17) FmtFprintfFloat-4 231ns ± 0% 231ns ± 1% ~ (p=0.678 n=18+19) FmtManyArgs-4 667ns ± 1% 672ns ± 1% +0.73% (p=0.005 n=20+20) GobDecode-4 5.60ms ± 0% 5.61ms ± 0% +0.24% (p=0.000 n=20+20) GobEncode-4 4.74ms ± 0% 4.73ms ± 1% -0.20% (p=0.002 n=20+20) Gzip-4 199ms ± 0% 199ms ± 1% +0.35% (p=0.000 n=19+20) Gunzip-4 31.8ms ± 1% 31.5ms ± 1% -0.89% (p=0.000 n=20+20) HTTPClientServer-4 38.1µs ± 1% 38.0µs ± 1% ~ (p=0.117 n=19+18) JSONEncode-4 14.2ms ± 1% 13.4ms ± 0% -5.73% (p=0.000 n=20+20) JSONDecode-4 42.7ms ± 0% 42.7ms ± 1% +0.18% (p=0.019 n=18+19) Mandelbrot200-4 3.26ms ± 0% 2.99ms ± 0% -8.38% (p=0.000 n=19+19) GoParse-4 2.76ms ± 1% 2.76ms ± 1% ~ (p=0.583 n=20+20) RegexpMatchEasy0_32-4 69.5ns ± 0% 69.6ns ± 0% +0.10% (p=0.017 n=16+17) RegexpMatchEasy0_1K-4 703ns ± 0% 708ns ± 3% +0.65% (p=0.000 n=17+18) RegexpMatchEasy1_32-4 68.2ns ± 1% 68.2ns ± 2% ~ (p=0.094 n=18+20) RegexpMatchEasy1_1K-4 288ns ± 1% 288ns ± 0% ~ (p=0.403 n=17+18) RegexpMatchMedium_32-4 104ns ± 2% 103ns ± 1% ~ (p=0.110 n=20+16) RegexpMatchMedium_1K-4 31.7µs ± 3% 31.7µs ± 3% ~ (p=0.091 n=19+20) RegexpMatchHard_32-4 1.59µs ± 2% 1.58µs ± 2% ~ (p=0.083 n=20+20) RegexpMatchHard_1K-4 48.1µs ± 3% 47.9µs ± 2% ~ (p=0.461 n=20+19) Revcomp-4 344ms ± 0% 345ms ± 0% +0.08% (p=0.009 n=18+17) Template-4 44.8ms ± 1% 44.7ms ± 1% ~ (p=0.277 n=20+20) TimeParse-4 258ns ± 0% 258ns ± 0% ~ (all samples are equal) TimeFormat-4 275ns ± 0% 273ns ± 0% -0.64% (p=0.000 n=20+18) name old speed new speed delta GobDecode-4 137MB/s ± 0% 137MB/s ± 0% -0.24% (p=0.000 n=20+20) GobEncode-4 162MB/s ± 0% 162MB/s ± 0% +0.20% (p=0.002 n=20+20) Gzip-4 97.6MB/s ± 0% 97.3MB/s ± 1% -0.35% (p=0.000 n=19+20) Gunzip-4 610MB/s ± 1% 615MB/s ± 1% +0.89% (p=0.000 n=20+20) JSONEncode-4 136MB/s ± 1% 145MB/s ± 0% +6.08% (p=0.000 n=20+20) JSONDecode-4 45.5MB/s ± 0% 45.4MB/s ± 1% -0.17% (p=0.017 n=18+19) GoParse-4 21.0MB/s ± 1% 21.0MB/s ± 1% ~ (p=0.578 n=20+20) RegexpMatchEasy0_32-4 460MB/s ± 0% 460MB/s ± 0% -0.09% (p=0.031 n=16+17) RegexpMatchEasy0_1K-4 1.46GB/s ± 0% 1.45GB/s ± 3% -0.64% (p=0.000 n=17+18) RegexpMatchEasy1_32-4 469MB/s ± 0% 469MB/s ± 2% +0.06% (p=0.043 n=18+20) RegexpMatchEasy1_1K-4 3.55GB/s ± 1% 3.55GB/s ± 0% ~ (p=0.057 n=17+18) RegexpMatchMedium_32-4 9.61MB/s ± 2% 9.64MB/s ± 2% ~ (p=0.856 n=20+20) RegexpMatchMedium_1K-4 32.3MB/s ± 3% 32.3MB/s ± 3% ~ (p=0.085 n=19+20) RegexpMatchHard_32-4 20.1MB/s ± 2% 20.2MB/s ± 2% ~ (p=0.086 n=20+20) RegexpMatchHard_1K-4 21.3MB/s ± 3% 21.4MB/s ± 2% ~ (p=0.578 n=20+20) Revcomp-4 738MB/s ± 0% 737MB/s ± 0% -0.08% (p=0.009 n=18+17) Template-4 43.3MB/s ± 1% 43.4MB/s ± 1% ~ (p=0.274 n=20+20) Fixes #16982 Change-Id: If574d66f39f4183a9b1d5ffff0339909cc73f59d Reviewed-on: https://go-review.googlesource.com/31490 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2016-10-19 20:21:42 +03:00
case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
r := v.Reg()
// Break false dependency on destination register.
opregreg(s, x86.AXORPS, r, r)
opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i, ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
var p *obj.Prog
switch v.Op {
case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
p = s.Prog(x86.AMOVQ)
case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
p = s.Prog(x86.AMOVL)
}
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64ADDQload, ssa.OpAMD64ADDLload, ssa.OpAMD64SUBQload, ssa.OpAMD64SUBLload,
ssa.OpAMD64ANDQload, ssa.OpAMD64ANDLload, ssa.OpAMD64ORQload, ssa.OpAMD64ORLload,
ssa.OpAMD64XORQload, ssa.OpAMD64XORLload, ssa.OpAMD64ADDSDload, ssa.OpAMD64ADDSSload,
cmd/compile: optimize AMD64 with DIVSSload and DIVSDload DIVSSload & DIVSDload directly operate on a memory operand. And binary size can be reduced by them, while the performance is not affected. The total size of pkg/linux_amd64 (excluding cmd/compile) decreases about 6KB. There is little regression in the go1 benchmark test (excluding noise). name old time/op new time/op delta BinaryTree17-4 2.63s ± 4% 2.62s ± 4% ~ (p=0.809 n=30+30) Fannkuch11-4 2.40s ± 2% 2.40s ± 2% ~ (p=0.109 n=30+30) FmtFprintfEmpty-4 43.1ns ± 4% 43.2ns ± 9% ~ (p=0.168 n=30+30) FmtFprintfString-4 73.6ns ± 4% 74.1ns ± 4% ~ (p=0.069 n=30+30) FmtFprintfInt-4 81.0ns ± 3% 81.4ns ± 5% ~ (p=0.350 n=30+30) FmtFprintfIntInt-4 127ns ± 4% 129ns ± 4% +0.99% (p=0.021 n=30+30) FmtFprintfPrefixedInt-4 156ns ± 4% 155ns ± 4% ~ (p=0.415 n=30+30) FmtFprintfFloat-4 219ns ± 4% 218ns ± 4% ~ (p=0.071 n=30+30) FmtManyArgs-4 522ns ± 3% 518ns ± 3% -0.68% (p=0.034 n=30+30) GobDecode-4 6.49ms ± 6% 6.52ms ± 6% ~ (p=0.832 n=30+30) GobEncode-4 6.10ms ± 9% 6.14ms ± 7% ~ (p=0.485 n=30+30) Gzip-4 227ms ± 1% 224ms ± 4% ~ (p=0.484 n=24+30) Gunzip-4 37.2ms ± 3% 36.8ms ± 4% ~ (p=0.889 n=30+30) HTTPClientServer-4 58.9µs ± 1% 58.7µs ± 2% -0.42% (p=0.003 n=28+28) JSONEncode-4 12.0ms ± 3% 12.0ms ± 4% ~ (p=0.523 n=30+30) JSONDecode-4 54.6ms ± 4% 54.5ms ± 4% ~ (p=0.708 n=30+30) Mandelbrot200-4 3.78ms ± 4% 3.81ms ± 3% +0.99% (p=0.016 n=30+30) GoParse-4 3.20ms ± 4% 3.20ms ± 5% ~ (p=0.994 n=30+30) RegexpMatchEasy0_32-4 77.0ns ± 4% 75.9ns ± 3% -1.39% (p=0.006 n=29+30) RegexpMatchEasy0_1K-4 255ns ± 4% 253ns ± 4% ~ (p=0.091 n=30+30) RegexpMatchEasy1_32-4 69.7ns ± 3% 70.3ns ± 4% ~ (p=0.120 n=30+30) RegexpMatchEasy1_1K-4 373ns ± 2% 378ns ± 3% +1.43% (p=0.000 n=21+26) RegexpMatchMedium_32-4 107ns ± 2% 108ns ± 4% +1.50% (p=0.012 n=22+30) RegexpMatchMedium_1K-4 34.0µs ± 1% 34.3µs ± 3% +1.08% (p=0.008 n=24+30) RegexpMatchHard_32-4 1.53µs ± 3% 1.54µs ± 3% ~ (p=0.234 n=30+30) RegexpMatchHard_1K-4 46.7µs ± 4% 47.0µs ± 4% ~ (p=0.420 n=30+30) Revcomp-4 411ms ± 7% 415ms ± 6% ~ (p=0.059 n=30+30) Template-4 65.5ms ± 5% 66.9ms ± 4% +2.21% (p=0.001 n=30+30) TimeParse-4 317ns ± 3% 311ns ± 3% -1.97% (p=0.000 n=30+30) TimeFormat-4 293ns ± 3% 294ns ± 3% ~ (p=0.243 n=30+30) [Geo mean] 47.4µs 47.5µs +0.17% name old speed new speed delta GobDecode-4 118MB/s ± 5% 118MB/s ± 6% ~ (p=0.832 n=30+30) GobEncode-4 125MB/s ± 7% 125MB/s ± 7% ~ (p=0.625 n=29+30) Gzip-4 85.3MB/s ± 1% 86.6MB/s ± 4% ~ (p=0.486 n=24+30) Gunzip-4 522MB/s ± 3% 527MB/s ± 4% ~ (p=0.889 n=30+30) JSONEncode-4 162MB/s ± 3% 162MB/s ± 4% ~ (p=0.520 n=30+30) JSONDecode-4 35.5MB/s ± 4% 35.6MB/s ± 4% ~ (p=0.701 n=30+30) GoParse-4 18.1MB/s ± 4% 18.1MB/s ± 4% ~ (p=0.891 n=29+30) RegexpMatchEasy0_32-4 416MB/s ± 4% 422MB/s ± 3% +1.43% (p=0.005 n=29+30) RegexpMatchEasy0_1K-4 4.01GB/s ± 4% 4.04GB/s ± 4% ~ (p=0.091 n=30+30) RegexpMatchEasy1_32-4 460MB/s ± 3% 456MB/s ± 5% ~ (p=0.123 n=30+30) RegexpMatchEasy1_1K-4 2.74GB/s ± 2% 2.70GB/s ± 3% -1.33% (p=0.000 n=22+26) RegexpMatchMedium_32-4 9.39MB/s ± 3% 9.19MB/s ± 4% -2.06% (p=0.001 n=28+30) RegexpMatchMedium_1K-4 30.1MB/s ± 1% 29.8MB/s ± 3% -1.04% (p=0.008 n=24+30) RegexpMatchHard_32-4 20.9MB/s ± 3% 20.8MB/s ± 3% ~ (p=0.234 n=30+30) RegexpMatchHard_1K-4 21.9MB/s ± 4% 21.8MB/s ± 4% ~ (p=0.420 n=30+30) Revcomp-4 619MB/s ± 7% 612MB/s ± 7% ~ (p=0.059 n=30+30) Template-4 29.6MB/s ± 4% 29.0MB/s ± 4% -2.16% (p=0.002 n=30+30) [Geo mean] 123MB/s 123MB/s -0.33% Change-Id: Ia59e077feae4f2824df79059daea4d0f678e3e4c Reviewed-on: https://go-review.googlesource.com/120275 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Ilya Tocar <ilya.tocar@intel.com>
2018-06-21 10:14:18 +00:00
ssa.OpAMD64SUBSDload, ssa.OpAMD64SUBSSload, ssa.OpAMD64MULSDload, ssa.OpAMD64MULSSload,
ssa.OpAMD64DIVSDload, ssa.OpAMD64DIVSSload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[1].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64ADDLloadidx1, ssa.OpAMD64ADDLloadidx4, ssa.OpAMD64ADDLloadidx8, ssa.OpAMD64ADDQloadidx1, ssa.OpAMD64ADDQloadidx8,
ssa.OpAMD64SUBLloadidx1, ssa.OpAMD64SUBLloadidx4, ssa.OpAMD64SUBLloadidx8, ssa.OpAMD64SUBQloadidx1, ssa.OpAMD64SUBQloadidx8,
ssa.OpAMD64ANDLloadidx1, ssa.OpAMD64ANDLloadidx4, ssa.OpAMD64ANDLloadidx8, ssa.OpAMD64ANDQloadidx1, ssa.OpAMD64ANDQloadidx8,
ssa.OpAMD64ORLloadidx1, ssa.OpAMD64ORLloadidx4, ssa.OpAMD64ORLloadidx8, ssa.OpAMD64ORQloadidx1, ssa.OpAMD64ORQloadidx8,
ssa.OpAMD64XORLloadidx1, ssa.OpAMD64XORLloadidx4, ssa.OpAMD64XORLloadidx8, ssa.OpAMD64XORQloadidx1, ssa.OpAMD64XORQloadidx8,
ssa.OpAMD64ADDSSloadidx1, ssa.OpAMD64ADDSSloadidx4, ssa.OpAMD64ADDSDloadidx1, ssa.OpAMD64ADDSDloadidx8,
ssa.OpAMD64SUBSSloadidx1, ssa.OpAMD64SUBSSloadidx4, ssa.OpAMD64SUBSDloadidx1, ssa.OpAMD64SUBSDloadidx8,
ssa.OpAMD64MULSSloadidx1, ssa.OpAMD64MULSSloadidx4, ssa.OpAMD64MULSDloadidx1, ssa.OpAMD64MULSDloadidx8,
ssa.OpAMD64DIVSSloadidx1, ssa.OpAMD64DIVSSloadidx4, ssa.OpAMD64DIVSDloadidx1, ssa.OpAMD64DIVSDloadidx8:
p := s.Prog(v.Op.Asm())
r, i := v.Args[1].Reg(), v.Args[2].Reg()
p.From.Type = obj.TYPE_MEM
p.From.Scale = v.Op.Scale()
if p.From.Scale == 1 && i == x86.REG_SP {
r, i = i, r
}
p.From.Reg = r
p.From.Index = i
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64LoweredZero:
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
ptrReg := v.Args[0].Reg()
n := v.AuxInt
if n < 16 {
v.Fatalf("Zero too small %d", n)
}
zero16 := func(off int64) {
zero16(s, ptrReg, off)
}
// Generate zeroing instructions.
var off int64
for n >= 16 {
zero16(off)
off += 16
n -= 16
}
if n != 0 {
// use partially overlapped write.
// TODO: n <= 8, use smaller write?
zero16(off + n - 16)
}
case ssa.OpAMD64LoweredZeroLoop:
if s.ABI != obj.ABIInternal {
// zero X15 manually
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
}
ptrReg := v.Args[0].Reg()
countReg := v.RegTmp()
n := v.AuxInt
loopSize := int64(64)
if n < 3*loopSize {
// - a loop count of 0 won't work.
// - a loop count of 1 is useless.
// - a loop count of 2 is a code size ~tie
// 4 instructions to implement the loop
// 4 instructions in the loop body
// vs
// 8 instructions in the straightline code
// Might as well use straightline code.
v.Fatalf("ZeroLoop size too small %d", n)
}
zero16 := func(off int64) {
zero16(s, ptrReg, off)
}
// Put iteration count in a register.
// MOVL $n, countReg
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
cntInit := p
// Zero loopSize bytes starting at ptrReg.
for i := range loopSize / 16 {
zero16(i * 16)
}
// ADDQ $loopSize, ptrReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = ptrReg
// DECL countReg
p = s.Prog(x86.ADECL)
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to first instruction in loop if we're not done yet.
// JNE head
p = s.Prog(x86.AJNE)
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link)
// Multiples of the loop size are now done.
n %= loopSize
// Write any fractional portion.
var off int64
for n >= 16 {
zero16(off)
off += 16
n -= 16
}
if n != 0 {
// Use partially-overlapping write.
// TODO: n <= 8, use smaller write?
zero16(off + n - 16)
}
case ssa.OpAMD64LoweredMove:
dstReg := v.Args[0].Reg()
srcReg := v.Args[1].Reg()
if dstReg == srcReg {
break
}
tmpReg := int16(x86.REG_X14)
n := v.AuxInt
if n < 16 {
v.Fatalf("Move too small %d", n)
}
// move 16 bytes from srcReg+off to dstReg+off.
move16 := func(off int64) {
move16(s, srcReg, dstReg, tmpReg, off)
}
// Generate copying instructions.
var off int64
for n >= 16 {
move16(off)
off += 16
n -= 16
}
if n != 0 {
// use partially overlapped read/write.
// TODO: use smaller operations when we can?
move16(off + n - 16)
}
case ssa.OpAMD64LoweredMoveLoop:
dstReg := v.Args[0].Reg()
srcReg := v.Args[1].Reg()
if dstReg == srcReg {
break
}
countReg := v.RegTmp()
tmpReg := int16(x86.REG_X14)
n := v.AuxInt
loopSize := int64(64)
if n < 3*loopSize {
// - a loop count of 0 won't work.
// - a loop count of 1 is useless.
// - a loop count of 2 is a code size ~tie
// 4 instructions to implement the loop
// 4 instructions in the loop body
// vs
// 8 instructions in the straightline code
// Might as well use straightline code.
v.Fatalf("ZeroLoop size too small %d", n)
}
// move 16 bytes from srcReg+off to dstReg+off.
move16 := func(off int64) {
move16(s, srcReg, dstReg, tmpReg, off)
}
// Put iteration count in a register.
// MOVL $n, countReg
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = n / loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
cntInit := p
// Copy loopSize bytes starting at srcReg to dstReg.
for i := range loopSize / 16 {
move16(i * 16)
}
// ADDQ $loopSize, srcReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = srcReg
// ADDQ $loopSize, dstReg
p = s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = loopSize
p.To.Type = obj.TYPE_REG
p.To.Reg = dstReg
// DECL countReg
p = s.Prog(x86.ADECL)
p.To.Type = obj.TYPE_REG
p.To.Reg = countReg
// Jump to loop header if we're not done yet.
// JNE head
p = s.Prog(x86.AJNE)
p.To.Type = obj.TYPE_BRANCH
p.To.SetTarget(cntInit.Link)
// Multiples of the loop size are now done.
n %= loopSize
// Copy any fractional portion.
var off int64
for n >= 16 {
move16(off)
off += 16
n -= 16
}
if n != 0 {
// Use partially-overlapping copy.
move16(off + n - 16)
}
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
if v.Type.IsMemory() {
return
}
x := v.Args[0].Reg()
y := v.Reg()
if v.Type.IsSIMD() {
x = simdOrMaskReg(v.Args[0])
y = simdOrMaskReg(v)
}
if x != y {
opregreg(s, moveByRegsWidth(y, x, v.Type.Size()), y, x)
}
case ssa.OpLoadReg:
if v.Type.IsFlags() {
v.Fatalf("load flags not implemented: %v", v.LongString())
return
}
r := v.Reg()
p := s.Prog(loadByRegWidth(r, v.Type.Size()))
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddrAuto(&p.From, v.Args[0])
p.To.Type = obj.TYPE_REG
if v.Type.IsSIMD() {
r = simdOrMaskReg(v)
}
p.To.Reg = r
case ssa.OpStoreReg:
if v.Type.IsFlags() {
v.Fatalf("store flags not implemented: %v", v.LongString())
return
}
r := v.Args[0].Reg()
if v.Type.IsSIMD() {
r = simdOrMaskReg(v.Args[0])
}
p := s.Prog(storeByRegWidth(r, v.Type.Size()))
p.From.Type = obj.TYPE_REG
p.From.Reg = r
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddrAuto(&p.To, v)
case ssa.OpAMD64LoweredHasCPUFeature:
cmd/compile: use shorter version of movzx for LoweredHASCPUFeature The values loaded from memory are booleans. Upper 32 bits are not needed. Use the 32-bit version of movzx. compilecmp linux/amd64: math/bits math/bits.OnesCount 83 -> 78 (-6.02%) math math.ceil 109 -> 103 (-5.50%) math.Gamma changed math.sinPi changed runtime runtime.(*sweepLocked).sweep changed runtime.(*mspan).countAlloc 182 -> 178 (-2.20%) runtime.(*pageBits).popcntRange 552 -> 542 (-1.81%) go:(**mspan).runtime.countAlloc 243 -> 232 (-4.53%) runtime.sweepLocked.countAlloc 221 -> 215 (-2.71%) runtime.(*pageCache).allocN changed math/rand math/rand.(*Zipf).Uint64 506 -> 499 (-1.38%) cmd/vendor/golang.org/x/sys/unix cmd/vendor/golang.org/x/sys/unix.(*CPUSet).Count 147 -> 145 (-1.36%) cmd/internal/obj/ppc64 cmd/internal/obj/ppc64.(*ctxt9).asmout changed cmd/cover main.htmlGen 1170 -> 1164 (-0.51%) cmd/compile/internal/bitvec cmd/compile/internal/bitvec.(*BitVec).Count 221 -> 213 (-3.62%) cmd/compile/internal/bitvec.BitVec.Count changed cmd/link/internal/loader cmd/link/internal/loader.(*Bitmap).Count 214 -> 212 (-0.93%) cmd/link/internal/loader.Bitmap.Count 171 -> 169 (-1.17%) cmd/link/internal/loader.(*Loader).NReachableSym changed cmd/link/internal/loader.(*Loader).Stat changed cmd/vendor/github.com/ianlancetaylor/demangle cmd/vendor/github.com/ianlancetaylor/demangle.oldRustToString changed cmd/vendor/github.com/google/pprof/internal/graph cmd/vendor/github.com/google/pprof/internal/graph.(*builder).addNode changed cmd/compile/internal/ssa cmd/compile/internal/ssa.rewriteValuePPC64_OpPPC64FTRUNC changed cmd/compile/internal/ssa.(*regAllocState).computeLive 10441 -> 10409 (-0.31%) cmd/compile/internal/ssa.(*regAllocState).regalloc changed cmd/compile/internal/ssa.rewriteValuePPC64_OpPPC64FCEIL changed cmd/compile/internal/ssa.(*regAllocState).allocReg changed cmd/compile/internal/ssa.rewriteValuePPC64_OpPPC64FFLOOR changed cmd/compile/internal/ssa.countRegs 83 -> 78 (-6.02%) cmd/compile/internal/liveness cmd/compile/internal/liveness.ArgLiveness.func2 changed cmd/compile/internal/amd64 cmd/compile/internal/amd64.ssaGenValue changed file before after Δ % math/bits.s 2618 2613 -5 -0.191% math.s 37246 37240 -6 -0.016% runtime.s 486910 486879 -31 -0.006% math/rand.s 9980 9973 -7 -0.070% cmd/vendor/golang.org/x/sys/unix.s 119232 119230 -2 -0.002% cmd/cover.s 31341 31335 -6 -0.019% cmd/compile/internal/bitvec.s 5542 5534 -8 -0.144% cmd/link/internal/loader.s 75315 75311 -4 -0.005% cmd/compile/internal/ssa.s 3570581 3570544 -37 -0.001% total 20041552 20041446 -106 -0.001% Change-Id: I29845744c512a1f833cb1fa3bb43b6b0e0eaac68 Reviewed-on: https://go-review.googlesource.com/c/go/+/430175 Reviewed-by: Cherry Mui <cherryyz@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Martin Möhrmann <moehrmann@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org> Auto-Submit: Dmitri Shuralyov <dmitshur@golang.org>
2022-09-11 14:47:36 +02:00
p := s.Prog(x86.AMOVBLZX)
p.From.Type = obj.TYPE_MEM
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpArgIntReg, ssa.OpArgFloatReg:
// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
// The loop only runs once.
for _, ap := range v.Block.Func.RegArgs {
// Pass the spill/unspill information along to the assembler, offset by size of return PC pushed on stack.
addr := ssagen.SpillSlotAddr(ap, x86.REG_SP, v.Block.Func.Config.PtrSize)
reg := ap.Reg
t := ap.Type
sz := t.Size()
if t.IsSIMD() {
reg = simdRegBySize(reg, sz)
}
s.FuncInfo().AddSpill(
obj.RegSpill{Reg: reg, Addr: addr, Unspill: loadByRegWidth(reg, sz), Spill: storeByRegWidth(reg, sz)})
}
v.Block.Func.RegArgs = nil
ssagen.CheckArgReg(v)
case ssa.OpAMD64LoweredGetClosurePtr:
// Closure pointer is DX.
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.CheckLoweredGetClosurePtr(v)
case ssa.OpAMD64LoweredGetG:
if s.ABI == obj.ABIInternal {
v.Fatalf("LoweredGetG should not appear in ABIInternal")
}
r := v.Reg()
getgFromTLS(s, r)
cmd/compile: restore tail call for method wrappers For certain type of method wrappers we used to generate a tail call. That was disabled in CL 307234 when register ABI is used, because with the current IR it was difficult to generate a tail call with the arguments in the right places. The problem was that the IR does not contain a CALL-like node with arguments; instead, it contains an OAS node that adjusts the receiver, than an OTAILCALL node that just contains the target, but no argument (with the assumption that the OAS node will put the adjusted receiver in the right place). With register ABI, putting arguments in registers are done in SSA. The assignment (OAS) doesn't put the receiver in register. This CL changes the IR of a tail call to take an actual OCALL node. Specifically, a tail call is represented as OTAILCALL (OCALL target args...) This way, the call target and args are connected through the OCALL node. So the call can be analyzed in SSA and the args can be passed in the right places. (Alternatively, we could have OTAILCALL node directly take the target and the args, without the OCALL node. Using an OCALL node is convenient as there are existing code that processes OCALL nodes which do not need to be changed. Also, a tail call is similar to ORETURN (OCALL target args...), except it doesn't preserve the frame. I did the former but I'm open to change.) The SSA representation is similar. Previously, the IR lowers to a Store the receiver then a BlockRetJmp which jumps to the target (without putting the arg in register). Now we use a TailCall op, which takes the target and the args. The call expansion pass and the register allocator handles TailCall pretty much like a StaticCall, and it will do the right ABI analysis and put the args in the right places. (Args other than the receiver are already in the right places. For register args it generates no code for them. For stack args currently it generates a self copy. I'll work on optimize that out.) BlockRetJmp is still used, signaling it is a tail call. The actual call is made in the TailCall op so BlockRetJmp generates no code (we could use BlockExit if we like). This slightly reduces binary size: old new cmd/go 14003088 13953936 cmd/link 6275552 6271456 Change-Id: I2d16d8d419fe1f17554916d317427383e17e27f0 Reviewed-on: https://go-review.googlesource.com/c/go/+/350145 Trust: Cherry Mui <cherryyz@google.com> Run-TryBot: Cherry Mui <cherryyz@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com> Reviewed-by: David Chase <drchase@google.com>
2021-09-10 22:05:55 -04:00
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
// zeroing X15 when entering ABIInternal from ABI0
zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
cmd/compile: restore tail call for method wrappers For certain type of method wrappers we used to generate a tail call. That was disabled in CL 307234 when register ABI is used, because with the current IR it was difficult to generate a tail call with the arguments in the right places. The problem was that the IR does not contain a CALL-like node with arguments; instead, it contains an OAS node that adjusts the receiver, than an OTAILCALL node that just contains the target, but no argument (with the assumption that the OAS node will put the adjusted receiver in the right place). With register ABI, putting arguments in registers are done in SSA. The assignment (OAS) doesn't put the receiver in register. This CL changes the IR of a tail call to take an actual OCALL node. Specifically, a tail call is represented as OTAILCALL (OCALL target args...) This way, the call target and args are connected through the OCALL node. So the call can be analyzed in SSA and the args can be passed in the right places. (Alternatively, we could have OTAILCALL node directly take the target and the args, without the OCALL node. Using an OCALL node is convenient as there are existing code that processes OCALL nodes which do not need to be changed. Also, a tail call is similar to ORETURN (OCALL target args...), except it doesn't preserve the frame. I did the former but I'm open to change.) The SSA representation is similar. Previously, the IR lowers to a Store the receiver then a BlockRetJmp which jumps to the target (without putting the arg in register). Now we use a TailCall op, which takes the target and the args. The call expansion pass and the register allocator handles TailCall pretty much like a StaticCall, and it will do the right ABI analysis and put the args in the right places. (Args other than the receiver are already in the right places. For register args it generates no code for them. For stack args currently it generates a self copy. I'll work on optimize that out.) BlockRetJmp is still used, signaling it is a tail call. The actual call is made in the TailCall op so BlockRetJmp generates no code (we could use BlockExit if we like). This slightly reduces binary size: old new cmd/go 14003088 13953936 cmd/link 6275552 6271456 Change-Id: I2d16d8d419fe1f17554916d317427383e17e27f0 Reviewed-on: https://go-review.googlesource.com/c/go/+/350145 Trust: Cherry Mui <cherryyz@google.com> Run-TryBot: Cherry Mui <cherryyz@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com> Reviewed-by: David Chase <drchase@google.com>
2021-09-10 22:05:55 -04:00
if v.Op == ssa.OpAMD64CALLtail {
s.TailCall(v)
break
}
s.Call(v)
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
// zeroing X15 when entering ABIInternal from ABI0
zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
case ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
s.Call(v)
case ssa.OpAMD64LoweredGetCallerPC:
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_MEM
p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
p.From.Name = obj.NAME_PARAM
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64LoweredGetCallerSP:
// caller's SP is the address of the first arg
mov := x86.AMOVQ
[dev.regabi] cmd/compile: move type size calculations into package types [generated] To break up package gc, we need to put these calculations somewhere lower in the import graph, either an existing or new package. Package types already needs this code and is using hacks to get it without an import cycle. We can remove the hacks and set up for the new package gc by moving the code into package types itself. [git-generate] cd src/cmd/compile/internal/gc rf ' # Remove old import cycle hacks in gc. rm TypecheckInit:/types.Widthptr =/-0,/types.Dowidth =/+0 \ ../ssa/export_test.go:/types.Dowidth =/-+ ex { import "cmd/compile/internal/types" types.Widthptr -> Widthptr types.Dowidth -> dowidth } # Disable CalcSize in tests instead of base.Fatalf sub dowidth:/base.Fatalf\("dowidth without betypeinit"\)/ \ // Assume this is a test. \ return # Move size calculation into cmd/compile/internal/types mv Widthptr PtrSize mv Widthreg RegSize mv slicePtrOffset SlicePtrOffset mv sliceLenOffset SliceLenOffset mv sliceCapOffset SliceCapOffset mv sizeofSlice SliceSize mv sizeofString StringSize mv skipDowidthForTracing SkipSizeForTracing mv dowidth CalcSize mv checkwidth CheckSize mv widstruct calcStructOffset mv sizeCalculationDisabled CalcSizeDisabled mv defercheckwidth DeferCheckSize mv resumecheckwidth ResumeCheckSize mv typeptrdata PtrDataSize mv \ PtrSize RegSize SlicePtrOffset SkipSizeForTracing typePos align.go PtrDataSize \ size.go mv size.go cmd/compile/internal/types ' : # Remove old import cycle hacks in types. cd ../types rf ' ex { Widthptr -> PtrSize Dowidth -> CalcSize } rm Widthptr Dowidth ' Change-Id: Ib96cdc6bda2617235480c29392ea5cfb20f60cd8 Reviewed-on: https://go-review.googlesource.com/c/go/+/279234 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:39:45 -05:00
if types.PtrSize == 4 {
mov = x86.AMOVL
}
p := s.Prog(mov)
p.From.Type = obj.TYPE_ADDR
p.From.Offset = -base.Ctxt.Arch.FixedFrameSize // 0 on amd64, just to be consistent with other architectures
p.From.Name = obj.NAME_PARAM
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
cmd/compile: compiler support for buffered write barrier This CL implements the compiler support for calling the buffered write barrier added by the previous CL. Since the buffered write barrier is only implemented on amd64 right now, this still supports the old, eager write barrier as well. There's little overhead to supporting both and this way a few tests in test/fixedbugs that expect to have liveness maps at write barrier calls can easily opt-in to the old, eager barrier. This significantly improves the performance of the write barrier: name old time/op new time/op delta WriteBarrier-12 73.5ns ±20% 19.2ns ±27% -73.90% (p=0.000 n=19+18) It also reduces the size of binaries because the write barrier call is more compact: name old object-bytes new object-bytes delta Template 398k ± 0% 393k ± 0% -1.14% (p=0.008 n=5+5) Unicode 208k ± 0% 206k ± 0% -1.00% (p=0.008 n=5+5) GoTypes 1.18M ± 0% 1.15M ± 0% -2.00% (p=0.008 n=5+5) Compiler 4.05M ± 0% 3.88M ± 0% -4.26% (p=0.008 n=5+5) SSA 8.25M ± 0% 8.11M ± 0% -1.59% (p=0.008 n=5+5) Flate 228k ± 0% 224k ± 0% -1.83% (p=0.008 n=5+5) GoParser 295k ± 0% 284k ± 0% -3.62% (p=0.008 n=5+5) Reflect 1.00M ± 0% 0.99M ± 0% -0.70% (p=0.008 n=5+5) Tar 339k ± 0% 333k ± 0% -1.67% (p=0.008 n=5+5) XML 404k ± 0% 395k ± 0% -2.10% (p=0.008 n=5+5) [Geo mean] 704k 690k -2.00% name old exe-bytes new exe-bytes delta HelloSize 1.05M ± 0% 1.04M ± 0% -1.55% (p=0.008 n=5+5) https://perf.golang.org/search?q=upload:20171027.1 (Amusingly, this also reduces compiler allocations by 0.75%, which, combined with the better write barrier, speeds up the compiler overall by 2.10%. See the perf link.) It slightly improves the performance of most of the go1 benchmarks and improves the performance of the x/benchmarks: name old time/op new time/op delta BinaryTree17-12 2.40s ± 1% 2.47s ± 1% +2.69% (p=0.000 n=19+19) Fannkuch11-12 2.95s ± 0% 2.95s ± 0% +0.21% (p=0.000 n=20+19) FmtFprintfEmpty-12 41.8ns ± 4% 41.4ns ± 2% -1.03% (p=0.014 n=20+20) FmtFprintfString-12 68.7ns ± 2% 67.5ns ± 1% -1.75% (p=0.000 n=20+17) FmtFprintfInt-12 79.0ns ± 3% 77.1ns ± 1% -2.40% (p=0.000 n=19+17) FmtFprintfIntInt-12 127ns ± 1% 123ns ± 3% -3.42% (p=0.000 n=20+20) FmtFprintfPrefixedInt-12 152ns ± 1% 150ns ± 1% -1.02% (p=0.000 n=18+17) FmtFprintfFloat-12 211ns ± 1% 209ns ± 0% -0.99% (p=0.000 n=20+16) FmtManyArgs-12 500ns ± 0% 496ns ± 0% -0.73% (p=0.000 n=17+20) GobDecode-12 6.44ms ± 1% 6.53ms ± 0% +1.28% (p=0.000 n=20+19) GobEncode-12 5.46ms ± 0% 5.46ms ± 1% ~ (p=0.550 n=19+20) Gzip-12 220ms ± 1% 216ms ± 0% -1.75% (p=0.000 n=19+19) Gunzip-12 38.8ms ± 0% 38.6ms ± 0% -0.30% (p=0.000 n=18+19) HTTPClientServer-12 79.0µs ± 1% 78.2µs ± 1% -1.01% (p=0.000 n=20+20) JSONEncode-12 11.9ms ± 0% 11.9ms ± 0% -0.29% (p=0.000 n=20+19) JSONDecode-12 52.6ms ± 0% 52.2ms ± 0% -0.68% (p=0.000 n=19+20) Mandelbrot200-12 3.69ms ± 0% 3.68ms ± 0% -0.36% (p=0.000 n=20+20) GoParse-12 3.13ms ± 1% 3.18ms ± 1% +1.67% (p=0.000 n=19+20) RegexpMatchEasy0_32-12 73.2ns ± 1% 72.3ns ± 1% -1.19% (p=0.000 n=19+18) RegexpMatchEasy0_1K-12 241ns ± 0% 239ns ± 0% -0.83% (p=0.000 n=17+16) RegexpMatchEasy1_32-12 68.6ns ± 1% 69.0ns ± 1% +0.47% (p=0.015 n=18+16) RegexpMatchEasy1_1K-12 364ns ± 0% 361ns ± 0% -0.67% (p=0.000 n=16+17) RegexpMatchMedium_32-12 104ns ± 1% 103ns ± 1% -0.79% (p=0.001 n=20+15) RegexpMatchMedium_1K-12 33.8µs ± 3% 34.0µs ± 2% ~ (p=0.267 n=20+19) RegexpMatchHard_32-12 1.64µs ± 1% 1.62µs ± 2% -1.25% (p=0.000 n=19+18) RegexpMatchHard_1K-12 49.2µs ± 0% 48.7µs ± 1% -0.93% (p=0.000 n=19+18) Revcomp-12 391ms ± 5% 396ms ± 7% ~ (p=0.154 n=19+19) Template-12 63.1ms ± 0% 59.5ms ± 0% -5.76% (p=0.000 n=18+19) TimeParse-12 307ns ± 0% 306ns ± 0% -0.39% (p=0.000 n=19+17) TimeFormat-12 325ns ± 0% 323ns ± 0% -0.50% (p=0.000 n=19+19) [Geo mean] 47.3µs 46.9µs -0.67% https://perf.golang.org/search?q=upload:20171026.1 name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.25ms ± 1% 2.20ms ± 1% -2.31% (p=0.000 n=18+18) HTTP-12 12.6µs ± 0% 12.6µs ± 0% -0.72% (p=0.000 n=18+17) JSON-12 11.0ms ± 0% 11.0ms ± 1% -0.68% (p=0.000 n=17+19) https://perf.golang.org/search?q=upload:20171026.2 Updates #14951. Updates #22460. Change-Id: Id4c0932890a1d41020071bec73b8522b1367d3e7 Reviewed-on: https://go-review.googlesource.com/73712 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-26 12:33:04 -04:00
case ssa.OpAMD64LoweredWB:
p := s.Prog(obj.ACALL)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
// AuxInt encodes how many buffer entries we need.
p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
cmd/compile: compiler support for buffered write barrier This CL implements the compiler support for calling the buffered write barrier added by the previous CL. Since the buffered write barrier is only implemented on amd64 right now, this still supports the old, eager write barrier as well. There's little overhead to supporting both and this way a few tests in test/fixedbugs that expect to have liveness maps at write barrier calls can easily opt-in to the old, eager barrier. This significantly improves the performance of the write barrier: name old time/op new time/op delta WriteBarrier-12 73.5ns ±20% 19.2ns ±27% -73.90% (p=0.000 n=19+18) It also reduces the size of binaries because the write barrier call is more compact: name old object-bytes new object-bytes delta Template 398k ± 0% 393k ± 0% -1.14% (p=0.008 n=5+5) Unicode 208k ± 0% 206k ± 0% -1.00% (p=0.008 n=5+5) GoTypes 1.18M ± 0% 1.15M ± 0% -2.00% (p=0.008 n=5+5) Compiler 4.05M ± 0% 3.88M ± 0% -4.26% (p=0.008 n=5+5) SSA 8.25M ± 0% 8.11M ± 0% -1.59% (p=0.008 n=5+5) Flate 228k ± 0% 224k ± 0% -1.83% (p=0.008 n=5+5) GoParser 295k ± 0% 284k ± 0% -3.62% (p=0.008 n=5+5) Reflect 1.00M ± 0% 0.99M ± 0% -0.70% (p=0.008 n=5+5) Tar 339k ± 0% 333k ± 0% -1.67% (p=0.008 n=5+5) XML 404k ± 0% 395k ± 0% -2.10% (p=0.008 n=5+5) [Geo mean] 704k 690k -2.00% name old exe-bytes new exe-bytes delta HelloSize 1.05M ± 0% 1.04M ± 0% -1.55% (p=0.008 n=5+5) https://perf.golang.org/search?q=upload:20171027.1 (Amusingly, this also reduces compiler allocations by 0.75%, which, combined with the better write barrier, speeds up the compiler overall by 2.10%. See the perf link.) It slightly improves the performance of most of the go1 benchmarks and improves the performance of the x/benchmarks: name old time/op new time/op delta BinaryTree17-12 2.40s ± 1% 2.47s ± 1% +2.69% (p=0.000 n=19+19) Fannkuch11-12 2.95s ± 0% 2.95s ± 0% +0.21% (p=0.000 n=20+19) FmtFprintfEmpty-12 41.8ns ± 4% 41.4ns ± 2% -1.03% (p=0.014 n=20+20) FmtFprintfString-12 68.7ns ± 2% 67.5ns ± 1% -1.75% (p=0.000 n=20+17) FmtFprintfInt-12 79.0ns ± 3% 77.1ns ± 1% -2.40% (p=0.000 n=19+17) FmtFprintfIntInt-12 127ns ± 1% 123ns ± 3% -3.42% (p=0.000 n=20+20) FmtFprintfPrefixedInt-12 152ns ± 1% 150ns ± 1% -1.02% (p=0.000 n=18+17) FmtFprintfFloat-12 211ns ± 1% 209ns ± 0% -0.99% (p=0.000 n=20+16) FmtManyArgs-12 500ns ± 0% 496ns ± 0% -0.73% (p=0.000 n=17+20) GobDecode-12 6.44ms ± 1% 6.53ms ± 0% +1.28% (p=0.000 n=20+19) GobEncode-12 5.46ms ± 0% 5.46ms ± 1% ~ (p=0.550 n=19+20) Gzip-12 220ms ± 1% 216ms ± 0% -1.75% (p=0.000 n=19+19) Gunzip-12 38.8ms ± 0% 38.6ms ± 0% -0.30% (p=0.000 n=18+19) HTTPClientServer-12 79.0µs ± 1% 78.2µs ± 1% -1.01% (p=0.000 n=20+20) JSONEncode-12 11.9ms ± 0% 11.9ms ± 0% -0.29% (p=0.000 n=20+19) JSONDecode-12 52.6ms ± 0% 52.2ms ± 0% -0.68% (p=0.000 n=19+20) Mandelbrot200-12 3.69ms ± 0% 3.68ms ± 0% -0.36% (p=0.000 n=20+20) GoParse-12 3.13ms ± 1% 3.18ms ± 1% +1.67% (p=0.000 n=19+20) RegexpMatchEasy0_32-12 73.2ns ± 1% 72.3ns ± 1% -1.19% (p=0.000 n=19+18) RegexpMatchEasy0_1K-12 241ns ± 0% 239ns ± 0% -0.83% (p=0.000 n=17+16) RegexpMatchEasy1_32-12 68.6ns ± 1% 69.0ns ± 1% +0.47% (p=0.015 n=18+16) RegexpMatchEasy1_1K-12 364ns ± 0% 361ns ± 0% -0.67% (p=0.000 n=16+17) RegexpMatchMedium_32-12 104ns ± 1% 103ns ± 1% -0.79% (p=0.001 n=20+15) RegexpMatchMedium_1K-12 33.8µs ± 3% 34.0µs ± 2% ~ (p=0.267 n=20+19) RegexpMatchHard_32-12 1.64µs ± 1% 1.62µs ± 2% -1.25% (p=0.000 n=19+18) RegexpMatchHard_1K-12 49.2µs ± 0% 48.7µs ± 1% -0.93% (p=0.000 n=19+18) Revcomp-12 391ms ± 5% 396ms ± 7% ~ (p=0.154 n=19+19) Template-12 63.1ms ± 0% 59.5ms ± 0% -5.76% (p=0.000 n=18+19) TimeParse-12 307ns ± 0% 306ns ± 0% -0.39% (p=0.000 n=19+17) TimeFormat-12 325ns ± 0% 323ns ± 0% -0.50% (p=0.000 n=19+19) [Geo mean] 47.3µs 46.9µs -0.67% https://perf.golang.org/search?q=upload:20171026.1 name old time/op new time/op delta Garbage/benchmem-MB=64-12 2.25ms ± 1% 2.20ms ± 1% -2.31% (p=0.000 n=18+18) HTTP-12 12.6µs ± 0% 12.6µs ± 0% -0.72% (p=0.000 n=18+17) JSON-12 11.0ms ± 0% 11.0ms ± 1% -0.68% (p=0.000 n=17+19) https://perf.golang.org/search?q=upload:20171026.2 Updates #14951. Updates #22460. Change-Id: Id4c0932890a1d41020071bec73b8522b1367d3e7 Reviewed-on: https://go-review.googlesource.com/73712 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2017-10-26 12:33:04 -04:00
case ssa.OpAMD64LoweredPanicBoundsRR, ssa.OpAMD64LoweredPanicBoundsRC, ssa.OpAMD64LoweredPanicBoundsCR, ssa.OpAMD64LoweredPanicBoundsCC:
// Compute the constant we put in the PCData entry for this call.
code, signed := ssa.BoundsKind(v.AuxInt).Code()
xIsReg := false
yIsReg := false
xVal := 0
yVal := 0
switch v.Op {
case ssa.OpAMD64LoweredPanicBoundsRR:
xIsReg = true
xVal = int(v.Args[0].Reg() - x86.REG_AX)
yIsReg = true
yVal = int(v.Args[1].Reg() - x86.REG_AX)
case ssa.OpAMD64LoweredPanicBoundsRC:
xIsReg = true
xVal = int(v.Args[0].Reg() - x86.REG_AX)
c := v.Aux.(ssa.PanicBoundsC).C
if c >= 0 && c <= abi.BoundsMaxConst {
yVal = int(c)
} else {
// Move constant to a register
yIsReg = true
if yVal == xVal {
yVal = 1
}
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = c
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX + int16(yVal)
}
case ssa.OpAMD64LoweredPanicBoundsCR:
yIsReg = true
yVal = int(v.Args[0].Reg() - x86.REG_AX)
c := v.Aux.(ssa.PanicBoundsC).C
if c >= 0 && c <= abi.BoundsMaxConst {
xVal = int(c)
} else {
// Move constant to a register
xIsReg = true
if xVal == yVal {
xVal = 1
}
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = c
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX + int16(xVal)
}
case ssa.OpAMD64LoweredPanicBoundsCC:
c := v.Aux.(ssa.PanicBoundsCC).Cx
if c >= 0 && c <= abi.BoundsMaxConst {
xVal = int(c)
} else {
// Move constant to a register
xIsReg = true
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = c
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX + int16(xVal)
}
c = v.Aux.(ssa.PanicBoundsCC).Cy
if c >= 0 && c <= abi.BoundsMaxConst {
yVal = int(c)
} else {
// Move constant to a register
yIsReg = true
yVal = 1
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = c
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX + int16(yVal)
}
}
c := abi.BoundsEncode(code, signed, xIsReg, yIsReg, xVal, yVal)
p := s.Prog(obj.APCDATA)
p.From.SetConst(abi.PCDATA_PanicBounds)
p.To.SetConst(int64(c))
p = s.Prog(obj.ACALL)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = ir.Syms.PanicBounds
case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64NEGLflags:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64ADDQconstflags, ssa.OpAMD64ADDLconstflags:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
// Note: the inc/dec instructions do not modify
// the carry flag like add$1 / sub$1 do.
// We currently never use the CF/OF flags from
// these instructions, so that is ok.
switch {
case p.As == x86.AADDQ && p.From.Offset == 1:
p.As = x86.AINCQ
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDQ && p.From.Offset == -1:
p.As = x86.ADECQ
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDL && p.From.Offset == 1:
p.As = x86.AINCL
p.From.Type = obj.TYPE_NONE
case p.As == x86.AADDL && p.From.Offset == -1:
p.As = x86.ADECL
p.From.Type = obj.TYPE_NONE
}
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
switch v.Op {
case ssa.OpAMD64BSFQ, ssa.OpAMD64BSRQ:
p.To.Reg = v.Reg0()
case ssa.OpAMD64BSFL, ssa.OpAMD64BSRL, ssa.OpAMD64SQRTSD, ssa.OpAMD64SQRTSS:
p.To.Reg = v.Reg()
}
cmd/compile: lower x*z + y to FMA if FMA enabled There is a generic opcode for FMA, but we don't use it in rewrite rules. This is maybe because some archs, like WASM and MIPS don't have a late lowering rule for it. Fixes #71204 Intel Alder Lake 12600k (GOAMD64=v3): math: name old time/op new time/op delta Acos-16 4.58ns ± 0% 3.36ns ± 0% -26.68% (p=0.008 n=5+5) Acosh-16 8.04ns ± 1% 6.44ns ± 0% -19.95% (p=0.008 n=5+5) Asin-16 4.28ns ± 0% 3.32ns ± 0% -22.24% (p=0.008 n=5+5) Asinh-16 9.92ns ± 0% 8.62ns ± 0% -13.13% (p=0.008 n=5+5) Atan-16 2.31ns ± 0% 1.84ns ± 0% -20.02% (p=0.008 n=5+5) Atanh-16 7.79ns ± 0% 7.03ns ± 0% -9.67% (p=0.008 n=5+5) Atan2-16 3.93ns ± 0% 3.52ns ± 0% -10.35% (p=0.000 n=5+4) Cbrt-16 4.62ns ± 0% 4.41ns ± 0% -4.57% (p=0.016 n=4+5) Ceil-16 0.14ns ± 1% 0.14ns ± 2% ~ (p=0.103 n=5+5) Copysign-16 0.33ns ± 0% 0.33ns ± 0% +0.03% (p=0.029 n=4+4) Cos-16 4.87ns ± 0% 4.75ns ± 0% -2.44% (p=0.016 n=5+4) Cosh-16 4.86ns ± 0% 4.86ns ± 0% ~ (p=0.317 n=5+5) Erf-16 2.71ns ± 0% 2.25ns ± 0% -16.69% (p=0.008 n=5+5) Erfc-16 3.06ns ± 0% 2.67ns ± 0% -13.00% (p=0.016 n=5+4) Erfinv-16 3.88ns ± 0% 2.84ns ± 3% -26.83% (p=0.008 n=5+5) Erfcinv-16 4.08ns ± 0% 3.01ns ± 1% -26.27% (p=0.008 n=5+5) Exp-16 3.29ns ± 0% 3.37ns ± 2% +2.64% (p=0.016 n=4+5) ExpGo-16 8.44ns ± 0% 7.48ns ± 1% -11.37% (p=0.008 n=5+5) Expm1-16 4.46ns ± 0% 3.69ns ± 2% -17.26% (p=0.016 n=4+5) Exp2-16 8.20ns ± 0% 7.39ns ± 2% -9.94% (p=0.008 n=5+5) Exp2Go-16 8.26ns ± 0% 7.23ns ± 0% -12.49% (p=0.016 n=4+5) Abs-16 0.26ns ± 3% 0.22ns ± 1% -16.34% (p=0.008 n=5+5) Dim-16 0.38ns ± 1% 0.40ns ± 2% +5.02% (p=0.008 n=5+5) Floor-16 0.11ns ± 1% 0.17ns ± 4% +54.99% (p=0.008 n=5+5) Max-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.619 n=5+5) Min-16 1.24ns ± 0% 1.24ns ± 0% ~ (p=0.484 n=5+5) Mod-16 13.4ns ± 1% 12.8ns ± 0% -4.21% (p=0.016 n=5+4) Frexp-16 1.70ns ± 0% 1.71ns ± 0% +0.46% (p=0.008 n=5+5) Gamma-16 3.97ns ± 0% 3.97ns ± 0% ~ (p=0.643 n=5+5) Hypot-16 2.11ns ± 0% 2.11ns ± 0% ~ (p=0.762 n=5+5) HypotGo-16 2.48ns ± 4% 2.26ns ± 0% -8.94% (p=0.008 n=5+5) Ilogb-16 1.67ns ± 0% 1.67ns ± 0% -0.07% (p=0.048 n=5+5) J0-16 19.8ns ± 0% 19.3ns ± 0% ~ (p=0.079 n=4+5) J1-16 19.4ns ± 0% 18.9ns ± 0% -2.63% (p=0.000 n=5+4) Jn-16 41.5ns ± 0% 40.6ns ± 0% -2.32% (p=0.016 n=4+5) Ldexp-16 2.26ns ± 0% 2.26ns ± 0% ~ (p=0.683 n=5+5) Lgamma-16 4.40ns ± 0% 4.21ns ± 0% -4.21% (p=0.008 n=5+5) Log-16 4.05ns ± 0% 4.05ns ± 0% ~ (all equal) Logb-16 1.69ns ± 0% 1.69ns ± 0% ~ (p=0.429 n=5+5) Log1p-16 5.00ns ± 0% 3.99ns ± 0% -20.14% (p=0.008 n=5+5) Log10-16 4.22ns ± 0% 4.21ns ± 0% -0.15% (p=0.008 n=5+5) Log2-16 2.27ns ± 0% 2.25ns ± 0% -0.94% (p=0.008 n=5+5) Modf-16 1.44ns ± 0% 1.44ns ± 0% ~ (p=0.492 n=5+5) Nextafter32-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.079 n=4+5) Nextafter64-16 2.09ns ± 0% 2.09ns ± 0% ~ (p=0.095 n=4+5) PowInt-16 10.8ns ± 0% 10.8ns ± 0% ~ (all equal) PowFrac-16 25.3ns ± 0% 25.3ns ± 0% -0.09% (p=0.000 n=5+4) Pow10Pos-16 0.52ns ± 1% 0.52ns ± 0% ~ (p=0.810 n=5+5) Pow10Neg-16 0.82ns ± 0% 0.82ns ± 0% ~ (p=0.381 n=5+5) Round-16 0.93ns ± 0% 0.93ns ± 0% ~ (p=0.056 n=5+5) RoundToEven-16 1.64ns ± 0% 1.64ns ± 0% ~ (all equal) Remainder-16 12.4ns ± 2% 12.0ns ± 0% -3.27% (p=0.008 n=5+5) Signbit-16 0.37ns ± 0% 0.37ns ± 0% -0.19% (p=0.008 n=5+5) Sin-16 4.04ns ± 0% 3.92ns ± 0% -3.13% (p=0.000 n=4+5) Sincos-16 5.99ns ± 0% 5.80ns ± 0% -3.03% (p=0.008 n=5+5) Sinh-16 5.22ns ± 0% 5.22ns ± 0% ~ (p=0.651 n=5+4) SqrtIndirect-16 0.41ns ± 0% 0.41ns ± 0% ~ (p=0.333 n=4+5) SqrtLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=0.079 n=4+5) SqrtIndirectLatency-16 2.66ns ± 0% 2.66ns ± 0% ~ (p=1.000 n=5+5) SqrtGoLatency-16 30.1ns ± 0% 28.6ns ± 1% -4.84% (p=0.008 n=5+5) SqrtPrime-16 645ns ± 0% 645ns ± 0% ~ (p=0.095 n=5+4) Tan-16 4.21ns ± 0% 4.09ns ± 0% -2.76% (p=0.029 n=4+4) Tanh-16 5.36ns ± 0% 5.36ns ± 0% ~ (p=0.444 n=5+5) Trunc-16 0.12ns ± 6% 0.11ns ± 1% -6.79% (p=0.008 n=5+5) Y0-16 19.2ns ± 0% 18.7ns ± 0% -2.52% (p=0.000 n=5+4) Y1-16 19.1ns ± 0% 18.4ns ± 0% ~ (p=0.079 n=4+5) Yn-16 40.7ns ± 0% 39.5ns ± 0% -2.82% (p=0.008 n=5+5) Float64bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.603 n=5+5) Float64frombits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.984 n=4+5) Float32bits-16 0.21ns ± 0% 0.21ns ± 0% ~ (p=0.778 n=4+5) Float32frombits-16 0.21ns ± 0% 0.20ns ± 0% ~ (p=0.397 n=5+5) FMA-16 0.82ns ± 0% 0.82ns ± 0% +0.02% (p=0.029 n=4+4) [Geo mean] 2.87ns 2.74ns -4.61% math/cmplx: name old time/op new time/op delta Abs-16 2.07ns ± 0% 2.05ns ± 0% -0.70% (p=0.016 n=5+4) Acos-16 36.5ns ± 0% 35.7ns ± 0% -2.33% (p=0.029 n=4+4) Acosh-16 37.0ns ± 0% 36.2ns ± 0% -2.20% (p=0.008 n=5+5) Asin-16 36.5ns ± 0% 35.7ns ± 0% -2.29% (p=0.008 n=5+5) Asinh-16 33.5ns ± 0% 31.6ns ± 0% -5.51% (p=0.008 n=5+5) Atan-16 15.5ns ± 0% 13.9ns ± 0% -10.61% (p=0.008 n=5+5) Atanh-16 15.0ns ± 0% 13.6ns ± 0% -9.73% (p=0.008 n=5+5) Conj-16 0.11ns ± 5% 0.11ns ± 1% ~ (p=0.421 n=5+5) Cos-16 12.3ns ± 0% 12.2ns ± 0% -0.60% (p=0.000 n=4+5) Cosh-16 12.1ns ± 0% 12.0ns ± 0% ~ (p=0.079 n=4+5) Exp-16 10.0ns ± 0% 9.8ns ± 0% -1.77% (p=0.008 n=5+5) Log-16 14.5ns ± 0% 13.7ns ± 0% -5.67% (p=0.008 n=5+5) Log10-16 14.5ns ± 0% 13.7ns ± 0% -5.55% (p=0.000 n=5+4) Phase-16 5.11ns ± 0% 4.25ns ± 0% -16.90% (p=0.008 n=5+5) Polar-16 7.12ns ± 0% 6.35ns ± 0% -10.90% (p=0.008 n=5+5) Pow-16 64.3ns ± 0% 63.7ns ± 0% -0.97% (p=0.008 n=5+5) Rect-16 5.74ns ± 0% 5.58ns ± 0% -2.73% (p=0.016 n=4+5) Sin-16 12.2ns ± 0% 12.2ns ± 0% -0.54% (p=0.000 n=4+5) Sinh-16 12.1ns ± 0% 12.0ns ± 0% -0.58% (p=0.000 n=5+4) Sqrt-16 5.30ns ± 0% 5.18ns ± 0% -2.36% (p=0.008 n=5+5) Tan-16 22.7ns ± 0% 22.6ns ± 0% -0.33% (p=0.008 n=5+5) Tanh-16 21.2ns ± 0% 20.9ns ± 0% -1.32% (p=0.008 n=5+5) [Geo mean] 11.3ns 10.8ns -3.97% Change-Id: Idcc4b357ba68477929c126289e5095b27a827b1b Reviewed-on: https://go-review.googlesource.com/c/go/+/646335 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Keith Randall <khr@golang.org>
2025-02-02 23:42:43 +01:00
case ssa.OpAMD64LoweredRound32F, ssa.OpAMD64LoweredRound64F:
// input is already rounded
case ssa.OpAMD64ROUNDSD:
p := s.Prog(v.Op.Asm())
val := v.AuxInt
// 0 means math.RoundToEven, 1 Floor, 2 Ceil, 3 Trunc
if val < 0 || val > 3 {
v.Fatalf("Invalid rounding mode")
}
p.From.Offset = val
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(v.Args[0].Reg())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL,
ssa.OpAMD64TZCNTQ, ssa.OpAMD64TZCNTL,
ssa.OpAMD64LZCNTQ, ssa.OpAMD64LZCNTL:
if v.Args[0].Reg() != v.Reg() {
// POPCNT/TZCNT/LZCNT have a false dependency on the destination register on Intel cpus.
// TZCNT/LZCNT problem affects pre-Skylake models. See discussion at https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011#c7.
// Xor register with itself to break the dependency.
opregreg(s, x86.AXORL, v.Reg(), v.Reg())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
ssa.OpAMD64SETA, ssa.OpAMD64SETAE,
ssa.OpAMD64SETO:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQstore, ssa.OpAMD64SETNEstore,
ssa.OpAMD64SETLstore, ssa.OpAMD64SETLEstore,
ssa.OpAMD64SETGstore, ssa.OpAMD64SETGEstore,
ssa.OpAMD64SETBstore, ssa.OpAMD64SETBEstore,
ssa.OpAMD64SETAstore, ssa.OpAMD64SETAEstore:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64SETEQstoreidx1, ssa.OpAMD64SETNEstoreidx1,
ssa.OpAMD64SETLstoreidx1, ssa.OpAMD64SETLEstoreidx1,
ssa.OpAMD64SETGstoreidx1, ssa.OpAMD64SETGEstoreidx1,
ssa.OpAMD64SETBstoreidx1, ssa.OpAMD64SETBEstoreidx1,
ssa.OpAMD64SETAstoreidx1, ssa.OpAMD64SETAEstoreidx1:
p := s.Prog(v.Op.Asm())
memIdx(&p.To, v)
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64SETNEF:
t := v.RegTmp()
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
q := s.Prog(x86.ASETPS)
q.To.Type = obj.TYPE_REG
q.To.Reg = t
// ORL avoids partial register write and is smaller than ORQ, used by old compiler
opregreg(s, x86.AORL, v.Reg(), t)
case ssa.OpAMD64SETEQF:
t := v.RegTmp()
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
q := s.Prog(x86.ASETPC)
q.To.Type = obj.TYPE_REG
q.To.Reg = t
// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
opregreg(s, x86.AANDL, v.Reg(), t)
case ssa.OpAMD64InvertFlags:
v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
case ssa.OpAMD64REPSTOSQ:
s.Prog(x86.AREP)
s.Prog(x86.ASTOSQ)
case ssa.OpAMD64REPMOVSQ:
s.Prog(x86.AREP)
s.Prog(x86.AMOVSQ)
case ssa.OpAMD64LoweredNilCheck:
// Issue a load which will fault if the input is nil.
// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
// but it doesn't have false dependency on AX.
// Or maybe allocate an output register and use MOVL (reg),reg2 ?
// That trades clobbering flags for clobbering a register.
p := s.Prog(x86.ATESTB)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_AX
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
cmd/compile: add framework for logging optimizer (non)actions to LSP This is intended to allow IDEs to note where the optimizer was not able to improve users' code. There may be other applications for this, for example in studying effectiveness of optimizer changes more quickly than running benchmarks, or in verifying that code changes did not accidentally disable optimizations in performance-critical code. Logging of nilcheck (bad) for amd64 is implemented as proof-of-concept. In general, the intent is that optimizations that didn't happen are what will be logged, because that is believed to be what IDE users want. Added flag -json=version,dest Check that version=0. (Future compilers will support a few recent versions, I hope that version is always <=3.) Dest is expected to be one of: /path (or \path in Windows) will create directory /path and fill it w/ json files file://path will create directory path, intended either for I:\dont\know\enough\about\windows\paths trustme_I_know_what_I_am_doing_probably_testing Not passing an absolute path name usually leads to json splattered all over source directories, or failure when those directories are not writeable. If you want a foot-gun, you have to ask for it. The JSON output is directed to subdirectories of dest, where each subdirectory is net/url.PathEscape of the package name, and each for each foo.go in the package, net/url.PathEscape(foo).json is created. The first line of foo.json contains version and context information, and subsequent lines contains LSP-conforming JSON describing the missing optimizations. Change-Id: Ib83176a53a8c177ee9081aefc5ae05604ccad8a0 Reviewed-on: https://go-review.googlesource.com/c/go/+/204338 Run-TryBot: David Chase <drchase@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2019-10-24 13:48:17 -04:00
if logopt.Enabled() {
logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
}
if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
base.WarnfAt(v.Pos, "generated nil check")
}
case ssa.OpAMD64MOVBatomicload, ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64XCHGB, ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Reg0()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[1].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Reg0()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[1].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
if v.Args[1].Reg() != x86.REG_AX {
v.Fatalf("input[1] not in AX %s", v.LongString())
}
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
p = s.Prog(x86.ASETEQ)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64ANDBlock, ssa.OpAMD64ANDLlock, ssa.OpAMD64ANDQlock, ssa.OpAMD64ORBlock, ssa.OpAMD64ORLlock, ssa.OpAMD64ORQlock:
// Atomic memory operations that don't need to return the old value.
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64LoweredAtomicAnd64, ssa.OpAMD64LoweredAtomicOr64, ssa.OpAMD64LoweredAtomicAnd32, ssa.OpAMD64LoweredAtomicOr32:
// Atomic memory operations that need to return the old value.
// We need to do these with compare-and-exchange to get access to the old value.
// loop:
// MOVQ mask, tmp
// MOVQ (addr), AX
// ANDQ AX, tmp
// LOCK CMPXCHGQ tmp, (addr) : note that AX is implicit old value to compare against
// JNE loop
// : result in AX
mov := x86.AMOVQ
op := x86.AANDQ
cmpxchg := x86.ACMPXCHGQ
switch v.Op {
case ssa.OpAMD64LoweredAtomicOr64:
op = x86.AORQ
case ssa.OpAMD64LoweredAtomicAnd32:
mov = x86.AMOVL
op = x86.AANDL
cmpxchg = x86.ACMPXCHGL
case ssa.OpAMD64LoweredAtomicOr32:
mov = x86.AMOVL
op = x86.AORL
cmpxchg = x86.ACMPXCHGL
}
addr := v.Args[0].Reg()
mask := v.Args[1].Reg()
tmp := v.RegTmp()
p1 := s.Prog(mov)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = mask
p1.To.Type = obj.TYPE_REG
p1.To.Reg = tmp
p2 := s.Prog(mov)
p2.From.Type = obj.TYPE_MEM
p2.From.Reg = addr
ssagen.AddAux(&p2.From, v)
p2.To.Type = obj.TYPE_REG
p2.To.Reg = x86.REG_AX
p3 := s.Prog(op)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = x86.REG_AX
p3.To.Type = obj.TYPE_REG
p3.To.Reg = tmp
s.Prog(x86.ALOCK)
p5 := s.Prog(cmpxchg)
p5.From.Type = obj.TYPE_REG
p5.From.Reg = tmp
p5.To.Type = obj.TYPE_MEM
p5.To.Reg = addr
ssagen.AddAux(&p5.To, v)
p6 := s.Prog(x86.AJNE)
p6.To.Type = obj.TYPE_BRANCH
p6.To.SetTarget(p1)
cmd/compile: add prefetch intrinsic support This CL provide new intrinsics to emit prefetch instructions for AMD64 and ARM64 platforms: Prefetch - prefetches data from memory address to cache; PrefetchStreamed - prefetches data from memory address, with a hint that this data is being streamed. This patch also provides prefetch calls pointed by RSC inside scanobject and greyobject of GC mark logic. Performance results provided by Michael: https://perf.golang.org/search?q=upload:20210901.9 Benchmark parameters: tree2 -heapsize=1000000000 -cpus=8 tree -n=18 parser peano Benchmarks AMD64 (Xeon - Cascade Lake): name old time/op new time/op delta Tree2-8 36.1ms ± 6% 33.4ms ± 5% -7.65% (p=0.000 n=9+9) Tree-8 326ms ± 1% 324ms ± 1% -0.44% (p=0.006 n=9+10) Parser-8 2.75s ± 1% 2.71s ± 1% -1.47% (p=0.008 n=5+5) Peano-8 63.1ms ± 1% 63.0ms ± 1% ~ (p=0.730 n=9+9) [Geo mean] 213ms 207ms -2.45% Benchmarks ARM64 (Kunpeng 920): name old time/op new time/op delta Tree2-8 50.3ms ± 8% 44.1ms ± 5% -12.24% (p=0.000 n=10+9) Tree-8 494ms ± 1% 493ms ± 1% ~ (p=0.684 n=10+10) Parser-8 3.99s ± 1% 3.93s ± 1% -1.37% (p=0.016 n=5+5) Peano-8 84.4ms ± 0% 84.1ms ± 1% ~ (p=0.068 n=8+10) [Geo mean] 302ms 291ms -3.67% Change-Id: I43e10bc2f9512dc49d7631dd8843a79036fa43d0 Reviewed-on: https://go-review.googlesource.com/c/go/+/328289 Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Go Bot <gobot@golang.org>
2021-06-15 14:04:30 +00:00
case ssa.OpAMD64PrefetchT0, ssa.OpAMD64PrefetchNTA:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
case ssa.OpClobber:
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0xdeaddead
p.To.Type = obj.TYPE_MEM
p.To.Reg = x86.REG_SP
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
p = s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0xdeaddead
p.To.Type = obj.TYPE_MEM
p.To.Reg = x86.REG_SP
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
ssagen.AddAux(&p.To, v)
p.To.Offset += 4
case ssa.OpClobberReg:
x := uint64(0xdeaddeaddeaddead)
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = int64(x)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
// SIMD ops
case ssa.OpAMD64VZEROUPPER, ssa.OpAMD64VZEROALL:
s.Prog(v.Op.Asm())
case ssa.OpAMD64Zero128, ssa.OpAMD64Zero256, ssa.OpAMD64Zero512: // no code emitted
case ssa.OpAMD64VMOVSSf2v, ssa.OpAMD64VMOVSDf2v:
// These are for initializing the least 32/64 bits of a SIMD register from a "float".
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.AddRestSourceReg(x86.REG_X15)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVQload, ssa.OpAMD64VMOVDload,
ssa.OpAMD64VMOVSSload, ssa.OpAMD64VMOVSDload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVSSconst, ssa.OpAMD64VMOVSDconst:
// for loading constants directly into SIMD registers
x := simdReg(v)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_FCONST
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = x
case ssa.OpAMD64VMOVD, ssa.OpAMD64VMOVQ:
// These are for initializing the least 32/64 bits of a SIMD register from an "int".
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VMOVDQUload128, ssa.OpAMD64VMOVDQUload256, ssa.OpAMD64VMOVDQUload512,
ssa.OpAMD64KMOVBload, ssa.OpAMD64KMOVWload, ssa.OpAMD64KMOVDload, ssa.OpAMD64KMOVQload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdOrMaskReg(v)
case ssa.OpAMD64VMOVDQUstore128, ssa.OpAMD64VMOVDQUstore256, ssa.OpAMD64VMOVDQUstore512,
ssa.OpAMD64KMOVBstore, ssa.OpAMD64KMOVWstore, ssa.OpAMD64KMOVDstore, ssa.OpAMD64KMOVQstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdOrMaskReg(v.Args[1])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
case ssa.OpAMD64VPMASK32load128, ssa.OpAMD64VPMASK64load128, ssa.OpAMD64VPMASK32load256, ssa.OpAMD64VPMASK64load256:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
case ssa.OpAMD64VPMASK32store128, ssa.OpAMD64VPMASK64store128, ssa.OpAMD64VPMASK32store256, ssa.OpAMD64VPMASK64store256:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
p.AddRestSourceReg(simdReg(v.Args[1])) // masking simd reg
case ssa.OpAMD64VPMASK64load512, ssa.OpAMD64VPMASK32load512, ssa.OpAMD64VPMASK16load512, ssa.OpAMD64VPMASK8load512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
x86.ParseSuffix(p, "Z") // must be zero if not in mask
case ssa.OpAMD64VPMASK64store512, ssa.OpAMD64VPMASK32store512, ssa.OpAMD64VPMASK16store512, ssa.OpAMD64VPMASK8store512:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.To, v)
p.AddRestSourceReg(v.Args[1].Reg()) // simd mask reg
case ssa.OpAMD64VPMOVMToVec8x16,
ssa.OpAMD64VPMOVMToVec8x32,
ssa.OpAMD64VPMOVMToVec8x64,
ssa.OpAMD64VPMOVMToVec16x8,
ssa.OpAMD64VPMOVMToVec16x16,
ssa.OpAMD64VPMOVMToVec16x32,
ssa.OpAMD64VPMOVMToVec32x4,
ssa.OpAMD64VPMOVMToVec32x8,
ssa.OpAMD64VPMOVMToVec32x16,
ssa.OpAMD64VPMOVMToVec64x2,
ssa.OpAMD64VPMOVMToVec64x4,
ssa.OpAMD64VPMOVMToVec64x8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
case ssa.OpAMD64VPMOVVec8x16ToM,
ssa.OpAMD64VPMOVVec8x32ToM,
ssa.OpAMD64VPMOVVec8x64ToM,
ssa.OpAMD64VPMOVVec16x8ToM,
ssa.OpAMD64VPMOVVec16x16ToM,
ssa.OpAMD64VPMOVVec16x32ToM,
ssa.OpAMD64VPMOVVec32x4ToM,
ssa.OpAMD64VPMOVVec32x8ToM,
ssa.OpAMD64VPMOVVec32x16ToM,
ssa.OpAMD64VPMOVVec64x2ToM,
ssa.OpAMD64VPMOVVec64x4ToM,
ssa.OpAMD64VPMOVVec64x8ToM:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64KMOVQk, ssa.OpAMD64KMOVDk, ssa.OpAMD64KMOVWk, ssa.OpAMD64KMOVBk,
ssa.OpAMD64KMOVQi, ssa.OpAMD64KMOVDi, ssa.OpAMD64KMOVWi, ssa.OpAMD64KMOVBi:
// See also ssa.OpAMD64KMOVQload
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64VPTEST:
// Some instructions setting flags put their second operand into the destination reg.
// See also CMP[BWDQ].
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v.Args[1])
default:
if !ssaGenSIMDValue(s, v) {
v.Fatalf("genValue not implemented: %s", v.LongString())
}
}
}
// zeroX15 zeroes the X15 register.
func zeroX15(s *ssagen.State) {
vxorps := func(s *ssagen.State) {
p := s.Prog(x86.AVXORPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.AddRestSourceReg(x86.REG_X15)
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_X15
}
if buildcfg.GOAMD64 >= 3 {
vxorps(s)
return
}
// AVX may not be available, check before zeroing the high bits.
p := s.Prog(x86.ACMPB)
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
p.From.Sym = ir.Syms.X86HasAVX
p.To.Type = obj.TYPE_CONST
p.To.Offset = 1
jmp := s.Prog(x86.AJNE)
jmp.To.Type = obj.TYPE_BRANCH
vxorps(s)
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
jmp.To.SetTarget(sse)
}
// Example instruction: VRSQRTPS X1, X1
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPSUBD X1, X2, X3
func simdV21(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
// Vector registers operands follows a right-to-left order.
// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
p.From.Reg = simdReg(v.Args[1])
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// This function is to accustomize the shifts.
// The 2nd arg is an XMM, and this function merely checks that.
// Example instruction: VPSLLQ Z1, X1, Z2
func simdVfpv(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
// Vector registers operands follows a right-to-left order.
// e.g. VPSUBD X1, X2, X3 means X3 = X2 - X1.
p.From.Reg = v.Args[1].Reg()
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPCMPEQW Z26, Z30, K4
func simdV2k(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VPMINUQ X21, X3, K3, X31
func simdV2kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.AddRestSourceReg(simdReg(v.Args[0]))
// These "simd*" series of functions assumes:
// Any "K" register that serves as the write-mask
// or "predicate" for "predicated AVX512 instructions"
// sits right at the end of the operand list.
// TODO: verify this assumption.
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPABSB X1, X2, K3 (masking merging)
func simdV2kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
// These "simd*" series of functions assumes:
// Any "K" register that serves as the write-mask
// or "predicate" for "predicated AVX512 instructions"
// sits right at the end of the operand list.
// TODO: verify this assumption.
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// This function is to accustomize the shifts.
// The 2nd arg is an XMM, and this function merely checks that.
// Example instruction: VPSLLQ Z1, X1, K1, Z2
func simdVfpkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPCMPEQW Z26, Z30, K1, K4
func simdV2kk(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VPOPCNTB X14, K4, X16
func simdVkv(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[0])
p.AddRestSourceReg(maskReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VROUNDPD $7, X2, X2
func simdV11Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VREDUCEPD $126, X1, K3, X31
func simdVkvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VCMPPS $7, X2, X9, X2
func simdV21Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPINSRB $3, DX, X0, X0
func simdVgpvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(v.Args[1].Reg())
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPCMPD $1, Z1, Z2, K1
func simdV2kImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VPCMPD $1, Z1, Z2, K2, K1
func simdV2kkImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
func simdV2kvImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VFMADD213PD Z2, Z1, Z0
func simdV31ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
func simdV31ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[2]))
p.AddRestSourceReg(simdReg(v.Args[1]))
// p.AddRestSourceReg(x86.REG_K0)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// v31loadResultInArg0Imm8
// Example instruction:
// for (VPTERNLOGD128load {sym} [makeValAndOff(int32(int8(c)),off)] x y ptr mem)
func simdV31loadResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[2].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(simdReg(v.Args[1]))
return p
}
// Example instruction: VFMADD213PD Z2, Z1, K1, Z0
func simdV3kvResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(maskReg(v.Args[3]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
func simdVgpImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
return p
}
// Currently unused
func simdV31(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Currently unused
func simdV3kv(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[2])
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[3]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VRCP14PS (DI), K6, X22
func simdVkvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(maskReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPSLLVD (DX), X7, X18
func simdV21load(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[1].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPDPWSSD (SI), X24, X18
func simdV31loadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[2].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPDPWSSD (SI), X24, K1, X18
func simdV3kvloadResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[2].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(simdReg(v.Args[1]))
p.AddRestSourceReg(maskReg(v.Args[3]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPSLLVD (SI), X1, K1, X2
func simdV2kvload(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[1].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPCMPEQD (SI), X1, K1
func simdV2kload(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[1].Reg()
ssagen.AddAux(&p.From, v)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VCVTTPS2DQ (BX), X2
func simdV11load(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
ssagen.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPSHUFD $7, (BX), X11
func simdV11loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPRORD $81, -15(R14), K7, Y1
func simdVkvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(maskReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VPSHLDD $82, 7(SI), Y21, Y3
func simdV21loadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: VCMPPS $81, -7(DI), Y16, K3
func simdV2kloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VCMPPS $81, -7(DI), Y16, K1, K3
func simdV2kkloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = maskReg(v)
return p
}
// Example instruction: VGF2P8AFFINEINVQB $64, -17(BP), X31, K3, X26
func simdV2kvloadImm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
sc := v.AuxValAndOff()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = sc.Val64()
m := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[1].Reg()}
ssagen.AddAux2(&m, v, sc.Off64())
p.AddRestSource(m)
p.AddRestSourceReg(simdReg(v.Args[0]))
p.AddRestSourceReg(maskReg(v.Args[2]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: SHA1NEXTE X2, X2
func simdV21ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = simdReg(v.Args[1])
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: SHA1RNDS4 $1, X2, X2
func simdV21ResultInArg0Imm8(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())
p.From.Offset = int64(v.AuxUInt8())
p.From.Type = obj.TYPE_CONST
p.AddRestSourceReg(simdReg(v.Args[1]))
p.To.Type = obj.TYPE_REG
p.To.Reg = simdReg(v)
return p
}
// Example instruction: SHA256RNDS2 X0, X11, X2
func simdV31x0AtIn2ResultInArg0(s *ssagen.State, v *ssa.Value) *obj.Prog {
return simdV31ResultInArg0(s, v)
}
var blockJump = [...]struct {
asm, invasm obj.As
}{
ssa.BlockAMD64EQ: {x86.AJEQ, x86.AJNE},
ssa.BlockAMD64NE: {x86.AJNE, x86.AJEQ},
ssa.BlockAMD64LT: {x86.AJLT, x86.AJGE},
ssa.BlockAMD64GE: {x86.AJGE, x86.AJLT},
ssa.BlockAMD64LE: {x86.AJLE, x86.AJGT},
ssa.BlockAMD64GT: {x86.AJGT, x86.AJLE},
ssa.BlockAMD64OS: {x86.AJOS, x86.AJOC},
ssa.BlockAMD64OC: {x86.AJOC, x86.AJOS},
ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
}
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
var eqfJumps = [2][2]ssagen.IndexJump{
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
}
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
var nefJumps = [2][2]ssagen.IndexJump{
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
}
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
switch b.Kind {
cmd/compile, runtime: use PC of deferreturn for panic transfer this removes the old conditional-on-register-value handshake from the deferproc/deferprocstack logic. The "line" for the recovery-exit frame itself (not the defers that it runs) is the closing brace of the function. Reduces code size slightly (e.g. go command is 0.2% smaller) Sample output showing effect of this change, also what sort of code it requires to observe the effect: ``` package main import "os" func main() { g(len(os.Args) - 1) // stack[0] } var gi int var pi *int = &gi //go:noinline func g(i int) { switch i { case 0: defer func() { println("g0", i) q() // stack[2] if i == 0 }() for j := *pi; j < 1; j++ { defer func() { println("recover0", recover().(string)) }() } default: for j := *pi; j < 1; j++ { defer func() { println("g1", i) q() // stack[2] if i == 1 }() } defer func() { println("recover1", recover().(string)) }() } p() } // stack[1] (deferreturn) //go:noinline func p() { panic("p()") } //go:noinline func q() { panic("q()") // stack[3] } /* Sample output for "./foo foo": recover1 p() g1 1 panic: q() goroutine 1 [running]: main.q() .../main.go:46 +0x2c main.g.func3() .../main.go:29 +0x48 main.g(0x1?) .../main.go:37 +0x68 main.main() .../main.go:6 +0x28 */ ``` Change-Id: Ie39ea62ecc244213500380ea06d44024cadc2317 Reviewed-on: https://go-review.googlesource.com/c/go/+/650795 Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-02-19 16:47:31 -05:00
case ssa.BlockPlain, ssa.BlockDefer:
if b.Succs[0].Block() != next {
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_BRANCH
[dev.regabi] cmd/compile: split out package ssagen [generated] [git-generate] cd src/cmd/compile/internal/gc rf ' # maxOpenDefers is declared in ssa.go but used only by walk. mv maxOpenDefers walk.go # gc.Arch -> ssagen.Arch # It is not as nice but will do for now. mv Arch ArchInfo mv thearch Arch mv Arch ArchInfo arch.go # Pull dwarf out of pgen.go. mv debuginfo declPos createDwarfVars preInliningDcls \ createSimpleVars createSimpleVar \ createComplexVars createComplexVar \ dwarf.go # Pull high-level compilation out of pgen.go, # leaving only the SSA code. mv compilequeue funccompile compile compilenow \ compileFunctions isInlinableButNotInlined \ initLSym \ compile.go mv BoundsCheckFunc GCWriteBarrierReg ssa.go mv largeStack largeStackFrames CheckLargeStacks pgen.go # All that is left in dcl.go is the nowritebarrierrecCheck mv dcl.go nowb.go # Export API and unexport non-API. mv initssaconfig InitConfig mv isIntrinsicCall IsIntrinsicCall mv ssaDumpInline DumpInline mv initSSATables InitTables mv initSSAEnv InitEnv mv compileSSA Compile mv stackOffset StackOffset mv canSSAType TypeOK mv SSAGenState State mv FwdRefAux fwdRefAux mv cgoSymABIs CgoSymABIs mv readSymABIs ReadSymABIs mv initLSym InitLSym mv useABIWrapGen symabiDefs CgoSymABIs ReadSymABIs InitLSym selectLSym makeABIWrapper setupTextLSym abi.go mv arch.go abi.go nowb.go phi.go pgen.go pgen_test.go ssa.go cmd/compile/internal/ssagen ' rm go.go gsubr.go Change-Id: I47fad6cbf1d1e583fd9139003a08401d7cd048a1 Reviewed-on: https://go-review.googlesource.com/c/go/+/279476 Trust: Russ Cox <rsc@golang.org> Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com>
2020-12-23 00:57:10 -05:00
s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
}
cmd/compile: restore tail call for method wrappers For certain type of method wrappers we used to generate a tail call. That was disabled in CL 307234 when register ABI is used, because with the current IR it was difficult to generate a tail call with the arguments in the right places. The problem was that the IR does not contain a CALL-like node with arguments; instead, it contains an OAS node that adjusts the receiver, than an OTAILCALL node that just contains the target, but no argument (with the assumption that the OAS node will put the adjusted receiver in the right place). With register ABI, putting arguments in registers are done in SSA. The assignment (OAS) doesn't put the receiver in register. This CL changes the IR of a tail call to take an actual OCALL node. Specifically, a tail call is represented as OTAILCALL (OCALL target args...) This way, the call target and args are connected through the OCALL node. So the call can be analyzed in SSA and the args can be passed in the right places. (Alternatively, we could have OTAILCALL node directly take the target and the args, without the OCALL node. Using an OCALL node is convenient as there are existing code that processes OCALL nodes which do not need to be changed. Also, a tail call is similar to ORETURN (OCALL target args...), except it doesn't preserve the frame. I did the former but I'm open to change.) The SSA representation is similar. Previously, the IR lowers to a Store the receiver then a BlockRetJmp which jumps to the target (without putting the arg in register). Now we use a TailCall op, which takes the target and the args. The call expansion pass and the register allocator handles TailCall pretty much like a StaticCall, and it will do the right ABI analysis and put the args in the right places. (Args other than the receiver are already in the right places. For register args it generates no code for them. For stack args currently it generates a self copy. I'll work on optimize that out.) BlockRetJmp is still used, signaling it is a tail call. The actual call is made in the TailCall op so BlockRetJmp generates no code (we could use BlockExit if we like). This slightly reduces binary size: old new cmd/go 14003088 13953936 cmd/link 6275552 6271456 Change-Id: I2d16d8d419fe1f17554916d317427383e17e27f0 Reviewed-on: https://go-review.googlesource.com/c/go/+/350145 Trust: Cherry Mui <cherryyz@google.com> Run-TryBot: Cherry Mui <cherryyz@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Matthew Dempsky <mdempsky@google.com> Reviewed-by: David Chase <drchase@google.com>
2021-09-10 22:05:55 -04:00
case ssa.BlockExit, ssa.BlockRetJmp:
case ssa.BlockRet:
s.Prog(obj.ARET)
case ssa.BlockAMD64EQF:
cmd/compile: fix incorrect rewriting to if condition Some ARM64 rewriting rules convert 'comparing to zero' conditions of if statements to a simplified version utilizing CMN and CMP instructions to branch over condition flags, in order to save one Add or Sub caculation. Such optimizations lead to wrong branching in case an overflow/underflow occurs when executing CMN or CMP. Fix the issue by introducing new block opcodes that don't honor the overflow/underflow flag, in the following categories: Block-Op Meaning ARM condition codes 1. LTnoov less than MI 2. GEnoov greater than or equal PL 3. LEnoov less than or equal MI || EQ 4. GTnoov greater than NEQ & PL The backend generates two consecutive branch instructions for 'LEnoov' and 'GTnoov' to model their expected behavior. A slight change to 'gc' and amd64/386 backends is made to unify the code generation. Add a test 'TestCondRewrite' as justification, it covers 32 incorrect rules identified on arm64, more might be needed on other arches, like 32-bit arm. Add two benchmarks profiling the aforementioned category 1&2 and category 3&4 separetely, we expect the first two categories will show performance improvement and the second will not result in visible regression compared with the non-optimized version. This change also updates TestFormats to support using %#x. Examples exhibiting where does the issue come from: 1: 'if x + 3 < 0' might be converted to: before: CMN $3, R0 BGE <else branch> // wrong branch is taken if 'x+3' overflows after: CMN $3, R0 BPL <else branch> 2: 'if y - 3 > 0' might be converted to: before: CMP $3, R0 BLE <else branch> // wrong branch is taken if 'y-3' underflows after: CMP $3, R0 BMI <else branch> BEQ <else branch> Benchmark data from different kinds of arm64 servers, 'old' is the non-optimized version (not the parent commit), generally the optimization version outperforms. S1: name old time/op new time/op delta CondRewrite/SoloJump 13.6ns ± 0% 12.9ns ± 0% -5.15% (p=0.000 n=10+10) CondRewrite/CombJump 13.8ns ± 1% 12.9ns ± 0% -6.32% (p=0.000 n=10+10) S2: name old time/op new time/op delta CondRewrite/SoloJump 11.6ns ± 0% 10.9ns ± 0% -6.03% (p=0.000 n=10+10) CondRewrite/CombJump 11.4ns ± 0% 10.8ns ± 1% -5.53% (p=0.000 n=10+10) S3: name old time/op new time/op delta CondRewrite/SoloJump 7.36ns ± 0% 7.50ns ± 0% +1.79% (p=0.000 n=9+10) CondRewrite/CombJump 7.35ns ± 0% 7.75ns ± 0% +5.51% (p=0.000 n=8+9) S4: name old time/op new time/op delta CondRewrite/SoloJump-224 11.5ns ± 1% 10.9ns ± 0% -4.97% (p=0.000 n=10+10) CondRewrite/CombJump-224 11.9ns ± 0% 11.5ns ± 0% -2.95% (p=0.000 n=10+10) S5: name old time/op new time/op delta CondRewrite/SoloJump 10.0ns ± 0% 10.0ns ± 0% -0.45% (p=0.000 n=9+10) CondRewrite/CombJump 9.93ns ± 0% 9.77ns ± 0% -1.53% (p=0.000 n=10+9) Go1 perf. data: name old time/op new time/op delta BinaryTree17 6.29s ± 1% 6.30s ± 1% ~ (p=1.000 n=5+5) Fannkuch11 5.40s ± 0% 5.40s ± 0% ~ (p=0.841 n=5+5) FmtFprintfEmpty 97.9ns ± 0% 98.9ns ± 3% ~ (p=0.937 n=4+5) FmtFprintfString 171ns ± 3% 171ns ± 2% ~ (p=0.754 n=5+5) FmtFprintfInt 212ns ± 0% 217ns ± 6% +2.55% (p=0.008 n=5+5) FmtFprintfIntInt 296ns ± 1% 297ns ± 2% ~ (p=0.516 n=5+5) FmtFprintfPrefixedInt 371ns ± 2% 374ns ± 7% ~ (p=1.000 n=5+5) FmtFprintfFloat 435ns ± 1% 439ns ± 2% ~ (p=0.056 n=5+5) FmtManyArgs 1.37µs ± 1% 1.36µs ± 1% ~ (p=0.730 n=5+5) GobDecode 14.6ms ± 4% 14.4ms ± 4% ~ (p=0.690 n=5+5) GobEncode 11.8ms ±20% 11.6ms ±15% ~ (p=1.000 n=5+5) Gzip 507ms ± 0% 491ms ± 0% -3.22% (p=0.008 n=5+5) Gunzip 73.8ms ± 0% 73.9ms ± 0% ~ (p=0.690 n=5+5) HTTPClientServer 116µs ± 0% 116µs ± 0% ~ (p=0.686 n=4+4) JSONEncode 21.8ms ± 1% 21.6ms ± 2% ~ (p=0.151 n=5+5) JSONDecode 104ms ± 1% 103ms ± 1% -1.08% (p=0.016 n=5+5) Mandelbrot200 9.53ms ± 0% 9.53ms ± 0% ~ (p=0.421 n=5+5) GoParse 7.55ms ± 1% 7.51ms ± 1% ~ (p=0.151 n=5+5) RegexpMatchEasy0_32 158ns ± 0% 158ns ± 0% ~ (all equal) RegexpMatchEasy0_1K 606ns ± 1% 608ns ± 3% ~ (p=0.937 n=5+5) RegexpMatchEasy1_32 143ns ± 0% 144ns ± 1% ~ (p=0.095 n=5+4) RegexpMatchEasy1_1K 927ns ± 2% 944ns ± 2% ~ (p=0.056 n=5+5) RegexpMatchMedium_32 16.0ns ± 0% 16.0ns ± 0% ~ (all equal) RegexpMatchMedium_1K 69.3µs ± 2% 69.7µs ± 0% ~ (p=0.690 n=5+5) RegexpMatchHard_32 3.73µs ± 0% 3.73µs ± 1% ~ (p=0.984 n=5+5) RegexpMatchHard_1K 111µs ± 1% 110µs ± 0% ~ (p=0.151 n=5+5) Revcomp 1.91s ±47% 1.77s ±68% ~ (p=1.000 n=5+5) Template 138ms ± 1% 138ms ± 1% ~ (p=1.000 n=5+5) TimeParse 787ns ± 2% 785ns ± 1% ~ (p=0.540 n=5+5) TimeFormat 729ns ± 1% 726ns ± 1% ~ (p=0.151 n=5+5) Updates #38740 Change-Id: I06c604874acdc1e63e66452dadee5df053045222 Reviewed-on: https://go-review.googlesource.com/c/go/+/233097 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Keith Randall <khr@golang.org>
2020-05-06 09:54:40 +00:00
s.CombJump(b, next, &eqfJumps)
case ssa.BlockAMD64NEF:
cmd/compile: fix incorrect rewriting to if condition Some ARM64 rewriting rules convert 'comparing to zero' conditions of if statements to a simplified version utilizing CMN and CMP instructions to branch over condition flags, in order to save one Add or Sub caculation. Such optimizations lead to wrong branching in case an overflow/underflow occurs when executing CMN or CMP. Fix the issue by introducing new block opcodes that don't honor the overflow/underflow flag, in the following categories: Block-Op Meaning ARM condition codes 1. LTnoov less than MI 2. GEnoov greater than or equal PL 3. LEnoov less than or equal MI || EQ 4. GTnoov greater than NEQ & PL The backend generates two consecutive branch instructions for 'LEnoov' and 'GTnoov' to model their expected behavior. A slight change to 'gc' and amd64/386 backends is made to unify the code generation. Add a test 'TestCondRewrite' as justification, it covers 32 incorrect rules identified on arm64, more might be needed on other arches, like 32-bit arm. Add two benchmarks profiling the aforementioned category 1&2 and category 3&4 separetely, we expect the first two categories will show performance improvement and the second will not result in visible regression compared with the non-optimized version. This change also updates TestFormats to support using %#x. Examples exhibiting where does the issue come from: 1: 'if x + 3 < 0' might be converted to: before: CMN $3, R0 BGE <else branch> // wrong branch is taken if 'x+3' overflows after: CMN $3, R0 BPL <else branch> 2: 'if y - 3 > 0' might be converted to: before: CMP $3, R0 BLE <else branch> // wrong branch is taken if 'y-3' underflows after: CMP $3, R0 BMI <else branch> BEQ <else branch> Benchmark data from different kinds of arm64 servers, 'old' is the non-optimized version (not the parent commit), generally the optimization version outperforms. S1: name old time/op new time/op delta CondRewrite/SoloJump 13.6ns ± 0% 12.9ns ± 0% -5.15% (p=0.000 n=10+10) CondRewrite/CombJump 13.8ns ± 1% 12.9ns ± 0% -6.32% (p=0.000 n=10+10) S2: name old time/op new time/op delta CondRewrite/SoloJump 11.6ns ± 0% 10.9ns ± 0% -6.03% (p=0.000 n=10+10) CondRewrite/CombJump 11.4ns ± 0% 10.8ns ± 1% -5.53% (p=0.000 n=10+10) S3: name old time/op new time/op delta CondRewrite/SoloJump 7.36ns ± 0% 7.50ns ± 0% +1.79% (p=0.000 n=9+10) CondRewrite/CombJump 7.35ns ± 0% 7.75ns ± 0% +5.51% (p=0.000 n=8+9) S4: name old time/op new time/op delta CondRewrite/SoloJump-224 11.5ns ± 1% 10.9ns ± 0% -4.97% (p=0.000 n=10+10) CondRewrite/CombJump-224 11.9ns ± 0% 11.5ns ± 0% -2.95% (p=0.000 n=10+10) S5: name old time/op new time/op delta CondRewrite/SoloJump 10.0ns ± 0% 10.0ns ± 0% -0.45% (p=0.000 n=9+10) CondRewrite/CombJump 9.93ns ± 0% 9.77ns ± 0% -1.53% (p=0.000 n=10+9) Go1 perf. data: name old time/op new time/op delta BinaryTree17 6.29s ± 1% 6.30s ± 1% ~ (p=1.000 n=5+5) Fannkuch11 5.40s ± 0% 5.40s ± 0% ~ (p=0.841 n=5+5) FmtFprintfEmpty 97.9ns ± 0% 98.9ns ± 3% ~ (p=0.937 n=4+5) FmtFprintfString 171ns ± 3% 171ns ± 2% ~ (p=0.754 n=5+5) FmtFprintfInt 212ns ± 0% 217ns ± 6% +2.55% (p=0.008 n=5+5) FmtFprintfIntInt 296ns ± 1% 297ns ± 2% ~ (p=0.516 n=5+5) FmtFprintfPrefixedInt 371ns ± 2% 374ns ± 7% ~ (p=1.000 n=5+5) FmtFprintfFloat 435ns ± 1% 439ns ± 2% ~ (p=0.056 n=5+5) FmtManyArgs 1.37µs ± 1% 1.36µs ± 1% ~ (p=0.730 n=5+5) GobDecode 14.6ms ± 4% 14.4ms ± 4% ~ (p=0.690 n=5+5) GobEncode 11.8ms ±20% 11.6ms ±15% ~ (p=1.000 n=5+5) Gzip 507ms ± 0% 491ms ± 0% -3.22% (p=0.008 n=5+5) Gunzip 73.8ms ± 0% 73.9ms ± 0% ~ (p=0.690 n=5+5) HTTPClientServer 116µs ± 0% 116µs ± 0% ~ (p=0.686 n=4+4) JSONEncode 21.8ms ± 1% 21.6ms ± 2% ~ (p=0.151 n=5+5) JSONDecode 104ms ± 1% 103ms ± 1% -1.08% (p=0.016 n=5+5) Mandelbrot200 9.53ms ± 0% 9.53ms ± 0% ~ (p=0.421 n=5+5) GoParse 7.55ms ± 1% 7.51ms ± 1% ~ (p=0.151 n=5+5) RegexpMatchEasy0_32 158ns ± 0% 158ns ± 0% ~ (all equal) RegexpMatchEasy0_1K 606ns ± 1% 608ns ± 3% ~ (p=0.937 n=5+5) RegexpMatchEasy1_32 143ns ± 0% 144ns ± 1% ~ (p=0.095 n=5+4) RegexpMatchEasy1_1K 927ns ± 2% 944ns ± 2% ~ (p=0.056 n=5+5) RegexpMatchMedium_32 16.0ns ± 0% 16.0ns ± 0% ~ (all equal) RegexpMatchMedium_1K 69.3µs ± 2% 69.7µs ± 0% ~ (p=0.690 n=5+5) RegexpMatchHard_32 3.73µs ± 0% 3.73µs ± 1% ~ (p=0.984 n=5+5) RegexpMatchHard_1K 111µs ± 1% 110µs ± 0% ~ (p=0.151 n=5+5) Revcomp 1.91s ±47% 1.77s ±68% ~ (p=1.000 n=5+5) Template 138ms ± 1% 138ms ± 1% ~ (p=1.000 n=5+5) TimeParse 787ns ± 2% 785ns ± 1% ~ (p=0.540 n=5+5) TimeFormat 729ns ± 1% 726ns ± 1% ~ (p=0.151 n=5+5) Updates #38740 Change-Id: I06c604874acdc1e63e66452dadee5df053045222 Reviewed-on: https://go-review.googlesource.com/c/go/+/233097 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Keith Randall <khr@golang.org>
2020-05-06 09:54:40 +00:00
s.CombJump(b, next, &nefJumps)
case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
ssa.BlockAMD64LT, ssa.BlockAMD64GE,
ssa.BlockAMD64LE, ssa.BlockAMD64GT,
ssa.BlockAMD64OS, ssa.BlockAMD64OC,
ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
jmp := blockJump[b.Kind]
switch next {
case b.Succs[0].Block():
s.Br(jmp.invasm, b.Succs[1].Block())
case b.Succs[1].Block():
s.Br(jmp.asm, b.Succs[0].Block())
default:
if b.Likely != ssa.BranchUnlikely {
s.Br(jmp.asm, b.Succs[0].Block())
s.Br(obj.AJMP, b.Succs[1].Block())
} else {
s.Br(jmp.invasm, b.Succs[1].Block())
s.Br(obj.AJMP, b.Succs[0].Block())
}
}
cmd/compile: implement jump tables Performance is kind of hard to exactly quantify. One big difference between jump tables and the old binary search scheme is that there's only 1 branch statement instead of O(n) of them. That can be both a blessing and a curse, and can make evaluating jump tables very hard to do. The single branch can become a choke point for the hardware branch predictor. A branch table jump must fit all of its state in a single branch predictor entry (technically, a branch target predictor entry). With binary search that predictor state can be spread among lots of entries. In cases where the case selection is repetitive and thus predictable, binary search can perform better. The big win for a jump table is that it doesn't consume so much of the branch predictor's resources. But that benefit is essentially never observed in microbenchmarks, because the branch predictor can easily keep state for all the binary search branches in a microbenchmark. So that benefit is really hard to measure. So predictable switch microbenchmarks are ~useless - they will almost always favor the binary search scheme. Fully unpredictable switch microbenchmarks are better, as they aren't lying to us quite so much. In a perfectly unpredictable situation, a jump table will expect to incur 1-1/N branch mispredicts, where a binary search would incur lg(N)/2 of them. That makes the crossover point at about N=4. But of course switches in real programs are seldom fully unpredictable, so we'll use a higher crossover point. Beyond the branch predictor, jump tables tend to execute more instructions per switch but have no additional instructions per case, which also argues for a larger crossover. As far as code size goes, with this CL cmd/go has a slightly smaller code segment and a slightly larger overall size (from the jump tables themselves which live in the data segment). This is a case where some FDO (feedback-directed optimization) would be really nice to have. #28262 Some large-program benchmarks might help make the case for this CL. Especially if we can turn on branch mispredict counters so we can see how much using jump tables can free up branch prediction resources that can be gainfully used elsewhere in the program. name old time/op new time/op delta Switch8Predictable 1.89ns ± 2% 1.27ns ± 3% -32.58% (p=0.000 n=9+10) Switch8Unpredictable 9.33ns ± 1% 7.50ns ± 1% -19.60% (p=0.000 n=10+9) Switch32Predictable 2.20ns ± 2% 1.64ns ± 1% -25.39% (p=0.000 n=10+9) Switch32Unpredictable 10.0ns ± 2% 7.6ns ± 2% -24.04% (p=0.000 n=10+10) Fixes #5496 Update #34381 Change-Id: I3ff56011d02be53f605ca5fd3fb96b905517c34f Reviewed-on: https://go-review.googlesource.com/c/go/+/357330 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Keith Randall <khr@google.com>
2021-10-04 12:17:46 -07:00
case ssa.BlockAMD64JUMPTABLE:
// JMP *(TABLE)(INDEX*8)
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_MEM
p.To.Reg = b.Controls[1].Reg()
p.To.Index = b.Controls[0].Reg()
p.To.Scale = 8
// Save jump tables for later resolution of the target blocks.
s.JumpTables = append(s.JumpTables, b)
default:
cmd/compile: allow multiple SSA block control values Control values are used to choose which successor of a block is jumped to. Typically a control value takes the form of a 'flags' value that represents the result of a comparison. Some architectures however use a variable in a register as a control value. Up until now we have managed with a single control value per block. However some architectures (e.g. s390x and riscv64) have combined compare-and-branch instructions that take two variables in registers as parameters. To generate these instructions we need to support 2 control values per block. This CL allows up to 2 control values to be used in a block in order to support the addition of compare-and-branch instructions. I have implemented s390x compare-and-branch instructions in a different CL. Passes toolstash-check -all. Results of compilebench: name old time/op new time/op delta Template 208ms ± 1% 209ms ± 1% ~ (p=0.289 n=20+20) Unicode 83.7ms ± 1% 83.3ms ± 3% -0.49% (p=0.017 n=18+18) GoTypes 748ms ± 1% 748ms ± 0% ~ (p=0.460 n=20+18) Compiler 3.47s ± 1% 3.48s ± 1% ~ (p=0.070 n=19+18) SSA 11.5s ± 1% 11.7s ± 1% +1.64% (p=0.000 n=19+18) Flate 130ms ± 1% 130ms ± 1% ~ (p=0.588 n=19+20) GoParser 160ms ± 1% 161ms ± 1% ~ (p=0.211 n=20+20) Reflect 465ms ± 1% 467ms ± 1% +0.42% (p=0.007 n=20+20) Tar 184ms ± 1% 185ms ± 2% ~ (p=0.087 n=18+20) XML 253ms ± 1% 253ms ± 1% ~ (p=0.377 n=20+18) LinkCompiler 769ms ± 2% 774ms ± 2% ~ (p=0.070 n=19+19) ExternalLinkCompiler 3.59s ±11% 3.68s ± 6% ~ (p=0.072 n=20+20) LinkWithoutDebugCompiler 446ms ± 5% 454ms ± 3% +1.79% (p=0.002 n=19+20) StdCmd 26.0s ± 2% 26.0s ± 2% ~ (p=0.799 n=20+20) name old user-time/op new user-time/op delta Template 238ms ± 5% 240ms ± 5% ~ (p=0.142 n=20+20) Unicode 105ms ±11% 106ms ±10% ~ (p=0.512 n=20+20) GoTypes 876ms ± 2% 873ms ± 4% ~ (p=0.647 n=20+19) Compiler 4.17s ± 2% 4.19s ± 1% ~ (p=0.093 n=20+18) SSA 13.9s ± 1% 14.1s ± 1% +1.45% (p=0.000 n=18+18) Flate 145ms ±13% 146ms ± 5% ~ (p=0.851 n=20+18) GoParser 185ms ± 5% 188ms ± 7% ~ (p=0.174 n=20+20) Reflect 534ms ± 3% 538ms ± 2% ~ (p=0.105 n=20+18) Tar 215ms ± 4% 211ms ± 9% ~ (p=0.079 n=19+20) XML 295ms ± 6% 295ms ± 5% ~ (p=0.968 n=20+20) LinkCompiler 832ms ± 4% 837ms ± 7% ~ (p=0.707 n=17+20) ExternalLinkCompiler 1.58s ± 8% 1.60s ± 4% ~ (p=0.296 n=20+19) LinkWithoutDebugCompiler 478ms ±12% 489ms ±10% ~ (p=0.429 n=20+20) name old object-bytes new object-bytes delta Template 559kB ± 0% 559kB ± 0% ~ (all equal) Unicode 216kB ± 0% 216kB ± 0% ~ (all equal) GoTypes 2.03MB ± 0% 2.03MB ± 0% ~ (all equal) Compiler 8.07MB ± 0% 8.07MB ± 0% -0.06% (p=0.000 n=20+20) SSA 27.1MB ± 0% 27.3MB ± 0% +0.89% (p=0.000 n=20+20) Flate 343kB ± 0% 343kB ± 0% ~ (all equal) GoParser 441kB ± 0% 441kB ± 0% ~ (all equal) Reflect 1.36MB ± 0% 1.36MB ± 0% ~ (all equal) Tar 487kB ± 0% 487kB ± 0% ~ (all equal) XML 632kB ± 0% 632kB ± 0% ~ (all equal) name old export-bytes new export-bytes delta Template 18.5kB ± 0% 18.5kB ± 0% ~ (all equal) Unicode 7.92kB ± 0% 7.92kB ± 0% ~ (all equal) GoTypes 35.0kB ± 0% 35.0kB ± 0% ~ (all equal) Compiler 109kB ± 0% 110kB ± 0% +0.72% (p=0.000 n=20+20) SSA 137kB ± 0% 138kB ± 0% +0.58% (p=0.000 n=20+20) Flate 4.89kB ± 0% 4.89kB ± 0% ~ (all equal) GoParser 8.49kB ± 0% 8.49kB ± 0% ~ (all equal) Reflect 11.4kB ± 0% 11.4kB ± 0% ~ (all equal) Tar 10.5kB ± 0% 10.5kB ± 0% ~ (all equal) XML 16.7kB ± 0% 16.7kB ± 0% ~ (all equal) name old text-bytes new text-bytes delta HelloSize 761kB ± 0% 761kB ± 0% ~ (all equal) CmdGoSize 10.8MB ± 0% 10.8MB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 10.7kB ± 0% 10.7kB ± 0% ~ (all equal) CmdGoSize 312kB ± 0% 312kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 122kB ± 0% 122kB ± 0% ~ (all equal) CmdGoSize 146kB ± 0% 146kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.13MB ± 0% 1.13MB ± 0% ~ (all equal) CmdGoSize 15.1MB ± 0% 15.1MB ± 0% ~ (all equal) Change-Id: I3cc2f9829a109543d9a68be4a21775d2d3e9801f Reviewed-on: https://go-review.googlesource.com/c/go/+/196557 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Daniel Martí <mvdan@mvdan.cc> Reviewed-by: Keith Randall <khr@golang.org>
2019-08-12 20:19:58 +01:00
b.Fatalf("branch not implemented: %s", b.LongString())
}
}
func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
p := s.Prog(loadByRegWidth(reg, t.Size()))
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_AUTO
p.From.Sym = n.Linksym()
p.From.Offset = n.FrameOffset() + off
p.To.Type = obj.TYPE_REG
p.To.Reg = reg
return p
}
func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
p = pp.Append(p, storeByRegWidth(reg, t.Size()), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
p.To.Name = obj.NAME_PARAM
p.To.Sym = n.Linksym()
cmd/compile: spos handling fixes to improve prolog debuggability With the new register ABI, the compiler sometimes introduces spills of argument registers in function prologs; depending on the positions assigned to these spills and whether they have the IsStmt flag set, this can degrade the debugging experience. For example, in this function from one of the Delve regression tests: L13: func foo((eface interface{}) { L14: if eface != nil { L15: n++ L16: } L17 } we wind up with a prolog containing two spill instructions, the first with line 14, the second with line 13. The end result for the user is that if you set a breakpoint in foo and run to it, then do "step", execution will initially stop at L14, then jump "backwards" to L13. The root of the problem in this case is that an ArgIntReg pseudo-op is introduced during expand calls, then promoted (due to lowering) to a first-class statement (IsStmt flag set), which in turn causes downstream handling to propagate its position to the first of the register spills in the prolog. To help improve things, this patch changes the rewriter to avoid moving an "IsStmt" flag from a deleted/replaced instruction to an Arg{Int,Float}Reg value, and adds Arg{Int,Float}Reg to the list of opcodes not suitable for selection as statement boundaries, and suppresses generation of additional register spills in defframe() when optimization is disabled (since in that case things will get spilled in any case). This is not a comprehensive/complete fix; there are still cases where we get less-than-ideal source position markers (ex: issue 45680). Updates #40724. Change-Id: Ica8bba4940b2291bef6b5d95ff0cfd84412a2d40 Reviewed-on: https://go-review.googlesource.com/c/go/+/312989 Trust: Than McIntosh <thanm@google.com> Run-TryBot: Than McIntosh <thanm@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2021-04-21 16:21:30 -04:00
p.Pos = p.Pos.WithNotStmt()
return p
}
// zero 16 bytes at reg+off.
func zero16(s *ssagen.State, reg int16, off int64) {
// MOVUPS X15, off(ptrReg)
p := s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.To.Type = obj.TYPE_MEM
p.To.Reg = reg
p.To.Offset = off
}
[dev.simd] all: merge master (7a1679d) into dev.simd Conflicts: - src/cmd/compile/internal/amd64/ssa.go - src/cmd/compile/internal/ssa/rewriteAMD64.go - src/internal/buildcfg/exp.go - src/internal/cpu/cpu.go - src/internal/cpu/cpu_x86.go - src/internal/goexperiment/flags.go Merge List: + 2025-08-04 7a1679d7ae cmd/compile: move s390x over to new bounds check strategy + 2025-08-04 95693816a5 cmd/compile: move riscv64 over to new bounds check strategy + 2025-08-04 d7bd7773eb go/parser: remove safePos + 2025-08-04 4b6cbc377f cmd/cgo/internal/test: use (syntactic) constant for C array bound + 2025-08-03 b2960e3580 cmd/internal/obj/loong64: add {V,XV}{BITCLR/BITSET/BITREV}[I].{B/H/W/D} instructions support + 2025-08-03 abeeef1c08 cmd/compile/internal/test: fix typo in comments + 2025-08-03 d44749b65b cmd/internal/obj/loong64: add [X]VLDREPL.{B/H/W/D} instructions support + 2025-08-03 d6beda863e runtime: add reference to debugPinnerV1 + 2025-08-01 4ab1aec007 cmd/go: modload should use a read-write lock to improve concurrency + 2025-08-01 e666972a67 runtime: deduplicate Windows stdcall + 2025-08-01 ef40549786 runtime,syscall: move loadlibrary and getprocaddress to syscall + 2025-08-01 336931a4ca cmd/go: use os.Rename to move files on Windows + 2025-08-01 eef5f8d930 cmd/compile: enforce that locals are always accessed with SP base register + 2025-08-01 e071617222 cmd/compile: optimize multiplication rules on loong64 + 2025-07-31 eb7f515c4d cmd/compile: use generated loops instead of DUFFZERO on amd64 + 2025-07-31 c0ee2fd4e3 cmd/go: explicitly reject module paths "go" and "toolchain" + 2025-07-30 a4d99770c0 runtime/metrics: add cleanup and finalizer queue metrics + 2025-07-30 70a2ff7648 runtime: add cgo call benchmark + 2025-07-30 69338a335a cmd/go/internal/gover: fix ModIsPrerelease for toolchain versions + 2025-07-30 cedf63616a cmd/compile: add floating point min/max intrinsics on s390x + 2025-07-30 82a1921c3b all: remove redundant Swiss prefixes + 2025-07-30 2ae059ccaf all: remove GOEXPERIMENT=swissmap + 2025-07-30 cc571dab91 cmd/compile: deduplicate instructions when rewrite func results + 2025-07-30 2174a7936c crypto/tls: use standard chacha20-poly1305 cipher suite names + 2025-07-30 8330fb48a6 cmd/compile: move mips32 over to new bounds check strategy + 2025-07-30 9f9d7b50e8 cmd/compile: move mips64 over to new bounds check strategy + 2025-07-30 5216fd570e cmd/compile: move loong64 over to new bounds check strategy + 2025-07-30 89a0af86b8 cmd/compile: allow ops to specify clobbering input registers + 2025-07-30 5e94d72158 cmd/compile: simplify zerorange on arm64 + 2025-07-30 8cd85e602a cmd/compile: check domination of loop return in both controls + 2025-07-30 cefaed0de0 reflect: fix noswiss builder + 2025-07-30 3aa1b00081 regexp: fix compiling alternate patterns of different fold case literals + 2025-07-30 b1e933d955 cmd/compile: avoid extending when already sufficiently masked on loong64 + 2025-07-29 880ca333d7 cmd/compile: removing log2uint32 function + 2025-07-29 1513661dc3 cmd/compile: simplify logX implementations + 2025-07-29 bd94ae8903 cmd/compile: use unsigned power-of-two detector for unsigned mod + 2025-07-29 f3582fc80e cmd/compile: add unsigned power-of-two detector + 2025-07-29 f7d167fe71 internal/abi: move direct/indirect flag from Kind to TFlag + 2025-07-29 e0b07dc22e os/exec: fix incorrect expansion of "", "." and ".." in LookPath + 2025-07-29 25816d401c internal/goexperiment: delete RangeFunc goexperiment + 2025-07-29 7961bf71f8 internal/goexperiment: delete CacheProg goexperiment + 2025-07-29 e15a14c4dd sync: remove synchashtriemap GOEXPERIMENT + 2025-07-29 7dccd6395c cmd/compile: move arm32 over to new bounds check strategy + 2025-07-29 d79405a344 runtime: only deduct assist credit for arenas during GC + 2025-07-29 19a086f716 cmd/go/internal/telemetrystats: count goexperiments + 2025-07-29 aa95ab8215 image: fix formatting of godoc link + 2025-07-29 4c854b7a3e crypto/elliptic: change a variable name that have the same name as keywords + 2025-07-28 b10eb1d042 cmd/compile: simplify zerorange on amd64 + 2025-07-28 f8eae7a3c3 os/user: fix tests to pass on non-english Windows + 2025-07-28 0984264471 internal/poll: remove msg field from Windows' poll.operation + 2025-07-28 d7b4114346 internal/poll: remove rsan field from Windows' poll.operation + 2025-07-28 361b1ab41f internal/poll: remove sa field from Windows' poll.operation + 2025-07-28 9b6bd64e46 internal/poll: remove qty and flags fields from Windows' poll.operation + 2025-07-28 cd3655a824 internal/runtime/maps: fix spelling errors in comments + 2025-07-28 d5dc36af45 runtime: remove openbsd/mips64 related code + 2025-07-28 64ba72474d errors: omit redundant nil check in type assertion for Join + 2025-07-28 e151db3e06 all: omit unnecessary type conversions + 2025-07-28 4569255f8c cmd/compile: cleanup SelectN rules by indexing into args + 2025-07-28 94645d2413 cmd/compile: rewrite cmov(x, x, cond) into x + 2025-07-28 10c5cf68d4 net/http: add proper panic message + 2025-07-28 46b5839231 test/codegen: fix failing condmove wasm tests + 2025-07-28 98f301cf68 runtime,syscall: move SyscallX implementations from runtime to syscall + 2025-07-28 c7ed3a1c5a internal/runtime/syscall/windows: factor out code from runtime + 2025-07-28 e81eac19d3 hash/crc32: fix incorrect checksums with avx512+race + 2025-07-25 6fbad4be75 cmd/compile: remove no-longer-necessary call to calculateDepths + 2025-07-25 5045fdd8ff cmd/compile: fix containsUnavoidableCall computation + 2025-07-25 d28b27cd8e go/types, types2: use nil to represent incomplete explicit aliases + 2025-07-25 7b53d8d06e cmd/compile/internal/types2: add loaded state between loader calls and constraint expansion + 2025-07-25 374e3be2eb os/user: user random name for the test user account + 2025-07-25 1aa154621d runtime: rename scanobject to scanObject + 2025-07-25 41b429881a runtime: duplicate scanobject in greentea and non-greentea files + 2025-07-25 aeb256e98a cmd/compile: remove unused arg from gorecover + 2025-07-25 08376e1a9c runtime: iterate through inlinings when processing recover() + 2025-07-25 c76c3abc54 encoding/json: fix truncated Token error regression in goexperiment.jsonv2 + 2025-07-25 ebdbfccd98 encoding/json/jsontext: preserve buffer capacity in Encoder.Reset + 2025-07-25 91c4f0ccd5 reflect: avoid a bounds check in stack-constrained code + 2025-07-24 3636ced112 encoding/json: fix extra data regression under goexperiment.jsonv2 + 2025-07-24 a6eec8bdc7 encoding/json: reduce error text regressions under goexperiment.jsonv2 + 2025-07-24 0fa88dec1e time: remove redundant uint32 conversion in split + 2025-07-24 ada30b8248 internal/buildcfg: add ability to get GORISCV64 variable in GOGOARCH + 2025-07-24 6f6c6c5782 cmd/internal/obj: rip out argp adjustment for wrapper frames + 2025-07-24 7b50024330 runtime: detect successful recovers differently + 2025-07-24 7b9de668bd unicode/utf8: skip ahead during ascii runs in Valid/ValidString + 2025-07-24 076eae436e cmd/compile: move amd64 and 386 over to new bounds check strategy + 2025-07-24 f703dc5bef cmd/compile: add missing StringLen rule in prove + 2025-07-24 394d0bee8d cmd/compile: move arm64 over to new bounds check strategy + 2025-07-24 3024785b92 cmd/compile,runtime: remember idx+len for bounds check failure with less code + 2025-07-24 741a19ab41 runtime: move bounds check constants to internal/abi + 2025-07-24 ce05ad448f cmd/compile: rewrite condselects into doublings and halvings + 2025-07-24 fcd28070fe cmd/compile: add opt branchelim to rewrite some CondSelect into math + 2025-07-24 f32cf8e4b0 cmd/compile: learn transitive proofs for safe unsigned subs + 2025-07-24 d574856482 cmd/compile: learn transitive proofs for safe negative signed adds + 2025-07-24 1a72920f09 cmd/compile: learn transitive proofs for safe positive signed adds + 2025-07-24 e5f202bb60 cmd/compile: learn transitive proofs for safe unsigned adds + 2025-07-24 bd80f74bc1 cmd/compile: fold shift through AND for slice operations + 2025-07-24 5c45fe1385 internal/runtime/syscall: rename to internal/runtime/syscall/linux + 2025-07-24 592c2db868 cmd/compile: improve loopRotate to handle nested loops + 2025-07-24 dcb479c2f9 cmd/compile: optimize slice bounds checking with SUB/SUBconst comparisons + 2025-07-24 f11599b0b9 internal/poll: remove handle field from Windows' poll.operation + 2025-07-24 f7432e0230 internal/poll: remove fd field from Windows' poll.operation + 2025-07-24 e84ed38641 runtime: add benchmark for small-size memmory operation + 2025-07-24 18dbe5b941 hash/crc32: add AVX512 IEEE CRC32 calculation + 2025-07-24 c641900f72 cmd/compile: prefer base.Fatalf to panic in dwarfgen + 2025-07-24 d71d8aeafd cmd/internal/obj/s390x: add MVCLE instruction + 2025-07-24 b6cf1d94dc runtime: optimize memclr on mips64x + 2025-07-24 a8edd99479 runtime: improvement in memclr for s390x + 2025-07-24 bd04f65511 internal/runtime/exithook: fix a typo + 2025-07-24 5c8624a396 cmd/internal/goobj: make error output clear + 2025-07-24 44d73dfb4e cmd/go/internal/doc: clean up after merge with cmd/internal/doc + 2025-07-24 bd446662dd cmd/internal/doc: merge with cmd/go/internal/doc + 2025-07-24 da8b50c830 cmd/doc: delete + 2025-07-24 6669aa3b14 runtime: randomize heap base address + 2025-07-24 26338a7f69 cmd/compile: use better fatal message for staticValue1 + 2025-07-24 8587ba272e cmd/cgo: compare malloc return value to NULL instead of literal 0 + 2025-07-24 cae45167b7 go/types, types2: better error messages for certain type mismatches + 2025-07-24 2ddf542e4c cmd/compile: use ,ok return idiom for sparsemap.get + 2025-07-24 6505fcbd0a cmd/compile: use generics for sparse map + 2025-07-24 14f5eb7812 cmd/api: rerun updategolden + 2025-07-24 52b6d7f67a runtime: drop NetBSD kernel bug sysmon workaround fixed in NetBSD 9.2 + 2025-07-24 1ebebf1cc1 cmd/go: clean should respect workspaces + 2025-07-24 6536a93547 encoding/json/jsontext: preserve buffer capacity in Decoder.Reset + 2025-07-24 efc37e97c0 cmd/go: always return the cached path from go tool -n + 2025-07-23 98a031193b runtime: check TestUsingVDSO ExitError type assertion + 2025-07-23 6bb42997c8 doc/next: initialize + 2025-07-23 2696a11a97 internal/goversion: update Version to 1.26 + 2025-07-23 489868f776 cmd/link: scope test to linux & net.sendFile + 2025-07-22 71c2bf5513 cmd/compile: fix loclist for heap return vars without optimizations + 2025-07-22 c74399e7f5 net: correct comment for ListenConfig.ListenPacket + 2025-07-22 4ed9943b26 all: go fmt + 2025-07-22 1aaf7422f1 cmd/internal/objabi: remove redundant word in comment + 2025-07-21 d5ec0815e6 runtime: relax TestMemoryLimitNoGCPercent a bit + 2025-07-21 f7cc61e7d7 cmd/compile: for arm64 epilog, do SP increment with a single instruction + 2025-07-21 5dac42363b runtime: fix asan wrapper for riscv64 + 2025-07-21 e5502e0959 cmd/go: check subcommand properties + 2025-07-19 2363897932 cmd/internal/obj: enable got pcrel itype in fips140 for riscv64 + 2025-07-19 e32255fcc0 cmd/compile/internal/ssa: restrict architectures for TestDebugLines_74576 + 2025-07-18 0451816430 os: revert the use of AddCleanup to close files and roots + 2025-07-18 34b70684ba go/types: infer correct type for y in append(bytes, y...) + 2025-07-17 66536242fc cmd/compile/internal/escape: improve DWARF .debug_line numbering for literal rewriting optimizations + 2025-07-16 385000b004 runtime: fix idle time double-counting bug + 2025-07-16 f506ad2644 cmd/compile/internal/escape: speed up analyzing some functions with many closures + 2025-07-16 9c507e7942 cmd/link, runtime: on Wasm, put only function index in method table and func table + 2025-07-16 9782dcfd16 runtime: use 32-bit function index on Wasm + 2025-07-16 c876bf9346 cmd/internal/obj/wasm: use 64-bit instructions for indirect calls + 2025-07-15 b4309ece66 cmd/internal/doc: upgrade godoc pkgsite to 01b046e + 2025-07-15 75a19dbcd7 runtime: use memclrNoHeapPointers to clear inline mark bits + 2025-07-15 6d4a91c7a5 runtime: only clear inline mark bits on span alloc if necessary + 2025-07-15 0c6296ab12 runtime: have mergeInlineMarkBits also clear the inline mark bits + 2025-07-15 397d2117ec runtime: merge inline mark bits with gcmarkBits 8 bytes at a time + 2025-07-15 7dceabd3be runtime/maps: fix typo in group.go comment (instrinsified -> intrinsified) + 2025-07-15 d826bf4d74 os: remove useless error check + 2025-07-14 bb07e55aff runtime: expand GOMAXPROCS documentation + 2025-07-14 9159cd4ec6 encoding/json: decompose legacy options + 2025-07-14 c6556b8eb3 encoding/json/v2: add security section to doc + 2025-07-11 6ebb5f56d9 runtime: gofmt after CL 643897 and CL 662455 + 2025-07-11 1e48ca7020 encoding/json: remove legacy option to EscapeInvalidUTF8 + 2025-07-11 a0a99cb22b encoding/json/v2: report wrapped io.ErrUnexpectedEOF + 2025-07-11 9d04122d24 crypto/rsa: drop contradictory promise to keep PublicKey modulus secret + 2025-07-11 1ca23682dd crypto/rsa: fix documentation formatting + 2025-07-11 4bc3373c8e runtime: turn off large memmove tests under asan/msan Change-Id: I1e32d964eba770b85421efb86b305a2242f24466
2025-08-04 15:07:05 -04:00
// move 16 bytes from src+off to dst+off using temporary register tmp.
func move16(s *ssagen.State, src, dst, tmp int16, off int64) {
// MOVUPS off(srcReg), tmpReg
// MOVUPS tmpReg, off(dstReg)
p := s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_MEM
p.From.Reg = src
p.From.Offset = off
p.To.Type = obj.TYPE_REG
p.To.Reg = tmp
p = s.Prog(x86.AMOVUPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = tmp
p.To.Type = obj.TYPE_MEM
p.To.Reg = dst
p.To.Offset = off
}
[dev.simd] all: merge master (924fe98) into dev.simd Conflicts: - src/cmd/compile/internal/amd64/ssa.go - src/cmd/compile/internal/ssa/expand_calls.go - src/cmd/compile/internal/ssagen/ssa.go - src/internal/buildcfg/exp.go - src/internal/cpu/cpu.go - src/internal/cpu/cpu_x86.go - src/runtime/mkpreempt.go - src/runtime/preempt_amd64.go - src/runtime/preempt_amd64.s Merge List: + 2025-08-14 924fe98902 cmd/internal/obj/riscv: add encoding for compressed riscv64 instructions + 2025-08-13 320df537cc cmd/compile: emit classify instructions for infinity tests on riscv64 + 2025-08-13 ca66f907dd cmd/compile: use generated loops instead of DUFFCOPY on amd64 + 2025-08-13 4b1800e476 encoding/json/v2: cleanup error constructors + 2025-08-13 af8870708b encoding/json/v2: fix incorrect marshaling of NaN in float64 any + 2025-08-13 0a75e5a07b encoding/json/v2: fix wrong type with cyclic marshal error in map[string]any + 2025-08-13 de9b6f9875 cmd/pprof: update vendored github.com/google/pprof + 2025-08-13 674c5f0edd os/exec: fix incorrect expansion of ".." in LookPath on plan9 + 2025-08-13 9bbea0f21a cmd/compile: during regalloc, fixedreg values are always available + 2025-08-13 08eef97500 runtime/trace: fix documentation typo + 2025-08-13 2fe5d51d04 internal/trace: fix wrong scope for Event.Range or EvGCSweepActive + 2025-08-13 9fcb87c352 cmd/compile: teach prove about len's & cap's max based on the element size + 2025-08-13 9763ece873 cmd/compile: absorb NEGV into branch on loong64 + 2025-08-13 f10a82b76f all: update vendored dependencies [generated] + 2025-08-13 3bea95b277 cmd/link/internal/ld: remove OpenBSD buildid workaround + 2025-08-12 90b7d7aaa2 cmd/compile/internal: optimize multiplication use new operation 'ADDshiftLLV' on loong64 + 2025-08-12 1b263fc604 runtime/race: restore previous version of LLVM TSAN on macOS + 2025-08-12 b266318cf7 cmd/compile/internal/ssa: use BEQ/BNE to optimize the combination of XOR and EQ/NE on loong64 + 2025-08-12 adbf59525c internal/runtime/gc/scan: avoid -1 index when cache sizes unavailable + 2025-08-12 4e182db5fc Revert "cmd/compile: use generated loops instead of DUFFCOPY on amd64" + 2025-08-12 d2b3c1a504 internal/trace: clarify which StateTransition events have stacks + 2025-08-12 f63e12d0e0 internal/trace: fix Sync.ClockSnapshot comment + 2025-08-12 8e317da77d internal/trace: remove unused StateTransition.id field + 2025-08-12 f67d8ff34a internal/trace/tracev2: adjust comment for consistency + 2025-08-12 fe4d445c36 internal/trace/tracev2: fix EvSTWBegin comment to include stack ID + 2025-08-12 750789fab7 internal/trace/internal/testgen: fix missing stacks nframes arg + 2025-08-12 889ab74169 internal/runtime/gc/scan: import scan kernel from gclab [green tea] + 2025-08-12 182336bf05 net/http: fix data race in client + 2025-08-12 f04421ea9a cmd/compile: soften test for 74788 + 2025-08-12 28aa529c99 cmd/compile: use generated loops instead of DUFFZERO on arm64 + 2025-08-12 ec9e1176c3 cmd/compile: use generated loops instead of DUFFCOPY on amd64 + 2025-08-12 d0a64f7969 Revert "cmd/compile/internal/ssa: Use transitive properties for len/cap" + 2025-08-12 00a7bdcb55 all: delete aliastypeparams GOEXPERIMENT + 2025-08-11 74421a305b Revert "cmd/compile: allow multi-field structs to be stored directly in interfaces" + 2025-08-11 c31359138c Revert "cmd/compile: allow StructSelect [x] of interface data fields for x>0" + 2025-08-11 7248995b60 Revert "cmd/compile: allow more args in StructMake folding rule" + 2025-08-11 caf9fc3ccd Revert "reflect: handle zero-sized fields of directly-stored structures correctly" + 2025-08-11 ce3f3e2ae7 cmd/link/internal/ld, internal/syscall/unix: use posix_fallocate on netbsd + 2025-08-11 3dbef65bf3 database/sql: allow drivers to override Scan behavior + 2025-08-11 2b804abf07 net: context aware Dialer.Dial functions + 2025-08-11 6abfe7b0de cmd/dist: require Go 1.24.6 as minimum bootstrap toolchain + 2025-08-11 691af6ca28 encoding/json: fix Indent trailing whitespace regression in goexperiment.jsonv2 + 2025-08-11 925149da20 net/http: add example for CrossOriginProtection + 2025-08-11 cf4af0b2f3 encoding/json/v2: fix UnmarshalDecode regression with EOF + 2025-08-11 b096ddb9ea internal/runtime/maps: loop invariant code motion with h2(hash) by hand + 2025-08-11 a2431776eb net, os, file/filepath, syscall: use slices.Equal in tests + 2025-08-11 a7f05b38f7 cmd/compile: convert branch with zero to more optimal branch zero on loong64 + 2025-08-11 1718828c81 internal/sync: warn about incorrect unsafe usage in HashTrieMap + 2025-08-11 084c0f8494 cmd/compile: allow InlMark operations to be speculatively executed + 2025-08-10 a62f72f7a7 cmd/compile/internal/ssa: optimise more branches with SGTconst/SGTUconst on loong64 + 2025-08-08 fbac94a799 internal/sync: rename Store parameter from old to new + 2025-08-08 317be4cfeb cmd/compile/internal/staticinit: remove deadcode + 2025-08-08 bce5601cbb cmd/go: fix fips doc link + 2025-08-08 777d76c4f2 text/template: use sync.OnceValue for builtinFuncs + 2025-08-08 0201524c52 math: remove redundant infinity tests + 2025-08-08 dcc77f9e3c cmd/go: fix get -tool when multiple packages are provided + 2025-08-08 c7b85e9ddc all: update blog link + 2025-08-08 a8dd771e13 crypto/tls: check if quic conn can send session ticket + 2025-08-08 bdb2d50fdf net: fix WriteMsgUDPAddrPort addr handling on IPv4 sockets + 2025-08-08 768c51e368 internal/runtime/maps: remove unused var bitsetDeleted + 2025-08-08 b3388569a1 reflect: handle zero-sized fields of directly-stored structures correctly + 2025-08-08 d83b16fcb8 internal/bytealg: vector implementation of compare for riscv64 + 2025-08-07 dd3abf6bc5 internal/bytealg: optimize Index/IndexString on loong64 + 2025-08-07 73ff6d1480 cmd/internal/obj/loong64: change the immediate range of ALSL{W/WU/V} + 2025-08-07 f3606b0825 cmd/compile/internal/ssa: fix typo in LOONG64Ops.go comment + 2025-08-07 ee7bb8969a cmd/internal/obj/loong64: add support for FSEL instruction + 2025-08-07 1f7ffca171 time: skip TestLongAdjustTimers on plan9 (too slow) + 2025-08-06 8282b72d62 runtime/race: update darwin race syso + 2025-08-06 dc54d7b607 all: remove support for windows/arm + 2025-08-06 e0a1ea431c cmd/compile: make panicBounds stack frame smaller on ppc64 + 2025-08-06 2747f925dd debug/macho: support reading imported symbols without LC_DYSYMTAB + 2025-08-06 025d36917c cmd/internal/testdir: pass -buildid to link command + 2025-08-06 f53dcb6280 cmd/internal/testdir: unify link command + 2025-08-06 a3895fe9f1 database/sql: avoid closing Rows while scan is in progress + 2025-08-06 608e9fac90 go/types, types2: flip on position tracing + 2025-08-06 72e8237cc1 cmd/compile: allow more args in StructMake folding rule + 2025-08-06 3406a617d9 internal/bytealg: vector implementation of indexbyte for riscv64 + 2025-08-06 75ea2d05c0 internal/bytealg: vector implementation of equal for riscv64 + 2025-08-05 17a8be7117 crypto/sha512: use const table for key loading on loong64 + 2025-08-05 dda9d780e2 crypto/sha256: use const table for key loading on loong64 + 2025-08-05 5defe8ebb3 internal/chacha8rand: replace WORD with instruction VMOVQ + 2025-08-05 4c7362e41c cmd/internal/obj/loong64: add new instructions ALSL{W/WU/V} for loong64 + 2025-08-05 a552737418 cmd/compile: fold negation into multiplication on loong64 + 2025-08-05 e1fd4faf91 runtime: fix godoc comment for inVDSOPage + 2025-08-05 bcd25c79aa cmd/compile: allow StructSelect [x] of interface data fields for x>0 + 2025-08-05 b0945a54b5 cmd/dist, internal/platform: mark freebsd/riscv64 broken + 2025-08-05 55d961b202 runtime: save AVX2 and AVX-512 state on asynchronous preemption + 2025-08-05 af0c4fe2ca runtime: save scalar registers off stack in amd64 async preemption + 2025-08-05 e73afaae69 internal/cpu: add AVX-512-CD and DQ, and derived "basic AVX-512" + 2025-08-05 cef381ba60 runtime: eliminate global state in mkpreempt.go + 2025-08-05 c0025d5e0b go/parser: correct comment in expectedErrors + 2025-08-05 4ee0df8c46 cmd: remove dead code + 2025-08-05 a2c45f0eb1 runtime: test VDSO symbol hash values + 2025-08-05 cd55f86b8d cmd/compile: allow multi-field structs to be stored directly in interfaces + 2025-08-05 21ab0128b6 cmd/compile: remove support for old-style bounds check calls + 2025-08-05 802d056c78 cmd/compile: move ppc64 over to new bounds check strategy + 2025-08-05 a3295df873 cmd/compile/internal/ssa: Use transitive properties for len/cap + 2025-08-05 bd082857a5 doc: fix typo in go memory model doc + 2025-08-05 2b622b05a9 cmd/compile: remove isUintXPowerOfTwo functions + 2025-08-05 72147ffa75 cmd/compile: simplify isUintXPowerOfTwo implementation + 2025-08-05 26da1199eb cmd/compile: make isUint{32,64}PowerOfTwo implementations clearer + 2025-08-05 5ab9f23977 cmd/compile, runtime: add checkptr instrumentation for unsafe.Add + 2025-08-05 fcc036f03b cmd/compile: optimise float <-> int register moves on riscv64 Change-Id: Ie94f29d9b0cc14a52a536866f5abaef27b5c52d7
2025-08-14 11:43:15 -04:00
// XXX maybe make this part of v.Reg?
// On the other hand, it is architecture-specific.
func simdReg(v *ssa.Value) int16 {
t := v.Type
if !t.IsSIMD() {
base.Fatalf("simdReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
}
return simdRegBySize(v.Reg(), t.Size())
}
func simdRegBySize(reg int16, size int64) int16 {
switch size {
case 16:
return reg
case 32:
return reg + (x86.REG_Y0 - x86.REG_X0)
case 64:
return reg + (x86.REG_Z0 - x86.REG_X0)
}
panic("simdRegBySize: bad size")
}
// XXX k mask
func maskReg(v *ssa.Value) int16 {
t := v.Type
if !t.IsSIMD() {
base.Fatalf("maskReg: not a simd type; v=%s, b=b%d, f=%s", v.LongString(), v.Block.ID, v.Block.Func.Name)
}
switch t.Size() {
case 8:
return v.Reg()
}
panic("unreachable")
}
// XXX k mask + vec
func simdOrMaskReg(v *ssa.Value) int16 {
t := v.Type
if t.Size() <= 8 {
return maskReg(v)
}
return simdReg(v)
}
// XXX this is used for shift operations only.
// regalloc will issue OpCopy with incorrect type, but the assigned
// register should be correct, and this function is merely checking
// the sanity of this part.
func simdCheckRegOnly(v *ssa.Value, regStart, regEnd int16) int16 {
if v.Reg() > regEnd || v.Reg() < regStart {
panic("simdCheckRegOnly: not the desired register")
}
return v.Reg()
}