go/src/cmd/compile/internal/amd64/ssa.go

1081 lines
31 KiB
Go
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package amd64
import (
"fmt"
"math"
"cmd/compile/internal/gc"
"cmd/compile/internal/ssa"
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
"cmd/compile/internal/types"
"cmd/internal/obj"
"cmd/internal/obj/x86"
)
// markMoves marks any MOVXconst ops that need to avoid clobbering flags.
func ssaMarkMoves(s *gc.SSAGenState, b *ssa.Block) {
flive := b.FlagsLiveAtEnd
if b.Control != nil && b.Control.Type.IsFlags() {
flive = true
}
for i := len(b.Values) - 1; i >= 0; i-- {
v := b.Values[i]
if flive && (v.Op == ssa.OpAMD64MOVLconst || v.Op == ssa.OpAMD64MOVQconst) {
// The "mark" is any non-nil Aux value.
v.Aux = v
}
if v.Type.IsFlags() {
flive = false
}
for _, a := range v.Args {
if a.Type.IsFlags() {
flive = true
}
}
}
}
// loadByType returns the load instruction of the given type.
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
func loadByType(t *types.Type) obj.As {
// Avoid partial register write
if !t.IsFloat() && t.Size() <= 2 {
if t.Size() == 1 {
return x86.AMOVBLZX
} else {
return x86.AMOVWLZX
}
}
// Otherwise, there's no difference between load and store opcodes.
return storeByType(t)
}
// storeByType returns the store instruction of the given type.
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
func storeByType(t *types.Type) obj.As {
width := t.Size()
if t.IsFloat() {
switch width {
case 4:
return x86.AMOVSS
case 8:
return x86.AMOVSD
}
} else {
switch width {
case 1:
return x86.AMOVB
case 2:
return x86.AMOVW
case 4:
return x86.AMOVL
case 8:
return x86.AMOVQ
}
}
panic("bad store type")
}
// moveByType returns the reg->reg move instruction of the given type.
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
func moveByType(t *types.Type) obj.As {
if t.IsFloat() {
// Moving the whole sse2 register is faster
// than moving just the correct low portion of it.
// There is no xmm->xmm move with 1 byte opcode,
// so use movups, which has 2 byte opcode.
return x86.AMOVUPS
} else {
switch t.Size() {
case 1:
// Avoids partial register write
return x86.AMOVL
case 2:
return x86.AMOVL
case 4:
return x86.AMOVL
case 8:
return x86.AMOVQ
case 16:
return x86.AMOVUPS // int128s are in SSE registers
default:
panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
}
}
}
// opregreg emits instructions for
// dest := dest(To) op src(From)
// and also returns the created obj.Prog so it
// may be further adjusted (offset, scale, etc).
func opregreg(s *gc.SSAGenState, op obj.As, dest, src int16) *obj.Prog {
p := s.Prog(op)
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = dest
p.From.Reg = src
return p
}
cmd/compile/internal/ssa: use sse to zero on amd64 Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-09 14:50:58 -05:00
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
// See runtime/mkduff.go.
func duffStart(size int64) int64 {
x, _ := duff(size)
return x
}
func duffAdj(size int64) int64 {
_, x := duff(size)
return x
}
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
// required to use the duffzero mechanism for a block of the given size.
func duff(size int64) (int64, int64) {
if size < 32 || size > 1024 || size%dzClearStep != 0 {
panic("bad duffzero size")
}
steps := size / dzClearStep
blocks := steps / dzBlockLen
steps %= dzBlockLen
off := dzBlockSize * (dzBlocks - blocks)
var adj int64
if steps != 0 {
cmd/compile/internal/ssa: use sse to zero on amd64 Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-09 14:50:58 -05:00
off -= dzLeaqSize
off -= dzMovSize * steps
adj -= dzClearStep * (dzBlockLen - steps)
}
return off, adj
}
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
switch v.Op {
case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
r := v.Reg()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
switch {
case r == r1:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case r == r2:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
default:
var asm obj.As
if v.Op == ssa.OpAMD64ADDQ {
asm = x86.ALEAQ
} else {
asm = x86.ALEAL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_MEM
p.From.Reg = r1
p.From.Scale = 1
p.From.Index = r2
p.To.Type = obj.TYPE_REG
p.To.Reg = r
}
// 2-address opcode arithmetic
case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL,
ssa.OpAMD64MULQ, ssa.OpAMD64MULL,
ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL,
ssa.OpAMD64ORQ, ssa.OpAMD64ORL,
ssa.OpAMD64XORQ, ssa.OpAMD64XORL,
ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL,
ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB,
ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB,
ssa.OpAMD64ROLQ, ssa.OpAMD64ROLL, ssa.OpAMD64ROLW, ssa.OpAMD64ROLB,
ssa.OpAMD64RORQ, ssa.OpAMD64RORL, ssa.OpAMD64RORW, ssa.OpAMD64RORB,
ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD,
ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD,
ssa.OpAMD64PXOR:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
opregreg(s, v.Op.Asm(), r, v.Args[1].Reg())
case ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.
// Result[0] (the quotient) is in AX.
// Result[1] (the remainder) is in DX.
r := v.Args[1].Reg()
// Zero extend dividend.
c := s.Prog(x86.AXORL)
c.From.Type = obj.TYPE_REG
c.From.Reg = x86.REG_DX
c.To.Type = obj.TYPE_REG
c.To.Reg = x86.REG_DX
// Issue divide.
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW:
// Arg[0] (the dividend) is in AX.
// Arg[1] (the divisor) can be in any other register.
// Result[0] (the quotient) is in AX.
// Result[1] (the remainder) is in DX.
r := v.Args[1].Reg()
// CPU faults upon signed overflow, which occurs when the most
// negative int is divided by -1. Handle divide by -1 as a special case.
var c *obj.Prog
switch v.Op {
case ssa.OpAMD64DIVQ:
c = s.Prog(x86.ACMPQ)
case ssa.OpAMD64DIVL:
c = s.Prog(x86.ACMPL)
case ssa.OpAMD64DIVW:
c = s.Prog(x86.ACMPW)
}
c.From.Type = obj.TYPE_REG
c.From.Reg = r
c.To.Type = obj.TYPE_CONST
c.To.Offset = -1
j1 := s.Prog(x86.AJEQ)
j1.To.Type = obj.TYPE_BRANCH
// Sign extend dividend.
switch v.Op {
case ssa.OpAMD64DIVQ:
s.Prog(x86.ACQO)
case ssa.OpAMD64DIVL:
s.Prog(x86.ACDQ)
case ssa.OpAMD64DIVW:
s.Prog(x86.ACWD)
}
// Issue divide.
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
// Skip over -1 fixup code.
j2 := s.Prog(obj.AJMP)
j2.To.Type = obj.TYPE_BRANCH
// Issue -1 fixup code.
// n / -1 = -n
n1 := s.Prog(x86.ANEGQ)
n1.To.Type = obj.TYPE_REG
n1.To.Reg = x86.REG_AX
// n % -1 == 0
n2 := s.Prog(x86.AXORL)
n2.From.Type = obj.TYPE_REG
n2.From.Reg = x86.REG_DX
n2.To.Type = obj.TYPE_REG
n2.To.Reg = x86.REG_DX
// TODO(khr): issue only the -1 fixup code we need.
// For instance, if only the quotient is used, no point in zeroing the remainder.
j1.To.Val = n1
j2.To.Val = s.Pc()
case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU:
// the frontend rewrites constant division by 8/16/32 bit integers into
// HMUL by a constant
// SSA rewrites generate the 64 bit versions
// Arg[0] is already in AX as it's the only register we allow
// and DX is the only output we care about (the high bits)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
// IMULB puts the high portion in AH instead of DL,
// so move it to DL for consistency
if v.Type.Size() == 1 {
m := s.Prog(x86.AMOVB)
m.From.Type = obj.TYPE_REG
m.From.Reg = x86.REG_AH
m.To.Type = obj.TYPE_REG
m.To.Reg = x86.REG_DX
}
case ssa.OpAMD64MULQU2:
// Arg[0] is already in AX as it's the only register we allow
// results hi in DX, lo in AX
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
case ssa.OpAMD64DIVQU2:
// Arg[0], Arg[1] are already in Dx, AX, as they're the only registers we allow
// results q in AX, r in DX
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
case ssa.OpAMD64AVGQU:
// compute (x+y)/2 unsigned.
// Do a 64-bit add, the overflow goes into the carry.
// Shift right once and pull the carry back into the 63rd bit.
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(x86.AADDQ)
p.From.Type = obj.TYPE_REG
p.To.Type = obj.TYPE_REG
p.To.Reg = r
p.From.Reg = v.Args[1].Reg()
p = s.Prog(x86.ARCRQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst:
r := v.Reg()
a := v.Args[0].Reg()
if r == a {
if v.AuxInt == 1 {
var asm obj.As
// Software optimization manual recommends add $1,reg.
// But inc/dec is 1 byte smaller. ICC always uses inc
// Clang/GCC choose depending on flags, but prefer add.
// Experiments show that inc/dec is both a little faster
// and make a binary a little smaller.
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.AINCQ
} else {
asm = x86.AINCL
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
}
if v.AuxInt == -1 {
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.ADECQ
} else {
asm = x86.ADECL
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
return
}
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconst {
asm = x86.ALEAQ
} else {
asm = x86.ALEAL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_MEM
p.From.Reg = a
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
// TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2
// then we don't need to use resultInArg0 for these ops.
//p.From3 = new(obj.Addr)
//p.From3.Type = obj.TYPE_REG
//p.From3.Reg = v.Args[0].Reg()
case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst,
ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst,
ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst,
ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst,
ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst,
ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst,
ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst,
ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask:
r := v.Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8:
r := v.Args[0].Reg()
i := v.Args[1].Reg()
p := s.Prog(x86.ALEAQ)
switch v.Op {
case ssa.OpAMD64LEAQ1:
p.From.Scale = 1
if i == x86.REG_SP {
r, i = i, r
}
case ssa.OpAMD64LEAQ2:
p.From.Scale = 2
case ssa.OpAMD64LEAQ4:
p.From.Scale = 4
case ssa.OpAMD64LEAQ8:
p.From.Scale = 8
}
p.From.Type = obj.TYPE_MEM
p.From.Reg = r
p.From.Index = i
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64LEAQ, ssa.OpAMD64LEAL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB,
ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB,
ssa.OpAMD64BTL, ssa.OpAMD64BTQ:
opregreg(s, v.Op.Asm(), v.Args[1].Reg(), v.Args[0].Reg())
case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD:
// Go assembler has swapped operands for UCOMISx relative to CMP,
// must account for that right here.
opregreg(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg())
case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_CONST
p.To.Offset = v.AuxInt
case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst,
ssa.OpAMD64BTLconst, ssa.OpAMD64BTQconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst:
x := v.Reg()
cmd/compile: use MOVL instead of MOVQ for small constants on amd64 The encoding of MOVL to a register is 2 bytes shorter than for MOVQ. The upper 32bit are automatically zeroed when MOVL to a register is used. Replaces 1657 MOVQ by MOVL in the go binary. Reduces go binary size by 4 kilobyte. name old time/op new time/op delta BinaryTree17 1.93s ± 0% 1.93s ± 0% -0.32% (p=0.000 n=9+9) Fannkuch11 2.66s ± 0% 2.48s ± 0% -6.60% (p=0.000 n=9+9) FmtFprintfEmpty 31.8ns ± 0% 31.6ns ± 0% -0.63% (p=0.000 n=10+10) FmtFprintfString 52.0ns ± 0% 51.9ns ± 0% -0.19% (p=0.000 n=10+10) FmtFprintfInt 55.6ns ± 0% 54.6ns ± 0% -1.80% (p=0.002 n=8+10) FmtFprintfIntInt 87.7ns ± 0% 84.8ns ± 0% -3.31% (p=0.000 n=9+9) FmtFprintfPrefixedInt 98.9ns ± 0% 102.0ns ± 0% +3.10% (p=0.000 n=10+10) FmtFprintfFloat 165ns ± 0% 164ns ± 0% -0.61% (p=0.000 n=10+10) FmtManyArgs 368ns ± 0% 361ns ± 0% -1.98% (p=0.000 n=8+10) GobDecode 4.53ms ± 0% 4.58ms ± 0% +1.08% (p=0.000 n=9+10) GobEncode 3.74ms ± 0% 3.73ms ± 0% -0.27% (p=0.000 n=10+10) Gzip 164ms ± 0% 163ms ± 0% -0.48% (p=0.000 n=10+10) Gunzip 26.7ms ± 0% 26.6ms ± 0% -0.13% (p=0.000 n=9+10) HTTPClientServer 30.4µs ± 1% 30.3µs ± 1% -0.41% (p=0.016 n=10+10) JSONEncode 10.9ms ± 0% 11.0ms ± 0% +0.70% (p=0.000 n=10+10) JSONDecode 36.8ms ± 0% 37.0ms ± 0% +0.59% (p=0.000 n=9+10) Mandelbrot200 3.20ms ± 0% 3.21ms ± 0% +0.44% (p=0.000 n=9+10) GoParse 2.35ms ± 0% 2.35ms ± 0% +0.26% (p=0.000 n=10+9) RegexpMatchEasy0_32 58.3ns ± 0% 58.4ns ± 0% +0.17% (p=0.000 n=10+10) RegexpMatchEasy0_1K 138ns ± 0% 142ns ± 0% +2.68% (p=0.000 n=10+10) RegexpMatchEasy1_32 55.1ns ± 0% 55.6ns ± 1% ~ (p=0.104 n=10+10) RegexpMatchEasy1_1K 242ns ± 0% 243ns ± 0% +0.41% (p=0.000 n=10+10) RegexpMatchMedium_32 87.4ns ± 0% 89.9ns ± 0% +2.86% (p=0.000 n=10+10) RegexpMatchMedium_1K 27.4µs ± 0% 27.4µs ± 0% +0.15% (p=0.000 n=10+10) RegexpMatchHard_32 1.30µs ± 0% 1.32µs ± 1% +1.91% (p=0.000 n=10+10) RegexpMatchHard_1K 39.0µs ± 0% 39.5µs ± 0% +1.38% (p=0.000 n=10+10) Revcomp 316ms ± 0% 319ms ± 0% +1.13% (p=0.000 n=9+8) Template 40.6ms ± 0% 40.6ms ± 0% ~ (p=0.123 n=10+10) TimeParse 224ns ± 0% 224ns ± 0% ~ (all equal) TimeFormat 230ns ± 0% 225ns ± 0% -2.17% (p=0.000 n=10+10) Change-Id: I32a099b65f9e6d4ad7288ed48546655c534757d8 Reviewed-on: https://go-review.googlesource.com/38630 Run-TryBot: Martin Möhrmann <moehrmann@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-03-24 08:13:17 +01:00
asm := v.Op.Asm()
// Use MOVL to move a small constant into a register
// when the constant is positive and fits into 32 bits.
if 0 <= v.AuxInt && v.AuxInt <= (1<<32-1) {
// The upper 32bit are zeroed automatically when using MOVL.
asm = x86.AMOVL
}
p := s.Prog(asm)
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = x
// If flags are live at this instruction, suppress the
// MOV $0,AX -> XOR AX,AX optimization.
if v.Aux != nil {
p.Mark |= x86.PRESERVEFLAGS
}
case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst:
x := v.Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_FCONST
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = x
case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8, ssa.OpAMD64MOVLloadidx8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.From.Scale = 8
p.From.Index = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.From.Scale = 4
p.From.Index = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVWloadidx2:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.From.Scale = 2
p.From.Index = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1:
r := v.Args[0].Reg()
i := v.Args[1].Reg()
if i == x86.REG_SP {
r, i = i, r
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = r
p.From.Scale = 1
p.From.Index = i
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8, ssa.OpAMD64MOVLstoreidx8:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Scale = 8
p.To.Index = v.Args[1].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Scale = 4
p.To.Index = v.Args[1].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64MOVWstoreidx2:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Scale = 2
p.To.Index = v.Args[1].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1:
r := v.Args[0].Reg()
i := v.Args[1].Reg()
if i == x86.REG_SP {
r, i = i, r
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = r
p.To.Scale = 1
p.To.Index = i
gc.AddAux(&p.To, v)
case ssa.OpAMD64ADDQconstmem, ssa.OpAMD64ADDLconstmem:
sc := v.AuxValAndOff()
off := sc.Off()
val := sc.Val()
if val == 1 {
var asm obj.As
if v.Op == ssa.OpAMD64ADDQconstmem {
asm = x86.AINCQ
} else {
asm = x86.AINCL
}
p := s.Prog(asm)
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux2(&p.To, v, off)
} else {
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = val
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux2(&p.To, v, off)
}
case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
sc := v.AuxValAndOff()
p.From.Offset = sc.Val()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux2(&p.To, v, sc.Off())
case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
sc := v.AuxValAndOff()
p.From.Offset = sc.Val()
r := v.Args[0].Reg()
i := v.Args[1].Reg()
switch v.Op {
case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1:
p.To.Scale = 1
if i == x86.REG_SP {
r, i = i, r
}
case ssa.OpAMD64MOVWstoreconstidx2:
p.To.Scale = 2
case ssa.OpAMD64MOVLstoreconstidx4:
p.To.Scale = 4
case ssa.OpAMD64MOVQstoreconstidx8:
p.To.Scale = 8
}
p.To.Type = obj.TYPE_MEM
p.To.Reg = r
p.To.Index = i
gc.AddAux2(&p.To, v, sc.Off())
case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX,
ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ,
ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS:
opregreg(s, v.Op.Asm(), v.Reg(), v.Args[0].Reg())
cmd/compile/internal/amd64: break dependency for CVTS[LQ]2S[DS] CVTSL2SS, CVTSQ2SS, CVTSL2SD, CVTSQ2SD preserve upper part of xmm register, introducing false dependency on a previous value. Break it by xoring destination with itself. Increases size of go executable by 320 bytes, but shows nice improvement on go1. Also fixes performance degradation introduced by 1.7. name old time/op new time/op delta BinaryTree17-4 2.20s ± 1% 2.19s ± 0% -0.36% (p=0.000 n=18+16) Fannkuch11-4 2.44s ± 1% 2.45s ± 2% +0.47% (p=0.030 n=20+20) FmtFprintfEmpty-4 40.9ns ± 7% 40.5ns ± 1% ~ (p=0.531 n=20+16) FmtFprintfString-4 111ns ± 2% 111ns ± 1% ~ (p=0.510 n=18+19) FmtFprintfInt-4 98.3ns ± 3% 99.3ns ± 1% +1.01% (p=0.003 n=20+18) FmtFprintfIntInt-4 148ns ± 3% 147ns ± 1% ~ (p=0.919 n=20+17) FmtFprintfPrefixedInt-4 149ns ± 1% 152ns ± 0% +1.73% (p=0.000 n=19+17) FmtFprintfFloat-4 231ns ± 0% 231ns ± 1% ~ (p=0.678 n=18+19) FmtManyArgs-4 667ns ± 1% 672ns ± 1% +0.73% (p=0.005 n=20+20) GobDecode-4 5.60ms ± 0% 5.61ms ± 0% +0.24% (p=0.000 n=20+20) GobEncode-4 4.74ms ± 0% 4.73ms ± 1% -0.20% (p=0.002 n=20+20) Gzip-4 199ms ± 0% 199ms ± 1% +0.35% (p=0.000 n=19+20) Gunzip-4 31.8ms ± 1% 31.5ms ± 1% -0.89% (p=0.000 n=20+20) HTTPClientServer-4 38.1µs ± 1% 38.0µs ± 1% ~ (p=0.117 n=19+18) JSONEncode-4 14.2ms ± 1% 13.4ms ± 0% -5.73% (p=0.000 n=20+20) JSONDecode-4 42.7ms ± 0% 42.7ms ± 1% +0.18% (p=0.019 n=18+19) Mandelbrot200-4 3.26ms ± 0% 2.99ms ± 0% -8.38% (p=0.000 n=19+19) GoParse-4 2.76ms ± 1% 2.76ms ± 1% ~ (p=0.583 n=20+20) RegexpMatchEasy0_32-4 69.5ns ± 0% 69.6ns ± 0% +0.10% (p=0.017 n=16+17) RegexpMatchEasy0_1K-4 703ns ± 0% 708ns ± 3% +0.65% (p=0.000 n=17+18) RegexpMatchEasy1_32-4 68.2ns ± 1% 68.2ns ± 2% ~ (p=0.094 n=18+20) RegexpMatchEasy1_1K-4 288ns ± 1% 288ns ± 0% ~ (p=0.403 n=17+18) RegexpMatchMedium_32-4 104ns ± 2% 103ns ± 1% ~ (p=0.110 n=20+16) RegexpMatchMedium_1K-4 31.7µs ± 3% 31.7µs ± 3% ~ (p=0.091 n=19+20) RegexpMatchHard_32-4 1.59µs ± 2% 1.58µs ± 2% ~ (p=0.083 n=20+20) RegexpMatchHard_1K-4 48.1µs ± 3% 47.9µs ± 2% ~ (p=0.461 n=20+19) Revcomp-4 344ms ± 0% 345ms ± 0% +0.08% (p=0.009 n=18+17) Template-4 44.8ms ± 1% 44.7ms ± 1% ~ (p=0.277 n=20+20) TimeParse-4 258ns ± 0% 258ns ± 0% ~ (all samples are equal) TimeFormat-4 275ns ± 0% 273ns ± 0% -0.64% (p=0.000 n=20+18) name old speed new speed delta GobDecode-4 137MB/s ± 0% 137MB/s ± 0% -0.24% (p=0.000 n=20+20) GobEncode-4 162MB/s ± 0% 162MB/s ± 0% +0.20% (p=0.002 n=20+20) Gzip-4 97.6MB/s ± 0% 97.3MB/s ± 1% -0.35% (p=0.000 n=19+20) Gunzip-4 610MB/s ± 1% 615MB/s ± 1% +0.89% (p=0.000 n=20+20) JSONEncode-4 136MB/s ± 1% 145MB/s ± 0% +6.08% (p=0.000 n=20+20) JSONDecode-4 45.5MB/s ± 0% 45.4MB/s ± 1% -0.17% (p=0.017 n=18+19) GoParse-4 21.0MB/s ± 1% 21.0MB/s ± 1% ~ (p=0.578 n=20+20) RegexpMatchEasy0_32-4 460MB/s ± 0% 460MB/s ± 0% -0.09% (p=0.031 n=16+17) RegexpMatchEasy0_1K-4 1.46GB/s ± 0% 1.45GB/s ± 3% -0.64% (p=0.000 n=17+18) RegexpMatchEasy1_32-4 469MB/s ± 0% 469MB/s ± 2% +0.06% (p=0.043 n=18+20) RegexpMatchEasy1_1K-4 3.55GB/s ± 1% 3.55GB/s ± 0% ~ (p=0.057 n=17+18) RegexpMatchMedium_32-4 9.61MB/s ± 2% 9.64MB/s ± 2% ~ (p=0.856 n=20+20) RegexpMatchMedium_1K-4 32.3MB/s ± 3% 32.3MB/s ± 3% ~ (p=0.085 n=19+20) RegexpMatchHard_32-4 20.1MB/s ± 2% 20.2MB/s ± 2% ~ (p=0.086 n=20+20) RegexpMatchHard_1K-4 21.3MB/s ± 3% 21.4MB/s ± 2% ~ (p=0.578 n=20+20) Revcomp-4 738MB/s ± 0% 737MB/s ± 0% -0.08% (p=0.009 n=18+17) Template-4 43.3MB/s ± 1% 43.4MB/s ± 1% ~ (p=0.274 n=20+20) Fixes #16982 Change-Id: If574d66f39f4183a9b1d5ffff0339909cc73f59d Reviewed-on: https://go-review.googlesource.com/31490 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2016-10-19 20:21:42 +03:00
case ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSL2SS:
r := v.Reg()
// Break false dependency on destination register.
opregreg(s, x86.AXORPS, r, r)
opregreg(s, v.Op.Asm(), r, v.Args[0].Reg())
case ssa.OpAMD64MOVQi2f, ssa.OpAMD64MOVQf2i:
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64MOVLi2f, ssa.OpAMD64MOVLf2i:
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64ADDQmem, ssa.OpAMD64ADDLmem, ssa.OpAMD64SUBQmem, ssa.OpAMD64SUBLmem,
ssa.OpAMD64ANDQmem, ssa.OpAMD64ANDLmem, ssa.OpAMD64ORQmem, ssa.OpAMD64ORLmem,
ssa.OpAMD64XORQmem, ssa.OpAMD64XORLmem, ssa.OpAMD64ADDSDmem, ssa.OpAMD64ADDSSmem,
ssa.OpAMD64SUBSDmem, ssa.OpAMD64SUBSSmem, ssa.OpAMD64MULSDmem, ssa.OpAMD64MULSSmem:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[1].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
if v.Reg() != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
case ssa.OpAMD64DUFFZERO:
off := duffStart(v.AuxInt)
adj := duffAdj(v.AuxInt)
var p *obj.Prog
if adj != 0 {
cmd/compile/internal/ssa: use sse to zero on amd64 Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-09 14:50:58 -05:00
p = s.Prog(x86.ALEAQ)
p.From.Type = obj.TYPE_MEM
p.From.Offset = adj
cmd/compile/internal/ssa: use sse to zero on amd64 Use 16-byte stores instead of 8-byte stores to zero small blocks. Also switch to duffzero for 65+ bytes only, because for each duffzero call we also save/restore BP, so call requires 4 instructions and replacing it with 4 sse stores doesn't cause code-bloat. Also switch duffzero to use leaq, instead of addq to avoid clobbering flags. ClearFat8-6 0.54ns ± 0% 0.54ns ± 0% ~ (all equal) ClearFat12-6 1.07ns ± 0% 1.07ns ± 0% ~ (all equal) ClearFat16-6 1.07ns ± 0% 0.69ns ± 0% -35.51% (p=0.001 n=8+9) ClearFat24-6 1.61ns ± 1% 1.07ns ± 0% -33.33% (p=0.000 n=10+10) ClearFat32-6 2.14ns ± 0% 1.07ns ± 0% -50.00% (p=0.001 n=8+9) ClearFat40-6 2.67ns ± 1% 1.61ns ± 0% -39.72% (p=0.000 n=10+8) ClearFat48-6 3.75ns ± 0% 2.68ns ± 0% -28.59% (p=0.000 n=9+9) ClearFat56-6 4.29ns ± 0% 3.22ns ± 0% -25.10% (p=0.000 n=9+9) ClearFat64-6 4.30ns ± 0% 3.22ns ± 0% -25.15% (p=0.000 n=8+8) ClearFat128-6 7.50ns ± 1% 7.51ns ± 0% ~ (p=0.767 n=10+9) ClearFat256-6 13.9ns ± 1% 13.9ns ± 1% ~ (p=0.257 n=10+10) ClearFat512-6 26.8ns ± 0% 26.8ns ± 0% ~ (p=0.467 n=8+8) ClearFat1024-6 52.5ns ± 0% 52.5ns ± 0% ~ (p=1.000 n=8+8) Also shaves ~20kb from go tool: go_old 10384994 go_new 10364514 [-20480 bytes] section differences global text (code) = -20585 bytes (-0.532047%) read-only data = -302 bytes (-0.018101%) Total difference -20887 bytes (-0.348731%) Change-Id: I15854e87544545c1af24775df895e38e16e12694 Reviewed-on: https://go-review.googlesource.com/54410 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-08-09 14:50:58 -05:00
p.From.Reg = x86.REG_DI
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_DI
}
p = s.Prog(obj.ADUFFZERO)
p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Duffzero
p.To.Offset = off
case ssa.OpAMD64MOVOconst:
if v.AuxInt != 0 {
v.Fatalf("MOVOconst can only do constant=0")
}
r := v.Reg()
opregreg(s, x86.AXORPS, r, r)
case ssa.OpAMD64DUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Duffcopy
p.To.Offset = v.AuxInt
case ssa.OpAMD64MOVQconvert, ssa.OpAMD64MOVLconvert:
if v.Args[0].Reg() != v.Reg() {
v.Fatalf("MOVXconvert should be a no-op")
}
case ssa.OpCopy: // TODO: use MOVQreg for reg->reg copies instead of OpCopy?
if v.Type.IsMemory() {
return
}
x := v.Args[0].Reg()
y := v.Reg()
if x != y {
opregreg(s, moveByType(v.Type), y, x)
}
case ssa.OpLoadReg:
if v.Type.IsFlags() {
v.Fatalf("load flags not implemented: %v", v.LongString())
return
}
p := s.Prog(loadByType(v.Type))
gc.AddrAuto(&p.From, v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpStoreReg:
if v.Type.IsFlags() {
v.Fatalf("store flags not implemented: %v", v.LongString())
return
}
p := s.Prog(storeByType(v.Type))
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
gc.AddrAuto(&p.To, v)
case ssa.OpAMD64LoweredGetClosurePtr:
// Closure pointer is DX.
gc.CheckLoweredGetClosurePtr(v)
case ssa.OpAMD64LoweredGetG:
r := v.Reg()
// See the comments in cmd/internal/obj/x86/obj6.go
// near CanUse1InsnTLS for a detailed explanation of these instructions.
if x86.CanUse1InsnTLS(gc.Ctxt) {
// MOVQ (TLS), r
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_MEM
p.From.Reg = x86.REG_TLS
p.To.Type = obj.TYPE_REG
p.To.Reg = r
} else {
// MOVQ TLS, r
// MOVQ (r)(TLS*1), r
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_TLS
p.To.Type = obj.TYPE_REG
p.To.Reg = r
q := s.Prog(x86.AMOVQ)
q.From.Type = obj.TYPE_MEM
q.From.Reg = r
q.From.Index = x86.REG_TLS
q.From.Scale = 1
q.To.Type = obj.TYPE_REG
q.To.Reg = r
}
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLclosure, ssa.OpAMD64CALLinter:
s.Call(v)
case ssa.OpAMD64LoweredGetCallerPC:
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_MEM
p.From.Offset = -8 // PC is stored 8 bytes below first parameter.
p.From.Name = obj.NAME_PARAM
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL,
ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL,
ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSRQ, ssa.OpAMD64BSRL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64SQRTSD:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64POPCNTQ, ssa.OpAMD64POPCNTL:
if v.Args[0].Reg() != v.Reg() {
// POPCNT on Intel has a false dependency on the destination register.
// Zero the destination to break the dependency.
p := s.Prog(x86.AMOVQ)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE,
ssa.OpAMD64SETL, ssa.OpAMD64SETLE,
ssa.OpAMD64SETG, ssa.OpAMD64SETGE,
ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF,
ssa.OpAMD64SETB, ssa.OpAMD64SETBE,
ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN,
ssa.OpAMD64SETA, ssa.OpAMD64SETAE:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpAMD64SETEQmem, ssa.OpAMD64SETNEmem,
ssa.OpAMD64SETLmem, ssa.OpAMD64SETLEmem,
ssa.OpAMD64SETGmem, ssa.OpAMD64SETGEmem,
ssa.OpAMD64SETBmem, ssa.OpAMD64SETBEmem,
ssa.OpAMD64SETAmem, ssa.OpAMD64SETAEmem:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64SETNEF:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
q := s.Prog(x86.ASETPS)
q.To.Type = obj.TYPE_REG
q.To.Reg = x86.REG_AX
// ORL avoids partial register write and is smaller than ORQ, used by old compiler
opregreg(s, x86.AORL, v.Reg(), x86.REG_AX)
case ssa.OpAMD64SETEQF:
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
q := s.Prog(x86.ASETPC)
q.To.Type = obj.TYPE_REG
q.To.Reg = x86.REG_AX
// ANDL avoids partial register write and is smaller than ANDQ, used by old compiler
opregreg(s, x86.AANDL, v.Reg(), x86.REG_AX)
case ssa.OpAMD64InvertFlags:
v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT:
v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
case ssa.OpAMD64AddTupleFirst32, ssa.OpAMD64AddTupleFirst64:
v.Fatalf("AddTupleFirst* should never make it to codegen %v", v.LongString())
case ssa.OpAMD64REPSTOSQ:
s.Prog(x86.AREP)
s.Prog(x86.ASTOSQ)
case ssa.OpAMD64REPMOVSQ:
s.Prog(x86.AREP)
s.Prog(x86.AMOVSQ)
case ssa.OpAMD64LoweredNilCheck:
// Issue a load which will fault if the input is nil.
// TODO: We currently use the 2-byte instruction TESTB AX, (reg).
// Should we use the 3-byte TESTB $0, (reg) instead? It is larger
// but it doesn't have false dependency on AX.
// Or maybe allocate an output register and use MOVL (reg),reg2 ?
// That trades clobbering flags for clobbering a register.
p := s.Prog(x86.ATESTB)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_AX
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
gc.Warnl(v.Pos, "generated nil check")
}
case ssa.OpAMD64MOVLatomicload, ssa.OpAMD64MOVQatomicload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64XCHGL, ssa.OpAMD64XCHGQ:
r := v.Reg0()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[1].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64XADDLlock, ssa.OpAMD64XADDQlock:
r := v.Reg0()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output[0] not in same register %s", v.LongString())
}
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[1].Reg()
gc.AddAux(&p.To, v)
case ssa.OpAMD64CMPXCHGLlock, ssa.OpAMD64CMPXCHGQlock:
if v.Args[1].Reg() != x86.REG_AX {
v.Fatalf("input[1] not in AX %s", v.LongString())
}
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
p = s.Prog(x86.ASETEQ)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpAMD64ANDBlock, ssa.OpAMD64ORBlock:
s.Prog(x86.ALOCK)
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpClobber:
p := s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0xdeaddead
p.To.Type = obj.TYPE_MEM
p.To.Reg = x86.REG_SP
gc.AddAux(&p.To, v)
p = s.Prog(x86.AMOVL)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0xdeaddead
p.To.Type = obj.TYPE_MEM
p.To.Reg = x86.REG_SP
gc.AddAux(&p.To, v)
p.To.Offset += 4
default:
v.Fatalf("genValue not implemented: %s", v.LongString())
}
}
var blockJump = [...]struct {
asm, invasm obj.As
}{
ssa.BlockAMD64EQ: {x86.AJEQ, x86.AJNE},
ssa.BlockAMD64NE: {x86.AJNE, x86.AJEQ},
ssa.BlockAMD64LT: {x86.AJLT, x86.AJGE},
ssa.BlockAMD64GE: {x86.AJGE, x86.AJLT},
ssa.BlockAMD64LE: {x86.AJLE, x86.AJGT},
ssa.BlockAMD64GT: {x86.AJGT, x86.AJLE},
ssa.BlockAMD64ULT: {x86.AJCS, x86.AJCC},
ssa.BlockAMD64UGE: {x86.AJCC, x86.AJCS},
ssa.BlockAMD64UGT: {x86.AJHI, x86.AJLS},
ssa.BlockAMD64ULE: {x86.AJLS, x86.AJHI},
ssa.BlockAMD64ORD: {x86.AJPC, x86.AJPS},
ssa.BlockAMD64NAN: {x86.AJPS, x86.AJPC},
}
var eqfJumps = [2][2]gc.FloatingEQNEJump{
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPS, Index: 1}}, // next == b.Succs[0]
{{Jump: x86.AJNE, Index: 1}, {Jump: x86.AJPC, Index: 0}}, // next == b.Succs[1]
}
var nefJumps = [2][2]gc.FloatingEQNEJump{
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPC, Index: 1}}, // next == b.Succs[0]
{{Jump: x86.AJNE, Index: 0}, {Jump: x86.AJPS, Index: 0}}, // next == b.Succs[1]
}
func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
switch b.Kind {
case ssa.BlockPlain:
if b.Succs[0].Block() != next {
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
}
case ssa.BlockDefer:
// defer returns in rax:
// 0 if we should continue executing
// 1 if we should jump to deferreturn call
p := s.Prog(x86.ATESTL)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_AX
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX
p = s.Prog(x86.AJNE)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
if b.Succs[0].Block() != next {
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
}
case ssa.BlockExit:
s.Prog(obj.AUNDEF) // tell plive.go that we never reach here
case ssa.BlockRet:
s.Prog(obj.ARET)
case ssa.BlockRetJmp:
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = b.Aux.(*obj.LSym)
case ssa.BlockAMD64EQF:
s.FPJump(b, next, &eqfJumps)
case ssa.BlockAMD64NEF:
s.FPJump(b, next, &nefJumps)
case ssa.BlockAMD64EQ, ssa.BlockAMD64NE,
ssa.BlockAMD64LT, ssa.BlockAMD64GE,
ssa.BlockAMD64LE, ssa.BlockAMD64GT,
ssa.BlockAMD64ULT, ssa.BlockAMD64UGT,
ssa.BlockAMD64ULE, ssa.BlockAMD64UGE:
jmp := blockJump[b.Kind]
var p *obj.Prog
switch next {
case b.Succs[0].Block():
p = s.Prog(jmp.invasm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
case b.Succs[1].Block():
p = s.Prog(jmp.asm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
default:
p = s.Prog(jmp.asm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
q := s.Prog(obj.AJMP)
q.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
}
default:
b.Fatalf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
}
}