go/src/cmd/compile/internal/arm/ssa.go

867 lines
24 KiB
Go
Raw Normal View History

// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package arm
import (
"fmt"
"math"
"cmd/compile/internal/gc"
"cmd/compile/internal/ssa"
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
"cmd/compile/internal/types"
"cmd/internal/obj"
"cmd/internal/obj/arm"
)
// loadByType returns the load instruction of the given type.
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
func loadByType(t *types.Type) obj.As {
if t.IsFloat() {
switch t.Size() {
case 4:
return arm.AMOVF
case 8:
return arm.AMOVD
}
} else {
switch t.Size() {
case 1:
if t.IsSigned() {
return arm.AMOVB
} else {
return arm.AMOVBU
}
case 2:
if t.IsSigned() {
return arm.AMOVH
} else {
return arm.AMOVHU
}
case 4:
return arm.AMOVW
}
}
panic("bad load type")
}
// storeByType returns the store instruction of the given type.
cmd/compile: change ssa.Type into *types.Type When package ssa was created, Type was in package gc. To avoid circular dependencies, we used an interface (ssa.Type) to represent type information in SSA. In the Go 1.9 cycle, gri extricated the Type type from package gc. As a result, we can now use it in package ssa. Now, instead of package types depending on package ssa, it is the other way. This is a more sensible dependency tree, and helps compiler performance a bit. Though this is a big CL, most of the changes are mechanical and uninteresting. Interesting bits: * Add new singleton globals to package types for the special SSA types Memory, Void, Invalid, Flags, and Int128. * Add two new Types, TSSA for the special types, and TTUPLE, for SSA tuple types. ssa.MakeTuple is now types.NewTuple. * Move type comparison result constants CMPlt, CMPeq, and CMPgt to package types. * We had picked the name "types" in our rules for the handy list of types provided by ssa.Config. That conflicted with the types package name, so change it to "typ". * Update the type comparison routine to handle tuples and special types inline. * Teach gc/fmt.go how to print special types. * We can now eliminate ElemTypes in favor of just Elem, and probably also some other duplicated Type methods designed to return ssa.Type instead of *types.Type. * The ssa tests were using their own dummy types, and they were not particularly careful about types in general. Of necessity, this CL switches them to use *types.Type; it does not make them more type-accurate. Unfortunately, using types.Type means initializing a bit of the types universe. This is prime for refactoring and improvement. This shrinks ssa.Value; it now fits in a smaller size class on 64 bit systems. This doesn't have a giant impact, though, since most Values are preallocated in a chunk. name old alloc/op new alloc/op delta Template 37.9MB ± 0% 37.7MB ± 0% -0.57% (p=0.000 n=10+8) Unicode 28.9MB ± 0% 28.7MB ± 0% -0.52% (p=0.000 n=10+10) GoTypes 110MB ± 0% 109MB ± 0% -0.88% (p=0.000 n=10+10) Flate 24.7MB ± 0% 24.6MB ± 0% -0.66% (p=0.000 n=10+10) GoParser 31.1MB ± 0% 30.9MB ± 0% -0.61% (p=0.000 n=10+9) Reflect 73.9MB ± 0% 73.4MB ± 0% -0.62% (p=0.000 n=10+8) Tar 25.8MB ± 0% 25.6MB ± 0% -0.77% (p=0.000 n=9+10) XML 41.2MB ± 0% 40.9MB ± 0% -0.80% (p=0.000 n=10+10) [Geo mean] 40.5MB 40.3MB -0.68% name old allocs/op new allocs/op delta Template 385k ± 0% 386k ± 0% ~ (p=0.356 n=10+9) Unicode 343k ± 1% 344k ± 0% ~ (p=0.481 n=10+10) GoTypes 1.16M ± 0% 1.16M ± 0% -0.16% (p=0.004 n=10+10) Flate 238k ± 1% 238k ± 1% ~ (p=0.853 n=10+10) GoParser 320k ± 0% 320k ± 0% ~ (p=0.720 n=10+9) Reflect 957k ± 0% 957k ± 0% ~ (p=0.460 n=10+8) Tar 252k ± 0% 252k ± 0% ~ (p=0.133 n=9+10) XML 400k ± 0% 400k ± 0% ~ (p=0.796 n=10+10) [Geo mean] 428k 428k -0.01% Removing all the interface calls helps non-trivially with CPU, though. name old time/op new time/op delta Template 178ms ± 4% 173ms ± 3% -2.90% (p=0.000 n=94+96) Unicode 85.0ms ± 4% 83.9ms ± 4% -1.23% (p=0.000 n=96+96) GoTypes 543ms ± 3% 528ms ± 3% -2.73% (p=0.000 n=98+96) Flate 116ms ± 3% 113ms ± 4% -2.34% (p=0.000 n=96+99) GoParser 144ms ± 3% 140ms ± 4% -2.80% (p=0.000 n=99+97) Reflect 344ms ± 3% 334ms ± 4% -3.02% (p=0.000 n=100+99) Tar 106ms ± 5% 103ms ± 4% -3.30% (p=0.000 n=98+94) XML 198ms ± 5% 192ms ± 4% -2.88% (p=0.000 n=92+95) [Geo mean] 178ms 173ms -2.65% name old user-time/op new user-time/op delta Template 229ms ± 5% 224ms ± 5% -2.36% (p=0.000 n=95+99) Unicode 107ms ± 6% 106ms ± 5% -1.13% (p=0.001 n=93+95) GoTypes 696ms ± 4% 679ms ± 4% -2.45% (p=0.000 n=97+99) Flate 137ms ± 4% 134ms ± 5% -2.66% (p=0.000 n=99+96) GoParser 176ms ± 5% 172ms ± 8% -2.27% (p=0.000 n=98+100) Reflect 430ms ± 6% 411ms ± 5% -4.46% (p=0.000 n=100+92) Tar 128ms ±13% 123ms ±13% -4.21% (p=0.000 n=100+100) XML 239ms ± 6% 233ms ± 6% -2.50% (p=0.000 n=95+97) [Geo mean] 220ms 213ms -2.76% Change-Id: I15c7d6268347f8358e75066dfdbd77db24e8d0c1 Reviewed-on: https://go-review.googlesource.com/42145 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2017-04-28 14:12:28 -07:00
func storeByType(t *types.Type) obj.As {
if t.IsFloat() {
switch t.Size() {
case 4:
return arm.AMOVF
case 8:
return arm.AMOVD
}
} else {
switch t.Size() {
case 1:
return arm.AMOVB
case 2:
return arm.AMOVH
case 4:
return arm.AMOVW
}
}
panic("bad store type")
}
// shift type is used as Offset in obj.TYPE_SHIFT operands to encode shifted register operands
type shift int64
// copied from ../../../internal/obj/util.go:/TYPE_SHIFT
func (v shift) String() string {
op := "<<>>->@>"[((v>>5)&3)<<1:]
if v&(1<<4) != 0 {
// register shift
return fmt.Sprintf("R%d%c%cR%d", v&15, op[0], op[1], (v>>8)&15)
} else {
// constant shift
return fmt.Sprintf("R%d%c%c%d", v&15, op[0], op[1], (v>>7)&31)
}
}
// makeshift encodes a register shifted by a constant
func makeshift(reg int16, typ int64, s int64) shift {
return shift(int64(reg&0xf) | typ | (s&31)<<7)
}
// genshift generates a Prog for r = r0 op (r1 shifted by n)
func genshift(s *gc.SSAGenState, as obj.As, r0, r1, r int16, typ int64, n int64) *obj.Prog {
p := s.Prog(as)
p.From.Type = obj.TYPE_SHIFT
p.From.Offset = int64(makeshift(r1, typ, n))
p.Reg = r0
if r != 0 {
p.To.Type = obj.TYPE_REG
p.To.Reg = r
}
return p
}
// makeregshift encodes a register shifted by a register
func makeregshift(r1 int16, typ int64, r2 int16) shift {
return shift(int64(r1&0xf) | typ | int64(r2&0xf)<<8 | 1<<4)
}
// genregshift generates a Prog for r = r0 op (r1 shifted by r2)
func genregshift(s *gc.SSAGenState, as obj.As, r0, r1, r2, r int16, typ int64) *obj.Prog {
p := s.Prog(as)
p.From.Type = obj.TYPE_SHIFT
p.From.Offset = int64(makeregshift(r1, typ, r2))
p.Reg = r0
if r != 0 {
p.To.Type = obj.TYPE_REG
p.To.Reg = r
}
return p
}
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
switch v.Op {
case ssa.OpCopy, ssa.OpARMMOVWconvert, ssa.OpARMMOVWreg:
if v.Type.IsMemory() {
return
}
x := v.Args[0].Reg()
y := v.Reg()
if x == y {
return
}
as := arm.AMOVW
if v.Type.IsFloat() {
switch v.Type.Size() {
case 4:
as = arm.AMOVF
case 8:
as = arm.AMOVD
default:
panic("bad float size")
}
}
p := s.Prog(as)
p.From.Type = obj.TYPE_REG
p.From.Reg = x
p.To.Type = obj.TYPE_REG
p.To.Reg = y
case ssa.OpARMMOVWnop:
if v.Reg() != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
// nothing to do
case ssa.OpLoadReg:
if v.Type.IsFlags() {
v.Fatalf("load flags not implemented: %v", v.LongString())
return
}
p := s.Prog(loadByType(v.Type))
gc.AddrAuto(&p.From, v.Args[0])
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpStoreReg:
if v.Type.IsFlags() {
v.Fatalf("store flags not implemented: %v", v.LongString())
return
}
p := s.Prog(storeByType(v.Type))
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
gc.AddrAuto(&p.To, v)
case ssa.OpARMADD,
ssa.OpARMADC,
ssa.OpARMSUB,
ssa.OpARMSBC,
ssa.OpARMRSB,
ssa.OpARMAND,
ssa.OpARMOR,
ssa.OpARMXOR,
ssa.OpARMBIC,
ssa.OpARMMUL,
ssa.OpARMADDF,
ssa.OpARMADDD,
ssa.OpARMSUBF,
ssa.OpARMSUBD,
ssa.OpARMMULF,
ssa.OpARMMULD,
ssa.OpARMDIVF,
ssa.OpARMDIVD:
r := v.Reg()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpARMADDS,
ssa.OpARMSUBS:
r := v.Reg0()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
p := s.Prog(v.Op.Asm())
p.Scond = arm.C_SBIT
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpARMSLL,
ssa.OpARMSRL,
ssa.OpARMSRA:
r := v.Reg()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpARMSRAcond:
// ARM shift instructions uses only the low-order byte of the shift amount
// generate conditional instructions to deal with large shifts
// flag is already set
// SRA.HS $31, Rarg0, Rdst // shift 31 bits to get the sign bit
// SRA.LO Rarg1, Rarg0, Rdst
r := v.Reg()
r1 := v.Args[0].Reg()
r2 := v.Args[1].Reg()
p := s.Prog(arm.ASRA)
p.Scond = arm.C_SCOND_HS
p.From.Type = obj.TYPE_CONST
p.From.Offset = 31
p.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
p = s.Prog(arm.ASRA)
p.Scond = arm.C_SCOND_LO
p.From.Type = obj.TYPE_REG
p.From.Reg = r2
p.Reg = r1
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpARMADDconst,
ssa.OpARMADCconst,
ssa.OpARMSUBconst,
ssa.OpARMSBCconst,
ssa.OpARMRSBconst,
ssa.OpARMRSCconst,
ssa.OpARMANDconst,
ssa.OpARMORconst,
ssa.OpARMXORconst,
ssa.OpARMBICconst,
ssa.OpARMSLLconst,
ssa.OpARMSRLconst,
ssa.OpARMSRAconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMADDSconst,
ssa.OpARMSUBSconst,
ssa.OpARMRSBSconst:
p := s.Prog(v.Op.Asm())
p.Scond = arm.C_SBIT
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
case ssa.OpARMSRRconst:
genshift(s, arm.AMOVW, 0, v.Args[0].Reg(), v.Reg(), arm.SHIFT_RR, v.AuxInt)
case ssa.OpARMADDshiftLL,
ssa.OpARMADCshiftLL,
ssa.OpARMSUBshiftLL,
ssa.OpARMSBCshiftLL,
ssa.OpARMRSBshiftLL,
ssa.OpARMRSCshiftLL,
ssa.OpARMANDshiftLL,
ssa.OpARMORshiftLL,
ssa.OpARMXORshiftLL,
ssa.OpARMBICshiftLL:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_LL, v.AuxInt)
case ssa.OpARMADDSshiftLL,
ssa.OpARMSUBSshiftLL,
ssa.OpARMRSBSshiftLL:
p := genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg0(), arm.SHIFT_LL, v.AuxInt)
p.Scond = arm.C_SBIT
case ssa.OpARMADDshiftRL,
ssa.OpARMADCshiftRL,
ssa.OpARMSUBshiftRL,
ssa.OpARMSBCshiftRL,
ssa.OpARMRSBshiftRL,
ssa.OpARMRSCshiftRL,
ssa.OpARMANDshiftRL,
ssa.OpARMORshiftRL,
ssa.OpARMXORshiftRL,
ssa.OpARMBICshiftRL:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_LR, v.AuxInt)
case ssa.OpARMADDSshiftRL,
ssa.OpARMSUBSshiftRL,
ssa.OpARMRSBSshiftRL:
p := genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg0(), arm.SHIFT_LR, v.AuxInt)
p.Scond = arm.C_SBIT
case ssa.OpARMADDshiftRA,
ssa.OpARMADCshiftRA,
ssa.OpARMSUBshiftRA,
ssa.OpARMSBCshiftRA,
ssa.OpARMRSBshiftRA,
ssa.OpARMRSCshiftRA,
ssa.OpARMANDshiftRA,
ssa.OpARMORshiftRA,
ssa.OpARMXORshiftRA,
ssa.OpARMBICshiftRA:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_AR, v.AuxInt)
case ssa.OpARMADDSshiftRA,
ssa.OpARMSUBSshiftRA,
ssa.OpARMRSBSshiftRA:
p := genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg0(), arm.SHIFT_AR, v.AuxInt)
p.Scond = arm.C_SBIT
case ssa.OpARMXORshiftRR:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_RR, v.AuxInt)
case ssa.OpARMMVNshiftLL:
genshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Reg(), arm.SHIFT_LL, v.AuxInt)
case ssa.OpARMMVNshiftRL:
genshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Reg(), arm.SHIFT_LR, v.AuxInt)
case ssa.OpARMMVNshiftRA:
genshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Reg(), arm.SHIFT_AR, v.AuxInt)
case ssa.OpARMMVNshiftLLreg:
genregshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_LL)
case ssa.OpARMMVNshiftRLreg:
genregshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_LR)
case ssa.OpARMMVNshiftRAreg:
genregshift(s, v.Op.Asm(), 0, v.Args[0].Reg(), v.Args[1].Reg(), v.Reg(), arm.SHIFT_AR)
case ssa.OpARMADDshiftLLreg,
ssa.OpARMADCshiftLLreg,
ssa.OpARMSUBshiftLLreg,
ssa.OpARMSBCshiftLLreg,
ssa.OpARMRSBshiftLLreg,
ssa.OpARMRSCshiftLLreg,
ssa.OpARMANDshiftLLreg,
ssa.OpARMORshiftLLreg,
ssa.OpARMXORshiftLLreg,
ssa.OpARMBICshiftLLreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg(), arm.SHIFT_LL)
case ssa.OpARMADDSshiftLLreg,
ssa.OpARMSUBSshiftLLreg,
ssa.OpARMRSBSshiftLLreg:
p := genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg0(), arm.SHIFT_LL)
p.Scond = arm.C_SBIT
case ssa.OpARMADDshiftRLreg,
ssa.OpARMADCshiftRLreg,
ssa.OpARMSUBshiftRLreg,
ssa.OpARMSBCshiftRLreg,
ssa.OpARMRSBshiftRLreg,
ssa.OpARMRSCshiftRLreg,
ssa.OpARMANDshiftRLreg,
ssa.OpARMORshiftRLreg,
ssa.OpARMXORshiftRLreg,
ssa.OpARMBICshiftRLreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg(), arm.SHIFT_LR)
case ssa.OpARMADDSshiftRLreg,
ssa.OpARMSUBSshiftRLreg,
ssa.OpARMRSBSshiftRLreg:
p := genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg0(), arm.SHIFT_LR)
p.Scond = arm.C_SBIT
case ssa.OpARMADDshiftRAreg,
ssa.OpARMADCshiftRAreg,
ssa.OpARMSUBshiftRAreg,
ssa.OpARMSBCshiftRAreg,
ssa.OpARMRSBshiftRAreg,
ssa.OpARMRSCshiftRAreg,
ssa.OpARMANDshiftRAreg,
ssa.OpARMORshiftRAreg,
ssa.OpARMXORshiftRAreg,
ssa.OpARMBICshiftRAreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg(), arm.SHIFT_AR)
case ssa.OpARMADDSshiftRAreg,
ssa.OpARMSUBSshiftRAreg,
ssa.OpARMRSBSshiftRAreg:
p := genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), v.Reg0(), arm.SHIFT_AR)
p.Scond = arm.C_SBIT
case ssa.OpARMHMUL,
ssa.OpARMHMULU:
// 32-bit high multiplication
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REGREG
p.To.Reg = v.Reg()
p.To.Offset = arm.REGTMP // throw away low 32-bit into tmp register
case ssa.OpARMMULLU:
// 32-bit multiplication, results 64-bit, high 32-bit in out0, low 32-bit in out1
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REGREG
p.To.Reg = v.Reg0() // high 32-bit
p.To.Offset = int64(v.Reg1()) // low 32-bit
case ssa.OpARMMULA:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REGREG2
p.To.Reg = v.Reg() // result
p.To.Offset = int64(v.Args[2].Reg()) // addend
case ssa.OpARMMOVWconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMMOVFconst,
ssa.OpARMMOVDconst:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_FCONST
p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMCMP,
ssa.OpARMCMN,
ssa.OpARMTST,
ssa.OpARMTEQ,
ssa.OpARMCMPF,
ssa.OpARMCMPD:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
// Special layout in ARM assembly
// Comparing to x86, the operands of ARM's CMP are reversed.
p.From.Reg = v.Args[1].Reg()
p.Reg = v.Args[0].Reg()
case ssa.OpARMCMPconst,
ssa.OpARMCMNconst,
ssa.OpARMTSTconst,
ssa.OpARMTEQconst:
// Special layout in ARM assembly
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.Reg = v.Args[0].Reg()
case ssa.OpARMCMPF0,
ssa.OpARMCMPD0:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
case ssa.OpARMCMPshiftLL:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), 0, arm.SHIFT_LL, v.AuxInt)
case ssa.OpARMCMPshiftRL:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), 0, arm.SHIFT_LR, v.AuxInt)
case ssa.OpARMCMPshiftRA:
genshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), 0, arm.SHIFT_AR, v.AuxInt)
case ssa.OpARMCMPshiftLLreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), 0, arm.SHIFT_LL)
case ssa.OpARMCMPshiftRLreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), 0, arm.SHIFT_LR)
case ssa.OpARMCMPshiftRAreg:
genregshift(s, v.Op.Asm(), v.Args[0].Reg(), v.Args[1].Reg(), v.Args[2].Reg(), 0, arm.SHIFT_AR)
case ssa.OpARMMOVWaddr:
p := s.Prog(arm.AMOVW)
p.From.Type = obj.TYPE_ADDR
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
var wantreg string
// MOVW $sym+off(base), R
// the assembler expands it as the following:
// - base is SP: add constant offset to SP (R13)
// when constant is large, tmp register (R11) may be used
// - base is SB: load external address from constant pool (use relocation)
switch v.Aux.(type) {
default:
v.Fatalf("aux is of unknown type %T", v.Aux)
case *ssa.ExternSymbol:
wantreg = "SB"
gc.AddAux(&p.From, v)
case *ssa.ArgSymbol, *ssa.AutoSymbol:
wantreg = "SP"
gc.AddAux(&p.From, v)
case nil:
// No sym, just MOVW $off(SP), R
wantreg = "SP"
p.From.Offset = v.AuxInt
}
if reg := v.Args[0].RegName(); reg != wantreg {
v.Fatalf("bad reg %s for symbol type %T, want %s", reg, v.Aux, wantreg)
}
case ssa.OpARMMOVBload,
ssa.OpARMMOVBUload,
ssa.OpARMMOVHload,
ssa.OpARMMOVHUload,
ssa.OpARMMOVWload,
ssa.OpARMMOVFload,
ssa.OpARMMOVDload:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMMOVBstore,
ssa.OpARMMOVHstore,
ssa.OpARMMOVWstore,
ssa.OpARMMOVFstore,
ssa.OpARMMOVDstore:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
cmd/compile: optimize ARM with more efficient MOVB/MOVBU/MOVH/MOVHU Like the indexed MOVW (MOVWloadidx/MOVWstoreidx) used in current ARM backend, the indexed MOVB/MOVBU/MOVH/MOVHU can also be used to generate further optimized ARM code. My patch implements this optimization. Here are some contrast test results against the original go compiler. 1. The total size of all .a files in pkg/ shrinks by 0.03%. 2. The compilecmp benchmark shows a little decline. name old time/op new time/op delta Template 2.35s ± 1% 2.37s ± 3% +0.94% (p=0.006 n=19+19) Unicode 1.33s ± 3% 1.33s ± 2% ~ (p=0.158 n=20+18) GoTypes 7.86s ± 2% 7.84s ± 1% ~ (p=0.284 n=19+18) Compiler 37.5s ± 1% 37.7s ± 2% ~ (p=0.101 n=20+19) SSA 83.4s ± 2% 83.6s ± 2% ~ (p=0.231 n=20+20) Flate 1.46s ± 2% 1.45s ± 1% ~ (p=0.097 n=20+17) GoParser 1.86s ± 2% 1.86s ± 4% ~ (p=0.738 n=20+20) Reflect 5.10s ± 1% 5.11s ± 1% ~ (p=0.290 n=20+18) Tar 1.78s ± 2% 1.77s ± 2% ~ (p=0.166 n=19+20) XML 2.61s ± 2% 2.61s ± 2% ~ (p=0.665 n=19+19) [Geo mean] 4.67s 4.68s +0.16% name old user-time/op new user-time/op delta Template 2.79s ± 3% 2.80s ± 2% ~ (p=0.662 n=20+20) Unicode 1.62s ± 3% 1.64s ± 4% ~ (p=0.252 n=20+20) GoTypes 9.58s ± 2% 9.62s ± 2% ~ (p=0.250 n=20+20) Compiler 46.2s ± 1% 46.2s ± 1% ~ (p=0.602 n=20+19) SSA 108s ± 1% 108s ± 2% ~ (p=0.242 n=18+20) Flate 1.69s ± 3% 1.69s ± 4% ~ (p=0.470 n=20+20) GoParser 2.16s ± 3% 2.20s ± 4% +1.70% (p=0.005 n=19+20) Reflect 6.02s ± 2% 6.02s ± 2% ~ (p=0.700 n=20+17) Tar 2.11s ± 2% 2.11s ± 3% ~ (p=0.480 n=18+20) XML 3.07s ± 2% 3.11s ± 4% +1.50% (p=0.043 n=20+20) [Geo mean] 5.61s 5.64s +0.55% name old text-bytes new text-bytes delta HelloSize 586kB ± 0% 586kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 5.46kB ± 0% 5.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 72.9kB ± 0% 72.9kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.03MB ± 0% 1.03MB ± 0% ~ (all equal) 3. The go1 benchmark shows improvement totally, and even more than 10% improvement in the test case Revcomp. name old time/op new time/op delta BinaryTree17-4 42.0s ± 1% 41.5s ± 1% -1.32% (p=0.000 n=39+40) Fannkuch11-4 24.1s ± 1% 23.6s ± 0% -2.38% (p=0.000 n=40+40) FmtFprintfEmpty-4 843ns ± 0% 839ns ± 1% -0.46% (p=0.000 n=33+40) FmtFprintfString-4 1.44µs ± 1% 1.37µs ± 1% -5.48% (p=0.000 n=40+35) FmtFprintfInt-4 1.44µs ± 1% 1.41µs ± 2% -1.50% (p=0.000 n=40+40) FmtFprintfIntInt-4 2.07µs ± 1% 2.06µs ± 0% -0.78% (p=0.000 n=40+40) FmtFprintfPrefixedInt-4 2.50µs ± 1% 2.33µs ± 1% -6.85% (p=0.000 n=40+40) FmtFprintfFloat-4 4.36µs ± 1% 4.34µs ± 0% -0.39% (p=0.017 n=40+40) FmtManyArgs-4 8.11µs ± 0% 8.00µs ± 0% -1.37% (p=0.000 n=40+40) GobDecode-4 105ms ± 2% 103ms ± 2% -2.17% (p=0.000 n=39+39) GobEncode-4 90.1ms ± 2% 88.6ms ± 1% -1.67% (p=0.000 n=40+39) Gzip-4 4.18s ± 1% 4.09s ± 1% -2.03% (p=0.000 n=40+40) Gunzip-4 608ms ± 1% 603ms ± 1% -0.86% (p=0.000 n=40+34) HTTPClientServer-4 674µs ± 3% 661µs ± 2% -1.82% (p=0.000 n=40+39) JSONEncode-4 256ms ± 1% 243ms ± 0% -5.11% (p=0.000 n=39+31) JSONDecode-4 915ms ± 1% 904ms ± 1% -1.18% (p=0.000 n=40+36) Mandelbrot200-4 49.2ms ± 0% 49.3ms ± 0% ~ (p=0.254 n=34+40) GoParse-4 46.9ms ± 2% 46.9ms ± 1% ~ (p=0.737 n=40+39) RegexpMatchEasy0_32-4 1.28µs ± 1% 1.27µs ± 1% -0.71% (p=0.000 n=40+40) RegexpMatchEasy0_1K-4 7.86µs ± 4% 7.67µs ± 4% -2.46% (p=0.000 n=38+40) RegexpMatchEasy1_32-4 1.28µs ± 1% 1.28µs ± 1% -0.54% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 10.4µs ± 2% 10.3µs ± 2% -0.88% (p=0.003 n=40+39) RegexpMatchMedium_32-4 2.05µs ± 0% 2.04µs ± 0% -0.34% (p=0.000 n=40+33) RegexpMatchMedium_1K-4 541µs ± 1% 535µs ± 1% -1.02% (p=0.000 n=40+38) RegexpMatchHard_32-4 29.3µs ± 1% 29.1µs ± 1% -0.51% (p=0.000 n=40+40) RegexpMatchHard_1K-4 881µs ± 1% 871µs ± 1% -1.15% (p=0.000 n=40+40) Revcomp-4 81.7ms ± 2% 67.5ms ± 2% -17.37% (p=0.000 n=39+39) Template-4 1.05s ± 1% 1.08s ± 2% +3.67% (p=0.000 n=40+40) TimeParse-4 7.24µs ± 1% 7.09µs ± 1% -2.13% (p=0.000 n=40+40) TimeFormat-4 13.2µs ± 1% 13.1µs ± 0% -0.31% (p=0.007 n=40+31) [Geo mean] 733µs 718µs -2.03% name old speed new speed delta GobDecode-4 7.28MB/s ± 2% 7.44MB/s ± 2% +2.23% (p=0.000 n=39+39) GobEncode-4 8.52MB/s ± 2% 8.67MB/s ± 1% +1.70% (p=0.000 n=40+39) Gzip-4 4.65MB/s ± 1% 4.74MB/s ± 1% +1.94% (p=0.000 n=37+40) Gunzip-4 31.9MB/s ± 1% 32.2MB/s ± 1% +0.90% (p=0.000 n=40+36) JSONEncode-4 7.57MB/s ± 1% 7.98MB/s ± 0% +5.41% (p=0.000 n=40+31) JSONDecode-4 2.12MB/s ± 1% 2.15MB/s ± 1% +1.23% (p=0.000 n=40+40) GoParse-4 1.23MB/s ± 1% 1.23MB/s ± 1% ~ (p=0.769 n=39+40) RegexpMatchEasy0_32-4 25.0MB/s ± 1% 25.2MB/s ± 1% +0.71% (p=0.000 n=40+40) RegexpMatchEasy0_1K-4 130MB/s ± 5% 134MB/s ± 4% +2.53% (p=0.000 n=38+40) RegexpMatchEasy1_32-4 24.9MB/s ± 1% 25.1MB/s ± 1% +0.55% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 98.5MB/s ± 2% 99.4MB/s ± 2% +0.88% (p=0.003 n=40+39) RegexpMatchMedium_32-4 490kB/s ± 0% 490kB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 1.89MB/s ± 1% 1.91MB/s ± 1% +1.02% (p=0.000 n=40+38) RegexpMatchHard_32-4 1.10MB/s ± 1% 1.10MB/s ± 0% +0.41% (p=0.000 n=40+33) RegexpMatchHard_1K-4 1.16MB/s ± 1% 1.17MB/s ± 1% +1.21% (p=0.000 n=40+40) Revcomp-4 31.1MB/s ± 2% 37.6MB/s ± 2% +21.03% (p=0.000 n=39+39) Template-4 1.86MB/s ± 1% 1.79MB/s ± 1% -3.51% (p=0.000 n=40+38) [Geo mean] 6.66MB/s 6.80MB/s +2.13% fixes #21492 Change-Id: Ia26e7ca393f0a5f31de240e8ff9a220453ca7e0d Reviewed-on: https://go-review.googlesource.com/58450 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-08-24 10:51:34 +00:00
case ssa.OpARMMOVWloadidx, ssa.OpARMMOVBUloadidx, ssa.OpARMMOVBloadidx, ssa.OpARMMOVHUloadidx, ssa.OpARMMOVHloadidx:
// this is just shift 0 bits
fallthrough
case ssa.OpARMMOVWloadshiftLL:
p := genshift(s, v.Op.Asm(), 0, v.Args[1].Reg(), v.Reg(), arm.SHIFT_LL, v.AuxInt)
p.From.Reg = v.Args[0].Reg()
case ssa.OpARMMOVWloadshiftRL:
p := genshift(s, v.Op.Asm(), 0, v.Args[1].Reg(), v.Reg(), arm.SHIFT_LR, v.AuxInt)
p.From.Reg = v.Args[0].Reg()
case ssa.OpARMMOVWloadshiftRA:
p := genshift(s, v.Op.Asm(), 0, v.Args[1].Reg(), v.Reg(), arm.SHIFT_AR, v.AuxInt)
p.From.Reg = v.Args[0].Reg()
cmd/compile: optimize ARM with more efficient MOVB/MOVBU/MOVH/MOVHU Like the indexed MOVW (MOVWloadidx/MOVWstoreidx) used in current ARM backend, the indexed MOVB/MOVBU/MOVH/MOVHU can also be used to generate further optimized ARM code. My patch implements this optimization. Here are some contrast test results against the original go compiler. 1. The total size of all .a files in pkg/ shrinks by 0.03%. 2. The compilecmp benchmark shows a little decline. name old time/op new time/op delta Template 2.35s ± 1% 2.37s ± 3% +0.94% (p=0.006 n=19+19) Unicode 1.33s ± 3% 1.33s ± 2% ~ (p=0.158 n=20+18) GoTypes 7.86s ± 2% 7.84s ± 1% ~ (p=0.284 n=19+18) Compiler 37.5s ± 1% 37.7s ± 2% ~ (p=0.101 n=20+19) SSA 83.4s ± 2% 83.6s ± 2% ~ (p=0.231 n=20+20) Flate 1.46s ± 2% 1.45s ± 1% ~ (p=0.097 n=20+17) GoParser 1.86s ± 2% 1.86s ± 4% ~ (p=0.738 n=20+20) Reflect 5.10s ± 1% 5.11s ± 1% ~ (p=0.290 n=20+18) Tar 1.78s ± 2% 1.77s ± 2% ~ (p=0.166 n=19+20) XML 2.61s ± 2% 2.61s ± 2% ~ (p=0.665 n=19+19) [Geo mean] 4.67s 4.68s +0.16% name old user-time/op new user-time/op delta Template 2.79s ± 3% 2.80s ± 2% ~ (p=0.662 n=20+20) Unicode 1.62s ± 3% 1.64s ± 4% ~ (p=0.252 n=20+20) GoTypes 9.58s ± 2% 9.62s ± 2% ~ (p=0.250 n=20+20) Compiler 46.2s ± 1% 46.2s ± 1% ~ (p=0.602 n=20+19) SSA 108s ± 1% 108s ± 2% ~ (p=0.242 n=18+20) Flate 1.69s ± 3% 1.69s ± 4% ~ (p=0.470 n=20+20) GoParser 2.16s ± 3% 2.20s ± 4% +1.70% (p=0.005 n=19+20) Reflect 6.02s ± 2% 6.02s ± 2% ~ (p=0.700 n=20+17) Tar 2.11s ± 2% 2.11s ± 3% ~ (p=0.480 n=18+20) XML 3.07s ± 2% 3.11s ± 4% +1.50% (p=0.043 n=20+20) [Geo mean] 5.61s 5.64s +0.55% name old text-bytes new text-bytes delta HelloSize 586kB ± 0% 586kB ± 0% ~ (all equal) name old data-bytes new data-bytes delta HelloSize 5.46kB ± 0% 5.46kB ± 0% ~ (all equal) name old bss-bytes new bss-bytes delta HelloSize 72.9kB ± 0% 72.9kB ± 0% ~ (all equal) name old exe-bytes new exe-bytes delta HelloSize 1.03MB ± 0% 1.03MB ± 0% ~ (all equal) 3. The go1 benchmark shows improvement totally, and even more than 10% improvement in the test case Revcomp. name old time/op new time/op delta BinaryTree17-4 42.0s ± 1% 41.5s ± 1% -1.32% (p=0.000 n=39+40) Fannkuch11-4 24.1s ± 1% 23.6s ± 0% -2.38% (p=0.000 n=40+40) FmtFprintfEmpty-4 843ns ± 0% 839ns ± 1% -0.46% (p=0.000 n=33+40) FmtFprintfString-4 1.44µs ± 1% 1.37µs ± 1% -5.48% (p=0.000 n=40+35) FmtFprintfInt-4 1.44µs ± 1% 1.41µs ± 2% -1.50% (p=0.000 n=40+40) FmtFprintfIntInt-4 2.07µs ± 1% 2.06µs ± 0% -0.78% (p=0.000 n=40+40) FmtFprintfPrefixedInt-4 2.50µs ± 1% 2.33µs ± 1% -6.85% (p=0.000 n=40+40) FmtFprintfFloat-4 4.36µs ± 1% 4.34µs ± 0% -0.39% (p=0.017 n=40+40) FmtManyArgs-4 8.11µs ± 0% 8.00µs ± 0% -1.37% (p=0.000 n=40+40) GobDecode-4 105ms ± 2% 103ms ± 2% -2.17% (p=0.000 n=39+39) GobEncode-4 90.1ms ± 2% 88.6ms ± 1% -1.67% (p=0.000 n=40+39) Gzip-4 4.18s ± 1% 4.09s ± 1% -2.03% (p=0.000 n=40+40) Gunzip-4 608ms ± 1% 603ms ± 1% -0.86% (p=0.000 n=40+34) HTTPClientServer-4 674µs ± 3% 661µs ± 2% -1.82% (p=0.000 n=40+39) JSONEncode-4 256ms ± 1% 243ms ± 0% -5.11% (p=0.000 n=39+31) JSONDecode-4 915ms ± 1% 904ms ± 1% -1.18% (p=0.000 n=40+36) Mandelbrot200-4 49.2ms ± 0% 49.3ms ± 0% ~ (p=0.254 n=34+40) GoParse-4 46.9ms ± 2% 46.9ms ± 1% ~ (p=0.737 n=40+39) RegexpMatchEasy0_32-4 1.28µs ± 1% 1.27µs ± 1% -0.71% (p=0.000 n=40+40) RegexpMatchEasy0_1K-4 7.86µs ± 4% 7.67µs ± 4% -2.46% (p=0.000 n=38+40) RegexpMatchEasy1_32-4 1.28µs ± 1% 1.28µs ± 1% -0.54% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 10.4µs ± 2% 10.3µs ± 2% -0.88% (p=0.003 n=40+39) RegexpMatchMedium_32-4 2.05µs ± 0% 2.04µs ± 0% -0.34% (p=0.000 n=40+33) RegexpMatchMedium_1K-4 541µs ± 1% 535µs ± 1% -1.02% (p=0.000 n=40+38) RegexpMatchHard_32-4 29.3µs ± 1% 29.1µs ± 1% -0.51% (p=0.000 n=40+40) RegexpMatchHard_1K-4 881µs ± 1% 871µs ± 1% -1.15% (p=0.000 n=40+40) Revcomp-4 81.7ms ± 2% 67.5ms ± 2% -17.37% (p=0.000 n=39+39) Template-4 1.05s ± 1% 1.08s ± 2% +3.67% (p=0.000 n=40+40) TimeParse-4 7.24µs ± 1% 7.09µs ± 1% -2.13% (p=0.000 n=40+40) TimeFormat-4 13.2µs ± 1% 13.1µs ± 0% -0.31% (p=0.007 n=40+31) [Geo mean] 733µs 718µs -2.03% name old speed new speed delta GobDecode-4 7.28MB/s ± 2% 7.44MB/s ± 2% +2.23% (p=0.000 n=39+39) GobEncode-4 8.52MB/s ± 2% 8.67MB/s ± 1% +1.70% (p=0.000 n=40+39) Gzip-4 4.65MB/s ± 1% 4.74MB/s ± 1% +1.94% (p=0.000 n=37+40) Gunzip-4 31.9MB/s ± 1% 32.2MB/s ± 1% +0.90% (p=0.000 n=40+36) JSONEncode-4 7.57MB/s ± 1% 7.98MB/s ± 0% +5.41% (p=0.000 n=40+31) JSONDecode-4 2.12MB/s ± 1% 2.15MB/s ± 1% +1.23% (p=0.000 n=40+40) GoParse-4 1.23MB/s ± 1% 1.23MB/s ± 1% ~ (p=0.769 n=39+40) RegexpMatchEasy0_32-4 25.0MB/s ± 1% 25.2MB/s ± 1% +0.71% (p=0.000 n=40+40) RegexpMatchEasy0_1K-4 130MB/s ± 5% 134MB/s ± 4% +2.53% (p=0.000 n=38+40) RegexpMatchEasy1_32-4 24.9MB/s ± 1% 25.1MB/s ± 1% +0.55% (p=0.000 n=40+40) RegexpMatchEasy1_1K-4 98.5MB/s ± 2% 99.4MB/s ± 2% +0.88% (p=0.003 n=40+39) RegexpMatchMedium_32-4 490kB/s ± 0% 490kB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 1.89MB/s ± 1% 1.91MB/s ± 1% +1.02% (p=0.000 n=40+38) RegexpMatchHard_32-4 1.10MB/s ± 1% 1.10MB/s ± 0% +0.41% (p=0.000 n=40+33) RegexpMatchHard_1K-4 1.16MB/s ± 1% 1.17MB/s ± 1% +1.21% (p=0.000 n=40+40) Revcomp-4 31.1MB/s ± 2% 37.6MB/s ± 2% +21.03% (p=0.000 n=39+39) Template-4 1.86MB/s ± 1% 1.79MB/s ± 1% -3.51% (p=0.000 n=40+38) [Geo mean] 6.66MB/s 6.80MB/s +2.13% fixes #21492 Change-Id: Ia26e7ca393f0a5f31de240e8ff9a220453ca7e0d Reviewed-on: https://go-review.googlesource.com/58450 Reviewed-by: Cherry Zhang <cherryyz@google.com> Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2017-08-24 10:51:34 +00:00
case ssa.OpARMMOVWstoreidx, ssa.OpARMMOVBstoreidx, ssa.OpARMMOVHstoreidx:
// this is just shift 0 bits
fallthrough
case ssa.OpARMMOVWstoreshiftLL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_SHIFT
p.To.Reg = v.Args[0].Reg()
p.To.Offset = int64(makeshift(v.Args[1].Reg(), arm.SHIFT_LL, v.AuxInt))
case ssa.OpARMMOVWstoreshiftRL:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_SHIFT
p.To.Reg = v.Args[0].Reg()
p.To.Offset = int64(makeshift(v.Args[1].Reg(), arm.SHIFT_LR, v.AuxInt))
case ssa.OpARMMOVWstoreshiftRA:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_SHIFT
p.To.Reg = v.Args[0].Reg()
p.To.Offset = int64(makeshift(v.Args[1].Reg(), arm.SHIFT_AR, v.AuxInt))
case ssa.OpARMMOVBreg,
ssa.OpARMMOVBUreg,
ssa.OpARMMOVHreg,
ssa.OpARMMOVHUreg:
a := v.Args[0]
for a.Op == ssa.OpCopy || a.Op == ssa.OpARMMOVWreg || a.Op == ssa.OpARMMOVWnop {
a = a.Args[0]
}
if a.Op == ssa.OpLoadReg {
t := a.Type
switch {
case v.Op == ssa.OpARMMOVBreg && t.Size() == 1 && t.IsSigned(),
v.Op == ssa.OpARMMOVBUreg && t.Size() == 1 && !t.IsSigned(),
v.Op == ssa.OpARMMOVHreg && t.Size() == 2 && t.IsSigned(),
v.Op == ssa.OpARMMOVHUreg && t.Size() == 2 && !t.IsSigned():
// arg is a proper-typed load, already zero/sign-extended, don't extend again
if v.Reg() == v.Args[0].Reg() {
return
}
p := s.Prog(arm.AMOVW)
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
return
default:
}
}
fallthrough
case ssa.OpARMMVN,
ssa.OpARMCLZ,
ssa.OpARMREV,
ssa.OpARMRBIT,
ssa.OpARMSQRTD,
ssa.OpARMNEGF,
ssa.OpARMNEGD,
ssa.OpARMMOVWF,
ssa.OpARMMOVWD,
ssa.OpARMMOVFW,
ssa.OpARMMOVDW,
ssa.OpARMMOVFD,
ssa.OpARMMOVDF:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMMOVWUF,
ssa.OpARMMOVWUD,
ssa.OpARMMOVFWU,
ssa.OpARMMOVDWU:
p := s.Prog(v.Op.Asm())
p.Scond = arm.C_UBIT
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMCMOVWHSconst:
p := s.Prog(arm.AMOVW)
p.Scond = arm.C_SCOND_HS
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMCMOVWLSconst:
p := s.Prog(arm.AMOVW)
p.Scond = arm.C_SCOND_LS
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMCALLstatic, ssa.OpARMCALLclosure, ssa.OpARMCALLinter:
s.Call(v)
case ssa.OpARMCALLudiv:
p := s.Prog(obj.ACALL)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Udiv
case ssa.OpARMDUFFZERO:
p := s.Prog(obj.ADUFFZERO)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Duffzero
p.To.Offset = v.AuxInt
case ssa.OpARMDUFFCOPY:
p := s.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Duffcopy
p.To.Offset = v.AuxInt
case ssa.OpARMLoweredNilCheck:
// Issue a load which will fault if arg is nil.
p := s.Prog(arm.AMOVB)
p.From.Type = obj.TYPE_MEM
p.From.Reg = v.Args[0].Reg()
gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG
p.To.Reg = arm.REGTMP
if gc.Debug_checknil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
gc.Warnl(v.Pos, "generated nil check")
}
case ssa.OpARMLoweredZero:
// MOVW.P Rarg2, 4(R1)
// CMP Rarg1, R1
// BLE -2(PC)
// arg1 is the address of the last element to zero
// arg2 is known to be zero
// auxint is alignment
var sz int64
var mov obj.As
switch {
case v.AuxInt%4 == 0:
sz = 4
mov = arm.AMOVW
case v.AuxInt%2 == 0:
sz = 2
mov = arm.AMOVH
default:
sz = 1
mov = arm.AMOVB
}
p := s.Prog(mov)
p.Scond = arm.C_PBIT
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[2].Reg()
p.To.Type = obj.TYPE_MEM
p.To.Reg = arm.REG_R1
p.To.Offset = sz
p2 := s.Prog(arm.ACMP)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = v.Args[1].Reg()
p2.Reg = arm.REG_R1
p3 := s.Prog(arm.ABLE)
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
case ssa.OpARMLoweredMove:
// MOVW.P 4(R1), Rtmp
// MOVW.P Rtmp, 4(R2)
// CMP Rarg2, R1
// BLE -3(PC)
// arg2 is the address of the last element of src
// auxint is alignment
var sz int64
var mov obj.As
switch {
case v.AuxInt%4 == 0:
sz = 4
mov = arm.AMOVW
case v.AuxInt%2 == 0:
sz = 2
mov = arm.AMOVH
default:
sz = 1
mov = arm.AMOVB
}
p := s.Prog(mov)
p.Scond = arm.C_PBIT
p.From.Type = obj.TYPE_MEM
p.From.Reg = arm.REG_R1
p.From.Offset = sz
p.To.Type = obj.TYPE_REG
p.To.Reg = arm.REGTMP
p2 := s.Prog(mov)
p2.Scond = arm.C_PBIT
p2.From.Type = obj.TYPE_REG
p2.From.Reg = arm.REGTMP
p2.To.Type = obj.TYPE_MEM
p2.To.Reg = arm.REG_R2
p2.To.Offset = sz
p3 := s.Prog(arm.ACMP)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = v.Args[2].Reg()
p3.Reg = arm.REG_R1
p4 := s.Prog(arm.ABLE)
p4.To.Type = obj.TYPE_BRANCH
gc.Patch(p4, p)
case ssa.OpARMEqual,
ssa.OpARMNotEqual,
ssa.OpARMLessThan,
ssa.OpARMLessEqual,
ssa.OpARMGreaterThan,
ssa.OpARMGreaterEqual,
ssa.OpARMLessThanU,
ssa.OpARMLessEqualU,
ssa.OpARMGreaterThanU,
ssa.OpARMGreaterEqualU:
// generate boolean values
// use conditional move
p := s.Prog(arm.AMOVW)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
p = s.Prog(arm.AMOVW)
p.Scond = condBits[v.Op]
p.From.Type = obj.TYPE_CONST
p.From.Offset = 1
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMLoweredGetClosurePtr:
// Closure pointer is R7 (arm.REGCTXT).
gc.CheckLoweredGetClosurePtr(v)
case ssa.OpARMFlagEQ,
ssa.OpARMFlagLT_ULT,
ssa.OpARMFlagLT_UGT,
ssa.OpARMFlagGT_ULT,
ssa.OpARMFlagGT_UGT:
v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
case ssa.OpARMInvertFlags:
v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
case ssa.OpClobber:
// TODO: implement for clobberdead experiment. Nop is ok for now.
default:
v.Fatalf("genValue not implemented: %s", v.LongString())
}
}
var condBits = map[ssa.Op]uint8{
ssa.OpARMEqual: arm.C_SCOND_EQ,
ssa.OpARMNotEqual: arm.C_SCOND_NE,
ssa.OpARMLessThan: arm.C_SCOND_LT,
ssa.OpARMLessThanU: arm.C_SCOND_LO,
ssa.OpARMLessEqual: arm.C_SCOND_LE,
ssa.OpARMLessEqualU: arm.C_SCOND_LS,
ssa.OpARMGreaterThan: arm.C_SCOND_GT,
ssa.OpARMGreaterThanU: arm.C_SCOND_HI,
ssa.OpARMGreaterEqual: arm.C_SCOND_GE,
ssa.OpARMGreaterEqualU: arm.C_SCOND_HS,
}
var blockJump = map[ssa.BlockKind]struct {
asm, invasm obj.As
}{
ssa.BlockARMEQ: {arm.ABEQ, arm.ABNE},
ssa.BlockARMNE: {arm.ABNE, arm.ABEQ},
ssa.BlockARMLT: {arm.ABLT, arm.ABGE},
ssa.BlockARMGE: {arm.ABGE, arm.ABLT},
ssa.BlockARMLE: {arm.ABLE, arm.ABGT},
ssa.BlockARMGT: {arm.ABGT, arm.ABLE},
ssa.BlockARMULT: {arm.ABLO, arm.ABHS},
ssa.BlockARMUGE: {arm.ABHS, arm.ABLO},
ssa.BlockARMUGT: {arm.ABHI, arm.ABLS},
ssa.BlockARMULE: {arm.ABLS, arm.ABHI},
}
func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) {
switch b.Kind {
case ssa.BlockPlain:
if b.Succs[0].Block() != next {
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
}
case ssa.BlockDefer:
// defer returns in R0:
// 0 if we should continue executing
// 1 if we should jump to deferreturn call
p := s.Prog(arm.ACMP)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 0
p.Reg = arm.REG_R0
p = s.Prog(arm.ABNE)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
if b.Succs[0].Block() != next {
p := s.Prog(obj.AJMP)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
}
case ssa.BlockExit:
s.Prog(obj.AUNDEF) // tell plive.go that we never reach here
case ssa.BlockRet:
s.Prog(obj.ARET)
case ssa.BlockRetJmp:
p := s.Prog(obj.ARET)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = b.Aux.(*obj.LSym)
case ssa.BlockARMEQ, ssa.BlockARMNE,
ssa.BlockARMLT, ssa.BlockARMGE,
ssa.BlockARMLE, ssa.BlockARMGT,
ssa.BlockARMULT, ssa.BlockARMUGT,
ssa.BlockARMULE, ssa.BlockARMUGE:
jmp := blockJump[b.Kind]
var p *obj.Prog
switch next {
case b.Succs[0].Block():
p = s.Prog(jmp.invasm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()})
case b.Succs[1].Block():
p = s.Prog(jmp.asm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
default:
p = s.Prog(jmp.asm)
p.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()})
q := s.Prog(obj.AJMP)
q.To.Type = obj.TYPE_BRANCH
s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()})
}
default:
b.Fatalf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString())
}
}