cmd/compile: implement CMOV on amd64

This builds upon the branchelim pass, activating it for amd64 and
lowering CondSelect. Special care is made to FPU instructions for
NaN handling.

Benchmark results on Xeon E5630 (Westmere EP):

name                      old time/op    new time/op    delta
BinaryTree17-16              4.99s ± 9%     4.66s ± 2%     ~     (p=0.095 n=5+5)
Fannkuch11-16                4.93s ± 3%     5.04s ± 2%     ~     (p=0.548 n=5+5)
FmtFprintfEmpty-16          58.8ns ± 7%    61.4ns ±14%     ~     (p=0.579 n=5+5)
FmtFprintfString-16          114ns ± 2%     114ns ± 4%     ~     (p=0.603 n=5+5)
FmtFprintfInt-16             181ns ± 4%     125ns ± 3%  -30.90%  (p=0.008 n=5+5)
FmtFprintfIntInt-16          263ns ± 2%     217ns ± 2%  -17.34%  (p=0.008 n=5+5)
FmtFprintfPrefixedInt-16     230ns ± 1%     212ns ± 1%   -7.99%  (p=0.008 n=5+5)
FmtFprintfFloat-16           411ns ± 3%     344ns ± 5%  -16.43%  (p=0.008 n=5+5)
FmtManyArgs-16               828ns ± 4%     790ns ± 2%   -4.59%  (p=0.032 n=5+5)
GobDecode-16                10.9ms ± 4%    10.8ms ± 5%     ~     (p=0.548 n=5+5)
GobEncode-16                9.52ms ± 5%    9.46ms ± 2%     ~     (p=1.000 n=5+5)
Gzip-16                      334ms ± 2%     337ms ± 2%     ~     (p=0.548 n=5+5)
Gunzip-16                   64.4ms ± 1%    65.0ms ± 1%   +1.00%  (p=0.008 n=5+5)
HTTPClientServer-16          156µs ± 3%     155µs ± 3%     ~     (p=0.690 n=5+5)
JSONEncode-16               21.0ms ± 1%    21.8ms ± 0%   +3.76%  (p=0.016 n=5+4)
JSONDecode-16               95.1ms ± 0%    95.7ms ± 1%     ~     (p=0.151 n=5+5)
Mandelbrot200-16            6.38ms ± 1%    6.42ms ± 1%     ~     (p=0.095 n=5+5)
GoParse-16                  5.47ms ± 2%    5.36ms ± 1%   -1.95%  (p=0.016 n=5+5)
RegexpMatchEasy0_32-16       111ns ± 1%     111ns ± 1%     ~     (p=0.635 n=5+4)
RegexpMatchEasy0_1K-16       408ns ± 1%     411ns ± 2%     ~     (p=0.087 n=5+5)
RegexpMatchEasy1_32-16       103ns ± 1%     104ns ± 1%     ~     (p=0.484 n=5+5)
RegexpMatchEasy1_1K-16       659ns ± 2%     652ns ± 1%     ~     (p=0.571 n=5+5)
RegexpMatchMedium_32-16      176ns ± 2%     174ns ± 1%     ~     (p=0.476 n=5+5)
RegexpMatchMedium_1K-16     58.6µs ± 4%    57.7µs ± 4%     ~     (p=0.548 n=5+5)
RegexpMatchHard_32-16       3.07µs ± 3%    3.04µs ± 4%     ~     (p=0.421 n=5+5)
RegexpMatchHard_1K-16       89.2µs ± 1%    87.9µs ± 2%   -1.52%  (p=0.032 n=5+5)
Revcomp-16                   575ms ± 0%     587ms ± 2%   +2.12%  (p=0.032 n=4+5)
Template-16                  110ms ± 1%     107ms ± 3%   -3.00%  (p=0.032 n=5+5)
TimeParse-16                 463ns ± 0%     462ns ± 0%     ~     (p=0.810 n=5+4)
TimeFormat-16                538ns ± 0%     535ns ± 0%   -0.63%  (p=0.024 n=5+5)

name                      old speed      new speed      delta
GobDecode-16              70.7MB/s ± 4%  71.4MB/s ± 5%     ~     (p=0.452 n=5+5)
GobEncode-16              80.7MB/s ± 5%  81.2MB/s ± 2%     ~     (p=1.000 n=5+5)
Gzip-16                   58.2MB/s ± 2%  57.7MB/s ± 2%     ~     (p=0.452 n=5+5)
Gunzip-16                  302MB/s ± 1%   299MB/s ± 1%   -0.99%  (p=0.008 n=5+5)
JSONEncode-16             92.4MB/s ± 1%  89.1MB/s ± 0%   -3.63%  (p=0.016 n=5+4)
JSONDecode-16             20.4MB/s ± 0%  20.3MB/s ± 1%     ~     (p=0.135 n=5+5)
GoParse-16                10.6MB/s ± 2%  10.8MB/s ± 1%   +2.00%  (p=0.016 n=5+5)
RegexpMatchEasy0_32-16     286MB/s ± 1%   285MB/s ± 3%     ~     (p=1.000 n=5+5)
RegexpMatchEasy0_1K-16    2.51GB/s ± 1%  2.49GB/s ± 2%     ~     (p=0.095 n=5+5)
RegexpMatchEasy1_32-16     309MB/s ± 1%   307MB/s ± 1%     ~     (p=0.548 n=5+5)
RegexpMatchEasy1_1K-16    1.55GB/s ± 2%  1.57GB/s ± 1%     ~     (p=0.690 n=5+5)
RegexpMatchMedium_32-16   5.68MB/s ± 2%  5.73MB/s ± 1%     ~     (p=0.579 n=5+5)
RegexpMatchMedium_1K-16   17.5MB/s ± 4%  17.8MB/s ± 4%     ~     (p=0.500 n=5+5)
RegexpMatchHard_32-16     10.4MB/s ± 3%  10.5MB/s ± 4%     ~     (p=0.460 n=5+5)
RegexpMatchHard_1K-16     11.5MB/s ± 1%  11.7MB/s ± 2%   +1.57%  (p=0.032 n=5+5)
Revcomp-16                 442MB/s ± 0%   433MB/s ± 2%   -2.05%  (p=0.032 n=4+5)
Template-16               17.7MB/s ± 1%  18.2MB/s ± 3%   +3.12%  (p=0.032 n=5+5)

Change-Id: I6972e8f35f2b31f9a42ac473a6bf419a18022558
Reviewed-on: https://go-review.googlesource.com/100935
Run-TryBot: Giovanni Bajo <rasky@develer.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Giovanni Bajo 2018-03-05 20:59:40 +01:00
parent 423111081b
commit a35ec9a59e
9 changed files with 5238 additions and 117 deletions

View file

@ -398,7 +398,18 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ:
case ssa.OpAMD64CMOVQEQ, ssa.OpAMD64CMOVLEQ, ssa.OpAMD64CMOVWEQ,
ssa.OpAMD64CMOVQLT, ssa.OpAMD64CMOVLLT, ssa.OpAMD64CMOVWLT,
ssa.OpAMD64CMOVQNE, ssa.OpAMD64CMOVLNE, ssa.OpAMD64CMOVWNE,
ssa.OpAMD64CMOVQGT, ssa.OpAMD64CMOVLGT, ssa.OpAMD64CMOVWGT,
ssa.OpAMD64CMOVQLE, ssa.OpAMD64CMOVLLE, ssa.OpAMD64CMOVWLE,
ssa.OpAMD64CMOVQGE, ssa.OpAMD64CMOVLGE, ssa.OpAMD64CMOVWGE,
ssa.OpAMD64CMOVQHI, ssa.OpAMD64CMOVLHI, ssa.OpAMD64CMOVWHI,
ssa.OpAMD64CMOVQLS, ssa.OpAMD64CMOVLLS, ssa.OpAMD64CMOVWLS,
ssa.OpAMD64CMOVQCC, ssa.OpAMD64CMOVLCC, ssa.OpAMD64CMOVWCC,
ssa.OpAMD64CMOVQCS, ssa.OpAMD64CMOVLCS, ssa.OpAMD64CMOVWCS,
ssa.OpAMD64CMOVQGTF, ssa.OpAMD64CMOVLGTF, ssa.OpAMD64CMOVWGTF,
ssa.OpAMD64CMOVQGEF, ssa.OpAMD64CMOVLGEF, ssa.OpAMD64CMOVWGEF:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
@ -409,6 +420,71 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = r
case ssa.OpAMD64CMOVQNEF, ssa.OpAMD64CMOVLNEF, ssa.OpAMD64CMOVWNEF:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
// Flag condition: ^ZERO || PARITY
// Generate:
// CMOV*NE SRC,DST
// CMOV*PS SRC,DST
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = v.Args[1].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = r
var q *obj.Prog
if v.Op == ssa.OpAMD64CMOVQNEF {
q = s.Prog(x86.ACMOVQPS)
} else if v.Op == ssa.OpAMD64CMOVLNEF {
q = s.Prog(x86.ACMOVLPS)
} else {
q = s.Prog(x86.ACMOVWPS)
}
q.From.Type = obj.TYPE_REG
q.From.Reg = v.Args[1].Reg()
q.To.Type = obj.TYPE_REG
q.To.Reg = r
case ssa.OpAMD64CMOVQEQF, ssa.OpAMD64CMOVLEQF, ssa.OpAMD64CMOVWEQF:
r := v.Reg()
if r != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
// Flag condition: ZERO && !PARITY
// Generate:
// MOV SRC,AX
// CMOV*NE DST,AX
// CMOV*PC AX,DST
//
// TODO(rasky): we could generate:
// CMOV*NE DST,SRC
// CMOV*PC SRC,DST
// But this requires a way for regalloc to know that SRC might be
// clobbered by this instruction.
if v.Args[1].Reg() != x86.REG_AX {
opregreg(s, moveByType(v.Type), x86.REG_AX, v.Args[1].Reg())
}
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = r
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_AX
var q *obj.Prog
if v.Op == ssa.OpAMD64CMOVQEQF {
q = s.Prog(x86.ACMOVQPC)
} else if v.Op == ssa.OpAMD64CMOVLEQF {
q = s.Prog(x86.ACMOVLPC)
} else {
q = s.Prog(x86.ACMOVWPC)
}
q.From.Type = obj.TYPE_REG
q.From.Reg = x86.REG_AX
q.To.Type = obj.TYPE_REG
q.To.Reg = r
case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst:
r := v.Reg()
p := s.Prog(v.Op.Asm())