mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
cmd/compile: add fma intrinsic for amd64
To permit ssa-level optimization, this change introduces an amd64 intrinsic
that generates the VFMADD231SD instruction for the fused-multiply-add
operation on systems that support it. System support is detected via
cpu.X86.HasFMA. A rewrite rule can then translate the generic ssa intrinsic
("Fma") to VFMADD231SD.
The benchmark compares the software implementation (old) with the intrinsic
(new).
name old time/op new time/op delta
Fma-4 27.2ns ± 1% 1.0ns ± 9% -96.48% (p=0.008 n=5+5)
Updates #25819.
Change-Id: I966655e5f96817a5d06dff5942418a3915b09584
Reviewed-on: https://go-review.googlesource.com/c/go/+/137156
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
50f4896b72
commit
7a6da218b1
12 changed files with 85 additions and 0 deletions
|
|
@ -164,6 +164,14 @@ func duff(size int64) (int64, int64) {
|
||||||
|
|
||||||
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
|
||||||
switch v.Op {
|
switch v.Op {
|
||||||
|
case ssa.OpAMD64VFMADD231SD:
|
||||||
|
p := s.Prog(v.Op.Asm())
|
||||||
|
p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[2].Reg()}
|
||||||
|
p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
|
||||||
|
p.SetFrom3(obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()})
|
||||||
|
if v.Reg() != v.Args[0].Reg() {
|
||||||
|
v.Fatalf("input[0] and output not in same register %s", v.LongString())
|
||||||
|
}
|
||||||
case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
|
case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL:
|
||||||
r := v.Reg()
|
r := v.Reg()
|
||||||
r1 := v.Args[0].Reg()
|
r1 := v.Args[0].Reg()
|
||||||
|
|
|
||||||
|
|
@ -185,6 +185,7 @@ var runtimeDecls = [...]struct {
|
||||||
{"checkptrArithmetic", funcTag, 122},
|
{"checkptrArithmetic", funcTag, 122},
|
||||||
{"x86HasPOPCNT", varTag, 15},
|
{"x86HasPOPCNT", varTag, 15},
|
||||||
{"x86HasSSE41", varTag, 15},
|
{"x86HasSSE41", varTag, 15},
|
||||||
|
{"x86HasFMA", varTag, 15},
|
||||||
{"arm64HasATOMICS", varTag, 15},
|
{"arm64HasATOMICS", varTag, 15},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -241,4 +241,5 @@ func checkptrArithmetic(unsafe.Pointer, []unsafe.Pointer)
|
||||||
// architecture variants
|
// architecture variants
|
||||||
var x86HasPOPCNT bool
|
var x86HasPOPCNT bool
|
||||||
var x86HasSSE41 bool
|
var x86HasSSE41 bool
|
||||||
|
var x86HasFMA bool
|
||||||
var arm64HasATOMICS bool
|
var arm64HasATOMICS bool
|
||||||
|
|
|
||||||
|
|
@ -311,6 +311,7 @@ var (
|
||||||
racewriterange,
|
racewriterange,
|
||||||
x86HasPOPCNT,
|
x86HasPOPCNT,
|
||||||
x86HasSSE41,
|
x86HasSSE41,
|
||||||
|
x86HasFMA,
|
||||||
arm64HasATOMICS,
|
arm64HasATOMICS,
|
||||||
typedmemclr,
|
typedmemclr,
|
||||||
typedmemmove,
|
typedmemmove,
|
||||||
|
|
|
||||||
|
|
@ -91,6 +91,7 @@ func initssaconfig() {
|
||||||
racewriterange = sysfunc("racewriterange")
|
racewriterange = sysfunc("racewriterange")
|
||||||
x86HasPOPCNT = sysvar("x86HasPOPCNT") // bool
|
x86HasPOPCNT = sysvar("x86HasPOPCNT") // bool
|
||||||
x86HasSSE41 = sysvar("x86HasSSE41") // bool
|
x86HasSSE41 = sysvar("x86HasSSE41") // bool
|
||||||
|
x86HasFMA = sysvar("x86HasFMA") // bool
|
||||||
arm64HasATOMICS = sysvar("arm64HasATOMICS") // bool
|
arm64HasATOMICS = sysvar("arm64HasATOMICS") // bool
|
||||||
typedmemclr = sysfunc("typedmemclr")
|
typedmemclr = sysfunc("typedmemclr")
|
||||||
typedmemmove = sysfunc("typedmemmove")
|
typedmemmove = sysfunc("typedmemmove")
|
||||||
|
|
@ -3326,6 +3327,36 @@ func init() {
|
||||||
return s.newValue3(ssa.OpFma, types.Types[TFLOAT64], args[0], args[1], args[2])
|
return s.newValue3(ssa.OpFma, types.Types[TFLOAT64], args[0], args[1], args[2])
|
||||||
},
|
},
|
||||||
sys.ARM64, sys.PPC64, sys.S390X)
|
sys.ARM64, sys.PPC64, sys.S390X)
|
||||||
|
addF("math", "Fma",
|
||||||
|
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||||
|
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), x86HasFMA, s.sb)
|
||||||
|
v := s.load(types.Types[TBOOL], addr)
|
||||||
|
b := s.endBlock()
|
||||||
|
b.Kind = ssa.BlockIf
|
||||||
|
b.SetControl(v)
|
||||||
|
bTrue := s.f.NewBlock(ssa.BlockPlain)
|
||||||
|
bFalse := s.f.NewBlock(ssa.BlockPlain)
|
||||||
|
bEnd := s.f.NewBlock(ssa.BlockPlain)
|
||||||
|
b.AddEdgeTo(bTrue)
|
||||||
|
b.AddEdgeTo(bFalse)
|
||||||
|
b.Likely = ssa.BranchLikely // >= haswell cpus are common
|
||||||
|
|
||||||
|
// We have the intrinsic - use it directly.
|
||||||
|
s.startBlock(bTrue)
|
||||||
|
s.vars[n] = s.newValue3(ssa.OpFma, types.Types[TFLOAT64], args[0], args[1], args[2])
|
||||||
|
s.endBlock().AddEdgeTo(bEnd)
|
||||||
|
|
||||||
|
// Call the pure Go version.
|
||||||
|
s.startBlock(bFalse)
|
||||||
|
a := s.call(n, callNormal)
|
||||||
|
s.vars[n] = s.load(types.Types[TFLOAT64], a)
|
||||||
|
s.endBlock().AddEdgeTo(bEnd)
|
||||||
|
|
||||||
|
// Merge results.
|
||||||
|
s.startBlock(bEnd)
|
||||||
|
return s.variable(n, types.Types[TFLOAT64])
|
||||||
|
},
|
||||||
|
sys.AMD64)
|
||||||
|
|
||||||
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||||
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
|
||||||
|
|
|
||||||
|
|
@ -113,6 +113,7 @@
|
||||||
(Floor x) -> (ROUNDSD [1] x)
|
(Floor x) -> (ROUNDSD [1] x)
|
||||||
(Ceil x) -> (ROUNDSD [2] x)
|
(Ceil x) -> (ROUNDSD [2] x)
|
||||||
(Trunc x) -> (ROUNDSD [3] x)
|
(Trunc x) -> (ROUNDSD [3] x)
|
||||||
|
(Fma x y z) -> (VFMADD231SD z x y)
|
||||||
|
|
||||||
// Lowering extension
|
// Lowering extension
|
||||||
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
|
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
|
||||||
|
|
|
||||||
|
|
@ -147,6 +147,7 @@ func init() {
|
||||||
|
|
||||||
fp01 = regInfo{inputs: nil, outputs: fponly}
|
fp01 = regInfo{inputs: nil, outputs: fponly}
|
||||||
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
|
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: fponly}
|
||||||
|
fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: fponly}
|
||||||
fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
|
fp21load = regInfo{inputs: []regMask{fp, gpspsb, 0}, outputs: fponly}
|
||||||
fpgp = regInfo{inputs: fponly, outputs: gponly}
|
fpgp = regInfo{inputs: fponly, outputs: gponly}
|
||||||
gpfp = regInfo{inputs: gponly, outputs: fponly}
|
gpfp = regInfo{inputs: gponly, outputs: fponly}
|
||||||
|
|
@ -478,6 +479,10 @@ func init() {
|
||||||
// Any use must be preceded by a successful check of runtime.x86HasSSE41.
|
// Any use must be preceded by a successful check of runtime.x86HasSSE41.
|
||||||
{name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
|
{name: "ROUNDSD", argLength: 1, reg: fp11, aux: "Int8", asm: "ROUNDSD"}, // rounds arg0 depending on auxint, 1 means math.Floor, 2 Ceil, 3 Trunc
|
||||||
|
|
||||||
|
// VFMADD231SD only exists on platforms with the FMA3 instruction set.
|
||||||
|
// Any use must be preceded by a successful check of runtime.support_fma.
|
||||||
|
{name: "VFMADD231SD", argLength: 3, reg: fp31, resultInArg0: true, asm: "VFMADD231SD"},
|
||||||
|
|
||||||
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
|
{name: "SBBQcarrymask", argLength: 1, reg: flagsgp, asm: "SBBQ"}, // (int64)(-1) if carry is set, 0 if carry is clear.
|
||||||
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
|
{name: "SBBLcarrymask", argLength: 1, reg: flagsgp, asm: "SBBL"}, // (int32)(-1) if carry is set, 0 if carry is clear.
|
||||||
// Note: SBBW and SBBB are subsumed by SBBL
|
// Note: SBBW and SBBB are subsumed by SBBL
|
||||||
|
|
|
||||||
|
|
@ -743,6 +743,7 @@ const (
|
||||||
OpAMD64POPCNTL
|
OpAMD64POPCNTL
|
||||||
OpAMD64SQRTSD
|
OpAMD64SQRTSD
|
||||||
OpAMD64ROUNDSD
|
OpAMD64ROUNDSD
|
||||||
|
OpAMD64VFMADD231SD
|
||||||
OpAMD64SBBQcarrymask
|
OpAMD64SBBQcarrymask
|
||||||
OpAMD64SBBLcarrymask
|
OpAMD64SBBLcarrymask
|
||||||
OpAMD64SETEQ
|
OpAMD64SETEQ
|
||||||
|
|
@ -9625,6 +9626,22 @@ var opcodeTable = [...]opInfo{
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
name: "VFMADD231SD",
|
||||||
|
argLen: 3,
|
||||||
|
resultInArg0: true,
|
||||||
|
asm: x86.AVFMADD231SD,
|
||||||
|
reg: regInfo{
|
||||||
|
inputs: []inputInfo{
|
||||||
|
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||||
|
{1, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||||
|
{2, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||||
|
},
|
||||||
|
outputs: []outputInfo{
|
||||||
|
{0, 4294901760}, // X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
name: "SBBQcarrymask",
|
name: "SBBQcarrymask",
|
||||||
argLen: 1,
|
argLen: 1,
|
||||||
|
|
|
||||||
|
|
@ -768,6 +768,8 @@ func rewriteValueAMD64(v *Value) bool {
|
||||||
return rewriteValueAMD64_OpEqPtr_0(v)
|
return rewriteValueAMD64_OpEqPtr_0(v)
|
||||||
case OpFloor:
|
case OpFloor:
|
||||||
return rewriteValueAMD64_OpFloor_0(v)
|
return rewriteValueAMD64_OpFloor_0(v)
|
||||||
|
case OpFma:
|
||||||
|
return rewriteValueAMD64_OpFma_0(v)
|
||||||
case OpGeq16:
|
case OpGeq16:
|
||||||
return rewriteValueAMD64_OpGeq16_0(v)
|
return rewriteValueAMD64_OpGeq16_0(v)
|
||||||
case OpGeq16U:
|
case OpGeq16U:
|
||||||
|
|
@ -52331,6 +52333,21 @@ func rewriteValueAMD64_OpFloor_0(v *Value) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
func rewriteValueAMD64_OpFma_0(v *Value) bool {
|
||||||
|
// match: (Fma x y z)
|
||||||
|
// cond:
|
||||||
|
// result: (VFMADD231SD z x y)
|
||||||
|
for {
|
||||||
|
z := v.Args[2]
|
||||||
|
x := v.Args[0]
|
||||||
|
y := v.Args[1]
|
||||||
|
v.reset(OpAMD64VFMADD231SD)
|
||||||
|
v.AddArg(z)
|
||||||
|
v.AddArg(x)
|
||||||
|
v.AddArg(y)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
|
func rewriteValueAMD64_OpGeq16_0(v *Value) bool {
|
||||||
b := v.Block
|
b := v.Block
|
||||||
// match: (Geq16 x y)
|
// match: (Geq16 x y)
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ var (
|
||||||
// TODO: deprecate these; use internal/cpu directly.
|
// TODO: deprecate these; use internal/cpu directly.
|
||||||
x86HasPOPCNT bool
|
x86HasPOPCNT bool
|
||||||
x86HasSSE41 bool
|
x86HasSSE41 bool
|
||||||
|
x86HasFMA bool
|
||||||
|
|
||||||
arm64HasATOMICS bool
|
arm64HasATOMICS bool
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -514,6 +514,7 @@ func cpuinit() {
|
||||||
// to guard execution of instructions that can not be assumed to be always supported.
|
// to guard execution of instructions that can not be assumed to be always supported.
|
||||||
x86HasPOPCNT = cpu.X86.HasPOPCNT
|
x86HasPOPCNT = cpu.X86.HasPOPCNT
|
||||||
x86HasSSE41 = cpu.X86.HasSSE41
|
x86HasSSE41 = cpu.X86.HasSSE41
|
||||||
|
x86HasFMA = cpu.X86.HasFMA
|
||||||
|
|
||||||
arm64HasATOMICS = cpu.ARM64.HasATOMICS
|
arm64HasATOMICS = cpu.ARM64.HasATOMICS
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,7 @@ func copysign(a, b, c float64) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func fma(x, y, z float64) float64 {
|
func fma(x, y, z float64) float64 {
|
||||||
|
// amd64:"VFMADD231SD"
|
||||||
// arm64:"FMADDD"
|
// arm64:"FMADDD"
|
||||||
// s390x:"FMADD"
|
// s390x:"FMADD"
|
||||||
// ppc64:"FMADD"
|
// ppc64:"FMADD"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue