cmd/compile: optimise float <-> int register moves on riscv64

Use the FMV* instructions to move values between the floating point and
integer register files.

Note: I'm unsure why there is a slowdown in the Float32bits benchmark,
I've checked and an FMVXS instruction is being used as expected. There
are multiple loads and other instructions in the main loop.

goos: linux
goarch: riscv64
pkg: math
cpu: Spacemit(R) X60
                    │ fmv-before.txt │            fmv-after.txt            │
                    │     sec/op     │   sec/op     vs base                │
Acos                     122.7n ± 0%   122.7n ± 0%        ~ (p=1.000 n=10)
Acosh                    197.2n ± 0%   191.5n ± 0%   -2.89% (p=0.000 n=10)
Asin                     122.7n ± 0%   122.7n ± 0%        ~ (p=0.474 n=10)
Asinh                    231.0n ± 0%   224.1n ± 0%   -2.99% (p=0.000 n=10)
Atan                     91.39n ± 0%   91.41n ± 0%        ~ (p=0.465 n=10)
Atanh                    210.3n ± 0%   203.4n ± 0%   -3.26% (p=0.000 n=10)
Atan2                    149.6n ± 0%   149.6n ± 0%        ~ (p=0.721 n=10)
Cbrt                     176.5n ± 0%   165.9n ± 0%   -6.01% (p=0.000 n=10)
Ceil                     25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
Copysign                 3.756n ± 0%   3.756n ± 0%        ~ (p=0.149 n=10)
Cos                      95.15n ± 0%   95.15n ± 0%        ~ (p=0.374 n=10)
Cosh                     228.6n ± 0%   224.7n ± 0%   -1.71% (p=0.000 n=10)
Erf                      115.2n ± 0%   115.2n ± 0%        ~ (p=0.474 n=10)
Erfc                     116.4n ± 0%   116.4n ± 0%        ~ (p=0.628 n=10)
Erfinv                   133.3n ± 0%   133.3n ± 0%        ~ (p=1.000 n=10)
Erfcinv                  133.3n ± 0%   133.3n ± 0%        ~ (p=1.000 n=10)
Exp                      194.1n ± 0%   190.3n ± 0%   -1.93% (p=0.000 n=10)
ExpGo                    204.7n ± 0%   200.3n ± 0%   -2.15% (p=0.000 n=10)
Expm1                    137.7n ± 0%   135.2n ± 0%   -1.82% (p=0.000 n=10)
Exp2                     173.4n ± 0%   169.0n ± 0%   -2.54% (p=0.000 n=10)
Exp2Go                   182.8n ± 0%   178.4n ± 0%   -2.41% (p=0.000 n=10)
Abs                      3.756n ± 0%   3.756n ± 0%        ~ (p=0.157 n=10)
Dim                      12.52n ± 0%   12.52n ± 0%        ~ (p=0.737 n=10)
Floor                    25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
Max                      21.29n ± 0%   20.03n ± 0%   -5.92% (p=0.000 n=10)
Min                      21.28n ± 0%   20.04n ± 0%   -5.85% (p=0.000 n=10)
Mod                      344.9n ± 0%   319.2n ± 0%   -7.45% (p=0.000 n=10)
Frexp                    55.71n ± 0%   48.85n ± 0%  -12.30% (p=0.000 n=10)
Gamma                    165.9n ± 0%   167.8n ± 0%   +1.15% (p=0.000 n=10)
Hypot                    73.24n ± 0%   70.74n ± 0%   -3.41% (p=0.000 n=10)
HypotGo                  84.50n ± 0%   82.63n ± 0%   -2.21% (p=0.000 n=10)
Ilogb                    49.45n ± 0%   45.70n ± 0%   -7.59% (p=0.000 n=10)
J0                       556.5n ± 0%   544.0n ± 0%   -2.25% (p=0.000 n=10)
J1                       555.3n ± 0%   542.8n ± 0%   -2.24% (p=0.000 n=10)
Jn                       1.181µ ± 0%   1.156µ ± 0%   -2.12% (p=0.000 n=10)
Ldexp                    59.47n ± 0%   53.84n ± 0%   -9.47% (p=0.000 n=10)
Lgamma                   167.2n ± 0%   154.6n ± 0%   -7.51% (p=0.000 n=10)
Log                      160.9n ± 0%   154.6n ± 0%   -3.92% (p=0.000 n=10)
Logb                     49.45n ± 0%   45.70n ± 0%   -7.58% (p=0.000 n=10)
Log1p                    147.1n ± 0%   137.1n ± 0%   -6.80% (p=0.000 n=10)
Log10                    162.1n ± 1%   154.6n ± 0%   -4.63% (p=0.000 n=10)
Log2                     66.99n ± 0%   60.72n ± 0%   -9.36% (p=0.000 n=10)
Modf                     29.42n ± 0%   26.29n ± 0%  -10.64% (p=0.000 n=10)
Nextafter32              41.95n ± 0%   37.88n ± 0%   -9.70% (p=0.000 n=10)
Nextafter64              38.82n ± 0%   33.49n ± 0%  -13.73% (p=0.000 n=10)
PowInt                   252.3n ± 0%   237.3n ± 0%   -5.95% (p=0.000 n=10)
PowFrac                  615.5n ± 0%   589.7n ± 0%   -4.19% (p=0.000 n=10)
Pow10Pos                 10.64n ± 0%   10.64n ± 0%        ~ (p=1.000 n=10)
Pow10Neg                 24.42n ± 0%   15.02n ± 0%  -38.49% (p=0.000 n=10)
Round                    21.91n ± 0%   18.16n ± 0%  -17.12% (p=0.000 n=10)
RoundToEven              24.42n ± 0%   21.29n ± 0%  -12.84% (p=0.000 n=10)
Remainder                308.0n ± 0%   291.2n ± 0%   -5.44% (p=0.000 n=10)
Signbit                  10.02n ± 0%   10.02n ± 0%        ~ (p=1.000 n=10)
Sin                      102.7n ± 0%   102.7n ± 0%        ~ (p=0.211 n=10)
Sincos                   124.0n ± 1%   123.3n ± 0%   -0.56% (p=0.002 n=10)
Sinh                     239.1n ± 0%   234.7n ± 0%   -1.84% (p=0.000 n=10)
SqrtIndirect             2.504n ± 0%   2.504n ± 0%        ~ (p=0.303 n=10)
SqrtLatency              15.03n ± 0%   15.02n ± 0%        ~ (p=0.598 n=10)
SqrtIndirectLatency      15.02n ± 0%   15.02n ± 0%        ~ (p=0.907 n=10)
SqrtGoLatency            165.3n ± 0%   157.2n ± 0%   -4.90% (p=0.000 n=10)
SqrtPrime                3.801µ ± 0%   3.802µ ± 0%        ~ (p=1.000 n=10)
Tan                      125.2n ± 0%   125.2n ± 0%        ~ (p=0.458 n=10)
Tanh                     244.2n ± 0%   239.9n ± 0%   -1.76% (p=0.000 n=10)
Trunc                    25.67n ± 0%   24.42n ± 0%   -4.87% (p=0.000 n=10)
Y0                       550.2n ± 0%   538.1n ± 0%   -2.21% (p=0.000 n=10)
Y1                       552.8n ± 0%   540.6n ± 0%   -2.21% (p=0.000 n=10)
Yn                       1.168µ ± 0%   1.143µ ± 0%   -2.14% (p=0.000 n=10)
Float64bits              8.139n ± 0%   4.385n ± 0%  -46.13% (p=0.000 n=10)
Float64frombits          7.512n ± 0%   3.759n ± 0%  -49.96% (p=0.000 n=10)
Float32bits              8.138n ± 0%   9.393n ± 0%  +15.42% (p=0.000 n=10)
Float32frombits          7.513n ± 0%   3.757n ± 0%  -49.98% (p=0.000 n=10)
FMA                      3.756n ± 0%   3.756n ± 0%        ~ (p=0.246 n=10)
geomean                  77.43n        72.42n        -6.47%

Change-Id: I8dac69b1d17cb3d2af78d1c844d2b5d80000d667
Reviewed-on: https://go-review.googlesource.com/c/go/+/599235
Reviewed-by: Keith Randall <khr@google.com>
Auto-Submit: Michael Munday <mikemndy@gmail.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Michael Munday 2024-07-17 23:54:43 +01:00 committed by Gopher Robot
parent 7a1679d7ae
commit fcc036f03b
6 changed files with 372 additions and 3 deletions

View file

@ -453,7 +453,8 @@ func init() {
{name: "FNMSUBS", argLength: 3, reg: fp31, asm: "FNMSUBS", commutative: true, typ: "Float32"}, // -(arg0 * arg1) - arg2
{name: "FSQRTS", argLength: 1, reg: fp11, asm: "FSQRTS", typ: "Float32"}, // sqrt(arg0)
{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS", typ: "Float32"}, // -arg0
{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float
{name: "FMVSX", argLength: 1, reg: gpfp, asm: "FMVSX", typ: "Float32"}, // reinterpret arg0 as float32
{name: "FMVXS", argLength: 1, reg: fpgp, asm: "FMVXS", typ: "Int32"}, // reinterpret arg0 as int32, sign extended to 64 bits
{name: "FCVTSW", argLength: 1, reg: gpfp, asm: "FCVTSW", typ: "Float32"}, // float32(low 32 bits of arg0)
{name: "FCVTSL", argLength: 1, reg: gpfp, asm: "FCVTSL", typ: "Float32"}, // float32(arg0)
{name: "FCVTWS", argLength: 1, reg: fpgp, asm: "FCVTWS", typ: "Int32"}, // int32(arg0)
@ -480,7 +481,8 @@ func init() {
{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD", typ: "Float64"}, // -arg0
{name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD", typ: "Float64"}, // abs(arg0)
{name: "FSGNJD", argLength: 2, reg: fp21, asm: "FSGNJD", typ: "Float64"}, // copy sign of arg1 to arg0
{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float
{name: "FMVDX", argLength: 1, reg: gpfp, asm: "FMVDX", typ: "Float64"}, // reinterpret arg0 as float64
{name: "FMVXD", argLength: 1, reg: fpgp, asm: "FMVXD", typ: "Int64"}, // reinterpret arg0 as int64
{name: "FCVTDW", argLength: 1, reg: gpfp, asm: "FCVTDW", typ: "Float64"}, // float64(low 32 bits of arg0)
{name: "FCVTDL", argLength: 1, reg: gpfp, asm: "FCVTDL", typ: "Float64"}, // float64(arg0)
{name: "FCVTWD", argLength: 1, reg: fpgp, asm: "FCVTWD", typ: "Int32"}, // int32(arg0)