mirror of
https://github.com/golang/go.git
synced 2025-10-22 12:33:19 +00:00

Use the FMV* instructions to move values between the floating point and integer register files. Note: I'm unsure why there is a slowdown in the Float32bits benchmark, I've checked and an FMVXS instruction is being used as expected. There are multiple loads and other instructions in the main loop. goos: linux goarch: riscv64 pkg: math cpu: Spacemit(R) X60 │ fmv-before.txt │ fmv-after.txt │ │ sec/op │ sec/op vs base │ Acos 122.7n ± 0% 122.7n ± 0% ~ (p=1.000 n=10) Acosh 197.2n ± 0% 191.5n ± 0% -2.89% (p=0.000 n=10) Asin 122.7n ± 0% 122.7n ± 0% ~ (p=0.474 n=10) Asinh 231.0n ± 0% 224.1n ± 0% -2.99% (p=0.000 n=10) Atan 91.39n ± 0% 91.41n ± 0% ~ (p=0.465 n=10) Atanh 210.3n ± 0% 203.4n ± 0% -3.26% (p=0.000 n=10) Atan2 149.6n ± 0% 149.6n ± 0% ~ (p=0.721 n=10) Cbrt 176.5n ± 0% 165.9n ± 0% -6.01% (p=0.000 n=10) Ceil 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) Copysign 3.756n ± 0% 3.756n ± 0% ~ (p=0.149 n=10) Cos 95.15n ± 0% 95.15n ± 0% ~ (p=0.374 n=10) Cosh 228.6n ± 0% 224.7n ± 0% -1.71% (p=0.000 n=10) Erf 115.2n ± 0% 115.2n ± 0% ~ (p=0.474 n=10) Erfc 116.4n ± 0% 116.4n ± 0% ~ (p=0.628 n=10) Erfinv 133.3n ± 0% 133.3n ± 0% ~ (p=1.000 n=10) Erfcinv 133.3n ± 0% 133.3n ± 0% ~ (p=1.000 n=10) Exp 194.1n ± 0% 190.3n ± 0% -1.93% (p=0.000 n=10) ExpGo 204.7n ± 0% 200.3n ± 0% -2.15% (p=0.000 n=10) Expm1 137.7n ± 0% 135.2n ± 0% -1.82% (p=0.000 n=10) Exp2 173.4n ± 0% 169.0n ± 0% -2.54% (p=0.000 n=10) Exp2Go 182.8n ± 0% 178.4n ± 0% -2.41% (p=0.000 n=10) Abs 3.756n ± 0% 3.756n ± 0% ~ (p=0.157 n=10) Dim 12.52n ± 0% 12.52n ± 0% ~ (p=0.737 n=10) Floor 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) Max 21.29n ± 0% 20.03n ± 0% -5.92% (p=0.000 n=10) Min 21.28n ± 0% 20.04n ± 0% -5.85% (p=0.000 n=10) Mod 344.9n ± 0% 319.2n ± 0% -7.45% (p=0.000 n=10) Frexp 55.71n ± 0% 48.85n ± 0% -12.30% (p=0.000 n=10) Gamma 165.9n ± 0% 167.8n ± 0% +1.15% (p=0.000 n=10) Hypot 73.24n ± 0% 70.74n ± 0% -3.41% (p=0.000 n=10) HypotGo 84.50n ± 0% 82.63n ± 0% -2.21% (p=0.000 n=10) Ilogb 49.45n ± 0% 45.70n ± 0% -7.59% (p=0.000 n=10) J0 556.5n ± 0% 544.0n ± 0% -2.25% (p=0.000 n=10) J1 555.3n ± 0% 542.8n ± 0% -2.24% (p=0.000 n=10) Jn 1.181µ ± 0% 1.156µ ± 0% -2.12% (p=0.000 n=10) Ldexp 59.47n ± 0% 53.84n ± 0% -9.47% (p=0.000 n=10) Lgamma 167.2n ± 0% 154.6n ± 0% -7.51% (p=0.000 n=10) Log 160.9n ± 0% 154.6n ± 0% -3.92% (p=0.000 n=10) Logb 49.45n ± 0% 45.70n ± 0% -7.58% (p=0.000 n=10) Log1p 147.1n ± 0% 137.1n ± 0% -6.80% (p=0.000 n=10) Log10 162.1n ± 1% 154.6n ± 0% -4.63% (p=0.000 n=10) Log2 66.99n ± 0% 60.72n ± 0% -9.36% (p=0.000 n=10) Modf 29.42n ± 0% 26.29n ± 0% -10.64% (p=0.000 n=10) Nextafter32 41.95n ± 0% 37.88n ± 0% -9.70% (p=0.000 n=10) Nextafter64 38.82n ± 0% 33.49n ± 0% -13.73% (p=0.000 n=10) PowInt 252.3n ± 0% 237.3n ± 0% -5.95% (p=0.000 n=10) PowFrac 615.5n ± 0% 589.7n ± 0% -4.19% (p=0.000 n=10) Pow10Pos 10.64n ± 0% 10.64n ± 0% ~ (p=1.000 n=10) Pow10Neg 24.42n ± 0% 15.02n ± 0% -38.49% (p=0.000 n=10) Round 21.91n ± 0% 18.16n ± 0% -17.12% (p=0.000 n=10) RoundToEven 24.42n ± 0% 21.29n ± 0% -12.84% (p=0.000 n=10) Remainder 308.0n ± 0% 291.2n ± 0% -5.44% (p=0.000 n=10) Signbit 10.02n ± 0% 10.02n ± 0% ~ (p=1.000 n=10) Sin 102.7n ± 0% 102.7n ± 0% ~ (p=0.211 n=10) Sincos 124.0n ± 1% 123.3n ± 0% -0.56% (p=0.002 n=10) Sinh 239.1n ± 0% 234.7n ± 0% -1.84% (p=0.000 n=10) SqrtIndirect 2.504n ± 0% 2.504n ± 0% ~ (p=0.303 n=10) SqrtLatency 15.03n ± 0% 15.02n ± 0% ~ (p=0.598 n=10) SqrtIndirectLatency 15.02n ± 0% 15.02n ± 0% ~ (p=0.907 n=10) SqrtGoLatency 165.3n ± 0% 157.2n ± 0% -4.90% (p=0.000 n=10) SqrtPrime 3.801µ ± 0% 3.802µ ± 0% ~ (p=1.000 n=10) Tan 125.2n ± 0% 125.2n ± 0% ~ (p=0.458 n=10) Tanh 244.2n ± 0% 239.9n ± 0% -1.76% (p=0.000 n=10) Trunc 25.67n ± 0% 24.42n ± 0% -4.87% (p=0.000 n=10) Y0 550.2n ± 0% 538.1n ± 0% -2.21% (p=0.000 n=10) Y1 552.8n ± 0% 540.6n ± 0% -2.21% (p=0.000 n=10) Yn 1.168µ ± 0% 1.143µ ± 0% -2.14% (p=0.000 n=10) Float64bits 8.139n ± 0% 4.385n ± 0% -46.13% (p=0.000 n=10) Float64frombits 7.512n ± 0% 3.759n ± 0% -49.96% (p=0.000 n=10) Float32bits 8.138n ± 0% 9.393n ± 0% +15.42% (p=0.000 n=10) Float32frombits 7.513n ± 0% 3.757n ± 0% -49.98% (p=0.000 n=10) FMA 3.756n ± 0% 3.756n ± 0% ~ (p=0.246 n=10) geomean 77.43n 72.42n -6.47% Change-Id: I8dac69b1d17cb3d2af78d1c844d2b5d80000d667 Reviewed-on: https://go-review.googlesource.com/c/go/+/599235 Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Michael Munday <mikemndy@gmail.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@golang.org>
266 lines
6.5 KiB
Go
266 lines
6.5 KiB
Go
// asmcheck
|
|
|
|
// Copyright 2018 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package codegen
|
|
|
|
import "math"
|
|
|
|
var sink64 [8]float64
|
|
|
|
func approx(x float64) {
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]2"
|
|
// s390x:"FIDBR\t[$]6"
|
|
// arm64:"FRINTPD"
|
|
// ppc64x:"FRIP"
|
|
// wasm:"F64Ceil"
|
|
sink64[0] = math.Ceil(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]1"
|
|
// s390x:"FIDBR\t[$]7"
|
|
// arm64:"FRINTMD"
|
|
// ppc64x:"FRIM"
|
|
// wasm:"F64Floor"
|
|
sink64[1] = math.Floor(x)
|
|
|
|
// s390x:"FIDBR\t[$]1"
|
|
// arm64:"FRINTAD"
|
|
// ppc64x:"FRIN"
|
|
sink64[2] = math.Round(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]3"
|
|
// s390x:"FIDBR\t[$]5"
|
|
// arm64:"FRINTZD"
|
|
// ppc64x:"FRIZ"
|
|
// wasm:"F64Trunc"
|
|
sink64[3] = math.Trunc(x)
|
|
|
|
// amd64/v2:-".*x86HasSSE41" amd64/v3:-".*x86HasSSE41"
|
|
// amd64:"ROUNDSD\t[$]0"
|
|
// s390x:"FIDBR\t[$]4"
|
|
// arm64:"FRINTND"
|
|
// wasm:"F64Nearest"
|
|
sink64[4] = math.RoundToEven(x)
|
|
}
|
|
|
|
func sqrt(x float64) float64 {
|
|
// amd64:"SQRTSD"
|
|
// 386/sse2:"SQRTSD" 386/softfloat:-"SQRTD"
|
|
// arm64:"FSQRTD"
|
|
// arm/7:"SQRTD"
|
|
// mips/hardfloat:"SQRTD" mips/softfloat:-"SQRTD"
|
|
// mips64/hardfloat:"SQRTD" mips64/softfloat:-"SQRTD"
|
|
// wasm:"F64Sqrt"
|
|
// ppc64x:"FSQRT"
|
|
// riscv64: "FSQRTD"
|
|
return math.Sqrt(x)
|
|
}
|
|
|
|
func sqrt32(x float32) float32 {
|
|
// amd64:"SQRTSS"
|
|
// 386/sse2:"SQRTSS" 386/softfloat:-"SQRTS"
|
|
// arm64:"FSQRTS"
|
|
// arm/7:"SQRTF"
|
|
// mips/hardfloat:"SQRTF" mips/softfloat:-"SQRTF"
|
|
// mips64/hardfloat:"SQRTF" mips64/softfloat:-"SQRTF"
|
|
// wasm:"F32Sqrt"
|
|
// ppc64x:"FSQRTS"
|
|
// riscv64: "FSQRTS"
|
|
return float32(math.Sqrt(float64(x)))
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func abs(x, y float64) {
|
|
// amd64:"BTRQ\t[$]63"
|
|
// arm64:"FABSD\t"
|
|
// s390x:"LPDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FABS\t"
|
|
// riscv64:"FABSD\t"
|
|
// wasm:"F64Abs"
|
|
// arm/6:"ABSD\t"
|
|
// mips64/hardfloat:"ABSD\t"
|
|
// mips/hardfloat:"ABSD\t"
|
|
sink64[0] = math.Abs(x)
|
|
|
|
// amd64:"BTRQ\t[$]63","PXOR" (TODO: this should be BTSQ)
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FNABS\t"
|
|
sink64[1] = -math.Abs(y)
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func abs32(x float32) float32 {
|
|
// s390x:"LPDFR",-"LDEBR",-"LEDBR" (no float64 conversion)
|
|
return float32(math.Abs(float64(x)))
|
|
}
|
|
|
|
// Check that it's using integer registers
|
|
func copysign(a, b, c float64) {
|
|
// amd64:"BTRQ\t[$]63","ANDQ","ORQ"
|
|
// s390x:"CPSDR",-"MOVD" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
// wasm:"F64Copysign"
|
|
sink64[0] = math.Copysign(a, b)
|
|
|
|
// amd64:"BTSQ\t[$]63"
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
// arm64:"ORR", -"AND"
|
|
sink64[1] = math.Copysign(c, -1)
|
|
|
|
// Like math.Copysign(c, -1), but with integer operations. Useful
|
|
// for platforms that have a copysign opcode to see if it's detected.
|
|
// s390x:"LNDFR\t",-"MOVD\t" (no integer load/store)
|
|
sink64[2] = math.Float64frombits(math.Float64bits(a) | 1<<63)
|
|
|
|
// amd64:"ANDQ","ORQ"
|
|
// s390x:"CPSDR\t",-"MOVD\t" (no integer load/store)
|
|
// ppc64x:"FCPSGN"
|
|
// riscv64:"FSGNJD"
|
|
sink64[3] = math.Copysign(-1, c)
|
|
}
|
|
|
|
func fma(x, y, z float64) float64 {
|
|
// amd64/v3:-".*x86HasFMA"
|
|
// amd64:"VFMADD231SD"
|
|
// arm/6:"FMULAD"
|
|
// arm64:"FMADDD"
|
|
// loong64:"FMADDD"
|
|
// s390x:"FMADD"
|
|
// ppc64x:"FMADD"
|
|
// riscv64:"FMADDD"
|
|
return math.FMA(x, y, z)
|
|
}
|
|
|
|
func fms(x, y, z float64) float64 {
|
|
// riscv64:"FMSUBD"
|
|
return math.FMA(x, y, -z)
|
|
}
|
|
|
|
func fnms(x, y, z float64) float64 {
|
|
// riscv64:"FNMSUBD",-"FNMADDD"
|
|
return math.FMA(-x, y, z)
|
|
}
|
|
|
|
func fnma(x, y, z float64) float64 {
|
|
// riscv64:"FNMADDD",-"FNMSUBD"
|
|
return math.FMA(x, -y, -z)
|
|
}
|
|
|
|
func fromFloat64(f64 float64) uint64 {
|
|
// amd64:"MOVQ\tX.*, [^X].*"
|
|
// arm64:"FMOVD\tF.*, R.*"
|
|
// loong64:"MOVV\tF.*, R.*"
|
|
// ppc64x:"MFVSRD"
|
|
// mips64/hardfloat:"MOVV\tF.*, R.*"
|
|
// riscv64:"FMVXD"
|
|
return math.Float64bits(f64+1) + 1
|
|
}
|
|
|
|
func fromFloat32(f32 float32) uint32 {
|
|
// amd64:"MOVL\tX.*, [^X].*"
|
|
// arm64:"FMOVS\tF.*, R.*"
|
|
// loong64:"MOVW\tF.*, R.*"
|
|
// mips64/hardfloat:"MOVW\tF.*, R.*"
|
|
// riscv64:"FMVXW"
|
|
return math.Float32bits(f32+1) + 1
|
|
}
|
|
|
|
func toFloat64(u64 uint64) float64 {
|
|
// amd64:"MOVQ\t[^X].*, X.*"
|
|
// arm64:"FMOVD\tR.*, F.*"
|
|
// loong64:"MOVV\tR.*, F.*"
|
|
// ppc64x:"MTVSRD"
|
|
// mips64/hardfloat:"MOVV\tR.*, F.*"
|
|
// riscv64:"FMVDX"
|
|
return math.Float64frombits(u64+1) + 1
|
|
}
|
|
|
|
func toFloat32(u32 uint32) float32 {
|
|
// amd64:"MOVL\t[^X].*, X.*"
|
|
// arm64:"FMOVS\tR.*, F.*"
|
|
// loong64:"MOVW\tR.*, F.*"
|
|
// mips64/hardfloat:"MOVW\tR.*, F.*"
|
|
// riscv64:"FMVWX"
|
|
return math.Float32frombits(u32+1) + 1
|
|
}
|
|
|
|
// Test that comparisons with constants converted to float
|
|
// are evaluated at compile-time
|
|
|
|
func constantCheck64() bool {
|
|
// amd64:"(MOVB\t[$]0)|(XORL\t[A-Z][A-Z0-9]+, [A-Z][A-Z0-9]+)",-"FCMP",-"MOVB\t[$]1"
|
|
// s390x:"MOV(B|BZ|D)\t[$]0,",-"FCMPU",-"MOV(B|BZ|D)\t[$]1,"
|
|
return 0.5 == float64(uint32(1)) || 1.5 > float64(uint64(1<<63))
|
|
}
|
|
|
|
func constantCheck32() bool {
|
|
// amd64:"MOV(B|L)\t[$]1",-"FCMP",-"MOV(B|L)\t[$]0"
|
|
// s390x:"MOV(B|BZ|D)\t[$]1,",-"FCMPU",-"MOV(B|BZ|D)\t[$]0,"
|
|
return float32(0.5) <= float32(int64(1)) && float32(1.5) >= float32(int32(-1<<31))
|
|
}
|
|
|
|
// Test that integer constants are converted to floating point constants
|
|
// at compile-time
|
|
|
|
func constantConvert32(x float32) float32 {
|
|
// amd64:"MOVSS\t[$]f32.3f800000\\(SB\\)"
|
|
// s390x:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power8:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power9:"FMOVS\t[$]f32.3f800000\\(SB\\)"
|
|
// ppc64x/power10:"XXSPLTIDP\t[$]1065353216, VS0"
|
|
// arm64:"FMOVS\t[$]\\(1.0\\)"
|
|
if x > math.Float32frombits(0x3f800000) {
|
|
return -x
|
|
}
|
|
return x
|
|
}
|
|
|
|
func constantConvertInt32(x uint32) uint32 {
|
|
// amd64:-"MOVSS"
|
|
// s390x:-"FMOVS"
|
|
// ppc64x:-"FMOVS"
|
|
// arm64:-"FMOVS"
|
|
if x > math.Float32bits(1) {
|
|
return -x
|
|
}
|
|
return x
|
|
}
|
|
|
|
func nanGenerate64() float64 {
|
|
// Test to make sure we don't generate a NaN while constant propagating.
|
|
// See issue 36400.
|
|
zero := 0.0
|
|
// amd64:-"DIVSD"
|
|
inf := 1 / zero // +inf. We can constant propagate this one.
|
|
negone := -1.0
|
|
|
|
// amd64:"DIVSD"
|
|
z0 := zero / zero
|
|
// amd64/v1,amd64/v2:"MULSD"
|
|
z1 := zero * inf
|
|
// amd64:"SQRTSD"
|
|
z2 := math.Sqrt(negone)
|
|
// amd64/v3:"VFMADD231SD"
|
|
return z0 + z1 + z2
|
|
}
|
|
|
|
func nanGenerate32() float32 {
|
|
zero := float32(0.0)
|
|
// amd64:-"DIVSS"
|
|
inf := 1 / zero // +inf. We can constant propagate this one.
|
|
|
|
// amd64:"DIVSS"
|
|
z0 := zero / zero
|
|
// amd64/v1,amd64/v2:"MULSS"
|
|
z1 := zero * inf
|
|
// amd64/v3:"VFMADD231SS"
|
|
return z0 + z1
|
|
}
|