go/test/codegen/floats.go

316 lines
6.6 KiB
Go
Raw Normal View History

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
cmd/compile: use FCLASSD for subnormal checks on riscv64 Only implemented for 64 bit floating point operations for now. goos: linux goarch: riscv64 pkg: math cpu: Spacemit(R) X60 │ sec/op │ sec/op vs base │ Acos 154.1n ± 0% 154.1n ± 0% ~ (p=0.303 n=10) Acosh 215.8n ± 6% 226.7n ± 0% ~ (p=0.439 n=10) Asin 149.2n ± 1% 149.2n ± 0% ~ (p=0.700 n=10) Asinh 262.1n ± 0% 258.5n ± 0% -1.37% (p=0.000 n=10) Atan 99.48n ± 0% 99.49n ± 0% ~ (p=0.836 n=10) Atanh 244.9n ± 0% 243.8n ± 0% -0.43% (p=0.002 n=10) Atan2 158.2n ± 1% 153.3n ± 0% -3.10% (p=0.000 n=10) Cbrt 186.8n ± 0% 181.1n ± 0% -3.03% (p=0.000 n=10) Ceil 36.71n ± 1% 36.71n ± 0% ~ (p=0.434 n=10) Copysign 6.531n ± 1% 6.526n ± 0% ~ (p=0.268 n=10) Cos 98.19n ± 0% 95.40n ± 0% -2.84% (p=0.000 n=10) Cosh 233.1n ± 0% 222.6n ± 0% -4.50% (p=0.000 n=10) Erf 122.5n ± 0% 114.2n ± 0% -6.78% (p=0.000 n=10) Erfc 126.0n ± 1% 116.6n ± 0% -7.46% (p=0.000 n=10) Erfinv 138.8n ± 0% 138.6n ± 0% ~ (p=0.082 n=10) Erfcinv 140.0n ± 0% 139.7n ± 0% ~ (p=0.359 n=10) Exp 193.3n ± 0% 184.2n ± 0% -4.68% (p=0.000 n=10) ExpGo 204.8n ± 0% 194.5n ± 0% -5.03% (p=0.000 n=10) Expm1 152.5n ± 1% 145.0n ± 0% -4.92% (p=0.000 n=10) Exp2 174.5n ± 0% 164.2n ± 0% -5.85% (p=0.000 n=10) Exp2Go 184.4n ± 1% 175.4n ± 0% -4.88% (p=0.000 n=10) Abs 4.912n ± 0% 4.914n ± 0% ~ (p=0.283 n=10) Dim 15.50n ± 1% 15.52n ± 1% ~ (p=0.331 n=10) Floor 36.89n ± 1% 36.76n ± 1% ~ (p=0.325 n=10) Max 31.05n ± 1% 31.17n ± 1% ~ (p=0.628 n=10) Min 31.01n ± 0% 31.06n ± 0% ~ (p=0.767 n=10) Mod 294.1n ± 0% 245.6n ± 0% -16.52% (p=0.000 n=10) Frexp 44.86n ± 1% 35.20n ± 0% -21.53% (p=0.000 n=10) Gamma 195.8n ± 0% 185.4n ± 1% -5.29% (p=0.000 n=10) Hypot 84.91n ± 0% 84.54n ± 1% -0.43% (p=0.006 n=10) HypotGo 96.70n ± 0% 95.42n ± 1% -1.32% (p=0.000 n=10) Ilogb 45.03n ± 0% 35.07n ± 1% -22.10% (p=0.000 n=10) J0 634.5n ± 0% 627.2n ± 0% -1.16% (p=0.000 n=10) J1 644.5n ± 0% 636.9n ± 0% -1.18% (p=0.000 n=10) Jn 1.357µ ± 0% 1.344µ ± 0% -0.92% (p=0.000 n=10) Ldexp 49.89n ± 0% 39.96n ± 0% -19.90% (p=0.000 n=10) Lgamma 186.6n ± 0% 184.3n ± 0% -1.21% (p=0.000 n=10) Log 150.4n ± 0% 141.1n ± 0% -6.15% (p=0.000 n=10) Logb 46.70n ± 0% 35.89n ± 0% -23.15% (p=0.000 n=10) Log1p 164.1n ± 0% 163.9n ± 0% ~ (p=0.122 n=10) Log10 153.1n ± 0% 143.5n ± 0% -6.24% (p=0.000 n=10) Log2 58.83n ± 0% 49.75n ± 0% -15.43% (p=0.000 n=10) Modf 40.82n ± 1% 40.78n ± 0% ~ (p=0.239 n=10) Nextafter32 49.15n ± 0% 48.93n ± 0% -0.44% (p=0.011 n=10) Nextafter64 43.33n ± 0% 43.23n ± 0% ~ (p=0.228 n=10) PowInt 269.4n ± 0% 243.8n ± 0% -9.49% (p=0.000 n=10) PowFrac 618.0n ± 0% 571.7n ± 0% -7.48% (p=0.000 n=10) Pow10Pos 13.09n ± 0% 13.05n ± 0% -0.31% (p=0.003 n=10) Pow10Neg 30.99n ± 1% 30.99n ± 0% ~ (p=0.173 n=10) Round 23.73n ± 0% 23.65n ± 0% -0.36% (p=0.011 n=10) RoundToEven 27.87n ± 0% 27.73n ± 0% -0.48% (p=0.003 n=10) Remainder 282.1n ± 0% 249.6n ± 0% -11.52% (p=0.000 n=10) Signbit 11.46n ± 0% 11.42n ± 0% -0.39% (p=0.003 n=10) Sin 115.2n ± 0% 113.2n ± 0% -1.74% (p=0.000 n=10) Sincos 140.6n ± 0% 138.6n ± 0% -1.39% (p=0.000 n=10) Sinh 252.0n ± 0% 241.4n ± 0% -4.21% (p=0.000 n=10) SqrtIndirect 4.909n ± 0% 4.893n ± 0% -0.34% (p=0.021 n=10) SqrtLatency 19.57n ± 1% 19.57n ± 0% ~ (p=0.087 n=10) SqrtIndirectLatency 19.64n ± 0% 19.57n ± 0% -0.36% (p=0.025 n=10) SqrtGoLatency 198.1n ± 0% 197.4n ± 0% -0.35% (p=0.014 n=10) SqrtPrime 5.733µ ± 0% 5.725µ ± 0% ~ (p=0.116 n=10) Tan 149.1n ± 0% 146.8n ± 0% -1.54% (p=0.000 n=10) Tanh 248.2n ± 1% 238.1n ± 0% -4.05% (p=0.000 n=10) Trunc 36.86n ± 0% 36.70n ± 0% -0.43% (p=0.029 n=10) Y0 638.2n ± 0% 633.6n ± 0% -0.71% (p=0.000 n=10) Y1 641.8n ± 0% 636.1n ± 0% -0.87% (p=0.000 n=10) Yn 1.358µ ± 0% 1.345µ ± 0% -0.92% (p=0.000 n=10) Float64bits 5.721n ± 0% 5.709n ± 0% -0.22% (p=0.044 n=10) Float64frombits 4.905n ± 0% 4.893n ± 0% ~ (p=0.266 n=10) Float32bits 12.27n ± 0% 12.23n ± 0% ~ (p=0.122 n=10) Float32frombits 4.909n ± 0% 4.893n ± 0% -0.32% (p=0.024 n=10) FMA 6.556n ± 0% 6.526n ± 0% ~ (p=0.283 n=10) geomean 86.82n 83.75n -3.54% Change-Id: I522297a79646d76543d516accce291f5a3cea337 Reviewed-on: https://go-review.googlesource.com/c/go/+/717560 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2025-08-24 00:15:29 +01:00
import "math"
// This file contains codegen tests related to arithmetic
// simplifications and optimizations on float types.
// For codegen tests on integer types, see arithmetic.go.
// --------------------- //
// Strength-reduce //
// --------------------- //
func Mul2(f float64) float64 {
// 386/sse2:"ADDSD" -"MULSD"
// amd64:"ADDSD" -"MULSD"
// arm/7:"ADDD" -"MULD"
// arm64:"FADDD" -"FMULD"
// loong64:"ADDD" -"MULD"
// ppc64x:"FADD" -"FMUL"
// riscv64:"FADDD" -"FMULD"
return f * 2.0
}
func DivPow2(f1, f2, f3 float64) (float64, float64, float64) {
// 386/sse2:"MULSD" -"DIVSD"
// amd64:"MULSD" -"DIVSD"
// arm/7:"MULD" -"DIVD"
// arm64:"FMULD" -"FDIVD"
// loong64:"MULD" -"DIVD"
// ppc64x:"FMUL" -"FDIV"
// riscv64:"FMULD" -"FDIVD"
x := f1 / 16.0
// 386/sse2:"MULSD" -"DIVSD"
// amd64:"MULSD" -"DIVSD"
// arm/7:"MULD" -"DIVD"
// arm64:"FMULD" -"FDIVD"
// loong64:"MULD" -"DIVD"
// ppc64x:"FMUL" -"FDIVD"
// riscv64:"FMULD" -"FDIVD"
y := f2 / 0.125
// 386/sse2:"ADDSD" -"DIVSD" -"MULSD"
// amd64:"ADDSD" -"DIVSD" -"MULSD"
// arm/7:"ADDD" -"MULD" -"DIVD"
// arm64:"FADDD" -"FMULD" -"FDIVD"
// loong64:"ADDD" -"MULD" -"DIVD"
// ppc64x:"FADD" -"FMUL" -"FDIV"
// riscv64:"FADDD" -"FMULD" -"FDIVD"
z := f3 / 0.5
return x, y, z
}
cmd/compile: optimize arm64 with indexed FP load/store The FP load/store on arm64 have register indexed forms. And this CL implements this optimization. 1. The total size of pkg/android_arm64 (excluding cmd/compile) decreases about 400 bytes. 2. There is no regression in the go1 benchmark, the test case GobEncode even gets slight improvement, excluding noise. name old time/op new time/op delta BinaryTree17-4 19.0s ± 0% 19.0s ± 1% ~ (p=0.817 n=29+29) Fannkuch11-4 9.94s ± 0% 9.95s ± 0% +0.03% (p=0.010 n=24+30) FmtFprintfEmpty-4 233ns ± 0% 233ns ± 0% ~ (all equal) FmtFprintfString-4 427ns ± 0% 427ns ± 0% ~ (p=0.649 n=30+30) FmtFprintfInt-4 471ns ± 0% 471ns ± 0% ~ (all equal) FmtFprintfIntInt-4 730ns ± 0% 730ns ± 0% ~ (all equal) FmtFprintfPrefixedInt-4 889ns ± 0% 889ns ± 0% ~ (all equal) FmtFprintfFloat-4 1.21µs ± 0% 1.21µs ± 0% +0.04% (p=0.012 n=20+30) FmtManyArgs-4 2.99µs ± 0% 2.99µs ± 0% ~ (p=0.651 n=29+29) GobDecode-4 42.4ms ± 1% 42.3ms ± 1% -0.27% (p=0.001 n=29+28) GobEncode-4 37.8ms ±11% 36.0ms ± 0% -4.67% (p=0.000 n=30+26) Gzip-4 1.98s ± 1% 1.96s ± 1% -1.26% (p=0.000 n=30+30) Gunzip-4 175ms ± 0% 175ms ± 0% ~ (p=0.988 n=29+29) HTTPClientServer-4 854µs ± 5% 860µs ± 5% ~ (p=0.236 n=28+29) JSONEncode-4 88.8ms ± 0% 87.9ms ± 0% -1.00% (p=0.000 n=24+26) JSONDecode-4 390ms ± 1% 392ms ± 2% +0.48% (p=0.025 n=30+30) Mandelbrot200-4 19.5ms ± 0% 19.5ms ± 0% ~ (p=0.894 n=24+29) GoParse-4 20.3ms ± 0% 20.1ms ± 1% -0.94% (p=0.000 n=27+26) RegexpMatchEasy0_32-4 451ns ± 0% 451ns ± 0% ~ (p=0.578 n=30+30) RegexpMatchEasy0_1K-4 1.63µs ± 0% 1.63µs ± 0% ~ (p=0.298 n=30+28) RegexpMatchEasy1_32-4 431ns ± 0% 434ns ± 0% +0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 2.60µs ± 0% 2.64µs ± 0% +1.36% (p=0.000 n=28+26) RegexpMatchMedium_32-4 744ns ± 0% 744ns ± 0% ~ (p=0.474 n=29+29) RegexpMatchMedium_1K-4 223µs ± 0% 223µs ± 0% -0.08% (p=0.038 n=26+30) RegexpMatchHard_32-4 12.2µs ± 0% 12.3µs ± 0% +0.27% (p=0.000 n=29+30) RegexpMatchHard_1K-4 373µs ± 0% 373µs ± 0% ~ (p=0.219 n=29+28) Revcomp-4 2.84s ± 0% 2.84s ± 0% ~ (p=0.130 n=28+28) Template-4 394ms ± 1% 392ms ± 1% -0.52% (p=0.001 n=30+30) TimeParse-4 1.93µs ± 0% 1.93µs ± 0% ~ (p=0.587 n=29+30) TimeFormat-4 2.00µs ± 0% 2.00µs ± 0% +0.07% (p=0.001 n=28+27) [Geo mean] 306µs 305µs -0.17% name old speed new speed delta GobDecode-4 18.1MB/s ± 1% 18.2MB/s ± 1% +0.27% (p=0.001 n=29+28) GobEncode-4 20.3MB/s ±10% 21.3MB/s ± 0% +4.64% (p=0.000 n=30+26) Gzip-4 9.79MB/s ± 1% 9.91MB/s ± 1% +1.28% (p=0.000 n=30+30) Gunzip-4 111MB/s ± 0% 111MB/s ± 0% ~ (p=0.988 n=29+29) JSONEncode-4 21.8MB/s ± 0% 22.1MB/s ± 0% +1.02% (p=0.000 n=24+26) JSONDecode-4 4.97MB/s ± 1% 4.95MB/s ± 2% -0.45% (p=0.031 n=30+30) GoParse-4 2.85MB/s ± 1% 2.88MB/s ± 1% +1.03% (p=0.000 n=30+26) RegexpMatchEasy0_32-4 70.9MB/s ± 0% 70.9MB/s ± 0% ~ (p=0.904 n=29+28) RegexpMatchEasy0_1K-4 627MB/s ± 0% 627MB/s ± 0% ~ (p=0.156 n=30+30) RegexpMatchEasy1_32-4 74.2MB/s ± 0% 73.7MB/s ± 0% -0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 393MB/s ± 0% 388MB/s ± 0% -1.34% (p=0.000 n=28+26) RegexpMatchMedium_32-4 1.34MB/s ± 0% 1.34MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 4.59MB/s ± 0% 4.59MB/s ± 0% +0.07% (p=0.035 n=25+30) RegexpMatchHard_32-4 2.61MB/s ± 0% 2.61MB/s ± 0% -0.11% (p=0.002 n=28+30) RegexpMatchHard_1K-4 2.75MB/s ± 0% 2.75MB/s ± 0% +0.15% (p=0.001 n=30+24) Revcomp-4 89.4MB/s ± 0% 89.4MB/s ± 0% ~ (p=0.140 n=28+28) Template-4 4.93MB/s ± 1% 4.95MB/s ± 1% +0.51% (p=0.001 n=30+30) [Geo mean] 18.4MB/s 18.4MB/s +0.37% Change-Id: I9a6b521a971b21cfb51064e8e9b853cef8a1d071 Reviewed-on: https://go-review.googlesource.com/124636 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-07-18 09:31:35 +00:00
func indexLoad(b0 []float32, b1 float32, idx int) float32 {
// arm64:`FMOVS\s\(R[0-9]+\)\(R[0-9]+<<2\),\sF[0-9]+`
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
// loong64:`MOVF\s\(R[0-9]+\)\(R[0-9]+\),\sF[0-9]+`
cmd/compile: optimize arm64 with indexed FP load/store The FP load/store on arm64 have register indexed forms. And this CL implements this optimization. 1. The total size of pkg/android_arm64 (excluding cmd/compile) decreases about 400 bytes. 2. There is no regression in the go1 benchmark, the test case GobEncode even gets slight improvement, excluding noise. name old time/op new time/op delta BinaryTree17-4 19.0s ± 0% 19.0s ± 1% ~ (p=0.817 n=29+29) Fannkuch11-4 9.94s ± 0% 9.95s ± 0% +0.03% (p=0.010 n=24+30) FmtFprintfEmpty-4 233ns ± 0% 233ns ± 0% ~ (all equal) FmtFprintfString-4 427ns ± 0% 427ns ± 0% ~ (p=0.649 n=30+30) FmtFprintfInt-4 471ns ± 0% 471ns ± 0% ~ (all equal) FmtFprintfIntInt-4 730ns ± 0% 730ns ± 0% ~ (all equal) FmtFprintfPrefixedInt-4 889ns ± 0% 889ns ± 0% ~ (all equal) FmtFprintfFloat-4 1.21µs ± 0% 1.21µs ± 0% +0.04% (p=0.012 n=20+30) FmtManyArgs-4 2.99µs ± 0% 2.99µs ± 0% ~ (p=0.651 n=29+29) GobDecode-4 42.4ms ± 1% 42.3ms ± 1% -0.27% (p=0.001 n=29+28) GobEncode-4 37.8ms ±11% 36.0ms ± 0% -4.67% (p=0.000 n=30+26) Gzip-4 1.98s ± 1% 1.96s ± 1% -1.26% (p=0.000 n=30+30) Gunzip-4 175ms ± 0% 175ms ± 0% ~ (p=0.988 n=29+29) HTTPClientServer-4 854µs ± 5% 860µs ± 5% ~ (p=0.236 n=28+29) JSONEncode-4 88.8ms ± 0% 87.9ms ± 0% -1.00% (p=0.000 n=24+26) JSONDecode-4 390ms ± 1% 392ms ± 2% +0.48% (p=0.025 n=30+30) Mandelbrot200-4 19.5ms ± 0% 19.5ms ± 0% ~ (p=0.894 n=24+29) GoParse-4 20.3ms ± 0% 20.1ms ± 1% -0.94% (p=0.000 n=27+26) RegexpMatchEasy0_32-4 451ns ± 0% 451ns ± 0% ~ (p=0.578 n=30+30) RegexpMatchEasy0_1K-4 1.63µs ± 0% 1.63µs ± 0% ~ (p=0.298 n=30+28) RegexpMatchEasy1_32-4 431ns ± 0% 434ns ± 0% +0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 2.60µs ± 0% 2.64µs ± 0% +1.36% (p=0.000 n=28+26) RegexpMatchMedium_32-4 744ns ± 0% 744ns ± 0% ~ (p=0.474 n=29+29) RegexpMatchMedium_1K-4 223µs ± 0% 223µs ± 0% -0.08% (p=0.038 n=26+30) RegexpMatchHard_32-4 12.2µs ± 0% 12.3µs ± 0% +0.27% (p=0.000 n=29+30) RegexpMatchHard_1K-4 373µs ± 0% 373µs ± 0% ~ (p=0.219 n=29+28) Revcomp-4 2.84s ± 0% 2.84s ± 0% ~ (p=0.130 n=28+28) Template-4 394ms ± 1% 392ms ± 1% -0.52% (p=0.001 n=30+30) TimeParse-4 1.93µs ± 0% 1.93µs ± 0% ~ (p=0.587 n=29+30) TimeFormat-4 2.00µs ± 0% 2.00µs ± 0% +0.07% (p=0.001 n=28+27) [Geo mean] 306µs 305µs -0.17% name old speed new speed delta GobDecode-4 18.1MB/s ± 1% 18.2MB/s ± 1% +0.27% (p=0.001 n=29+28) GobEncode-4 20.3MB/s ±10% 21.3MB/s ± 0% +4.64% (p=0.000 n=30+26) Gzip-4 9.79MB/s ± 1% 9.91MB/s ± 1% +1.28% (p=0.000 n=30+30) Gunzip-4 111MB/s ± 0% 111MB/s ± 0% ~ (p=0.988 n=29+29) JSONEncode-4 21.8MB/s ± 0% 22.1MB/s ± 0% +1.02% (p=0.000 n=24+26) JSONDecode-4 4.97MB/s ± 1% 4.95MB/s ± 2% -0.45% (p=0.031 n=30+30) GoParse-4 2.85MB/s ± 1% 2.88MB/s ± 1% +1.03% (p=0.000 n=30+26) RegexpMatchEasy0_32-4 70.9MB/s ± 0% 70.9MB/s ± 0% ~ (p=0.904 n=29+28) RegexpMatchEasy0_1K-4 627MB/s ± 0% 627MB/s ± 0% ~ (p=0.156 n=30+30) RegexpMatchEasy1_32-4 74.2MB/s ± 0% 73.7MB/s ± 0% -0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 393MB/s ± 0% 388MB/s ± 0% -1.34% (p=0.000 n=28+26) RegexpMatchMedium_32-4 1.34MB/s ± 0% 1.34MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 4.59MB/s ± 0% 4.59MB/s ± 0% +0.07% (p=0.035 n=25+30) RegexpMatchHard_32-4 2.61MB/s ± 0% 2.61MB/s ± 0% -0.11% (p=0.002 n=28+30) RegexpMatchHard_1K-4 2.75MB/s ± 0% 2.75MB/s ± 0% +0.15% (p=0.001 n=30+24) Revcomp-4 89.4MB/s ± 0% 89.4MB/s ± 0% ~ (p=0.140 n=28+28) Template-4 4.93MB/s ± 1% 4.95MB/s ± 1% +0.51% (p=0.001 n=30+30) [Geo mean] 18.4MB/s 18.4MB/s +0.37% Change-Id: I9a6b521a971b21cfb51064e8e9b853cef8a1d071 Reviewed-on: https://go-review.googlesource.com/124636 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-07-18 09:31:35 +00:00
return b0[idx] * b1
}
func indexStore(b0 []float64, b1 float64, idx int) {
// arm64:`FMOVD\sF[0-9]+,\s\(R[0-9]+\)\(R[0-9]+<<3\)`
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
// loong64:`MOVD\sF[0-9]+,\s\(R[0-9]+\)\(R[0-9]+\)`
cmd/compile: optimize arm64 with indexed FP load/store The FP load/store on arm64 have register indexed forms. And this CL implements this optimization. 1. The total size of pkg/android_arm64 (excluding cmd/compile) decreases about 400 bytes. 2. There is no regression in the go1 benchmark, the test case GobEncode even gets slight improvement, excluding noise. name old time/op new time/op delta BinaryTree17-4 19.0s ± 0% 19.0s ± 1% ~ (p=0.817 n=29+29) Fannkuch11-4 9.94s ± 0% 9.95s ± 0% +0.03% (p=0.010 n=24+30) FmtFprintfEmpty-4 233ns ± 0% 233ns ± 0% ~ (all equal) FmtFprintfString-4 427ns ± 0% 427ns ± 0% ~ (p=0.649 n=30+30) FmtFprintfInt-4 471ns ± 0% 471ns ± 0% ~ (all equal) FmtFprintfIntInt-4 730ns ± 0% 730ns ± 0% ~ (all equal) FmtFprintfPrefixedInt-4 889ns ± 0% 889ns ± 0% ~ (all equal) FmtFprintfFloat-4 1.21µs ± 0% 1.21µs ± 0% +0.04% (p=0.012 n=20+30) FmtManyArgs-4 2.99µs ± 0% 2.99µs ± 0% ~ (p=0.651 n=29+29) GobDecode-4 42.4ms ± 1% 42.3ms ± 1% -0.27% (p=0.001 n=29+28) GobEncode-4 37.8ms ±11% 36.0ms ± 0% -4.67% (p=0.000 n=30+26) Gzip-4 1.98s ± 1% 1.96s ± 1% -1.26% (p=0.000 n=30+30) Gunzip-4 175ms ± 0% 175ms ± 0% ~ (p=0.988 n=29+29) HTTPClientServer-4 854µs ± 5% 860µs ± 5% ~ (p=0.236 n=28+29) JSONEncode-4 88.8ms ± 0% 87.9ms ± 0% -1.00% (p=0.000 n=24+26) JSONDecode-4 390ms ± 1% 392ms ± 2% +0.48% (p=0.025 n=30+30) Mandelbrot200-4 19.5ms ± 0% 19.5ms ± 0% ~ (p=0.894 n=24+29) GoParse-4 20.3ms ± 0% 20.1ms ± 1% -0.94% (p=0.000 n=27+26) RegexpMatchEasy0_32-4 451ns ± 0% 451ns ± 0% ~ (p=0.578 n=30+30) RegexpMatchEasy0_1K-4 1.63µs ± 0% 1.63µs ± 0% ~ (p=0.298 n=30+28) RegexpMatchEasy1_32-4 431ns ± 0% 434ns ± 0% +0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 2.60µs ± 0% 2.64µs ± 0% +1.36% (p=0.000 n=28+26) RegexpMatchMedium_32-4 744ns ± 0% 744ns ± 0% ~ (p=0.474 n=29+29) RegexpMatchMedium_1K-4 223µs ± 0% 223µs ± 0% -0.08% (p=0.038 n=26+30) RegexpMatchHard_32-4 12.2µs ± 0% 12.3µs ± 0% +0.27% (p=0.000 n=29+30) RegexpMatchHard_1K-4 373µs ± 0% 373µs ± 0% ~ (p=0.219 n=29+28) Revcomp-4 2.84s ± 0% 2.84s ± 0% ~ (p=0.130 n=28+28) Template-4 394ms ± 1% 392ms ± 1% -0.52% (p=0.001 n=30+30) TimeParse-4 1.93µs ± 0% 1.93µs ± 0% ~ (p=0.587 n=29+30) TimeFormat-4 2.00µs ± 0% 2.00µs ± 0% +0.07% (p=0.001 n=28+27) [Geo mean] 306µs 305µs -0.17% name old speed new speed delta GobDecode-4 18.1MB/s ± 1% 18.2MB/s ± 1% +0.27% (p=0.001 n=29+28) GobEncode-4 20.3MB/s ±10% 21.3MB/s ± 0% +4.64% (p=0.000 n=30+26) Gzip-4 9.79MB/s ± 1% 9.91MB/s ± 1% +1.28% (p=0.000 n=30+30) Gunzip-4 111MB/s ± 0% 111MB/s ± 0% ~ (p=0.988 n=29+29) JSONEncode-4 21.8MB/s ± 0% 22.1MB/s ± 0% +1.02% (p=0.000 n=24+26) JSONDecode-4 4.97MB/s ± 1% 4.95MB/s ± 2% -0.45% (p=0.031 n=30+30) GoParse-4 2.85MB/s ± 1% 2.88MB/s ± 1% +1.03% (p=0.000 n=30+26) RegexpMatchEasy0_32-4 70.9MB/s ± 0% 70.9MB/s ± 0% ~ (p=0.904 n=29+28) RegexpMatchEasy0_1K-4 627MB/s ± 0% 627MB/s ± 0% ~ (p=0.156 n=30+30) RegexpMatchEasy1_32-4 74.2MB/s ± 0% 73.7MB/s ± 0% -0.67% (p=0.000 n=30+29) RegexpMatchEasy1_1K-4 393MB/s ± 0% 388MB/s ± 0% -1.34% (p=0.000 n=28+26) RegexpMatchMedium_32-4 1.34MB/s ± 0% 1.34MB/s ± 0% ~ (all equal) RegexpMatchMedium_1K-4 4.59MB/s ± 0% 4.59MB/s ± 0% +0.07% (p=0.035 n=25+30) RegexpMatchHard_32-4 2.61MB/s ± 0% 2.61MB/s ± 0% -0.11% (p=0.002 n=28+30) RegexpMatchHard_1K-4 2.75MB/s ± 0% 2.75MB/s ± 0% +0.15% (p=0.001 n=30+24) Revcomp-4 89.4MB/s ± 0% 89.4MB/s ± 0% ~ (p=0.140 n=28+28) Template-4 4.93MB/s ± 1% 4.95MB/s ± 1% +0.51% (p=0.001 n=30+30) [Geo mean] 18.4MB/s 18.4MB/s +0.37% Change-Id: I9a6b521a971b21cfb51064e8e9b853cef8a1d071 Reviewed-on: https://go-review.googlesource.com/124636 Run-TryBot: Ben Shi <powerman1st@163.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
2018-07-18 09:31:35 +00:00
b0[idx] = b1
}
// ----------- //
// Fused //
// ----------- //
func FusedAdd32(x, y, z float32) float32 {
// s390x:"FMADDS "
// ppc64x:"FMADDS "
// arm64:"FMADDS"
// loong64:"FMADDF "
// riscv64:"FMADDS "
// amd64/v3:"VFMADD231SS "
return x*y + z
}
func FusedSub32_a(x, y, z float32) float32 {
// s390x:"FMSUBS "
// ppc64x:"FMSUBS "
// riscv64:"FMSUBS "
// loong64:"FMSUBF "
return x*y - z
}
func FusedSub32_b(x, y, z float32) float32 {
// arm64:"FMSUBS"
// loong64:"FNMSUBF "
// riscv64:"FNMSUBS "
return z - x*y
}
func FusedAdd64(x, y, z float64) float64 {
// s390x:"FMADD "
// ppc64x:"FMADD "
// arm64:"FMADDD"
// loong64:"FMADDD "
// riscv64:"FMADDD "
// amd64/v3:"VFMADD231SD "
return x*y + z
}
func FusedSub64_a(x, y, z float64) float64 {
// s390x:"FMSUB "
// ppc64x:"FMSUB "
// riscv64:"FMSUBD "
// loong64:"FMSUBD "
return x*y - z
}
func FusedSub64_b(x, y, z float64) float64 {
// arm64:"FMSUBD"
// loong64:"FNMSUBD "
// riscv64:"FNMSUBD "
return z - x*y
}
func Cmp(f float64) bool {
// arm64:"FCMPD" "(BGT|BLE|BMI|BPL)" -"CSET GT" -"CBZ"
return f > 4 || f < -4
}
func CmpZero64(f float64) bool {
// s390x:"LTDBR" -"FCMPU"
return f <= 0
}
func CmpZero32(f float32) bool {
// s390x:"LTEBR" -"CEBR"
return f <= 0
}
cmd/compile: allow floating point Ops to produce flags on s390x On s390x, some floating point arithmetic instructions (FSUB, FADD) generate flag. This patch allows those related SSA ops to return a tuple, where the second argument of the tuple is the generated flag. We can use the flag and remove the subsequent comparison instruction (e.g: LTDBR). This CL also reduces the .text section for math.test binary by 0.4KB. Benchmarks: name old time/op new time/op delta Acos-18 12.1ns ± 0% 12.1ns ± 0% ~ (all equal) Acosh-18 18.5ns ± 0% 18.5ns ± 0% ~ (all equal) Asin-18 13.1ns ± 0% 13.1ns ± 0% ~ (all equal) Asinh-18 19.4ns ± 0% 19.5ns ± 1% ~ (p=0.444 n=5+5) Atan-18 10.0ns ± 0% 10.0ns ± 0% ~ (all equal) Atanh-18 19.1ns ± 1% 19.2ns ± 2% ~ (p=0.841 n=5+5) Atan2-18 16.4ns ± 0% 16.4ns ± 0% ~ (all equal) Cbrt-18 14.8ns ± 0% 14.8ns ± 0% ~ (all equal) Ceil-18 0.78ns ± 0% 0.78ns ± 0% ~ (all equal) Copysign-18 0.80ns ± 0% 0.80ns ± 0% ~ (all equal) Cos-18 7.19ns ± 0% 7.19ns ± 0% ~ (p=0.556 n=4+5) Cosh-18 12.4ns ± 0% 12.4ns ± 0% ~ (all equal) Erf-18 10.8ns ± 0% 10.8ns ± 0% ~ (all equal) Erfc-18 11.0ns ± 0% 11.0ns ± 0% ~ (all equal) Erfinv-18 23.0ns ±16% 26.8ns ± 1% +16.90% (p=0.008 n=5+5) Erfcinv-18 23.3ns ±15% 26.1ns ± 7% ~ (p=0.087 n=5+5) Exp-18 8.67ns ± 0% 8.67ns ± 0% ~ (p=1.000 n=4+4) ExpGo-18 50.8ns ± 3% 52.4ns ± 2% ~ (p=0.063 n=5+5) Expm1-18 9.49ns ± 1% 9.47ns ± 0% ~ (p=1.000 n=5+5) Exp2-18 52.7ns ± 1% 50.5ns ± 3% -4.10% (p=0.024 n=5+5) Exp2Go-18 50.6ns ± 1% 48.4ns ± 3% -4.39% (p=0.008 n=5+5) Abs-18 0.67ns ± 0% 0.67ns ± 0% ~ (p=0.444 n=5+5) Dim-18 1.02ns ± 0% 1.03ns ± 0% +0.98% (p=0.008 n=5+5) Floor-18 0.78ns ± 0% 0.78ns ± 0% ~ (all equal) Max-18 3.09ns ± 1% 3.05ns ± 0% -1.42% (p=0.008 n=5+5) Min-18 3.32ns ± 1% 3.30ns ± 0% -0.72% (p=0.016 n=5+4) Mod-18 62.3ns ± 1% 65.8ns ± 3% +5.55% (p=0.008 n=5+5) Frexp-18 5.05ns ± 2% 4.98ns ± 0% ~ (p=0.683 n=5+5) Gamma-18 24.4ns ± 0% 24.1ns ± 0% -1.23% (p=0.008 n=5+5) Hypot-18 10.3ns ± 0% 10.3ns ± 0% ~ (all equal) HypotGo-18 10.2ns ± 0% 10.2ns ± 0% ~ (all equal) Ilogb-18 3.56ns ± 1% 3.54ns ± 0% ~ (p=0.595 n=5+5) J0-18 113ns ± 0% 108ns ± 1% -4.42% (p=0.016 n=4+5) J1-18 115ns ± 0% 109ns ± 1% -4.87% (p=0.016 n=4+5) Jn-18 240ns ± 0% 230ns ± 2% -4.41% (p=0.008 n=5+5) Ldexp-18 6.19ns ± 0% 6.19ns ± 0% ~ (p=0.444 n=5+5) Lgamma-18 32.2ns ± 0% 32.2ns ± 0% ~ (all equal) Log-18 13.1ns ± 0% 13.1ns ± 0% ~ (all equal) Logb-18 4.23ns ± 0% 4.22ns ± 0% ~ (p=0.444 n=5+5) Log1p-18 12.7ns ± 0% 12.7ns ± 0% ~ (all equal) Log10-18 18.1ns ± 0% 18.2ns ± 0% ~ (p=0.167 n=5+5) Log2-18 14.0ns ± 0% 14.0ns ± 0% ~ (all equal) Modf-18 10.4ns ± 0% 10.5ns ± 0% +0.96% (p=0.016 n=4+5) Nextafter32-18 11.3ns ± 0% 11.3ns ± 0% ~ (all equal) Nextafter64-18 4.01ns ± 1% 3.97ns ± 0% ~ (p=0.333 n=5+4) PowInt-18 32.7ns ± 0% 32.7ns ± 0% ~ (all equal) PowFrac-18 33.2ns ± 0% 33.1ns ± 0% ~ (p=0.095 n=4+5) Pow10Pos-18 1.58ns ± 0% 1.58ns ± 0% ~ (all equal) Pow10Neg-18 5.81ns ± 0% 5.81ns ± 0% ~ (all equal) Round-18 0.78ns ± 0% 0.78ns ± 0% ~ (all equal) RoundToEven-18 0.78ns ± 0% 0.78ns ± 0% ~ (all equal) Remainder-18 40.6ns ± 0% 40.7ns ± 0% ~ (p=0.238 n=5+4) Signbit-18 1.57ns ± 0% 1.57ns ± 0% ~ (all equal) Sin-18 6.75ns ± 0% 6.74ns ± 0% ~ (p=0.333 n=5+4) Sincos-18 29.5ns ± 0% 29.5ns ± 0% ~ (all equal) Sinh-18 14.4ns ± 0% 14.4ns ± 0% ~ (all equal) SqrtIndirect-18 3.97ns ± 0% 4.15ns ± 0% +4.59% (p=0.008 n=5+5) SqrtLatency-18 8.01ns ± 0% 8.01ns ± 0% ~ (all equal) SqrtIndirectLatency-18 11.6ns ± 0% 11.6ns ± 0% ~ (all equal) SqrtGoLatency-18 44.7ns ± 0% 45.0ns ± 0% +0.67% (p=0.008 n=5+5) SqrtPrime-18 1.26µs ± 0% 1.27µs ± 0% +0.63% (p=0.029 n=4+4) Tan-18 11.1ns ± 0% 11.1ns ± 0% ~ (all equal) Tanh-18 15.8ns ± 0% 15.8ns ± 0% ~ (all equal) Trunc-18 0.78ns ± 0% 0.78ns ± 0% ~ (all equal) Y0-18 113ns ± 2% 108ns ± 3% -5.11% (p=0.008 n=5+5) Y1-18 112ns ± 3% 107ns ± 0% -4.29% (p=0.000 n=5+4) Yn-18 229ns ± 0% 220ns ± 1% -3.76% (p=0.016 n=4+5) Float64bits-18 1.09ns ± 0% 1.09ns ± 0% ~ (all equal) Float64frombits-18 0.55ns ± 0% 0.55ns ± 0% ~ (all equal) Float32bits-18 0.96ns ±16% 0.86ns ± 0% ~ (p=0.563 n=5+5) Float32frombits-18 1.03ns ±28% 0.84ns ± 0% ~ (p=0.167 n=5+5) FMA-18 1.60ns ± 0% 1.60ns ± 0% ~ (all equal) [Geo mean] 10.0ns 9.9ns -0.41% Change-Id: Ief7e63ea5a8ba404b0a4696e12b9b7e0b05a9a03 Reviewed-on: https://go-review.googlesource.com/c/go/+/209160 Reviewed-by: Michael Munday <mike.munday@ibm.com> Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
2019-11-21 10:44:23 -05:00
func CmpWithSub(a float64, b float64) bool {
f := a - b
// s390x:-"LTDBR"
return f <= 0
}
func CmpWithAdd(a float64, b float64) bool {
f := a + b
// s390x:-"LTDBR"
return f <= 0
}
// ---------------- //
// Non-floats //
// ---------------- //
func ArrayZero() [16]byte {
// amd64:"MOVUPS"
var a [16]byte
return a
}
func ArrayCopy(a [16]byte) (b [16]byte) {
// amd64:"MOVUPS"
b = a
return
}
// ---------------- //
// Float Min/Max //
// ---------------- //
func Float64Min(a, b float64) float64 {
// amd64:"MINSD"
// arm64:"FMIND"
// loong64:"FMIND"
// riscv64:"FMIN"
// ppc64/power9:"XSMINJDP"
// ppc64/power10:"XSMINJDP"
// s390x: "WFMINDB"
return min(a, b)
}
func Float64Max(a, b float64) float64 {
// amd64:"MINSD"
// arm64:"FMAXD"
// loong64:"FMAXD"
// riscv64:"FMAX"
// ppc64/power9:"XSMAXJDP"
// ppc64/power10:"XSMAXJDP"
// s390x: "WFMAXDB"
return max(a, b)
}
func Float32Min(a, b float32) float32 {
// amd64:"MINSS"
// arm64:"FMINS"
// loong64:"FMINF"
// riscv64:"FMINS"
// ppc64/power9:"XSMINJDP"
// ppc64/power10:"XSMINJDP"
// s390x: "WFMINSB"
return min(a, b)
}
func Float32Max(a, b float32) float32 {
// amd64:"MINSS"
// arm64:"FMAXS"
// loong64:"FMAXF"
// riscv64:"FMAXS"
// ppc64/power9:"XSMAXJDP"
// ppc64/power10:"XSMAXJDP"
// s390x: "WFMAXSB"
return max(a, b)
}
// ------------------------ //
// Constant Optimizations //
// ------------------------ //
func Float32ConstantZero() float32 {
// arm64:"FMOVS ZR,"
return 0.0
}
func Float32ConstantChipFloat() float32 {
// arm64:"FMOVS [$]\\(2\\.25\\),"
return 2.25
}
func Float32Constant() float32 {
// arm64:"FMOVS [$]f32\\.42440000\\(SB\\)"
// ppc64x/power8:"FMOVS [$]f32\\.42440000\\(SB\\)"
// ppc64x/power9:"FMOVS [$]f32\\.42440000\\(SB\\)"
// ppc64x/power10:"XXSPLTIDP [$]1111752704,"
return 49.0
}
func Float64ConstantZero() float64 {
// arm64:"FMOVD ZR,"
return 0.0
}
func Float64ConstantChipFloat() float64 {
// arm64:"FMOVD [$]\\(2\\.25\\),"
return 2.25
}
func Float64Constant() float64 {
// arm64:"FMOVD [$]f64\\.4048800000000000\\(SB\\)"
// ppc64x/power8:"FMOVD [$]f64\\.4048800000000000\\(SB\\)"
// ppc64x/power9:"FMOVD [$]f64\\.4048800000000000\\(SB\\)"
// ppc64x/power10:"XXSPLTIDP [$]1111752704,"
return 49.0
}
func Float32DenormalConstant() float32 {
// ppc64x:"FMOVS [$]f32\\.00400000\\(SB\\)"
return 0x1p-127
}
// A float64 constant which can be exactly represented as a
// denormal float32 value. On ppc64x, denormal values cannot
// be used with XXSPLTIDP.
func Float64DenormalFloat32Constant() float64 {
// ppc64x:"FMOVD [$]f64\\.3800000000000000\\(SB\\)"
return 0x1p-127
}
func Float32ConstantStore(p *float32) {
// amd64:"MOVL [$]1085133554"
// riscv64: "MOVF [$]f32.40add2f2"
*p = 5.432
}
func Float64ConstantStore(p *float64) {
// amd64: "MOVQ [$]4617801906721357038"
// riscv64: "MOVD [$]f64.4015ba5e353f7cee"
*p = 5.432
}
cmd/compile: use FCLASSD for subnormal checks on riscv64 Only implemented for 64 bit floating point operations for now. goos: linux goarch: riscv64 pkg: math cpu: Spacemit(R) X60 │ sec/op │ sec/op vs base │ Acos 154.1n ± 0% 154.1n ± 0% ~ (p=0.303 n=10) Acosh 215.8n ± 6% 226.7n ± 0% ~ (p=0.439 n=10) Asin 149.2n ± 1% 149.2n ± 0% ~ (p=0.700 n=10) Asinh 262.1n ± 0% 258.5n ± 0% -1.37% (p=0.000 n=10) Atan 99.48n ± 0% 99.49n ± 0% ~ (p=0.836 n=10) Atanh 244.9n ± 0% 243.8n ± 0% -0.43% (p=0.002 n=10) Atan2 158.2n ± 1% 153.3n ± 0% -3.10% (p=0.000 n=10) Cbrt 186.8n ± 0% 181.1n ± 0% -3.03% (p=0.000 n=10) Ceil 36.71n ± 1% 36.71n ± 0% ~ (p=0.434 n=10) Copysign 6.531n ± 1% 6.526n ± 0% ~ (p=0.268 n=10) Cos 98.19n ± 0% 95.40n ± 0% -2.84% (p=0.000 n=10) Cosh 233.1n ± 0% 222.6n ± 0% -4.50% (p=0.000 n=10) Erf 122.5n ± 0% 114.2n ± 0% -6.78% (p=0.000 n=10) Erfc 126.0n ± 1% 116.6n ± 0% -7.46% (p=0.000 n=10) Erfinv 138.8n ± 0% 138.6n ± 0% ~ (p=0.082 n=10) Erfcinv 140.0n ± 0% 139.7n ± 0% ~ (p=0.359 n=10) Exp 193.3n ± 0% 184.2n ± 0% -4.68% (p=0.000 n=10) ExpGo 204.8n ± 0% 194.5n ± 0% -5.03% (p=0.000 n=10) Expm1 152.5n ± 1% 145.0n ± 0% -4.92% (p=0.000 n=10) Exp2 174.5n ± 0% 164.2n ± 0% -5.85% (p=0.000 n=10) Exp2Go 184.4n ± 1% 175.4n ± 0% -4.88% (p=0.000 n=10) Abs 4.912n ± 0% 4.914n ± 0% ~ (p=0.283 n=10) Dim 15.50n ± 1% 15.52n ± 1% ~ (p=0.331 n=10) Floor 36.89n ± 1% 36.76n ± 1% ~ (p=0.325 n=10) Max 31.05n ± 1% 31.17n ± 1% ~ (p=0.628 n=10) Min 31.01n ± 0% 31.06n ± 0% ~ (p=0.767 n=10) Mod 294.1n ± 0% 245.6n ± 0% -16.52% (p=0.000 n=10) Frexp 44.86n ± 1% 35.20n ± 0% -21.53% (p=0.000 n=10) Gamma 195.8n ± 0% 185.4n ± 1% -5.29% (p=0.000 n=10) Hypot 84.91n ± 0% 84.54n ± 1% -0.43% (p=0.006 n=10) HypotGo 96.70n ± 0% 95.42n ± 1% -1.32% (p=0.000 n=10) Ilogb 45.03n ± 0% 35.07n ± 1% -22.10% (p=0.000 n=10) J0 634.5n ± 0% 627.2n ± 0% -1.16% (p=0.000 n=10) J1 644.5n ± 0% 636.9n ± 0% -1.18% (p=0.000 n=10) Jn 1.357µ ± 0% 1.344µ ± 0% -0.92% (p=0.000 n=10) Ldexp 49.89n ± 0% 39.96n ± 0% -19.90% (p=0.000 n=10) Lgamma 186.6n ± 0% 184.3n ± 0% -1.21% (p=0.000 n=10) Log 150.4n ± 0% 141.1n ± 0% -6.15% (p=0.000 n=10) Logb 46.70n ± 0% 35.89n ± 0% -23.15% (p=0.000 n=10) Log1p 164.1n ± 0% 163.9n ± 0% ~ (p=0.122 n=10) Log10 153.1n ± 0% 143.5n ± 0% -6.24% (p=0.000 n=10) Log2 58.83n ± 0% 49.75n ± 0% -15.43% (p=0.000 n=10) Modf 40.82n ± 1% 40.78n ± 0% ~ (p=0.239 n=10) Nextafter32 49.15n ± 0% 48.93n ± 0% -0.44% (p=0.011 n=10) Nextafter64 43.33n ± 0% 43.23n ± 0% ~ (p=0.228 n=10) PowInt 269.4n ± 0% 243.8n ± 0% -9.49% (p=0.000 n=10) PowFrac 618.0n ± 0% 571.7n ± 0% -7.48% (p=0.000 n=10) Pow10Pos 13.09n ± 0% 13.05n ± 0% -0.31% (p=0.003 n=10) Pow10Neg 30.99n ± 1% 30.99n ± 0% ~ (p=0.173 n=10) Round 23.73n ± 0% 23.65n ± 0% -0.36% (p=0.011 n=10) RoundToEven 27.87n ± 0% 27.73n ± 0% -0.48% (p=0.003 n=10) Remainder 282.1n ± 0% 249.6n ± 0% -11.52% (p=0.000 n=10) Signbit 11.46n ± 0% 11.42n ± 0% -0.39% (p=0.003 n=10) Sin 115.2n ± 0% 113.2n ± 0% -1.74% (p=0.000 n=10) Sincos 140.6n ± 0% 138.6n ± 0% -1.39% (p=0.000 n=10) Sinh 252.0n ± 0% 241.4n ± 0% -4.21% (p=0.000 n=10) SqrtIndirect 4.909n ± 0% 4.893n ± 0% -0.34% (p=0.021 n=10) SqrtLatency 19.57n ± 1% 19.57n ± 0% ~ (p=0.087 n=10) SqrtIndirectLatency 19.64n ± 0% 19.57n ± 0% -0.36% (p=0.025 n=10) SqrtGoLatency 198.1n ± 0% 197.4n ± 0% -0.35% (p=0.014 n=10) SqrtPrime 5.733µ ± 0% 5.725µ ± 0% ~ (p=0.116 n=10) Tan 149.1n ± 0% 146.8n ± 0% -1.54% (p=0.000 n=10) Tanh 248.2n ± 1% 238.1n ± 0% -4.05% (p=0.000 n=10) Trunc 36.86n ± 0% 36.70n ± 0% -0.43% (p=0.029 n=10) Y0 638.2n ± 0% 633.6n ± 0% -0.71% (p=0.000 n=10) Y1 641.8n ± 0% 636.1n ± 0% -0.87% (p=0.000 n=10) Yn 1.358µ ± 0% 1.345µ ± 0% -0.92% (p=0.000 n=10) Float64bits 5.721n ± 0% 5.709n ± 0% -0.22% (p=0.044 n=10) Float64frombits 4.905n ± 0% 4.893n ± 0% ~ (p=0.266 n=10) Float32bits 12.27n ± 0% 12.23n ± 0% ~ (p=0.122 n=10) Float32frombits 4.909n ± 0% 4.893n ± 0% -0.32% (p=0.024 n=10) FMA 6.556n ± 0% 6.526n ± 0% ~ (p=0.283 n=10) geomean 86.82n 83.75n -3.54% Change-Id: I522297a79646d76543d516accce291f5a3cea337 Reviewed-on: https://go-review.googlesource.com/c/go/+/717560 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2025-08-24 00:15:29 +01:00
// ------------------------ //
// Subnormal tests //
// ------------------------ //
func isSubnormal(x float64) bool {
// riscv64:"FCLASSD" -"FABSD"
return math.Abs(x) < 2.2250738585072014e-308
}
func isNormal(x float64) bool {
// riscv64:"FCLASSD" -"FABSD"
return math.Abs(x) >= 0x1p-1022
}
func isPosSubnormal(x float64) bool {
// riscv64:"FCLASSD"
return x > 0 && x < 2.2250738585072014e-308
}
func isNegSubnormal(x float64) bool {
// riscv64:"FCLASSD"
return x < 0 && x > -0x1p-1022
}
func isPosNormal(x float64) bool {
// riscv64:"FCLASSD"
return x >= 2.2250738585072014e-308
}
func isNegNormal(x float64) bool {
// riscv64:"FCLASSD"
return x <= -2.2250738585072014e-308
}