go/src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go

551 lines
31 KiB
Go
Raw Normal View History

// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import "strings"
// Notes:
// - Integer types live in the low portion of registers. Upper portions are junk.
// - Boolean types use the low-order byte of a register. 0=false, 1=true.
// Upper bytes are junk.
// - *const instructions may use a constant larger than the instruction can encode.
// In this case the assembler expands to multiple instructions and uses tmp
// register (R23).
// Suffixes encode the bit width of various instructions.
// V (vlong) = 64 bit
// WU (word) = 32 bit unsigned
// W (word) = 32 bit
// H (half word) = 16 bit
// HU = 16 bit unsigned
// B (byte) = 8 bit
// BU = 8 bit unsigned
// F (float) = 32 bit float
// D (double) = 64 bit float
// Note: registers not used in regalloc are not included in this list,
// so that regmask stays within int64
// Be careful when hand coding regmasks.
var regNamesLOONG64 = []string{
"R0", // constant 0
"R1",
"SP", // aka R3
"R4",
"R5",
"R6",
"R7",
"R8",
"R9",
"R10",
"R11",
"R12",
"R13",
"R14",
"R15",
"R16",
"R17",
"R18",
"R19",
"R20",
"R21",
"g", // aka R22
"R23",
"R24",
"R25",
"R26",
"R27",
"R28",
"R29",
// R30 is REGTMP not used in regalloc
"R31",
"F0",
"F1",
"F2",
"F3",
"F4",
"F5",
"F6",
"F7",
"F8",
"F9",
"F10",
"F11",
"F12",
"F13",
"F14",
"F15",
"F16",
"F17",
"F18",
"F19",
"F20",
"F21",
"F22",
"F23",
"F24",
"F25",
"F26",
"F27",
"F28",
"F29",
"F30",
"F31",
// If you add registers, update asyncPreempt in runtime.
// pseudo-registers
"SB",
}
func init() {
// Make map from reg names to reg integers.
if len(regNamesLOONG64) > 64 {
panic("too many registers")
}
num := map[string]int{}
for i, name := range regNamesLOONG64 {
num[name] = i
}
buildReg := func(s string) regMask {
m := regMask(0)
for _, r := range strings.Split(s, " ") {
if n, ok := num[r]; ok {
m |= regMask(1) << uint(n)
continue
}
panic("register " + r + " not found")
}
return m
}
// Common individual register masks
var (
gp = buildReg("R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 R23 R24 R25 R26 R27 R28 R29 R31") // R1 is LR, R2 is thread pointer, R3 is stack pointer, R22 is g, R30 is REGTMP
gpg = gp | buildReg("g")
gpsp = gp | buildReg("SP")
gpspg = gpg | buildReg("SP")
gpspsbg = gpspg | buildReg("SB")
fp = buildReg("F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31")
callerSave = gp | fp | buildReg("g") // runtime.setg (and anything calling it) may clobber g
r1 = buildReg("R20")
r2 = buildReg("R21")
r3 = buildReg("R23")
r4 = buildReg("R24")
)
// Common regInfo
var (
gp01 = regInfo{inputs: nil, outputs: []regMask{gp}}
gp11 = regInfo{inputs: []regMask{gpg}, outputs: []regMask{gp}}
gp11sp = regInfo{inputs: []regMask{gpspg}, outputs: []regMask{gp}}
gp21 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp}}
gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
gp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
gpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
fp2flags = regInfo{inputs: []regMask{fp, fp}}
fpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{fp}}
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
fp2load = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{fp}}
fpstore = regInfo{inputs: []regMask{gpspsbg, fp}}
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
fpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, fp}}
cmd/compile: optimize math.Float64(32)bits and math.Float64(32)frombits on loong64 Use float <-> int register moves without conversion instead of stores and loads to move float <-> int values like arm64 and mips64. goos: linux goarch: loong64 pkg: math cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Acos 15.98n ± 0% 15.94n ± 0% -0.25% (p=0.000 n=20) Acosh 27.75n ± 0% 25.56n ± 0% -7.89% (p=0.000 n=20) Asin 15.85n ± 0% 15.76n ± 0% -0.57% (p=0.000 n=20) Asinh 39.79n ± 0% 37.69n ± 0% -5.28% (p=0.000 n=20) Atan 7.261n ± 0% 7.242n ± 0% -0.27% (p=0.000 n=20) Atanh 28.30n ± 0% 27.62n ± 0% -2.40% (p=0.000 n=20) Atan2 15.85n ± 0% 15.75n ± 0% -0.63% (p=0.000 n=20) Cbrt 27.02n ± 0% 21.08n ± 0% -21.98% (p=0.000 n=20) Ceil 2.830n ± 1% 2.896n ± 1% +2.31% (p=0.000 n=20) Copysign 0.8022n ± 0% 0.8004n ± 0% -0.22% (p=0.000 n=20) Cos 11.64n ± 0% 11.61n ± 0% -0.26% (p=0.000 n=20) Cosh 35.98n ± 0% 33.44n ± 0% -7.05% (p=0.000 n=20) Erf 10.09n ± 0% 10.08n ± 0% -0.10% (p=0.000 n=20) Erfc 11.40n ± 0% 11.35n ± 0% -0.44% (p=0.000 n=20) Erfinv 12.31n ± 0% 12.29n ± 0% -0.16% (p=0.000 n=20) Erfcinv 12.16n ± 0% 12.17n ± 0% +0.08% (p=0.000 n=20) Exp 28.41n ± 0% 26.44n ± 0% -6.95% (p=0.000 n=20) ExpGo 28.68n ± 0% 27.07n ± 0% -5.60% (p=0.000 n=20) Expm1 17.21n ± 0% 16.75n ± 0% -2.67% (p=0.000 n=20) Exp2 24.71n ± 0% 23.01n ± 0% -6.88% (p=0.000 n=20) Exp2Go 25.17n ± 0% 23.91n ± 0% -4.99% (p=0.000 n=20) Abs 0.8004n ± 0% 0.8004n ± 0% ~ (p=0.224 n=20) Dim 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=20) ¹ Floor 2.848n ± 0% 2.859n ± 0% +0.39% (p=0.000 n=20) Max 3.074n ± 0% 3.071n ± 0% ~ (p=0.481 n=20) Min 3.179n ± 0% 3.176n ± 0% -0.09% (p=0.003 n=20) Mod 49.62n ± 0% 44.82n ± 0% -9.67% (p=0.000 n=20) Frexp 7.604n ± 0% 6.803n ± 0% -10.53% (p=0.000 n=20) Gamma 18.01n ± 0% 17.61n ± 0% -2.22% (p=0.000 n=20) Hypot 7.204n ± 0% 7.604n ± 0% +5.55% (p=0.000 n=20) HypotGo 7.204n ± 0% 7.604n ± 0% +5.56% (p=0.000 n=20) Ilogb 6.003n ± 0% 6.003n ± 0% ~ (p=0.407 n=20) J0 76.43n ± 0% 76.24n ± 0% -0.25% (p=0.000 n=20) J1 76.44n ± 0% 76.44n ± 0% ~ (p=1.000 n=20) Jn 168.2n ± 0% 168.5n ± 0% +0.18% (p=0.000 n=20) Ldexp 8.804n ± 0% 7.604n ± 0% -13.63% (p=0.000 n=20) Lgamma 19.01n ± 0% 19.01n ± 0% ~ (p=0.695 n=20) Log 19.38n ± 0% 19.12n ± 0% -1.34% (p=0.000 n=20) Logb 6.003n ± 0% 6.003n ± 0% ~ (p=1.000 n=20) Log1p 18.57n ± 0% 16.72n ± 0% -9.96% (p=0.000 n=20) Log10 20.67n ± 0% 20.45n ± 0% -1.06% (p=0.000 n=20) Log2 9.605n ± 0% 8.804n ± 0% -8.34% (p=0.000 n=20) Modf 4.402n ± 0% 4.402n ± 0% ~ (p=1.000 n=20) Nextafter32 7.204n ± 0% 5.603n ± 0% -22.22% (p=0.000 n=20) Nextafter64 6.803n ± 0% 6.003n ± 0% -11.76% (p=0.000 n=20) PowInt 39.62n ± 0% 37.22n ± 0% -6.06% (p=0.000 n=20) PowFrac 120.9n ± 0% 108.9n ± 0% -9.93% (p=0.000 n=20) Pow10Pos 1.601n ± 0% 1.601n ± 0% ~ (p=0.487 n=20) Pow10Neg 2.675n ± 0% 2.675n ± 0% ~ (p=1.000 n=20) Round 3.018n ± 0% 2.401n ± 0% -20.46% (p=0.000 n=20) RoundToEven 3.822n ± 0% 3.001n ± 0% -21.48% (p=0.000 n=20) Remainder 45.62n ± 0% 42.42n ± 0% -7.01% (p=0.000 n=20) Signbit 0.9075n ± 0% 0.8004n ± 0% -11.81% (p=0.000 n=20) Sin 12.65n ± 0% 12.65n ± 0% ~ (p=0.503 n=20) Sincos 14.81n ± 0% 14.60n ± 0% -1.42% (p=0.000 n=20) Sinh 36.75n ± 0% 35.11n ± 0% -4.46% (p=0.000 n=20) SqrtIndirect 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=20) ¹ SqrtLatency 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) SqrtIndirectLatency 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) SqrtGoLatency 52.85n ± 0% 40.82n ± 0% -22.76% (p=0.000 n=20) SqrtPrime 887.4n ± 0% 887.4n ± 0% ~ (p=0.751 n=20) Tan 13.95n ± 0% 13.97n ± 0% +0.18% (p=0.000 n=20) Tanh 36.79n ± 0% 34.89n ± 0% -5.16% (p=0.000 n=20) Trunc 2.849n ± 0% 2.861n ± 0% +0.42% (p=0.000 n=20) Y0 77.44n ± 0% 77.64n ± 0% +0.26% (p=0.000 n=20) Y1 74.41n ± 0% 74.33n ± 0% -0.11% (p=0.000 n=20) Yn 158.7n ± 0% 159.0n ± 0% +0.19% (p=0.000 n=20) Float64bits 0.8774n ± 0% 0.4002n ± 0% -54.39% (p=0.000 n=20) Float64frombits 0.8042n ± 0% 0.4002n ± 0% -50.24% (p=0.000 n=20) Float32bits 1.1230n ± 0% 0.5336n ± 0% -52.48% (p=0.000 n=20) Float32frombits 1.0670n ± 0% 0.8004n ± 0% -24.99% (p=0.000 n=20) FMA 2.001n ± 0% 2.001n ± 0% ~ (p=0.605 n=20) geomean 10.87n 10.10n -7.15% ¹ all samples are equal goos: linux goarch: loong64 pkg: math cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Acos 33.10n ± 0% 31.95n ± 2% -3.46% (p=0.000 n=20) Acosh 58.38n ± 0% 50.44n ± 0% -13.60% (p=0.000 n=20) Asin 32.70n ± 0% 31.94n ± 0% -2.32% (p=0.000 n=20) Asinh 57.65n ± 0% 50.83n ± 0% -11.82% (p=0.000 n=20) Atan 14.21n ± 0% 14.21n ± 0% ~ (p=0.501 n=20) Atanh 60.86n ± 0% 54.44n ± 0% -10.56% (p=0.000 n=20) Atan2 32.02n ± 0% 34.02n ± 0% +6.25% (p=0.000 n=20) Cbrt 55.58n ± 0% 40.64n ± 0% -26.88% (p=0.000 n=20) Ceil 9.566n ± 0% 9.566n ± 0% ~ (p=0.463 n=20) Copysign 0.8005n ± 0% 0.8005n ± 0% ~ (p=0.806 n=20) Cos 18.02n ± 0% 18.02n ± 0% ~ (p=0.191 n=20) Cosh 64.44n ± 0% 65.64n ± 0% +1.86% (p=0.000 n=20) Erf 16.15n ± 0% 16.16n ± 0% ~ (p=0.770 n=20) Erfc 18.71n ± 0% 18.83n ± 0% +0.61% (p=0.000 n=20) Erfinv 19.33n ± 0% 19.34n ± 0% ~ (p=0.513 n=20) Erfcinv 18.90n ± 0% 19.78n ± 0% +4.63% (p=0.000 n=20) Exp 50.04n ± 0% 49.66n ± 0% -0.75% (p=0.000 n=20) ExpGo 50.03n ± 0% 50.03n ± 0% ~ (p=0.723 n=20) Expm1 28.41n ± 0% 28.27n ± 0% -0.49% (p=0.000 n=20) Exp2 50.08n ± 0% 51.23n ± 0% +2.31% (p=0.000 n=20) Exp2Go 49.77n ± 0% 49.89n ± 0% +0.24% (p=0.000 n=20) Abs 0.8009n ± 0% 0.8006n ± 0% ~ (p=0.317 n=20) Dim 1.987n ± 0% 1.993n ± 0% +0.28% (p=0.001 n=20) Floor 8.543n ± 0% 8.548n ± 0% ~ (p=0.509 n=20) Max 6.670n ± 0% 6.672n ± 0% ~ (p=0.335 n=20) Min 6.694n ± 0% 6.694n ± 0% ~ (p=0.459 n=20) Mod 56.44n ± 0% 53.23n ± 0% -5.70% (p=0.000 n=20) Frexp 8.409n ± 0% 7.606n ± 0% -9.55% (p=0.000 n=20) Gamma 35.64n ± 0% 35.23n ± 0% -1.15% (p=0.000 n=20) Hypot 11.21n ± 0% 10.61n ± 0% -5.31% (p=0.000 n=20) HypotGo 11.50n ± 0% 11.01n ± 0% -4.30% (p=0.000 n=20) Ilogb 7.606n ± 0% 6.804n ± 0% -10.54% (p=0.000 n=20) J0 125.3n ± 0% 126.5n ± 0% +0.96% (p=0.000 n=20) J1 124.9n ± 0% 125.3n ± 0% +0.32% (p=0.000 n=20) Jn 264.3n ± 0% 265.9n ± 0% +0.61% (p=0.000 n=20) Ldexp 9.606n ± 0% 9.204n ± 0% -4.19% (p=0.000 n=20) Lgamma 38.82n ± 0% 38.85n ± 0% +0.06% (p=0.019 n=20) Log 38.44n ± 0% 28.04n ± 0% -27.06% (p=0.000 n=20) Logb 8.405n ± 0% 7.605n ± 0% -9.52% (p=0.000 n=20) Log1p 31.62n ± 0% 27.11n ± 0% -14.26% (p=0.000 n=20) Log10 38.83n ± 0% 28.42n ± 0% -26.81% (p=0.000 n=20) Log2 11.21n ± 0% 10.41n ± 0% -7.14% (p=0.000 n=20) Modf 5.204n ± 0% 5.205n ± 0% ~ (p=0.983 n=20) Nextafter32 8.809n ± 0% 7.208n ± 0% -18.18% (p=0.000 n=20) Nextafter64 8.405n ± 0% 8.406n ± 0% +0.01% (p=0.007 n=20) PowInt 48.83n ± 0% 44.78n ± 0% -8.28% (p=0.000 n=20) PowFrac 146.9n ± 0% 142.1n ± 0% -3.23% (p=0.000 n=20) Pow10Pos 2.334n ± 0% 2.333n ± 0% ~ (p=0.110 n=20) Pow10Neg 4.803n ± 0% 4.803n ± 0% ~ (p=0.130 n=20) Round 4.816n ± 0% 3.819n ± 0% -20.70% (p=0.000 n=20) RoundToEven 5.735n ± 0% 5.204n ± 0% -9.26% (p=0.000 n=20) Remainder 52.05n ± 0% 49.64n ± 0% -4.63% (p=0.000 n=20) Signbit 1.201n ± 0% 1.001n ± 0% -16.65% (p=0.000 n=20) Sin 20.63n ± 0% 20.64n ± 0% +0.05% (p=0.040 n=20) Sincos 23.82n ± 0% 24.62n ± 0% +3.36% (p=0.000 n=20) Sinh 71.25n ± 0% 68.44n ± 0% -3.94% (p=0.000 n=20) SqrtIndirect 2.001n ± 0% 2.001n ± 0% ~ (p=0.182 n=20) SqrtLatency 4.003n ± 0% 4.003n ± 0% ~ (p=0.754 n=20) SqrtIndirectLatency 4.003n ± 0% 4.003n ± 0% ~ (p=0.773 n=20) SqrtGoLatency 60.84n ± 0% 81.26n ± 0% +33.56% (p=0.000 n=20) SqrtPrime 1.791µ ± 0% 1.791µ ± 0% ~ (p=0.784 n=20) Tan 27.22n ± 0% 27.22n ± 0% ~ (p=0.819 n=20) Tanh 70.88n ± 0% 69.04n ± 0% -2.60% (p=0.000 n=20) Trunc 8.543n ± 0% 8.543n ± 0% ~ (p=0.784 n=20) Y0 122.9n ± 0% 122.9n ± 0% ~ (p=0.559 n=20) Y1 123.3n ± 0% 121.7n ± 0% -1.30% (p=0.000 n=20) Yn 263.0n ± 0% 262.6n ± 0% -0.15% (p=0.000 n=20) Float64bits 1.2010n ± 0% 0.6004n ± 0% -50.01% (p=0.000 n=20) Float64frombits 1.2010n ± 0% 0.6004n ± 0% -50.01% (p=0.000 n=20) Float32bits 1.7010n ± 0% 0.8005n ± 0% -52.94% (p=0.000 n=20) Float32frombits 1.5010n ± 0% 0.8005n ± 0% -46.67% (p=0.000 n=20) FMA 2.001n ± 0% 2.001n ± 0% ~ (p=0.238 n=20) geomean 17.41n 16.15n -7.19% Change-Id: I0a0c263af2f07203eab1782e69c706f20c689d8d Reviewed-on: https://go-review.googlesource.com/c/go/+/604737 Auto-Submit: Tim King <taking@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Tim King <taking@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-08-12 16:41:11 +08:00
fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
readflags = regInfo{inputs: nil, outputs: []regMask{gp}}
)
ops := []opData{
// binary ops
{name: "ADDV", argLength: 2, reg: gp21, asm: "ADDVU", commutative: true}, // arg0 + arg1
{name: "ADDVconst", argLength: 1, reg: gp11sp, asm: "ADDVU", aux: "Int64"}, // arg0 + auxInt. auxInt is 32-bit, also in other *const ops.
{name: "SUBV", argLength: 2, reg: gp21, asm: "SUBVU"}, // arg0 - arg1
{name: "SUBVconst", argLength: 1, reg: gp11, asm: "SUBVU", aux: "Int64"}, // arg0 - auxInt
{name: "MULV", argLength: 2, reg: gp21, asm: "MULV", commutative: true, typ: "Int64"}, // arg0 * arg1
{name: "MULHV", argLength: 2, reg: gp21, asm: "MULHV", commutative: true, typ: "Int64"}, // (arg0 * arg1) >> 64, signed
{name: "MULHVU", argLength: 2, reg: gp21, asm: "MULHVU", commutative: true, typ: "UInt64"}, // (arg0 * arg1) >> 64, unsigned
{name: "DIVV", argLength: 2, reg: gp21, asm: "DIVV", typ: "Int64"}, // arg0 / arg1, signed
{name: "DIVVU", argLength: 2, reg: gp21, asm: "DIVVU", typ: "UInt64"}, // arg0 / arg1, unsigned
{name: "REMV", argLength: 2, reg: gp21, asm: "REMV", typ: "Int64"}, // arg0 / arg1, signed
{name: "REMVU", argLength: 2, reg: gp21, asm: "REMVU", typ: "UInt64"}, // arg0 / arg1, unsigned
{name: "ADDF", argLength: 2, reg: fp21, asm: "ADDF", commutative: true}, // arg0 + arg1
{name: "ADDD", argLength: 2, reg: fp21, asm: "ADDD", commutative: true}, // arg0 + arg1
{name: "SUBF", argLength: 2, reg: fp21, asm: "SUBF"}, // arg0 - arg1
{name: "SUBD", argLength: 2, reg: fp21, asm: "SUBD"}, // arg0 - arg1
{name: "MULF", argLength: 2, reg: fp21, asm: "MULF", commutative: true}, // arg0 * arg1
{name: "MULD", argLength: 2, reg: fp21, asm: "MULD", commutative: true}, // arg0 * arg1
{name: "DIVF", argLength: 2, reg: fp21, asm: "DIVF"}, // arg0 / arg1
{name: "DIVD", argLength: 2, reg: fp21, asm: "DIVD"}, // arg0 / arg1
{name: "AND", argLength: 2, reg: gp21, asm: "AND", commutative: true}, // arg0 & arg1
{name: "ANDconst", argLength: 1, reg: gp11, asm: "AND", aux: "Int64"}, // arg0 & auxInt
{name: "OR", argLength: 2, reg: gp21, asm: "OR", commutative: true}, // arg0 | arg1
{name: "ORconst", argLength: 1, reg: gp11, asm: "OR", aux: "Int64"}, // arg0 | auxInt
{name: "XOR", argLength: 2, reg: gp21, asm: "XOR", commutative: true, typ: "UInt64"}, // arg0 ^ arg1
{name: "XORconst", argLength: 1, reg: gp11, asm: "XOR", aux: "Int64", typ: "UInt64"}, // arg0 ^ auxInt
{name: "NOR", argLength: 2, reg: gp21, asm: "NOR", commutative: true}, // ^(arg0 | arg1)
{name: "NORconst", argLength: 1, reg: gp11, asm: "NOR", aux: "Int64"}, // ^(arg0 | auxInt)
{name: "NEGV", argLength: 1, reg: gp11}, // -arg0
{name: "NEGF", argLength: 1, reg: fp11, asm: "NEGF"}, // -arg0, float32
{name: "NEGD", argLength: 1, reg: fp11, asm: "NEGD"}, // -arg0, float64
{name: "SQRTD", argLength: 1, reg: fp11, asm: "SQRTD"}, // sqrt(arg0), float64
{name: "SQRTF", argLength: 1, reg: fp11, asm: "SQRTF"}, // sqrt(arg0), float32
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
{name: "CLZW", argLength: 1, reg: gp11, asm: "CLZW"}, // Count leading (high order) zeroes (returns 0-32)
{name: "CLZV", argLength: 1, reg: gp11, asm: "CLZV"}, // Count leading (high order) zeroes (returns 0-64)
{name: "REVB2H", argLength: 1, reg: gp11, asm: "REVB2H"}, // Swap bytes: 0x11223344 -> 0x22114433 (sign extends to 64 bits)
{name: "REVB2W", argLength: 1, reg: gp11, asm: "REVB2W"}, // Swap bytes: 0x1122334455667788 -> 0x4433221188776655
{name: "REVBV", argLength: 1, reg: gp11, asm: "REVBV"}, // Swap bytes: 0x1122334455667788 -> 0x8877665544332211
{name: "FMINF", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMINF", commutative: true, typ: "Float32"}, // min(arg0, arg1), float32
{name: "FMIND", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMIND", commutative: true, typ: "Float64"}, // min(arg0, arg1), float64
{name: "FMAXF", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXF", commutative: true, typ: "Float32"}, // max(arg0, arg1), float32
{name: "FMAXD", argLength: 2, reg: fp21, resultNotInArgs: true, asm: "FMAXD", commutative: true, typ: "Float64"}, // max(arg0, arg1), float64
{name: "MASKEQZ", argLength: 2, reg: gp21, asm: "MASKEQZ"}, // returns 0 if arg1 == 0, otherwise returns arg0
{name: "MASKNEZ", argLength: 2, reg: gp21, asm: "MASKNEZ"}, // returns 0 if arg1 != 0, otherwise returns arg0
{name: "ABSD", argLength: 1, reg: fp11, asm: "ABSD"}, // abs(arg0), float64
{name: "FCOPYSGD", argLength: 2, reg: fp21, asm: "FCOPYSGD"}, // float64
// shifts
cmd/compile: intrinsify RotateLeft{32,64} on loong64 Benchmark on crypto/sha256 (provided by Xiaodong Liu): name old time/op new time/op delta Hash8Bytes/New 1.19µs ± 0% 0.97µs ± 0% -18.75% (p=0.000 n=9+9) Hash8Bytes/Sum224 1.21µs ± 0% 0.97µs ± 0% -20.04% (p=0.000 n=9+10) Hash8Bytes/Sum256 1.21µs ± 0% 0.98µs ± 0% -19.16% (p=0.000 n=10+7) Hash1K/New 15.9µs ± 0% 12.4µs ± 0% -22.10% (p=0.000 n=10+10) Hash1K/Sum224 15.9µs ± 0% 12.4µs ± 0% -22.18% (p=0.000 n=8+10) Hash1K/Sum256 15.9µs ± 0% 12.4µs ± 0% -22.15% (p=0.000 n=10+9) Hash8K/New 119µs ± 0% 92µs ± 0% -22.40% (p=0.000 n=10+9) Hash8K/Sum224 119µs ± 0% 92µs ± 0% -22.41% (p=0.000 n=9+10) Hash8K/Sum256 119µs ± 0% 92µs ± 0% -22.40% (p=0.000 n=9+9) name old speed new speed delta Hash8Bytes/New 6.70MB/s ± 0% 8.25MB/s ± 0% +23.13% (p=0.000 n=10+10) Hash8Bytes/Sum224 6.60MB/s ± 0% 8.26MB/s ± 0% +25.06% (p=0.000 n=10+10) Hash8Bytes/Sum256 6.59MB/s ± 0% 8.15MB/s ± 0% +23.67% (p=0.000 n=10+7) Hash1K/New 64.3MB/s ± 0% 82.5MB/s ± 0% +28.36% (p=0.000 n=10+10) Hash1K/Sum224 64.3MB/s ± 0% 82.6MB/s ± 0% +28.51% (p=0.000 n=10+10) Hash1K/Sum256 64.3MB/s ± 0% 82.6MB/s ± 0% +28.46% (p=0.000 n=9+9) Hash8K/New 69.0MB/s ± 0% 89.0MB/s ± 0% +28.87% (p=0.000 n=10+8) Hash8K/Sum224 69.0MB/s ± 0% 89.0MB/s ± 0% +28.88% (p=0.000 n=9+10) Hash8K/Sum256 69.0MB/s ± 0% 88.9MB/s ± 0% +28.87% (p=0.000 n=8+9) Benchmark on crypto/sha512 (provided by Xiaodong Liu): name old time/op new time/op delta Hash8Bytes/New 1.55µs ± 0% 1.31µs ± 0% -15.67% (p=0.000 n=10+10) Hash8Bytes/Sum384 1.59µs ± 0% 1.35µs ± 0% -14.97% (p=0.000 n=10+10) Hash8Bytes/Sum512 1.62µs ± 0% 1.39µs ± 0% -14.02% (p=0.000 n=10+10) Hash1K/New 10.7µs ± 0% 8.6µs ± 0% -19.60% (p=0.000 n=8+8) Hash1K/Sum384 10.8µs ± 0% 8.7µs ± 0% -19.40% (p=0.000 n=9+9) Hash1K/Sum512 10.8µs ± 0% 8.7µs ± 0% -19.35% (p=0.000 n=9+10) Hash8K/New 74.6µs ± 0% 59.6µs ± 0% -20.08% (p=0.000 n=10+9) Hash8K/Sum384 74.7µs ± 0% 59.7µs ± 0% -20.04% (p=0.000 n=9+8) Hash8K/Sum512 74.7µs ± 0% 59.7µs ± 0% -20.01% (p=0.000 n=10+10) name old speed new speed delta Hash8Bytes/New 5.16MB/s ± 0% 6.12MB/s ± 0% +18.60% (p=0.000 n=10+8) Hash8Bytes/Sum384 5.02MB/s ± 0% 5.90MB/s ± 0% +17.56% (p=0.000 n=10+10) Hash8Bytes/Sum512 4.94MB/s ± 0% 5.74MB/s ± 0% +16.29% (p=0.000 n=10+9) Hash1K/New 95.4MB/s ± 0% 118.6MB/s ± 0% +24.38% (p=0.000 n=10+10) Hash1K/Sum384 95.0MB/s ± 0% 117.9MB/s ± 0% +24.06% (p=0.000 n=8+9) Hash1K/Sum512 94.8MB/s ± 0% 117.5MB/s ± 0% +23.99% (p=0.000 n=8+9) Hash8K/New 110MB/s ± 0% 137MB/s ± 0% +25.11% (p=0.000 n=9+6) Hash8K/Sum384 110MB/s ± 0% 137MB/s ± 0% +25.07% (p=0.000 n=9+8) Hash8K/Sum512 110MB/s ± 0% 137MB/s ± 0% +25.01% (p=0.000 n=10+10) Change-Id: I28ccfce634659305a336c8e0a3f8589f7361d661 Reviewed-on: https://go-review.googlesource.com/c/go/+/422317 Reviewed-by: Keith Randall <khr@golang.org> Run-TryBot: Wayne Zuo <wdvxdr@golangcn.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Reviewed-by: David Chase <drchase@google.com>
2022-08-09 23:53:37 +08:00
{name: "SLLV", argLength: 2, reg: gp21, asm: "SLLV"}, // arg0 << arg1, shift amount is mod 64
{name: "SLLVconst", argLength: 1, reg: gp11, asm: "SLLV", aux: "Int64"}, // arg0 << auxInt
{name: "SRLV", argLength: 2, reg: gp21, asm: "SRLV"}, // arg0 >> arg1, unsigned, shift amount is mod 64
{name: "SRLVconst", argLength: 1, reg: gp11, asm: "SRLV", aux: "Int64"}, // arg0 >> auxInt, unsigned
{name: "SRAV", argLength: 2, reg: gp21, asm: "SRAV"}, // arg0 >> arg1, signed, shift amount is mod 64
{name: "SRAVconst", argLength: 1, reg: gp11, asm: "SRAV", aux: "Int64"}, // arg0 >> auxInt, signed
{name: "ROTR", argLength: 2, reg: gp21, asm: "ROTR"}, // arg0 right rotate by (arg1 mod 32) bits
{name: "ROTRV", argLength: 2, reg: gp21, asm: "ROTRV"}, // arg0 right rotate by (arg1 mod 64) bits
{name: "ROTRconst", argLength: 1, reg: gp11, asm: "ROTR", aux: "Int64"}, // uint32(arg0) right rotate by auxInt bits, auxInt should be in the range 0 to 31.
{name: "ROTRVconst", argLength: 1, reg: gp11, asm: "ROTRV", aux: "Int64"}, // arg0 right rotate by auxInt bits, auxInt should be in the range 0 to 63.
// comparisons
{name: "SGT", argLength: 2, reg: gp21, asm: "SGT", typ: "Bool"}, // 1 if arg0 > arg1 (signed), 0 otherwise
{name: "SGTconst", argLength: 1, reg: gp11, asm: "SGT", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (signed), 0 otherwise
{name: "SGTU", argLength: 2, reg: gp21, asm: "SGTU", typ: "Bool"}, // 1 if arg0 > arg1 (unsigned), 0 otherwise
{name: "SGTUconst", argLength: 1, reg: gp11, asm: "SGTU", aux: "Int64", typ: "Bool"}, // 1 if auxInt > arg0 (unsigned), 0 otherwise
{name: "CMPEQF", argLength: 2, reg: fp2flags, asm: "CMPEQF", typ: "Flags"}, // flags=true if arg0 = arg1, float32
{name: "CMPEQD", argLength: 2, reg: fp2flags, asm: "CMPEQD", typ: "Flags"}, // flags=true if arg0 = arg1, float64
{name: "CMPGEF", argLength: 2, reg: fp2flags, asm: "CMPGEF", typ: "Flags"}, // flags=true if arg0 >= arg1, float32
{name: "CMPGED", argLength: 2, reg: fp2flags, asm: "CMPGED", typ: "Flags"}, // flags=true if arg0 >= arg1, float64
{name: "CMPGTF", argLength: 2, reg: fp2flags, asm: "CMPGTF", typ: "Flags"}, // flags=true if arg0 > arg1, float32
{name: "CMPGTD", argLength: 2, reg: fp2flags, asm: "CMPGTD", typ: "Flags"}, // flags=true if arg0 > arg1, float64
cmd/compile: add patterns for bitfield opcodes on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.0095n ± 0% 0.8011n ± 0% -20.64% (p=0.000 n=10) LeadingZeros8 1.201n ± 0% 1.167n ± 0% -2.83% (p=0.000 n=10) LeadingZeros16 1.201n ± 0% 1.167n ± 0% -2.83% (p=0.000 n=10) LeadingZeros32 1.201n ± 0% 1.134n ± 0% -5.58% (p=0.000 n=10) LeadingZeros64 0.8007n ± 0% 1.0115n ± 0% +26.32% (p=0.000 n=10) TrailingZeros 0.8054n ± 0% 0.8106n ± 1% +0.65% (p=0.000 n=10) TrailingZeros8 1.067n ± 0% 1.002n ± 1% -6.09% (p=0.000 n=10) TrailingZeros16 1.0540n ± 0% 0.8389n ± 0% -20.40% (p=0.000 n=10) TrailingZeros32 0.8014n ± 0% 0.8117n ± 0% +1.29% (p=0.000 n=10) TrailingZeros64 0.8015n ± 0% 0.8124n ± 1% +1.36% (p=0.000 n=10) OnesCount 3.418n ± 0% 3.417n ± 0% ~ (p=0.911 n=10) OnesCount8 0.8004n ± 0% 0.8004n ± 0% ~ (p=1.000 n=10) OnesCount16 1.440n ± 0% 1.299n ± 0% -9.79% (p=0.000 n=10) OnesCount32 2.969n ± 0% 2.940n ± 0% -0.94% (p=0.000 n=10) OnesCount64 3.563n ± 0% 3.558n ± 0% -0.14% (p=0.000 n=10) RotateLeft 0.6677n ± 0% 0.6670n ± 0% ~ (p=0.055 n=10) RotateLeft8 1.318n ± 1% 1.321n ± 0% ~ (p=0.117 n=10) RotateLeft16 0.8457n ± 1% 0.8442n ± 0% ~ (p=0.325 n=10) RotateLeft32 0.8004n ± 0% 0.8004n ± 0% ~ (p=0.837 n=10) RotateLeft64 0.6678n ± 0% 0.6670n ± 0% -0.13% (p=0.000 n=10) Reverse 0.8004n ± 0% 0.8004n ± 0% ~ (p=1.000 n=10) Reverse8 0.6989n ± 0% 0.6969n ± 1% ~ (p=0.138 n=10) Reverse16 0.6998n ± 1% 0.7004n ± 1% ~ (p=0.985 n=10) Reverse32 0.4158n ± 1% 0.4159n ± 1% ~ (p=0.870 n=10) Reverse64 0.4165n ± 1% 0.4194n ± 2% ~ (p=0.093 n=10) ReverseBytes 0.8004n ± 0% 0.8004n ± 0% ~ (p=1.000 n=10) ReverseBytes16 0.4183n ± 2% 0.4148n ± 1% ~ (p=0.055 n=10) ReverseBytes32 0.4143n ± 2% 0.4153n ± 1% ~ (p=0.869 n=10) ReverseBytes64 0.4168n ± 1% 0.4177n ± 1% ~ (p=0.184 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=0.087 n=10) Add32 1.603n ± 0% 1.601n ± 0% -0.12% (p=0.000 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=0.211 n=10) Add64multiple 1.839n ± 0% 1.835n ± 0% -0.24% (p=0.001 n=10) Sub 1.202n ± 0% 1.201n ± 0% -0.04% (p=0.033 n=10) Sub32 2.401n ± 0% 1.601n ± 0% -33.32% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.105n ± 0% 2.096n ± 0% -0.40% (p=0.000 n=10) Mul 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Mul32 0.8041n ± 0% 0.8014n ± 0% -0.34% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 8.977n ± 0% 8.945n ± 0% -0.36% (p=0.000 n=10) Div32 4.084n ± 0% 4.086n ± 0% ~ (p=0.445 n=10) Div64 9.316n ± 0% 9.301n ± 0% -0.17% (p=0.000 n=10) geomean 1.141n 1.117n -2.09% Change-Id: I4dc1eaab6728f771bc722ed331fe5c6429bd1037 Reviewed-on: https://go-review.googlesource.com/c/go/+/618475 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-08-14 17:11:08 +08:00
// bitfield ops
// for bstrpick.w msbw is auxInt>>5, lsbw is auxInt&0x1f
// for bstrpick.d msbd is auxInt>>6, lsbd is auxInt&0x3f
{name: "BSTRPICKW", argLength: 1, reg: gp11, asm: "BSTRPICKW", aux: "Int64"},
{name: "BSTRPICKV", argLength: 1, reg: gp11, asm: "BSTRPICKV", aux: "Int64"},
// moves
{name: "MOVVconst", argLength: 0, reg: gp01, aux: "Int64", asm: "MOVV", typ: "UInt64", rematerializeable: true}, // auxint
{name: "MOVFconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVF", typ: "Float32", rematerializeable: true}, // auxint as 64-bit float, convert to 32-bit float
{name: "MOVDconst", argLength: 0, reg: fp01, aux: "Float64", asm: "MOVD", typ: "Float64", rematerializeable: true}, // auxint as 64-bit float
{name: "MOVVaddr", argLength: 1, reg: regInfo{inputs: []regMask{buildReg("SP") | buildReg("SB")}, outputs: []regMask{gp}}, aux: "SymOff", asm: "MOVV", rematerializeable: true, symEffect: "Addr"}, // arg0 + auxInt + aux.(*gc.Sym), arg0=SP/SB
{name: "MOVBload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVB", typ: "Int8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVBUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVBU", typ: "UInt8", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVHload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVH", typ: "Int16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVHUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVHU", typ: "UInt16", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVWload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVW", typ: "Int32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVWUload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVWU", typ: "UInt32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVVload", argLength: 2, reg: gpload, aux: "SymOff", asm: "MOVV", typ: "UInt64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVFload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVF", typ: "Float32", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
{name: "MOVDload", argLength: 2, reg: fpload, aux: "SymOff", asm: "MOVD", typ: "Float64", faultOnNilArg0: true, symEffect: "Read"}, // load from arg0 + auxInt + aux. arg1=mem.
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
// register indexed load
{name: "MOVVloadidx", argLength: 3, reg: gp2load, asm: "MOVV", typ: "UInt64"}, // load 64-bit dword from arg0 + arg1, arg2 = mem.
{name: "MOVWloadidx", argLength: 3, reg: gp2load, asm: "MOVW", typ: "Int32"}, // load 32-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{name: "MOVWUloadidx", argLength: 3, reg: gp2load, asm: "MOVWU", typ: "UInt32"}, // load 32-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{name: "MOVHloadidx", argLength: 3, reg: gp2load, asm: "MOVH", typ: "Int16"}, // load 16-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{name: "MOVHUloadidx", argLength: 3, reg: gp2load, asm: "MOVHU", typ: "UInt16"}, // load 16-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{name: "MOVBloadidx", argLength: 3, reg: gp2load, asm: "MOVB", typ: "Int8"}, // load 8-bit word from arg0 + arg1, sign-extended to 64-bit, arg2=mem.
{name: "MOVBUloadidx", argLength: 3, reg: gp2load, asm: "MOVBU", typ: "UInt8"}, // load 8-bit word from arg0 + arg1, zero-extended to 64-bit, arg2=mem.
{name: "MOVFloadidx", argLength: 3, reg: fp2load, asm: "MOVF", typ: "Float32"}, // load 32-bit float from arg0 + arg1, arg2=mem.
{name: "MOVDloadidx", argLength: 3, reg: fp2load, asm: "MOVD", typ: "Float64"}, // load 64-bit float from arg0 + arg1, arg2=mem.
{name: "MOVBstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVVstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVFstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVF", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
// register indexed store
{name: "MOVBstoreidx", argLength: 4, reg: gpstore2, asm: "MOVB", typ: "Mem"}, // store 1 byte of arg2 to arg0 + arg1, arg3 = mem.
{name: "MOVHstoreidx", argLength: 4, reg: gpstore2, asm: "MOVH", typ: "Mem"}, // store 2 bytes of arg2 to arg0 + arg1, arg3 = mem.
{name: "MOVWstoreidx", argLength: 4, reg: gpstore2, asm: "MOVW", typ: "Mem"}, // store 4 bytes of arg2 to arg0 + arg1, arg3 = mem.
{name: "MOVVstoreidx", argLength: 4, reg: gpstore2, asm: "MOVV", typ: "Mem"}, // store 8 bytes of arg2 to arg0 + arg1, arg3 = mem.
{name: "MOVFstoreidx", argLength: 4, reg: fpstore2, asm: "MOVF", typ: "Mem"}, // store 32-bit float of arg2 to arg0 + arg1, arg3=mem.
{name: "MOVDstoreidx", argLength: 4, reg: fpstore2, asm: "MOVD", typ: "Mem"}, // store 64-bit float of arg2 to arg0 + arg1, arg3=mem.
{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVVstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVV", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem.
cmd/compile: optimize loong64 with register indexed load/store goos: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | BinaryTree17 7.766 ± 1% 7.640 ± 2% -1.62% (p=0.000 n=20) Fannkuch11 2.649 ± 0% 2.358 ± 0% -10.96% (p=0.000 n=20) FmtFprintfEmpty 35.89n ± 0% 35.87n ± 0% -0.06% (p=0.000 n=20) FmtFprintfString 59.44n ± 0% 57.25n ± 2% -3.68% (p=0.000 n=20) FmtFprintfInt 62.07n ± 0% 60.04n ± 0% -3.27% (p=0.000 n=20) FmtFprintfIntInt 97.90n ± 0% 97.26n ± 0% -0.65% (p=0.000 n=20) FmtFprintfPrefixedInt 116.7n ± 0% 119.2n ± 0% +2.14% (p=0.000 n=20) FmtFprintfFloat 204.5n ± 0% 201.9n ± 0% -1.30% (p=0.000 n=20) FmtManyArgs 455.9n ± 0% 466.8n ± 0% +2.39% (p=0.000 n=20) GobDecode 7.458m ± 1% 7.138m ± 1% -4.28% (p=0.000 n=20) GobEncode 8.573m ± 1% 8.473m ± 1% ~ (p=0.091 n=20) Gzip 280.2m ± 0% 284.9m ± 0% +1.67% (p=0.000 n=20) Gunzip 32.68m ± 0% 32.67m ± 0% ~ (p=0.211 n=20) HTTPClientServer 54.22µ ± 0% 53.24µ ± 0% -1.80% (p=0.000 n=20) JSONEncode 9.427m ± 1% 9.152m ± 0% -2.92% (p=0.000 n=20) JSONDecode 47.08m ± 1% 46.85m ± 1% -0.49% (p=0.007 n=20) Mandelbrot200 4.601m ± 0% 4.605m ± 0% +0.08% (p=0.000 n=20) GoParse 4.776m ± 0% 4.655m ± 1% -2.52% (p=0.000 n=20) RegexpMatchEasy0_32 59.77n ± 0% 57.59n ± 0% -3.66% (p=0.000 n=20) RegexpMatchEasy0_1K 458.1n ± 0% 458.8n ± 0% +0.15% (p=0.000 n=20) RegexpMatchEasy1_32 59.36n ± 0% 59.24n ± 0% -0.20% (p=0.000 n=20) RegexpMatchEasy1_1K 557.7n ± 0% 560.2n ± 0% +0.46% (p=0.000 n=20) RegexpMatchMedium_32 803.1n ± 0% 772.8n ± 0% -3.77% (p=0.000 n=20) RegexpMatchMedium_1K 27.29µ ± 0% 25.88µ ± 0% -5.18% (p=0.000 n=20) RegexpMatchHard_32 1.385µ ± 0% 1.304µ ± 0% -5.85% (p=0.000 n=20) RegexpMatchHard_1K 40.92µ ± 0% 39.58µ ± 0% -3.27% (p=0.000 n=20) Revcomp 474.3m ± 0% 410.0m ± 0% -13.56% (p=0.000 n=20) Template 78.16m ± 0% 76.32m ± 1% -2.36% (p=0.000 n=20) TimeParse 271.8n ± 0% 272.1n ± 0% +0.11% (p=0.000 n=20) TimeFormat 292.3n ± 0% 294.8n ± 0% +0.86% (p=0.000 n=20) geomean 51.98µ 50.82µ -2.22% Change-Id: Ia78f1ddee8f1d9ec7192a4b8d2a4ec6058679956 Reviewed-on: https://go-review.googlesource.com/c/go/+/615918 Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-09-26 14:17:17 +08:00
// register indexed store zero
{name: "MOVBstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVB", typ: "Mem"}, // store 1 byte of zero to arg0 + arg1, arg2 = mem.
{name: "MOVHstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVH", typ: "Mem"}, // store 2 bytes of zero to arg0 + arg1, arg2 = mem.
{name: "MOVWstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVW", typ: "Mem"}, // store 4 bytes of zero to arg0 + arg1, arg2 = mem.
{name: "MOVVstorezeroidx", argLength: 3, reg: gpstore, asm: "MOVV", typ: "Mem"}, // store 8 bytes of zero to arg0 + arg1, arg2 = mem.
cmd/compile: optimize math.Float64(32)bits and math.Float64(32)frombits on loong64 Use float <-> int register moves without conversion instead of stores and loads to move float <-> int values like arm64 and mips64. goos: linux goarch: loong64 pkg: math cpu: Loongson-3A6000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Acos 15.98n ± 0% 15.94n ± 0% -0.25% (p=0.000 n=20) Acosh 27.75n ± 0% 25.56n ± 0% -7.89% (p=0.000 n=20) Asin 15.85n ± 0% 15.76n ± 0% -0.57% (p=0.000 n=20) Asinh 39.79n ± 0% 37.69n ± 0% -5.28% (p=0.000 n=20) Atan 7.261n ± 0% 7.242n ± 0% -0.27% (p=0.000 n=20) Atanh 28.30n ± 0% 27.62n ± 0% -2.40% (p=0.000 n=20) Atan2 15.85n ± 0% 15.75n ± 0% -0.63% (p=0.000 n=20) Cbrt 27.02n ± 0% 21.08n ± 0% -21.98% (p=0.000 n=20) Ceil 2.830n ± 1% 2.896n ± 1% +2.31% (p=0.000 n=20) Copysign 0.8022n ± 0% 0.8004n ± 0% -0.22% (p=0.000 n=20) Cos 11.64n ± 0% 11.61n ± 0% -0.26% (p=0.000 n=20) Cosh 35.98n ± 0% 33.44n ± 0% -7.05% (p=0.000 n=20) Erf 10.09n ± 0% 10.08n ± 0% -0.10% (p=0.000 n=20) Erfc 11.40n ± 0% 11.35n ± 0% -0.44% (p=0.000 n=20) Erfinv 12.31n ± 0% 12.29n ± 0% -0.16% (p=0.000 n=20) Erfcinv 12.16n ± 0% 12.17n ± 0% +0.08% (p=0.000 n=20) Exp 28.41n ± 0% 26.44n ± 0% -6.95% (p=0.000 n=20) ExpGo 28.68n ± 0% 27.07n ± 0% -5.60% (p=0.000 n=20) Expm1 17.21n ± 0% 16.75n ± 0% -2.67% (p=0.000 n=20) Exp2 24.71n ± 0% 23.01n ± 0% -6.88% (p=0.000 n=20) Exp2Go 25.17n ± 0% 23.91n ± 0% -4.99% (p=0.000 n=20) Abs 0.8004n ± 0% 0.8004n ± 0% ~ (p=0.224 n=20) Dim 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=20) ¹ Floor 2.848n ± 0% 2.859n ± 0% +0.39% (p=0.000 n=20) Max 3.074n ± 0% 3.071n ± 0% ~ (p=0.481 n=20) Min 3.179n ± 0% 3.176n ± 0% -0.09% (p=0.003 n=20) Mod 49.62n ± 0% 44.82n ± 0% -9.67% (p=0.000 n=20) Frexp 7.604n ± 0% 6.803n ± 0% -10.53% (p=0.000 n=20) Gamma 18.01n ± 0% 17.61n ± 0% -2.22% (p=0.000 n=20) Hypot 7.204n ± 0% 7.604n ± 0% +5.55% (p=0.000 n=20) HypotGo 7.204n ± 0% 7.604n ± 0% +5.56% (p=0.000 n=20) Ilogb 6.003n ± 0% 6.003n ± 0% ~ (p=0.407 n=20) J0 76.43n ± 0% 76.24n ± 0% -0.25% (p=0.000 n=20) J1 76.44n ± 0% 76.44n ± 0% ~ (p=1.000 n=20) Jn 168.2n ± 0% 168.5n ± 0% +0.18% (p=0.000 n=20) Ldexp 8.804n ± 0% 7.604n ± 0% -13.63% (p=0.000 n=20) Lgamma 19.01n ± 0% 19.01n ± 0% ~ (p=0.695 n=20) Log 19.38n ± 0% 19.12n ± 0% -1.34% (p=0.000 n=20) Logb 6.003n ± 0% 6.003n ± 0% ~ (p=1.000 n=20) Log1p 18.57n ± 0% 16.72n ± 0% -9.96% (p=0.000 n=20) Log10 20.67n ± 0% 20.45n ± 0% -1.06% (p=0.000 n=20) Log2 9.605n ± 0% 8.804n ± 0% -8.34% (p=0.000 n=20) Modf 4.402n ± 0% 4.402n ± 0% ~ (p=1.000 n=20) Nextafter32 7.204n ± 0% 5.603n ± 0% -22.22% (p=0.000 n=20) Nextafter64 6.803n ± 0% 6.003n ± 0% -11.76% (p=0.000 n=20) PowInt 39.62n ± 0% 37.22n ± 0% -6.06% (p=0.000 n=20) PowFrac 120.9n ± 0% 108.9n ± 0% -9.93% (p=0.000 n=20) Pow10Pos 1.601n ± 0% 1.601n ± 0% ~ (p=0.487 n=20) Pow10Neg 2.675n ± 0% 2.675n ± 0% ~ (p=1.000 n=20) Round 3.018n ± 0% 2.401n ± 0% -20.46% (p=0.000 n=20) RoundToEven 3.822n ± 0% 3.001n ± 0% -21.48% (p=0.000 n=20) Remainder 45.62n ± 0% 42.42n ± 0% -7.01% (p=0.000 n=20) Signbit 0.9075n ± 0% 0.8004n ± 0% -11.81% (p=0.000 n=20) Sin 12.65n ± 0% 12.65n ± 0% ~ (p=0.503 n=20) Sincos 14.81n ± 0% 14.60n ± 0% -1.42% (p=0.000 n=20) Sinh 36.75n ± 0% 35.11n ± 0% -4.46% (p=0.000 n=20) SqrtIndirect 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=20) ¹ SqrtLatency 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) SqrtIndirectLatency 4.002n ± 0% 4.002n ± 0% ~ (p=1.000 n=20) SqrtGoLatency 52.85n ± 0% 40.82n ± 0% -22.76% (p=0.000 n=20) SqrtPrime 887.4n ± 0% 887.4n ± 0% ~ (p=0.751 n=20) Tan 13.95n ± 0% 13.97n ± 0% +0.18% (p=0.000 n=20) Tanh 36.79n ± 0% 34.89n ± 0% -5.16% (p=0.000 n=20) Trunc 2.849n ± 0% 2.861n ± 0% +0.42% (p=0.000 n=20) Y0 77.44n ± 0% 77.64n ± 0% +0.26% (p=0.000 n=20) Y1 74.41n ± 0% 74.33n ± 0% -0.11% (p=0.000 n=20) Yn 158.7n ± 0% 159.0n ± 0% +0.19% (p=0.000 n=20) Float64bits 0.8774n ± 0% 0.4002n ± 0% -54.39% (p=0.000 n=20) Float64frombits 0.8042n ± 0% 0.4002n ± 0% -50.24% (p=0.000 n=20) Float32bits 1.1230n ± 0% 0.5336n ± 0% -52.48% (p=0.000 n=20) Float32frombits 1.0670n ± 0% 0.8004n ± 0% -24.99% (p=0.000 n=20) FMA 2.001n ± 0% 2.001n ± 0% ~ (p=0.605 n=20) geomean 10.87n 10.10n -7.15% ¹ all samples are equal goos: linux goarch: loong64 pkg: math cpu: Loongson-3A5000 @ 2500.00MHz │ bench.old │ bench.new │ │ sec/op │ sec/op vs base │ Acos 33.10n ± 0% 31.95n ± 2% -3.46% (p=0.000 n=20) Acosh 58.38n ± 0% 50.44n ± 0% -13.60% (p=0.000 n=20) Asin 32.70n ± 0% 31.94n ± 0% -2.32% (p=0.000 n=20) Asinh 57.65n ± 0% 50.83n ± 0% -11.82% (p=0.000 n=20) Atan 14.21n ± 0% 14.21n ± 0% ~ (p=0.501 n=20) Atanh 60.86n ± 0% 54.44n ± 0% -10.56% (p=0.000 n=20) Atan2 32.02n ± 0% 34.02n ± 0% +6.25% (p=0.000 n=20) Cbrt 55.58n ± 0% 40.64n ± 0% -26.88% (p=0.000 n=20) Ceil 9.566n ± 0% 9.566n ± 0% ~ (p=0.463 n=20) Copysign 0.8005n ± 0% 0.8005n ± 0% ~ (p=0.806 n=20) Cos 18.02n ± 0% 18.02n ± 0% ~ (p=0.191 n=20) Cosh 64.44n ± 0% 65.64n ± 0% +1.86% (p=0.000 n=20) Erf 16.15n ± 0% 16.16n ± 0% ~ (p=0.770 n=20) Erfc 18.71n ± 0% 18.83n ± 0% +0.61% (p=0.000 n=20) Erfinv 19.33n ± 0% 19.34n ± 0% ~ (p=0.513 n=20) Erfcinv 18.90n ± 0% 19.78n ± 0% +4.63% (p=0.000 n=20) Exp 50.04n ± 0% 49.66n ± 0% -0.75% (p=0.000 n=20) ExpGo 50.03n ± 0% 50.03n ± 0% ~ (p=0.723 n=20) Expm1 28.41n ± 0% 28.27n ± 0% -0.49% (p=0.000 n=20) Exp2 50.08n ± 0% 51.23n ± 0% +2.31% (p=0.000 n=20) Exp2Go 49.77n ± 0% 49.89n ± 0% +0.24% (p=0.000 n=20) Abs 0.8009n ± 0% 0.8006n ± 0% ~ (p=0.317 n=20) Dim 1.987n ± 0% 1.993n ± 0% +0.28% (p=0.001 n=20) Floor 8.543n ± 0% 8.548n ± 0% ~ (p=0.509 n=20) Max 6.670n ± 0% 6.672n ± 0% ~ (p=0.335 n=20) Min 6.694n ± 0% 6.694n ± 0% ~ (p=0.459 n=20) Mod 56.44n ± 0% 53.23n ± 0% -5.70% (p=0.000 n=20) Frexp 8.409n ± 0% 7.606n ± 0% -9.55% (p=0.000 n=20) Gamma 35.64n ± 0% 35.23n ± 0% -1.15% (p=0.000 n=20) Hypot 11.21n ± 0% 10.61n ± 0% -5.31% (p=0.000 n=20) HypotGo 11.50n ± 0% 11.01n ± 0% -4.30% (p=0.000 n=20) Ilogb 7.606n ± 0% 6.804n ± 0% -10.54% (p=0.000 n=20) J0 125.3n ± 0% 126.5n ± 0% +0.96% (p=0.000 n=20) J1 124.9n ± 0% 125.3n ± 0% +0.32% (p=0.000 n=20) Jn 264.3n ± 0% 265.9n ± 0% +0.61% (p=0.000 n=20) Ldexp 9.606n ± 0% 9.204n ± 0% -4.19% (p=0.000 n=20) Lgamma 38.82n ± 0% 38.85n ± 0% +0.06% (p=0.019 n=20) Log 38.44n ± 0% 28.04n ± 0% -27.06% (p=0.000 n=20) Logb 8.405n ± 0% 7.605n ± 0% -9.52% (p=0.000 n=20) Log1p 31.62n ± 0% 27.11n ± 0% -14.26% (p=0.000 n=20) Log10 38.83n ± 0% 28.42n ± 0% -26.81% (p=0.000 n=20) Log2 11.21n ± 0% 10.41n ± 0% -7.14% (p=0.000 n=20) Modf 5.204n ± 0% 5.205n ± 0% ~ (p=0.983 n=20) Nextafter32 8.809n ± 0% 7.208n ± 0% -18.18% (p=0.000 n=20) Nextafter64 8.405n ± 0% 8.406n ± 0% +0.01% (p=0.007 n=20) PowInt 48.83n ± 0% 44.78n ± 0% -8.28% (p=0.000 n=20) PowFrac 146.9n ± 0% 142.1n ± 0% -3.23% (p=0.000 n=20) Pow10Pos 2.334n ± 0% 2.333n ± 0% ~ (p=0.110 n=20) Pow10Neg 4.803n ± 0% 4.803n ± 0% ~ (p=0.130 n=20) Round 4.816n ± 0% 3.819n ± 0% -20.70% (p=0.000 n=20) RoundToEven 5.735n ± 0% 5.204n ± 0% -9.26% (p=0.000 n=20) Remainder 52.05n ± 0% 49.64n ± 0% -4.63% (p=0.000 n=20) Signbit 1.201n ± 0% 1.001n ± 0% -16.65% (p=0.000 n=20) Sin 20.63n ± 0% 20.64n ± 0% +0.05% (p=0.040 n=20) Sincos 23.82n ± 0% 24.62n ± 0% +3.36% (p=0.000 n=20) Sinh 71.25n ± 0% 68.44n ± 0% -3.94% (p=0.000 n=20) SqrtIndirect 2.001n ± 0% 2.001n ± 0% ~ (p=0.182 n=20) SqrtLatency 4.003n ± 0% 4.003n ± 0% ~ (p=0.754 n=20) SqrtIndirectLatency 4.003n ± 0% 4.003n ± 0% ~ (p=0.773 n=20) SqrtGoLatency 60.84n ± 0% 81.26n ± 0% +33.56% (p=0.000 n=20) SqrtPrime 1.791µ ± 0% 1.791µ ± 0% ~ (p=0.784 n=20) Tan 27.22n ± 0% 27.22n ± 0% ~ (p=0.819 n=20) Tanh 70.88n ± 0% 69.04n ± 0% -2.60% (p=0.000 n=20) Trunc 8.543n ± 0% 8.543n ± 0% ~ (p=0.784 n=20) Y0 122.9n ± 0% 122.9n ± 0% ~ (p=0.559 n=20) Y1 123.3n ± 0% 121.7n ± 0% -1.30% (p=0.000 n=20) Yn 263.0n ± 0% 262.6n ± 0% -0.15% (p=0.000 n=20) Float64bits 1.2010n ± 0% 0.6004n ± 0% -50.01% (p=0.000 n=20) Float64frombits 1.2010n ± 0% 0.6004n ± 0% -50.01% (p=0.000 n=20) Float32bits 1.7010n ± 0% 0.8005n ± 0% -52.94% (p=0.000 n=20) Float32frombits 1.5010n ± 0% 0.8005n ± 0% -46.67% (p=0.000 n=20) FMA 2.001n ± 0% 2.001n ± 0% ~ (p=0.238 n=20) geomean 17.41n 16.15n -7.19% Change-Id: I0a0c263af2f07203eab1782e69c706f20c689d8d Reviewed-on: https://go-review.googlesource.com/c/go/+/604737 Auto-Submit: Tim King <taking@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Tim King <taking@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-08-12 16:41:11 +08:00
// moves (no conversion)
{name: "MOVWfpgp", argLength: 1, reg: fpgp, asm: "MOVW"}, // move float32 to int32 (no conversion).
{name: "MOVWgpfp", argLength: 1, reg: gpfp, asm: "MOVW"}, // move int32 to float32 (no conversion).
{name: "MOVVfpgp", argLength: 1, reg: fpgp, asm: "MOVV"}, // move float64 to int64 (no conversion).
{name: "MOVVgpfp", argLength: 1, reg: gpfp, asm: "MOVV"}, // move int64 to float64 (no conversion).
// conversions
{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
{name: "MOVBUreg", argLength: 1, reg: gp11, asm: "MOVBU"}, // move from arg0, unsign-extended from byte
{name: "MOVHreg", argLength: 1, reg: gp11, asm: "MOVH"}, // move from arg0, sign-extended from half
{name: "MOVHUreg", argLength: 1, reg: gp11, asm: "MOVHU"}, // move from arg0, unsign-extended from half
{name: "MOVWreg", argLength: 1, reg: gp11, asm: "MOVW"}, // move from arg0, sign-extended from word
{name: "MOVWUreg", argLength: 1, reg: gp11, asm: "MOVWU"}, // move from arg0, unsign-extended from word
{name: "MOVVreg", argLength: 1, reg: gp11, asm: "MOVV"}, // move from arg0
{name: "MOVVnop", argLength: 1, reg: regInfo{inputs: []regMask{gp}, outputs: []regMask{gp}}, resultInArg0: true}, // nop, return arg0 in same register
{name: "MOVWF", argLength: 1, reg: fp11, asm: "MOVWF"}, // int32 -> float32
{name: "MOVWD", argLength: 1, reg: fp11, asm: "MOVWD"}, // int32 -> float64
{name: "MOVVF", argLength: 1, reg: fp11, asm: "MOVVF"}, // int64 -> float32
{name: "MOVVD", argLength: 1, reg: fp11, asm: "MOVVD"}, // int64 -> float64
{name: "TRUNCFW", argLength: 1, reg: fp11, asm: "TRUNCFW"}, // float32 -> int32
{name: "TRUNCDW", argLength: 1, reg: fp11, asm: "TRUNCDW"}, // float64 -> int32
{name: "TRUNCFV", argLength: 1, reg: fp11, asm: "TRUNCFV"}, // float32 -> int64
{name: "TRUNCDV", argLength: 1, reg: fp11, asm: "TRUNCDV"}, // float64 -> int64
{name: "MOVFD", argLength: 1, reg: fp11, asm: "MOVFD"}, // float32 -> float64
{name: "MOVDF", argLength: 1, reg: fp11, asm: "MOVDF"}, // float64 -> float32
// function calls
{name: "CALLstatic", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
{name: "CALLtail", argLength: -1, reg: regInfo{clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true, tailCall: true}, // tail call static function aux.(*obj.LSym). last arg=mem, auxint=argsize, returns mem
{name: "CALLclosure", argLength: -1, reg: regInfo{inputs: []regMask{gpsp, buildReg("R29"), 0}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call function via closure. arg0=codeptr, arg1=closure, last arg=mem, auxint=argsize, returns mem
{name: "CALLinter", argLength: -1, reg: regInfo{inputs: []regMask{gp}, clobbers: callerSave}, aux: "CallOff", clobberFlags: true, call: true}, // call fn by pointer. arg0=codeptr, last arg=mem, auxint=argsize, returns mem
// duffzero
// arg0 = address of memory to zero
// arg1 = mem
// auxint = offset into duffzero code to start executing
// returns mem
// R20 aka loong64.REGRT1 changed as side effect
{
name: "DUFFZERO",
aux: "Int64",
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("R20")},
clobbers: buildReg("R20 R1"),
},
typ: "Mem",
faultOnNilArg0: true,
},
// duffcopy
// arg0 = address of dst memory (in R21, changed as side effect)
// arg1 = address of src memory (in R20, changed as side effect)
// arg2 = mem
// auxint = offset into duffcopy code to start executing
// returns mem
{
name: "DUFFCOPY",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R21"), buildReg("R20")},
clobbers: buildReg("R20 R21 R1"),
},
typ: "Mem",
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// large or unaligned zeroing
// arg0 = address of memory to zero (in R20, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// auxint = alignment
// returns mem
// MOVx R0, (R20)
// ADDV $sz, R20
// BGEU Rarg1, R20, -2(PC)
{
name: "LoweredZero",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R20"), gp},
clobbers: buildReg("R20"),
},
cmd/compile: improve the implementation of Lowered{Move,Zero} on linux/loong64 Like the CL 487295, when implementing Lowered{Move,Zero}, 8 is first subtracted from Rarg0 (parameter Ptr), and then the offset of 8 is added during subsequent operations on Rarg0. This operation is meaningless, so delete it. Change LoweredMove's Rarg0 register to R20, consistent with duffcopy. goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3C5000 @ 2200.00MHz │ old.bench │ new.bench │ │ sec/op │ sec/op vs base │ Memmove/15 19.10n ± 0% 19.10n ± 0% ~ (p=0.483 n=15) MemmoveUnalignedDst/15 25.02n ± 0% 25.02n ± 0% ~ (p=0.741 n=15) MemmoveUnalignedDst/32 48.22n ± 0% 48.22n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedDst/64 90.57n ± 0% 90.52n ± 0% ~ (p=0.212 n=15) MemmoveUnalignedDstOverlap/32 44.12n ± 0% 44.13n ± 0% +0.02% (p=0.000 n=15) MemmoveUnalignedDstOverlap/64 87.79n ± 0% 87.80n ± 0% +0.01% (p=0.002 n=15) MemmoveUnalignedSrc/0 3.639n ± 0% 3.639n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/1 7.733n ± 0% 7.733n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/2 9.097n ± 0% 9.097n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/3 10.46n ± 0% 10.46n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/4 11.83n ± 0% 11.83n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/64 93.71n ± 0% 93.70n ± 0% ~ (p=0.128 n=15) Memclr/4096 699.1n ± 0% 699.1n ± 0% ~ (p=0.682 n=15) Memclr/65536 11.18µ ± 0% 11.18µ ± 0% -0.01% (p=0.000 n=15) Memclr/1M 175.2µ ± 0% 175.2µ ± 0% ~ (p=0.191 n=15) Memclr/4M 661.8µ ± 0% 662.0µ ± 0% ~ (p=0.486 n=15) MemclrUnaligned/4_5 19.39n ± 0% 20.47n ± 0% +5.57% (p=0.000 n=15) MemclrUnaligned/4_16 22.29n ± 0% 21.38n ± 0% -4.08% (p=0.000 n=15) MemclrUnaligned/4_64 30.58n ± 0% 29.81n ± 0% -2.52% (p=0.000 n=15) MemclrUnaligned/4_65536 11.19µ ± 0% 11.20µ ± 0% +0.02% (p=0.000 n=15) GoMemclr/5 12.73n ± 0% 12.73n ± 0% ~ (p=0.261 n=15) GoMemclr/16 10.01n ± 0% 10.00n ± 0% ~ (p=0.264 n=15) GoMemclr/256 50.94n ± 0% 50.94n ± 0% ~ (p=0.372 n=15) ClearFat15 14.95n ± 0% 15.01n ± 4% ~ (p=0.925 n=15) ClearFat1032 125.5n ± 0% 125.6n ± 0% +0.08% (p=0.000 n=15) CopyFat64 10.58n ± 0% 10.01n ± 0% -5.39% (p=0.000 n=15) CopyFat1040 244.3n ± 0% 155.6n ± 0% -36.31% (p=0.000 n=15) Issue18740/2byte 29.82µ ± 0% 29.82µ ± 0% ~ (p=0.648 n=30) Issue18740/4byte 18.18µ ± 0% 18.18µ ± 0% -0.02% (p=0.001 n=30) Issue18740/8byte 8.395µ ± 0% 8.395µ ± 0% ~ (p=0.401 n=30) geomean 154.5n 151.8n -1.70% ¹ all samples are equal Change-Id: Ia3f3c8b25e1e93c97ab72328651de78ca9dec016 Reviewed-on: https://go-review.googlesource.com/c/go/+/488515 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Auto-Submit: Ian Lance Taylor <iant@golang.org> Reviewed-by: WANG Xuerui <git@xen0n.name> Reviewed-by: xiaodong liu <teaofmoli@gmail.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-04-25 03:27:23 +08:00
typ: "Mem",
faultOnNilArg0: true,
},
// large or unaligned move
// arg0 = address of dst memory (in R21, changed as side effect)
// arg1 = address of src memory (in R20, changed as side effect)
// arg2 = address of the last element of src
// arg3 = mem
// auxint = alignment
// returns mem
// MOVx (R20), Rtmp
// MOVx Rtmp, (R21)
cmd/compile: improve the implementation of Lowered{Move,Zero} on linux/loong64 Like the CL 487295, when implementing Lowered{Move,Zero}, 8 is first subtracted from Rarg0 (parameter Ptr), and then the offset of 8 is added during subsequent operations on Rarg0. This operation is meaningless, so delete it. Change LoweredMove's Rarg0 register to R20, consistent with duffcopy. goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3C5000 @ 2200.00MHz │ old.bench │ new.bench │ │ sec/op │ sec/op vs base │ Memmove/15 19.10n ± 0% 19.10n ± 0% ~ (p=0.483 n=15) MemmoveUnalignedDst/15 25.02n ± 0% 25.02n ± 0% ~ (p=0.741 n=15) MemmoveUnalignedDst/32 48.22n ± 0% 48.22n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedDst/64 90.57n ± 0% 90.52n ± 0% ~ (p=0.212 n=15) MemmoveUnalignedDstOverlap/32 44.12n ± 0% 44.13n ± 0% +0.02% (p=0.000 n=15) MemmoveUnalignedDstOverlap/64 87.79n ± 0% 87.80n ± 0% +0.01% (p=0.002 n=15) MemmoveUnalignedSrc/0 3.639n ± 0% 3.639n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/1 7.733n ± 0% 7.733n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/2 9.097n ± 0% 9.097n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/3 10.46n ± 0% 10.46n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/4 11.83n ± 0% 11.83n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/64 93.71n ± 0% 93.70n ± 0% ~ (p=0.128 n=15) Memclr/4096 699.1n ± 0% 699.1n ± 0% ~ (p=0.682 n=15) Memclr/65536 11.18µ ± 0% 11.18µ ± 0% -0.01% (p=0.000 n=15) Memclr/1M 175.2µ ± 0% 175.2µ ± 0% ~ (p=0.191 n=15) Memclr/4M 661.8µ ± 0% 662.0µ ± 0% ~ (p=0.486 n=15) MemclrUnaligned/4_5 19.39n ± 0% 20.47n ± 0% +5.57% (p=0.000 n=15) MemclrUnaligned/4_16 22.29n ± 0% 21.38n ± 0% -4.08% (p=0.000 n=15) MemclrUnaligned/4_64 30.58n ± 0% 29.81n ± 0% -2.52% (p=0.000 n=15) MemclrUnaligned/4_65536 11.19µ ± 0% 11.20µ ± 0% +0.02% (p=0.000 n=15) GoMemclr/5 12.73n ± 0% 12.73n ± 0% ~ (p=0.261 n=15) GoMemclr/16 10.01n ± 0% 10.00n ± 0% ~ (p=0.264 n=15) GoMemclr/256 50.94n ± 0% 50.94n ± 0% ~ (p=0.372 n=15) ClearFat15 14.95n ± 0% 15.01n ± 4% ~ (p=0.925 n=15) ClearFat1032 125.5n ± 0% 125.6n ± 0% +0.08% (p=0.000 n=15) CopyFat64 10.58n ± 0% 10.01n ± 0% -5.39% (p=0.000 n=15) CopyFat1040 244.3n ± 0% 155.6n ± 0% -36.31% (p=0.000 n=15) Issue18740/2byte 29.82µ ± 0% 29.82µ ± 0% ~ (p=0.648 n=30) Issue18740/4byte 18.18µ ± 0% 18.18µ ± 0% -0.02% (p=0.001 n=30) Issue18740/8byte 8.395µ ± 0% 8.395µ ± 0% ~ (p=0.401 n=30) geomean 154.5n 151.8n -1.70% ¹ all samples are equal Change-Id: Ia3f3c8b25e1e93c97ab72328651de78ca9dec016 Reviewed-on: https://go-review.googlesource.com/c/go/+/488515 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Auto-Submit: Ian Lance Taylor <iant@golang.org> Reviewed-by: WANG Xuerui <git@xen0n.name> Reviewed-by: xiaodong liu <teaofmoli@gmail.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-04-25 03:27:23 +08:00
// ADDV $sz, R20
// ADDV $sz, R21
// BGEU Rarg2, R20, -4(PC)
{
name: "LoweredMove",
aux: "Int64",
argLength: 4,
reg: regInfo{
inputs: []regMask{buildReg("R21"), buildReg("R20"), gp},
clobbers: buildReg("R20 R21"),
},
cmd/compile: improve the implementation of Lowered{Move,Zero} on linux/loong64 Like the CL 487295, when implementing Lowered{Move,Zero}, 8 is first subtracted from Rarg0 (parameter Ptr), and then the offset of 8 is added during subsequent operations on Rarg0. This operation is meaningless, so delete it. Change LoweredMove's Rarg0 register to R20, consistent with duffcopy. goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3C5000 @ 2200.00MHz │ old.bench │ new.bench │ │ sec/op │ sec/op vs base │ Memmove/15 19.10n ± 0% 19.10n ± 0% ~ (p=0.483 n=15) MemmoveUnalignedDst/15 25.02n ± 0% 25.02n ± 0% ~ (p=0.741 n=15) MemmoveUnalignedDst/32 48.22n ± 0% 48.22n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedDst/64 90.57n ± 0% 90.52n ± 0% ~ (p=0.212 n=15) MemmoveUnalignedDstOverlap/32 44.12n ± 0% 44.13n ± 0% +0.02% (p=0.000 n=15) MemmoveUnalignedDstOverlap/64 87.79n ± 0% 87.80n ± 0% +0.01% (p=0.002 n=15) MemmoveUnalignedSrc/0 3.639n ± 0% 3.639n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/1 7.733n ± 0% 7.733n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/2 9.097n ± 0% 9.097n ± 0% ~ (p=1.000 n=15) MemmoveUnalignedSrc/3 10.46n ± 0% 10.46n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/4 11.83n ± 0% 11.83n ± 0% ~ (p=1.000 n=15) ¹ MemmoveUnalignedSrc/64 93.71n ± 0% 93.70n ± 0% ~ (p=0.128 n=15) Memclr/4096 699.1n ± 0% 699.1n ± 0% ~ (p=0.682 n=15) Memclr/65536 11.18µ ± 0% 11.18µ ± 0% -0.01% (p=0.000 n=15) Memclr/1M 175.2µ ± 0% 175.2µ ± 0% ~ (p=0.191 n=15) Memclr/4M 661.8µ ± 0% 662.0µ ± 0% ~ (p=0.486 n=15) MemclrUnaligned/4_5 19.39n ± 0% 20.47n ± 0% +5.57% (p=0.000 n=15) MemclrUnaligned/4_16 22.29n ± 0% 21.38n ± 0% -4.08% (p=0.000 n=15) MemclrUnaligned/4_64 30.58n ± 0% 29.81n ± 0% -2.52% (p=0.000 n=15) MemclrUnaligned/4_65536 11.19µ ± 0% 11.20µ ± 0% +0.02% (p=0.000 n=15) GoMemclr/5 12.73n ± 0% 12.73n ± 0% ~ (p=0.261 n=15) GoMemclr/16 10.01n ± 0% 10.00n ± 0% ~ (p=0.264 n=15) GoMemclr/256 50.94n ± 0% 50.94n ± 0% ~ (p=0.372 n=15) ClearFat15 14.95n ± 0% 15.01n ± 4% ~ (p=0.925 n=15) ClearFat1032 125.5n ± 0% 125.6n ± 0% +0.08% (p=0.000 n=15) CopyFat64 10.58n ± 0% 10.01n ± 0% -5.39% (p=0.000 n=15) CopyFat1040 244.3n ± 0% 155.6n ± 0% -36.31% (p=0.000 n=15) Issue18740/2byte 29.82µ ± 0% 29.82µ ± 0% ~ (p=0.648 n=30) Issue18740/4byte 18.18µ ± 0% 18.18µ ± 0% -0.02% (p=0.001 n=30) Issue18740/8byte 8.395µ ± 0% 8.395µ ± 0% ~ (p=0.401 n=30) geomean 154.5n 151.8n -1.70% ¹ all samples are equal Change-Id: Ia3f3c8b25e1e93c97ab72328651de78ca9dec016 Reviewed-on: https://go-review.googlesource.com/c/go/+/488515 Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Bryan Mills <bcmills@google.com> Auto-Submit: Ian Lance Taylor <iant@golang.org> Reviewed-by: WANG Xuerui <git@xen0n.name> Reviewed-by: xiaodong liu <teaofmoli@gmail.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2023-04-25 03:27:23 +08:00
typ: "Mem",
faultOnNilArg0: true,
faultOnNilArg1: true,
},
// atomic loads.
// load from arg0. arg1=mem.
// returns <value,memory> so they can be properly ordered with other loads.
{name: "LoweredAtomicLoad8", argLength: 2, reg: gpload, faultOnNilArg0: true},
{name: "LoweredAtomicLoad32", argLength: 2, reg: gpload, faultOnNilArg0: true},
{name: "LoweredAtomicLoad64", argLength: 2, reg: gpload, faultOnNilArg0: true},
// atomic stores.
// store arg1 to arg0. arg2=mem. returns memory.
{name: "LoweredAtomicStore8", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicStore32", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicStore64", argLength: 3, reg: gpstore, faultOnNilArg0: true, hasSideEffects: true},
// store zero to arg0. arg1=mem. returns memory.
{name: "LoweredAtomicStorezero32", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
{name: "LoweredAtomicStorezero64", argLength: 2, reg: gpstore0, faultOnNilArg0: true, hasSideEffects: true},
// atomic exchange.
// store arg1 to arg0. arg2=mem. returns <old content of *arg0, memory>.
// DBAR
// LL (Rarg0), Rout
// MOVV Rarg1, Rtmp
// SC Rtmp, (Rarg0)
// BEQ Rtmp, -3(PC)
// DBAR
{name: "LoweredAtomicExchange32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicExchange64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic add.
// *arg0 += arg1. arg2=mem. returns <new content of *arg0, memory>.
// DBAR
// LL (Rarg0), Rout
// ADDV Rarg1, Rout, Rtmp
// SC Rtmp, (Rarg0)
// BEQ Rtmp, -3(PC)
// DBAR
// ADDV Rarg1, Rout
{name: "LoweredAtomicAdd32", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicAdd64", argLength: 3, reg: gpxchg, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// *arg0 += auxint. arg1=mem. returns <new content of *arg0, memory>. auxint is 32-bit.
{name: "LoweredAtomicAddconst32", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int32", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicAddconst64", argLength: 2, reg: regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}, aux: "Int64", resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// atomic compare and swap.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
// if *arg0 == arg1 {
// *arg0 = arg2
// return (true, memory)
// } else {
// return (false, memory)
// }
// DBAR
// MOVV $0, Rout
// LL (Rarg0), Rtmp
// BNE Rtmp, Rarg1, 4(PC)
// MOVV Rarg2, Rout
// SC Rout, (Rarg0)
// BEQ Rout, -4(PC)
// DBAR
{name: "LoweredAtomicCas32", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
{name: "LoweredAtomicCas64", argLength: 4, reg: gpcas, resultNotInArgs: true, faultOnNilArg0: true, hasSideEffects: true, unsafePoint: true},
// pseudo-ops
{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
{name: "FPFlagTrue", argLength: 1, reg: readflags}, // bool, true if FP flag is true
{name: "FPFlagFalse", argLength: 1, reg: readflags}, // bool, true if FP flag is false
// Scheduler ensures LoweredGetClosurePtr occurs only in entry block,
// and sorts it to the very beginning of the block to prevent other
// use of R22 (loong64.REGCTXT, the closure pointer)
{name: "LoweredGetClosurePtr", reg: regInfo{outputs: []regMask{buildReg("R29")}}, zeroWidth: true},
// LoweredGetCallerSP returns the SP of the caller of the current function. arg0=mem.
{name: "LoweredGetCallerSP", argLength: 1, reg: gp01, rematerializeable: true},
// LoweredGetCallerPC evaluates to the PC to which its "caller" will return.
// I.e., if f calls g "calls" sys.GetCallerPC,
// the result should be the PC within f that g will return to.
// See runtime/stubs.go for a more detailed discussion.
{name: "LoweredGetCallerPC", reg: gp01, rematerializeable: true},
// LoweredWB invokes runtime.gcWriteBarrier. arg0=mem, auxint=# of buffer entries needed
// It saves all GP registers if necessary,
// but clobbers R1 (LR) because it's a call
// and R30 (REGTMP).
// Returns a pointer to a write barrier buffer in R29.
{name: "LoweredWB", argLength: 1, reg: regInfo{clobbers: (callerSave &^ gpg) | buildReg("R1"), outputs: []regMask{buildReg("R29")}}, clobberFlags: true, aux: "Int64"},
// There are three of these functions so that they can have three different register inputs.
// When we check 0 <= c <= cap (A), then 0 <= b <= c (B), then 0 <= a <= b (C), we want the
// default registers to match so we don't need to copy registers around unnecessarily.
{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
}
blocks := []blockData{
{name: "EQ", controls: 1},
{name: "NE", controls: 1},
cmd/compile/internal: optimize condition branch implementation os: linux goarch: loong64 pkg: test/bench/go1 cpu: Loongson-3A6000 @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ BinaryTree17 7.521 ± 1% 7.551 ± 2% ~ (p=0.190 n=10) Fannkuch11 2.736 ± 0% 2.667 ± 0% -2.51% (p=0.000 n=10) FmtFprintfEmpty 34.42n ± 0% 35.22n ± 0% +2.32% (p=0.000 n=10) FmtFprintfString 61.24n ± 0% 56.84n ± 0% -7.18% (p=0.000 n=10) FmtFprintfInt 68.04n ± 0% 65.65n ± 0% -3.51% (p=0.000 n=10) FmtFprintfIntInt 111.9n ± 0% 106.0n ± 0% -5.32% (p=0.000 n=10) FmtFprintfPrefixedInt 131.4n ± 0% 122.5n ± 0% -6.77% (p=0.000 n=10) FmtFprintfFloat 241.1n ± 0% 235.1n ± 0% -2.51% (p=0.000 n=10) FmtManyArgs 553.7n ± 0% 518.9n ± 0% -6.28% (p=0.000 n=10) GobDecode 7.223m ± 1% 7.291m ± 1% +0.94% (p=0.004 n=10) GobEncode 6.741m ± 1% 6.622m ± 2% -1.77% (p=0.011 n=10) Gzip 288.9m ± 0% 280.3m ± 0% -3.00% (p=0.000 n=10) Gunzip 34.07m ± 0% 33.33m ± 0% -2.18% (p=0.000 n=10) HTTPClientServer 60.15µ ± 0% 60.63µ ± 0% +0.80% (p=0.000 n=10) JSONEncode 10.052m ± 1% 9.840m ± 0% -2.12% (p=0.000 n=10) JSONDecode 50.96m ± 0% 51.32m ± 0% +0.70% (p=0.002 n=10) Mandelbrot200 4.525m ± 0% 4.602m ± 0% +1.69% (p=0.000 n=10) GoParse 5.018m ± 0% 4.996m ± 0% -0.44% (p=0.000 n=10) RegexpMatchEasy0_32 58.74n ± 0% 59.95n ± 0% +2.06% (p=0.000 n=10) RegexpMatchEasy0_1K 464.9n ± 0% 466.1n ± 0% +0.26% (p=0.000 n=10) RegexpMatchEasy1_32 64.88n ± 0% 59.64n ± 0% -8.08% (p=0.000 n=10) RegexpMatchEasy1_1K 557.2n ± 0% 564.4n ± 0% +1.29% (p=0.000 n=10) RegexpMatchMedium_32 879.3n ± 0% 912.8n ± 1% +3.81% (p=0.000 n=10) RegexpMatchMedium_1K 28.08µ ± 0% 28.70µ ± 0% +2.20% (p=0.000 n=10) RegexpMatchHard_32 1.456µ ± 0% 1.414µ ± 0% -2.88% (p=0.000 n=10) RegexpMatchHard_1K 43.81µ ± 0% 42.23µ ± 0% -3.61% (p=0.000 n=10) Revcomp 472.4m ± 0% 474.5m ± 1% +0.45% (p=0.000 n=10) Template 83.45m ± 0% 83.39m ± 0% ~ (p=0.481 n=10) TimeParse 291.3n ± 0% 283.8n ± 0% -2.57% (p=0.000 n=10) TimeFormat 322.8n ± 0% 313.1n ± 0% -3.02% (p=0.000 n=10) geomean 54.32µ 53.45µ -1.61% Change-Id: If68fdd952ec6137c77e25ce8932358cac28da324 Reviewed-on: https://go-review.googlesource.com/c/go/+/620977 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
2024-07-18 10:17:42 +08:00
{name: "LTZ", controls: 1}, // < 0
{name: "LEZ", controls: 1}, // <= 0
{name: "GTZ", controls: 1}, // > 0
{name: "GEZ", controls: 1}, // >= 0
{name: "FPT", controls: 1}, // FP flag is true
{name: "FPF", controls: 1}, // FP flag is false
{name: "BEQ", controls: 2}, // controls[0] == controls[1]
{name: "BNE", controls: 2}, // controls[0] == controls[1]
{name: "BGE", controls: 2}, // controls[0] >= controls[1]
{name: "BLT", controls: 2}, // controls[0] < controls[1]
{name: "BGEU", controls: 2}, // controls[0] >= controls[1], unsigned
{name: "BLTU", controls: 2}, // controls[0] < controls[1], unsigned
}
archs = append(archs, arch{
name: "LOONG64",
pkg: "cmd/internal/obj/loong64",
genfile: "../../loong64/ssa.go",
ops: ops,
blocks: blocks,
regnames: regNamesLOONG64,
// TODO: support register ABI on loong64
ParamIntRegNames: "R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19",
ParamFloatRegNames: "F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15",
gpregmask: gp,
fpregmask: fp,
framepointerreg: -1, // not used
linkreg: int8(num["R1"]),
})
}