go/test/codegen/shift.go

678 lines
16 KiB
Go
Raw Normal View History

// asmcheck
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package codegen
// ------------------ //
// constant shifts //
// ------------------ //
func lshConst64x64(v int64) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SLLV"
// ppc64x:"SLD"
// riscv64:"SLLI",-"AND",-"SLTIU"
return v << uint64(33)
}
func rshConst64Ux64(v uint64) uint64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRLV"
// ppc64x:"SRD"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRLI\t",-"AND",-"SLTIU"
return v >> uint64(33)
}
func rshConst64Ux64Overflow32(v uint32) uint64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"MOVV\t\\$0,",-"SRL\t"
// riscv64:"MOV\t\\$0,",-"SRL"
return uint64(v) >> 32
}
func rshConst64Ux64Overflow16(v uint16) uint64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"MOVV\t\\$0,",-"SRLV"
// riscv64:"MOV\t\\$0,",-"SRL"
return uint64(v) >> 16
}
func rshConst64Ux64Overflow8(v uint8) uint64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"MOVV\t\\$0,",-"SRLV"
// riscv64:"MOV\t\\$0,",-"SRL"
return uint64(v) >> 8
}
func rshConst64x64(v int64) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRAV"
// ppc64x:"SRAD"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRAI\t",-"OR",-"SLTIU"
return v >> uint64(33)
}
func rshConst64x64Overflow32(v int32) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRA\t\\$31"
// riscv64:"SRAIW",-"SLLI",-"SRAI\t"
return int64(v) >> 32
}
func rshConst64x64Overflow16(v int16) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SLLV\t\\$48","SRAV\t\\$63"
// riscv64:"SLLI","SRAI",-"SRAIW"
return int64(v) >> 16
}
func rshConst64x64Overflow8(v int8) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SLLV\t\\$56","SRAV\t\\$63"
// riscv64:"SLLI","SRAI",-"SRAIW"
return int64(v) >> 8
}
cmd/compile: prefer an add when shifting left by 1 ADD(Q|L) has generally twice the throughput. Came up in CL 626998. Throughput by arch: Zen 4: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.25 Intel Alder Lake: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.2 Intel Haswell: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.25 Also include a minor opt for: (x + x) << c -> x << (c + 1) Before this, the code: func addShift(x int64) int64 { return (x + x) << 1 } emitted two instructions: ADDQ AX, AX SHLQ $1, AX but we can do it in a single shift: SHLQ $2, AX Add a codegen test for clearing the last bit. compilecmp linux/amd64: math math.sqrt 243 -> 242 (-0.41%) math [cmd/compile] math.sqrt 243 -> 242 (-0.41%) runtime runtime.selectgo 5455 -> 5445 (-0.18%) runtime.sysargs 665 -> 662 (-0.45%) runtime.isPinned 145 -> 141 (-2.76%) runtime.atoi64 198 -> 194 (-2.02%) runtime.setPinned 714 -> 709 (-0.70%) runtime [cmd/compile] runtime.sysargs 665 -> 662 (-0.45%) runtime.setPinned 714 -> 709 (-0.70%) runtime.atoi64 198 -> 194 (-2.02%) runtime.isPinned 145 -> 141 (-2.76%) strconv strconv.computeBounds 109 -> 107 (-1.83%) strconv.FormatInt 201 -> 197 (-1.99%) strconv.ryuFtoaShortest 1298 -> 1266 (-2.47%) strconv.small 144 -> 134 (-6.94%) strconv.AppendInt 357 -> 344 (-3.64%) strconv.ryuDigits32 490 -> 488 (-0.41%) strconv.AppendUint 342 -> 340 (-0.58%) strconv [cmd/compile] strconv.FormatInt 201 -> 197 (-1.99%) strconv.ryuFtoaShortest 1298 -> 1266 (-2.47%) strconv.ryuDigits32 490 -> 488 (-0.41%) strconv.AppendUint 342 -> 340 (-0.58%) strconv.computeBounds 109 -> 107 (-1.83%) strconv.small 144 -> 134 (-6.94%) strconv.AppendInt 357 -> 344 (-3.64%) image image.Rectangle.Inset 101 -> 97 (-3.96%) regexp/syntax regexp/syntax.inCharClass.func1 111 -> 110 (-0.90%) regexp/syntax.(*compiler).quest 586 -> 573 (-2.22%) regexp/syntax.ranges.Less 153 -> 150 (-1.96%) regexp/syntax.(*compiler).loop 583 -> 568 (-2.57%) time time.Time.Before 179 -> 161 (-10.06%) time.Time.Compare 189 -> 166 (-12.17%) time.Time.Sub 444 -> 425 (-4.28%) time.Time.UnixMicro 106 -> 95 (-10.38%) time.div 592 -> 587 (-0.84%) time.Time.UnixNano 85 -> 78 (-8.24%) time.(*Time).UnixMilli 141 -> 140 (-0.71%) time.Time.UnixMilli 106 -> 95 (-10.38%) time.(*Time).UnixMicro 141 -> 140 (-0.71%) time.Time.After 179 -> 161 (-10.06%) time.Time.Equal 170 -> 150 (-11.76%) time.Time.AppendBinary 766 -> 757 (-1.17%) time.Time.IsZero 74 -> 66 (-10.81%) time.(*Time).UnixNano 124 -> 113 (-8.87%) time.(*Time).IsZero 113 -> 108 (-4.42%) regexp regexp.(*Regexp).FindAllStringSubmatch.func1 590 -> 569 (-3.56%) regexp.QuoteMeta 485 -> 469 (-3.30%) regexp/syntax [cmd/compile] regexp/syntax.inCharClass.func1 111 -> 110 (-0.90%) regexp/syntax.(*compiler).loop 583 -> 568 (-2.57%) regexp/syntax.(*compiler).quest 586 -> 573 (-2.22%) regexp/syntax.ranges.Less 153 -> 150 (-1.96%) encoding/base64 encoding/base64.decodedLen 92 -> 90 (-2.17%) encoding/base64.(*Encoding).DecodedLen 99 -> 97 (-2.02%) time [cmd/compile] time.(*Time).IsZero 113 -> 108 (-4.42%) time.Time.IsZero 74 -> 66 (-10.81%) time.(*Time).UnixNano 124 -> 113 (-8.87%) time.Time.UnixMilli 106 -> 95 (-10.38%) time.Time.Equal 170 -> 150 (-11.76%) time.Time.UnixMicro 106 -> 95 (-10.38%) time.(*Time).UnixMicro 141 -> 140 (-0.71%) time.Time.Before 179 -> 161 (-10.06%) time.Time.UnixNano 85 -> 78 (-8.24%) time.Time.AppendBinary 766 -> 757 (-1.17%) time.div 592 -> 587 (-0.84%) time.Time.After 179 -> 161 (-10.06%) time.Time.Compare 189 -> 166 (-12.17%) time.(*Time).UnixMilli 141 -> 140 (-0.71%) time.Time.Sub 444 -> 425 (-4.28%) index/suffixarray index/suffixarray.sais_8_32 1677 -> 1645 (-1.91%) index/suffixarray.sais_32 1677 -> 1645 (-1.91%) index/suffixarray.sais_64 1677 -> 1654 (-1.37%) index/suffixarray.sais_8_64 1677 -> 1654 (-1.37%) index/suffixarray.writeInt 249 -> 247 (-0.80%) os os.Expand 1070 -> 1051 (-1.78%) os.Chtimes 787 -> 774 (-1.65%) regexp [cmd/compile] regexp.(*Regexp).FindAllStringSubmatch.func1 590 -> 569 (-3.56%) regexp.QuoteMeta 485 -> 469 (-3.30%) encoding/base64 [cmd/compile] encoding/base64.decodedLen 92 -> 90 (-2.17%) encoding/base64.(*Encoding).DecodedLen 99 -> 97 (-2.02%) encoding/hex encoding/hex.Encode 138 -> 136 (-1.45%) encoding/hex.(*decoder).Read 830 -> 824 (-0.72%) crypto/des crypto/des.initFeistelBox 235 -> 229 (-2.55%) crypto/des.cryptBlock 549 -> 538 (-2.00%) os [cmd/compile] os.Chtimes 787 -> 774 (-1.65%) os.Expand 1070 -> 1051 (-1.78%) math/big math/big.newFloat 238 -> 223 (-6.30%) math/big.nat.mul 2138 -> 2122 (-0.75%) math/big.karatsubaSqr 1372 -> 1369 (-0.22%) math/big.(*Float).sqrtInverse 895 -> 878 (-1.90%) math/big.basicSqr 1032 -> 1017 (-1.45%) cmd/vendor/golang.org/x/sys/unix cmd/vendor/golang.org/x/sys/unix.TimeToTimespec 72 -> 66 (-8.33%) encoding/json encoding/json.Indent 404 -> 403 (-0.25%) encoding/json.MarshalIndent 303 -> 297 (-1.98%) testing testing.(*T).Deadline 84 -> 82 (-2.38%) testing.(*M).Run 3545 -> 3525 (-0.56%) archive/zip archive/zip.headerFileInfo.ModTime 229 -> 223 (-2.62%) encoding/gob encoding/gob.(*encoderState).encodeInt 474 -> 469 (-1.05%) crypto/elliptic crypto/elliptic.Marshal 728 -> 714 (-1.92%) debug/buildinfo debug/buildinfo.readString 325 -> 315 (-3.08%) image/png image/png.(*decoder).readImagePass 10866 -> 10834 (-0.29%) archive/tar archive/tar.Header.allowedFormats.func3 1768 -> 1736 (-1.81%) archive/tar.formatPAXTime 389 -> 358 (-7.97%) archive/tar.(*Writer).writeGNUHeader 741 -> 727 (-1.89%) archive/tar.readGNUSparseMap0x1 709 -> 695 (-1.97%) archive/tar.(*Writer).templateV7Plus 915 -> 909 (-0.66%) crypto/internal/cryptotest crypto/internal/cryptotest.TestHash.func4 890 -> 879 (-1.24%) crypto/internal/cryptotest.TestStream.func6.1 646 -> 645 (-0.15%) crypto/internal/cryptotest.testCipher.func3 1300 -> 1289 (-0.85%) internal/pkgbits internal/pkgbits.(*Encoder).Int64 113 -> 103 (-8.85%) internal/pkgbits.(*Encoder).rawVarint 74 -> 72 (-2.70%) testing/quick testing/quick.(*Config).getRand 316 -> 315 (-0.32%) log/slog log/slog.TimeValue 489 -> 479 (-2.04%) runtime/pprof runtime/pprof.(*profileBuilder).build 2341 -> 2322 (-0.81%) internal/coverage/cfile internal/coverage/cfile.(*emitState).openMetaFile 824 -> 822 (-0.24%) internal/coverage/cfile.(*emitState).openCounterFile 904 -> 892 (-1.33%) cmd/internal/objabi cmd/internal/objabi.expandArgs 1177 -> 1169 (-0.68%) crypto/ecdsa crypto/ecdsa.pointFromAffine 1162 -> 1144 (-1.55%) net net.minNonzeroTime 313 -> 308 (-1.60%) net.cgoLookupAddrPTR 812 -> 797 (-1.85%) net.(*IPNet).String 851 -> 827 (-2.82%) net.IP.AppendText 488 -> 471 (-3.48%) net.IPMask.String 281 -> 270 (-3.91%) net.partialDeadline 374 -> 366 (-2.14%) net.hexString 249 -> 240 (-3.61%) net.IP.String 454 -> 453 (-0.22%) internal/fuzz internal/fuzz.newPcgRand 240 -> 234 (-2.50%) crypto/x509 crypto/x509.(*Certificate).isValid 2642 -> 2611 (-1.17%) cmd/internal/obj/s390x cmd/internal/obj/s390x.buildop 33676 -> 33644 (-0.10%) encoding/hex [cmd/compile] encoding/hex.(*decoder).Read 830 -> 824 (-0.72%) encoding/hex.Encode 138 -> 136 (-1.45%) cmd/internal/objabi [cmd/compile] cmd/internal/objabi.expandArgs 1177 -> 1169 (-0.68%) math/big [cmd/compile] math/big.(*Float).sqrtInverse 895 -> 878 (-1.90%) math/big.nat.mul 2138 -> 2122 (-0.75%) math/big.karatsubaSqr 1372 -> 1369 (-0.22%) math/big.basicSqr 1032 -> 1017 (-1.45%) math/big.newFloat 238 -> 223 (-6.30%) encoding/json [cmd/compile] encoding/json.MarshalIndent 303 -> 297 (-1.98%) encoding/json.Indent 404 -> 403 (-0.25%) cmd/covdata main.(*metaMerge).emitCounters 985 -> 973 (-1.22%) runtime/pprof [cmd/compile] runtime/pprof.(*profileBuilder).build 2341 -> 2322 (-0.81%) cmd/compile/internal/syntax cmd/compile/internal/syntax.(*source).fill 722 -> 703 (-2.63%) cmd/dist main.runInstall 19081 -> 19049 (-0.17%) crypto/tls crypto/tls.extractPadding 176 -> 175 (-0.57%) slices.Clone[[]crypto/tls.SignatureScheme,crypto/tls.SignatureScheme] 253 -> 247 (-2.37%) slices.Clone[[]uint16,uint16] 253 -> 247 (-2.37%) slices.Clone[[]crypto/tls.CurveID,crypto/tls.CurveID] 253 -> 247 (-2.37%) crypto/tls.(*Config).cipherSuites 335 -> 326 (-2.69%) slices.DeleteFunc[go.shape.[]crypto/tls.CurveID,go.shape.uint16] 437 -> 434 (-0.69%) crypto/tls.dial 1349 -> 1339 (-0.74%) slices.DeleteFunc[go.shape.[]uint16,go.shape.uint16] 437 -> 434 (-0.69%) internal/pkgbits [cmd/compile] internal/pkgbits.(*Encoder).Int64 113 -> 103 (-8.85%) internal/pkgbits.(*Encoder).rawVarint 74 -> 72 (-2.70%) cmd/compile/internal/syntax [cmd/compile] cmd/compile/internal/syntax.(*source).fill 722 -> 703 (-2.63%) cmd/internal/obj/s390x [cmd/compile] cmd/internal/obj/s390x.buildop 33676 -> 33644 (-0.10%) cmd/go/internal/trace cmd/go/internal/trace.Flow 910 -> 886 (-2.64%) cmd/go/internal/trace.(*Span).Done 311 -> 304 (-2.25%) cmd/go/internal/trace.StartSpan 620 -> 615 (-0.81%) cmd/internal/script cmd/internal/script.(*Engine).Execute.func2 534 -> 528 (-1.12%) cmd/link/internal/loader cmd/link/internal/loader.(*Loader).SetSymSect 344 -> 338 (-1.74%) net/http net/http.(*Transport).queueForIdleConn 1797 -> 1766 (-1.73%) net/http.(*Transport).getConn 2149 -> 2131 (-0.84%) net/http.(*http2ClientConn).tooIdleLocked 207 -> 197 (-4.83%) net/http.(*http2responseWriter).SetWriteDeadline.func1 520 -> 508 (-2.31%) net/http.(*Cookie).Valid 837 -> 818 (-2.27%) net/http.(*http2responseWriter).SetReadDeadline 373 -> 357 (-4.29%) net/http.checkIfRange 701 -> 690 (-1.57%) net/http.(*http2SettingsFrame).Value 325 -> 298 (-8.31%) net/http.(*http2SettingsFrame).HasDuplicates 777 -> 767 (-1.29%) net/http.(*Server).Serve 1746 -> 1739 (-0.40%) net/http.http2traceGotConn 569 -> 556 (-2.28%) net/http/pprof net/http/pprof.collectProfile 242 -> 239 (-1.24%) cmd/compile/internal/coverage cmd/compile/internal/coverage.metaHashAndLen 439 -> 438 (-0.23%) cmd/vendor/golang.org/x/telemetry/internal/upload cmd/vendor/golang.org/x/telemetry/internal/upload.(*uploader).findWork 4570 -> 4540 (-0.66%) cmd/vendor/golang.org/x/telemetry/internal/upload.(*uploader).reports 3604 -> 3572 (-0.89%) cmd/compile/internal/coverage [cmd/compile] cmd/compile/internal/coverage.metaHashAndLen 439 -> 438 (-0.23%) cmd/vendor/golang.org/x/text/language cmd/vendor/golang.org/x/text/language.regionGroupDist 287 -> 284 (-1.05%) cmd/go/internal/vcweb cmd/go/internal/vcweb.(*Server).overview.func1 1045 -> 1041 (-0.38%) cmd/go/internal/vcs cmd/go/internal/vcs.expand 761 -> 741 (-2.63%) cmd/compile/internal/inline/inlheur slices.stableCmpFunc[go.shape.struct 2300 -> 2284 (-0.70%) cmd/compile/internal/inline/inlheur [cmd/compile] slices.stableCmpFunc[go.shape.struct 2300 -> 2284 (-0.70%) cmd/go/internal/modfetch/codehost cmd/go/internal/modfetch/codehost.bzrParseStat 2217 -> 2213 (-0.18%) cmd/link/internal/ld cmd/link/internal/ld.decodetypeStructFieldCount 157 -> 152 (-3.18%) cmd/link/internal/ld.(*Link).address 12559 -> 12495 (-0.51%) cmd/link/internal/ld.(*dodataState).allocateDataSections 18345 -> 18205 (-0.76%) cmd/link/internal/ld.elfshreloc 618 -> 616 (-0.32%) cmd/link/internal/ld.(*deadcodePass).decodetypeMethods 794 -> 779 (-1.89%) cmd/link/internal/ld.(*dodataState).assignDsymsToSection 668 -> 663 (-0.75%) cmd/link/internal/ld.relocSectFn 285 -> 284 (-0.35%) cmd/link/internal/ld.decodetypeIfaceMethodCount 146 -> 144 (-1.37%) cmd/link/internal/ld.decodetypeArrayLen 157 -> 152 (-3.18%) cmd/link/internal/arm64 cmd/link/internal/arm64.gensymlate.func1 895 -> 888 (-0.78%) cmd/go/internal/modload cmd/go/internal/modload.queryProxy.func3 1029 -> 1012 (-1.65%) cmd/go/internal/load cmd/go/internal/load.(*Package).setBuildInfo 8453 -> 8447 (-0.07%) cmd/go/internal/clean cmd/go/internal/clean.runClean 2120 -> 2104 (-0.75%) cmd/compile/internal/ssa cmd/compile/internal/ssa.(*poset).aliasnodes 2010 -> 1978 (-1.59%) cmd/compile/internal/ssa.rewriteValueARM64_OpARM64MOVHstoreidx2 730 -> 719 (-1.51%) cmd/compile/internal/ssa.(*debugState).buildLocationLists 3326 -> 3294 (-0.96%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDLconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.(*debugState).processValue 9756 -> 9724 (-0.33%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDQconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.(*poset).mergeroot 1079 -> 1054 (-2.32%) cmd/compile/internal/ssa [cmd/compile] cmd/compile/internal/ssa.rewriteValueARM64_OpARM64MOVHstoreidx2 730 -> 719 (-1.51%) cmd/compile/internal/ssa.(*poset).aliasnodes 2010 -> 1978 (-1.59%) cmd/compile/internal/ssa.(*poset).mergeroot 1079 -> 1054 (-2.32%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDQconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDLconst 3069 -> 2941 (-4.17%) file before after Δ % math/bits.s 2352 2354 +2 +0.085% math/bits [cmd/compile].s 2352 2354 +2 +0.085% math.s 35675 35674 -1 -0.003% math [cmd/compile].s 35675 35674 -1 -0.003% runtime.s 577251 577245 -6 -0.001% runtime [cmd/compile].s 642419 642438 +19 +0.003% sort.s 37434 37435 +1 +0.003% strconv.s 48391 48343 -48 -0.099% sort [cmd/compile].s 37434 37435 +1 +0.003% bufio.s 21386 21418 +32 +0.150% strconv [cmd/compile].s 48391 48343 -48 -0.099% image.s 34978 35022 +44 +0.126% regexp/syntax.s 81719 81781 +62 +0.076% time.s 94341 94184 -157 -0.166% regexp.s 60411 60399 -12 -0.020% bufio [cmd/compile].s 21512 21544 +32 +0.149% encoding/binary.s 34062 34087 +25 +0.073% regexp/syntax [cmd/compile].s 81719 81781 +62 +0.076% encoding/base64.s 11907 11903 -4 -0.034% time [cmd/compile].s 94341 94184 -157 -0.166% index/suffixarray.s 41633 41527 -106 -0.255% os.s 101770 101738 -32 -0.031% regexp [cmd/compile].s 60411 60399 -12 -0.020% encoding/binary [cmd/compile].s 37173 37198 +25 +0.067% encoding/base64 [cmd/compile].s 11907 11903 -4 -0.034% os/exec.s 23900 23907 +7 +0.029% encoding/hex.s 6038 6030 -8 -0.132% crypto/des.s 5073 5056 -17 -0.335% os [cmd/compile].s 102030 101998 -32 -0.031% vendor/golang.org/x/net/http2/hpack.s 22027 22033 +6 +0.027% math/big.s 164808 164753 -55 -0.033% cmd/vendor/golang.org/x/sys/unix.s 121450 121444 -6 -0.005% encoding/json.s 110294 110287 -7 -0.006% testing.s 115303 115281 -22 -0.019% archive/zip.s 65329 65325 -4 -0.006% os/user.s 10078 10080 +2 +0.020% encoding/gob.s 143788 143783 -5 -0.003% crypto/elliptic.s 30686 30704 +18 +0.059% go/doc/comment.s 49401 49433 +32 +0.065% debug/buildinfo.s 9095 9085 -10 -0.110% image/png.s 36113 36081 -32 -0.089% archive/tar.s 71994 71897 -97 -0.135% crypto/internal/cryptotest.s 60872 60849 -23 -0.038% internal/pkgbits.s 20441 20429 -12 -0.059% testing/quick.s 8236 8235 -1 -0.012% log/slog.s 77568 77558 -10 -0.013% internal/trace/internal/oldtrace.s 52885 52896 +11 +0.021% runtime/pprof.s 123978 123969 -9 -0.007% internal/coverage/cfile.s 25198 25184 -14 -0.056% cmd/internal/objabi.s 19954 19946 -8 -0.040% crypto/ecdsa.s 29159 29141 -18 -0.062% log/slog/internal/benchmarks.s 6694 6695 +1 +0.015% net.s 299569 299503 -66 -0.022% os/exec [cmd/compile].s 23888 23895 +7 +0.029% internal/trace.s 179226 179240 +14 +0.008% internal/fuzz.s 86190 86191 +1 +0.001% crypto/x509.s 177195 177164 -31 -0.017% cmd/internal/obj/s390x.s 121642 121610 -32 -0.026% cmd/internal/obj/ppc64.s 140118 140122 +4 +0.003% encoding/hex [cmd/compile].s 6149 6141 -8 -0.130% cmd/internal/objabi [cmd/compile].s 19954 19946 -8 -0.040% cmd/internal/obj/arm64.s 158523 158555 +32 +0.020% go/doc/comment [cmd/compile].s 49512 49544 +32 +0.065% math/big [cmd/compile].s 166394 166339 -55 -0.033% encoding/json [cmd/compile].s 110712 110705 -7 -0.006% cmd/covdata.s 39699 39687 -12 -0.030% runtime/pprof [cmd/compile].s 125209 125200 -9 -0.007% cmd/compile/internal/syntax.s 181755 181736 -19 -0.010% cmd/dist.s 177893 177861 -32 -0.018% crypto/tls.s 389157 389113 -44 -0.011% internal/pkgbits [cmd/compile].s 41644 41632 -12 -0.029% cmd/compile/internal/syntax [cmd/compile].s 196105 196086 -19 -0.010% cmd/compile/internal/types.s 71315 71345 +30 +0.042% cmd/internal/obj/s390x [cmd/compile].s 121733 121701 -32 -0.026% cmd/go/internal/trace.s 4796 4760 -36 -0.751% cmd/internal/obj/arm64 [cmd/compile].s 168120 168147 +27 +0.016% cmd/internal/obj/ppc64 [cmd/compile].s 140219 140223 +4 +0.003% cmd/internal/script.s 83442 83436 -6 -0.007% cmd/link/internal/loader.s 93299 93294 -5 -0.005% net/http.s 620639 620472 -167 -0.027% net/http/pprof.s 35016 35013 -3 -0.009% cmd/compile/internal/coverage.s 6668 6667 -1 -0.015% cmd/vendor/golang.org/x/telemetry/internal/upload.s 34210 34148 -62 -0.181% cmd/compile/internal/coverage [cmd/compile].s 6664 6663 -1 -0.015% cmd/vendor/golang.org/x/text/language.s 48077 48074 -3 -0.006% cmd/go/internal/vcweb.s 45193 45189 -4 -0.009% cmd/go/internal/vcs.s 44749 44729 -20 -0.045% cmd/compile/internal/inline/inlheur.s 83758 83742 -16 -0.019% cmd/compile/internal/inline/inlheur [cmd/compile].s 84773 84757 -16 -0.019% cmd/go/internal/modfetch/codehost.s 89098 89094 -4 -0.004% cmd/trace.s 257550 257564 +14 +0.005% cmd/link/internal/ld.s 641945 641706 -239 -0.037% cmd/link/internal/arm64.s 34805 34798 -7 -0.020% cmd/go/internal/modload.s 328971 328954 -17 -0.005% cmd/go/internal/load.s 178877 178871 -6 -0.003% cmd/go/internal/clean.s 11006 10990 -16 -0.145% cmd/compile/internal/ssa.s 3552843 3553347 +504 +0.014% cmd/compile/internal/ssa [cmd/compile].s 3752511 3753123 +612 +0.016% total 36179015 36178687 -328 -0.001% Change-Id: I251c2898ccf3c9931d162d87dabbd49cf4ec73a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/641757 Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-01-11 19:26:57 +01:00
func lshConst32x1(v int32) int32 {
// amd64:"ADDL", -"SHLL"
return v << 1
}
func lshConst64x1(v int64) int64 {
// amd64:"ADDQ", -"SHLQ"
return v << 1
}
func lshConst32x64(v int32) int32 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SLL\t"
// ppc64x:"SLW"
// riscv64:"SLLI",-"AND",-"SLTIU", -"MOVW"
return v << uint64(29)
}
func rshConst32Ux64(v uint32) uint32 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRL\t"
// ppc64x:"SRW"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRLIW",-"AND",-"SLTIU", -"MOVW"
return v >> uint64(29)
}
func rshConst32x64(v int32) int32 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRA\t"
// ppc64x:"SRAW"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRAIW",-"OR",-"SLTIU", -"MOVW"
return v >> uint64(29)
}
func lshConst64x32(v int64) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SLLV"
// ppc64x:"SLD"
// riscv64:"SLLI",-"AND",-"SLTIU"
return v << uint32(33)
}
func rshConst64Ux32(v uint64) uint64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRLV"
// ppc64x:"SRD"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRLI\t",-"AND",-"SLTIU"
return v >> uint32(33)
}
func rshConst64x32(v int64) int64 {
cmd/compile: optimize shifts of int32 and uint32 on loong64 goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.100n ± 1% 1.101n ± 0% ~ (p=0.566 n=10) LeadingZeros8 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros16 1.501n ± 0% 1.502n ± 0% +0.07% (p=0.000 n=10) LeadingZeros32 1.2010n ± 0% 0.9511n ± 0% -20.81% (p=0.000 n=10) LeadingZeros64 1.104n ± 1% 1.119n ± 0% +1.40% (p=0.000 n=10) TrailingZeros 0.8137n ± 0% 0.8086n ± 0% -0.63% (p=0.001 n=10) TrailingZeros8 1.031n ± 1% 1.031n ± 1% ~ (p=0.956 n=10) TrailingZeros16 0.8204n ± 1% 0.8114n ± 0% -1.11% (p=0.000 n=10) TrailingZeros32 0.8145n ± 0% 0.8090n ± 0% -0.68% (p=0.000 n=10) TrailingZeros64 0.8159n ± 0% 0.8089n ± 1% -0.86% (p=0.000 n=10) OnesCount 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount8 0.8005n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) OnesCount16 0.9339n ± 0% 0.9344n ± 0% +0.05% (p=0.000 n=10) OnesCount32 0.8672n ± 0% 0.8677n ± 0% +0.06% (p=0.000 n=10) OnesCount64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) RotateLeft 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.202n ± 0% ~ (p=0.210 n=10) RotateLeft16 0.8050n ± 0% 0.8036n ± 0% -0.17% (p=0.002 n=10) RotateLeft32 0.6674n ± 0% 0.6674n ± 0% ~ (p=1.000 n=10) RotateLeft64 0.6673n ± 0% 0.6674n ± 0% ~ (p=0.072 n=10) Reverse 0.4123n ± 0% 0.4067n ± 1% -1.37% (p=0.000 n=10) Reverse8 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Reverse16 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse32 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.000 n=10) Reverse64 0.8004n ± 0% 0.8009n ± 0% +0.06% (p=0.001 n=10) ReverseBytes 0.4100n ± 1% 0.4057n ± 1% -1.06% (p=0.002 n=10) ReverseBytes16 0.8004n ± 0% 0.8009n ± 0% +0.07% (p=0.000 n=10) ReverseBytes32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) ReverseBytes64 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.831n ± 0% 1.832n ± 0% ~ (p=1.000 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.601n ± 0% 1.602n ± 0% +0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.400n ± 0% 2.402n ± 0% +0.10% (p=0.000 n=10) Mul 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul32 0.8005n ± 0% 0.8009n ± 0% +0.05% (p=0.000 n=10) Mul64 0.8004n ± 0% 0.8008n ± 0% +0.05% (p=0.000 n=10) Div 9.107n ± 0% 9.083n ± 0% ~ (p=0.255 n=10) Div32 4.009n ± 0% 4.011n ± 0% +0.05% (p=0.000 n=10) Div64 9.705n ± 0% 9.711n ± 0% +0.06% (p=0.000 n=10) geomean 1.089n 1.083n -0.62% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 1.352n ± 0% 1.341n ± 4% -0.81% (p=0.024 n=10) LeadingZeros8 1.766n ± 0% 1.781n ± 0% +0.88% (p=0.000 n=10) LeadingZeros16 1.766n ± 0% 1.782n ± 0% +0.88% (p=0.000 n=10) LeadingZeros32 1.536n ± 0% 1.341n ± 1% -12.73% (p=0.000 n=10) LeadingZeros64 1.351n ± 1% 1.338n ± 0% -0.96% (p=0.000 n=10) TrailingZeros 0.9037n ± 0% 0.9025n ± 0% -0.12% (p=0.020 n=10) TrailingZeros8 1.087n ± 3% 1.056n ± 0% ~ (p=0.060 n=10) TrailingZeros16 1.101n ± 0% 1.101n ± 0% ~ (p=0.211 n=10) TrailingZeros32 0.9040n ± 0% 0.9024n ± 1% -0.18% (p=0.017 n=10) TrailingZeros64 0.9043n ± 0% 0.9028n ± 1% ~ (p=0.118 n=10) OnesCount 1.503n ± 2% 1.482n ± 1% -1.43% (p=0.001 n=10) OnesCount8 1.207n ± 0% 1.206n ± 0% -0.12% (p=0.000 n=10) OnesCount16 1.501n ± 0% 1.534n ± 0% +2.13% (p=0.000 n=10) OnesCount32 1.483n ± 1% 1.531n ± 1% +3.27% (p=0.000 n=10) OnesCount64 1.301n ± 0% 1.302n ± 0% +0.08% (p=0.000 n=10) RotateLeft 0.8136n ± 4% 0.8083n ± 0% -0.66% (p=0.002 n=10) RotateLeft8 1.311n ± 0% 1.310n ± 0% ~ (p=0.786 n=10) RotateLeft16 1.165n ± 0% 1.149n ± 0% -1.33% (p=0.001 n=10) RotateLeft32 0.8138n ± 1% 0.8093n ± 0% -0.57% (p=0.017 n=10) RotateLeft64 0.8149n ± 1% 0.8088n ± 0% -0.74% (p=0.000 n=10) Reverse 0.5195n ± 1% 0.5109n ± 0% -1.67% (p=0.000 n=10) Reverse8 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Reverse32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.012 n=10) Reverse64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.010 n=10) ReverseBytes 0.5120n ± 1% 0.5122n ± 2% ~ (p=0.306 n=10) ReverseBytes16 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes32 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) ReverseBytes64 0.8007n ± 0% 0.8010n ± 0% +0.04% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 4% ~ (p=0.334 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.563 n=10) Add64 1.201n ± 0% 1.201n ± 1% ~ (p=0.652 n=10) Add64multiple 1.909n ± 0% 1.902n ± 0% ~ (p=0.126 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.655n ± 0% 1.654n ± 0% ~ (p=0.589 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub64multiple 2.150n ± 0% 2.180n ± 4% +1.37% (p=0.000 n=10) Mul 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.011 n=10) Mul32 1.053n ± 0% 1.030n ± 0% -2.23% (p=0.000 n=10) Mul64 0.9341n ± 0% 0.9345n ± 0% +0.04% (p=0.018 n=10) Div 11.59n ± 0% 11.57n ± 1% ~ (p=0.091 n=10) Div32 4.337n ± 0% 4.337n ± 1% ~ (p=0.783 n=10) Div64 12.81n ± 0% 12.76n ± 0% -0.39% (p=0.001 n=10) geomean 1.257n 1.252n -0.46% Change-Id: I9e93ea49736760c19dc6b6463d2aa95878121b7b Reviewed-on: https://go-review.googlesource.com/c/go/+/627855 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Junyang Shao <shaojunyang@google.com>
2024-11-14 11:35:39 +08:00
// loong64:"SRAV"
// ppc64x:"SRAD"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRAI\t",-"OR",-"SLTIU"
return v >> uint32(33)
}
cmd/compile: prefer an add when shifting left by 1 ADD(Q|L) has generally twice the throughput. Came up in CL 626998. Throughput by arch: Zen 4: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.25 Intel Alder Lake: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.2 Intel Haswell: SHLL (R64, 1): 0.5 ADD (R64, R64): 0.25 Also include a minor opt for: (x + x) << c -> x << (c + 1) Before this, the code: func addShift(x int64) int64 { return (x + x) << 1 } emitted two instructions: ADDQ AX, AX SHLQ $1, AX but we can do it in a single shift: SHLQ $2, AX Add a codegen test for clearing the last bit. compilecmp linux/amd64: math math.sqrt 243 -> 242 (-0.41%) math [cmd/compile] math.sqrt 243 -> 242 (-0.41%) runtime runtime.selectgo 5455 -> 5445 (-0.18%) runtime.sysargs 665 -> 662 (-0.45%) runtime.isPinned 145 -> 141 (-2.76%) runtime.atoi64 198 -> 194 (-2.02%) runtime.setPinned 714 -> 709 (-0.70%) runtime [cmd/compile] runtime.sysargs 665 -> 662 (-0.45%) runtime.setPinned 714 -> 709 (-0.70%) runtime.atoi64 198 -> 194 (-2.02%) runtime.isPinned 145 -> 141 (-2.76%) strconv strconv.computeBounds 109 -> 107 (-1.83%) strconv.FormatInt 201 -> 197 (-1.99%) strconv.ryuFtoaShortest 1298 -> 1266 (-2.47%) strconv.small 144 -> 134 (-6.94%) strconv.AppendInt 357 -> 344 (-3.64%) strconv.ryuDigits32 490 -> 488 (-0.41%) strconv.AppendUint 342 -> 340 (-0.58%) strconv [cmd/compile] strconv.FormatInt 201 -> 197 (-1.99%) strconv.ryuFtoaShortest 1298 -> 1266 (-2.47%) strconv.ryuDigits32 490 -> 488 (-0.41%) strconv.AppendUint 342 -> 340 (-0.58%) strconv.computeBounds 109 -> 107 (-1.83%) strconv.small 144 -> 134 (-6.94%) strconv.AppendInt 357 -> 344 (-3.64%) image image.Rectangle.Inset 101 -> 97 (-3.96%) regexp/syntax regexp/syntax.inCharClass.func1 111 -> 110 (-0.90%) regexp/syntax.(*compiler).quest 586 -> 573 (-2.22%) regexp/syntax.ranges.Less 153 -> 150 (-1.96%) regexp/syntax.(*compiler).loop 583 -> 568 (-2.57%) time time.Time.Before 179 -> 161 (-10.06%) time.Time.Compare 189 -> 166 (-12.17%) time.Time.Sub 444 -> 425 (-4.28%) time.Time.UnixMicro 106 -> 95 (-10.38%) time.div 592 -> 587 (-0.84%) time.Time.UnixNano 85 -> 78 (-8.24%) time.(*Time).UnixMilli 141 -> 140 (-0.71%) time.Time.UnixMilli 106 -> 95 (-10.38%) time.(*Time).UnixMicro 141 -> 140 (-0.71%) time.Time.After 179 -> 161 (-10.06%) time.Time.Equal 170 -> 150 (-11.76%) time.Time.AppendBinary 766 -> 757 (-1.17%) time.Time.IsZero 74 -> 66 (-10.81%) time.(*Time).UnixNano 124 -> 113 (-8.87%) time.(*Time).IsZero 113 -> 108 (-4.42%) regexp regexp.(*Regexp).FindAllStringSubmatch.func1 590 -> 569 (-3.56%) regexp.QuoteMeta 485 -> 469 (-3.30%) regexp/syntax [cmd/compile] regexp/syntax.inCharClass.func1 111 -> 110 (-0.90%) regexp/syntax.(*compiler).loop 583 -> 568 (-2.57%) regexp/syntax.(*compiler).quest 586 -> 573 (-2.22%) regexp/syntax.ranges.Less 153 -> 150 (-1.96%) encoding/base64 encoding/base64.decodedLen 92 -> 90 (-2.17%) encoding/base64.(*Encoding).DecodedLen 99 -> 97 (-2.02%) time [cmd/compile] time.(*Time).IsZero 113 -> 108 (-4.42%) time.Time.IsZero 74 -> 66 (-10.81%) time.(*Time).UnixNano 124 -> 113 (-8.87%) time.Time.UnixMilli 106 -> 95 (-10.38%) time.Time.Equal 170 -> 150 (-11.76%) time.Time.UnixMicro 106 -> 95 (-10.38%) time.(*Time).UnixMicro 141 -> 140 (-0.71%) time.Time.Before 179 -> 161 (-10.06%) time.Time.UnixNano 85 -> 78 (-8.24%) time.Time.AppendBinary 766 -> 757 (-1.17%) time.div 592 -> 587 (-0.84%) time.Time.After 179 -> 161 (-10.06%) time.Time.Compare 189 -> 166 (-12.17%) time.(*Time).UnixMilli 141 -> 140 (-0.71%) time.Time.Sub 444 -> 425 (-4.28%) index/suffixarray index/suffixarray.sais_8_32 1677 -> 1645 (-1.91%) index/suffixarray.sais_32 1677 -> 1645 (-1.91%) index/suffixarray.sais_64 1677 -> 1654 (-1.37%) index/suffixarray.sais_8_64 1677 -> 1654 (-1.37%) index/suffixarray.writeInt 249 -> 247 (-0.80%) os os.Expand 1070 -> 1051 (-1.78%) os.Chtimes 787 -> 774 (-1.65%) regexp [cmd/compile] regexp.(*Regexp).FindAllStringSubmatch.func1 590 -> 569 (-3.56%) regexp.QuoteMeta 485 -> 469 (-3.30%) encoding/base64 [cmd/compile] encoding/base64.decodedLen 92 -> 90 (-2.17%) encoding/base64.(*Encoding).DecodedLen 99 -> 97 (-2.02%) encoding/hex encoding/hex.Encode 138 -> 136 (-1.45%) encoding/hex.(*decoder).Read 830 -> 824 (-0.72%) crypto/des crypto/des.initFeistelBox 235 -> 229 (-2.55%) crypto/des.cryptBlock 549 -> 538 (-2.00%) os [cmd/compile] os.Chtimes 787 -> 774 (-1.65%) os.Expand 1070 -> 1051 (-1.78%) math/big math/big.newFloat 238 -> 223 (-6.30%) math/big.nat.mul 2138 -> 2122 (-0.75%) math/big.karatsubaSqr 1372 -> 1369 (-0.22%) math/big.(*Float).sqrtInverse 895 -> 878 (-1.90%) math/big.basicSqr 1032 -> 1017 (-1.45%) cmd/vendor/golang.org/x/sys/unix cmd/vendor/golang.org/x/sys/unix.TimeToTimespec 72 -> 66 (-8.33%) encoding/json encoding/json.Indent 404 -> 403 (-0.25%) encoding/json.MarshalIndent 303 -> 297 (-1.98%) testing testing.(*T).Deadline 84 -> 82 (-2.38%) testing.(*M).Run 3545 -> 3525 (-0.56%) archive/zip archive/zip.headerFileInfo.ModTime 229 -> 223 (-2.62%) encoding/gob encoding/gob.(*encoderState).encodeInt 474 -> 469 (-1.05%) crypto/elliptic crypto/elliptic.Marshal 728 -> 714 (-1.92%) debug/buildinfo debug/buildinfo.readString 325 -> 315 (-3.08%) image/png image/png.(*decoder).readImagePass 10866 -> 10834 (-0.29%) archive/tar archive/tar.Header.allowedFormats.func3 1768 -> 1736 (-1.81%) archive/tar.formatPAXTime 389 -> 358 (-7.97%) archive/tar.(*Writer).writeGNUHeader 741 -> 727 (-1.89%) archive/tar.readGNUSparseMap0x1 709 -> 695 (-1.97%) archive/tar.(*Writer).templateV7Plus 915 -> 909 (-0.66%) crypto/internal/cryptotest crypto/internal/cryptotest.TestHash.func4 890 -> 879 (-1.24%) crypto/internal/cryptotest.TestStream.func6.1 646 -> 645 (-0.15%) crypto/internal/cryptotest.testCipher.func3 1300 -> 1289 (-0.85%) internal/pkgbits internal/pkgbits.(*Encoder).Int64 113 -> 103 (-8.85%) internal/pkgbits.(*Encoder).rawVarint 74 -> 72 (-2.70%) testing/quick testing/quick.(*Config).getRand 316 -> 315 (-0.32%) log/slog log/slog.TimeValue 489 -> 479 (-2.04%) runtime/pprof runtime/pprof.(*profileBuilder).build 2341 -> 2322 (-0.81%) internal/coverage/cfile internal/coverage/cfile.(*emitState).openMetaFile 824 -> 822 (-0.24%) internal/coverage/cfile.(*emitState).openCounterFile 904 -> 892 (-1.33%) cmd/internal/objabi cmd/internal/objabi.expandArgs 1177 -> 1169 (-0.68%) crypto/ecdsa crypto/ecdsa.pointFromAffine 1162 -> 1144 (-1.55%) net net.minNonzeroTime 313 -> 308 (-1.60%) net.cgoLookupAddrPTR 812 -> 797 (-1.85%) net.(*IPNet).String 851 -> 827 (-2.82%) net.IP.AppendText 488 -> 471 (-3.48%) net.IPMask.String 281 -> 270 (-3.91%) net.partialDeadline 374 -> 366 (-2.14%) net.hexString 249 -> 240 (-3.61%) net.IP.String 454 -> 453 (-0.22%) internal/fuzz internal/fuzz.newPcgRand 240 -> 234 (-2.50%) crypto/x509 crypto/x509.(*Certificate).isValid 2642 -> 2611 (-1.17%) cmd/internal/obj/s390x cmd/internal/obj/s390x.buildop 33676 -> 33644 (-0.10%) encoding/hex [cmd/compile] encoding/hex.(*decoder).Read 830 -> 824 (-0.72%) encoding/hex.Encode 138 -> 136 (-1.45%) cmd/internal/objabi [cmd/compile] cmd/internal/objabi.expandArgs 1177 -> 1169 (-0.68%) math/big [cmd/compile] math/big.(*Float).sqrtInverse 895 -> 878 (-1.90%) math/big.nat.mul 2138 -> 2122 (-0.75%) math/big.karatsubaSqr 1372 -> 1369 (-0.22%) math/big.basicSqr 1032 -> 1017 (-1.45%) math/big.newFloat 238 -> 223 (-6.30%) encoding/json [cmd/compile] encoding/json.MarshalIndent 303 -> 297 (-1.98%) encoding/json.Indent 404 -> 403 (-0.25%) cmd/covdata main.(*metaMerge).emitCounters 985 -> 973 (-1.22%) runtime/pprof [cmd/compile] runtime/pprof.(*profileBuilder).build 2341 -> 2322 (-0.81%) cmd/compile/internal/syntax cmd/compile/internal/syntax.(*source).fill 722 -> 703 (-2.63%) cmd/dist main.runInstall 19081 -> 19049 (-0.17%) crypto/tls crypto/tls.extractPadding 176 -> 175 (-0.57%) slices.Clone[[]crypto/tls.SignatureScheme,crypto/tls.SignatureScheme] 253 -> 247 (-2.37%) slices.Clone[[]uint16,uint16] 253 -> 247 (-2.37%) slices.Clone[[]crypto/tls.CurveID,crypto/tls.CurveID] 253 -> 247 (-2.37%) crypto/tls.(*Config).cipherSuites 335 -> 326 (-2.69%) slices.DeleteFunc[go.shape.[]crypto/tls.CurveID,go.shape.uint16] 437 -> 434 (-0.69%) crypto/tls.dial 1349 -> 1339 (-0.74%) slices.DeleteFunc[go.shape.[]uint16,go.shape.uint16] 437 -> 434 (-0.69%) internal/pkgbits [cmd/compile] internal/pkgbits.(*Encoder).Int64 113 -> 103 (-8.85%) internal/pkgbits.(*Encoder).rawVarint 74 -> 72 (-2.70%) cmd/compile/internal/syntax [cmd/compile] cmd/compile/internal/syntax.(*source).fill 722 -> 703 (-2.63%) cmd/internal/obj/s390x [cmd/compile] cmd/internal/obj/s390x.buildop 33676 -> 33644 (-0.10%) cmd/go/internal/trace cmd/go/internal/trace.Flow 910 -> 886 (-2.64%) cmd/go/internal/trace.(*Span).Done 311 -> 304 (-2.25%) cmd/go/internal/trace.StartSpan 620 -> 615 (-0.81%) cmd/internal/script cmd/internal/script.(*Engine).Execute.func2 534 -> 528 (-1.12%) cmd/link/internal/loader cmd/link/internal/loader.(*Loader).SetSymSect 344 -> 338 (-1.74%) net/http net/http.(*Transport).queueForIdleConn 1797 -> 1766 (-1.73%) net/http.(*Transport).getConn 2149 -> 2131 (-0.84%) net/http.(*http2ClientConn).tooIdleLocked 207 -> 197 (-4.83%) net/http.(*http2responseWriter).SetWriteDeadline.func1 520 -> 508 (-2.31%) net/http.(*Cookie).Valid 837 -> 818 (-2.27%) net/http.(*http2responseWriter).SetReadDeadline 373 -> 357 (-4.29%) net/http.checkIfRange 701 -> 690 (-1.57%) net/http.(*http2SettingsFrame).Value 325 -> 298 (-8.31%) net/http.(*http2SettingsFrame).HasDuplicates 777 -> 767 (-1.29%) net/http.(*Server).Serve 1746 -> 1739 (-0.40%) net/http.http2traceGotConn 569 -> 556 (-2.28%) net/http/pprof net/http/pprof.collectProfile 242 -> 239 (-1.24%) cmd/compile/internal/coverage cmd/compile/internal/coverage.metaHashAndLen 439 -> 438 (-0.23%) cmd/vendor/golang.org/x/telemetry/internal/upload cmd/vendor/golang.org/x/telemetry/internal/upload.(*uploader).findWork 4570 -> 4540 (-0.66%) cmd/vendor/golang.org/x/telemetry/internal/upload.(*uploader).reports 3604 -> 3572 (-0.89%) cmd/compile/internal/coverage [cmd/compile] cmd/compile/internal/coverage.metaHashAndLen 439 -> 438 (-0.23%) cmd/vendor/golang.org/x/text/language cmd/vendor/golang.org/x/text/language.regionGroupDist 287 -> 284 (-1.05%) cmd/go/internal/vcweb cmd/go/internal/vcweb.(*Server).overview.func1 1045 -> 1041 (-0.38%) cmd/go/internal/vcs cmd/go/internal/vcs.expand 761 -> 741 (-2.63%) cmd/compile/internal/inline/inlheur slices.stableCmpFunc[go.shape.struct 2300 -> 2284 (-0.70%) cmd/compile/internal/inline/inlheur [cmd/compile] slices.stableCmpFunc[go.shape.struct 2300 -> 2284 (-0.70%) cmd/go/internal/modfetch/codehost cmd/go/internal/modfetch/codehost.bzrParseStat 2217 -> 2213 (-0.18%) cmd/link/internal/ld cmd/link/internal/ld.decodetypeStructFieldCount 157 -> 152 (-3.18%) cmd/link/internal/ld.(*Link).address 12559 -> 12495 (-0.51%) cmd/link/internal/ld.(*dodataState).allocateDataSections 18345 -> 18205 (-0.76%) cmd/link/internal/ld.elfshreloc 618 -> 616 (-0.32%) cmd/link/internal/ld.(*deadcodePass).decodetypeMethods 794 -> 779 (-1.89%) cmd/link/internal/ld.(*dodataState).assignDsymsToSection 668 -> 663 (-0.75%) cmd/link/internal/ld.relocSectFn 285 -> 284 (-0.35%) cmd/link/internal/ld.decodetypeIfaceMethodCount 146 -> 144 (-1.37%) cmd/link/internal/ld.decodetypeArrayLen 157 -> 152 (-3.18%) cmd/link/internal/arm64 cmd/link/internal/arm64.gensymlate.func1 895 -> 888 (-0.78%) cmd/go/internal/modload cmd/go/internal/modload.queryProxy.func3 1029 -> 1012 (-1.65%) cmd/go/internal/load cmd/go/internal/load.(*Package).setBuildInfo 8453 -> 8447 (-0.07%) cmd/go/internal/clean cmd/go/internal/clean.runClean 2120 -> 2104 (-0.75%) cmd/compile/internal/ssa cmd/compile/internal/ssa.(*poset).aliasnodes 2010 -> 1978 (-1.59%) cmd/compile/internal/ssa.rewriteValueARM64_OpARM64MOVHstoreidx2 730 -> 719 (-1.51%) cmd/compile/internal/ssa.(*debugState).buildLocationLists 3326 -> 3294 (-0.96%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDLconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.(*debugState).processValue 9756 -> 9724 (-0.33%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDQconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.(*poset).mergeroot 1079 -> 1054 (-2.32%) cmd/compile/internal/ssa [cmd/compile] cmd/compile/internal/ssa.rewriteValueARM64_OpARM64MOVHstoreidx2 730 -> 719 (-1.51%) cmd/compile/internal/ssa.(*poset).aliasnodes 2010 -> 1978 (-1.59%) cmd/compile/internal/ssa.(*poset).mergeroot 1079 -> 1054 (-2.32%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDQconst 3069 -> 2941 (-4.17%) cmd/compile/internal/ssa.rewriteValueAMD64_OpAMD64ADDLconst 3069 -> 2941 (-4.17%) file before after Δ % math/bits.s 2352 2354 +2 +0.085% math/bits [cmd/compile].s 2352 2354 +2 +0.085% math.s 35675 35674 -1 -0.003% math [cmd/compile].s 35675 35674 -1 -0.003% runtime.s 577251 577245 -6 -0.001% runtime [cmd/compile].s 642419 642438 +19 +0.003% sort.s 37434 37435 +1 +0.003% strconv.s 48391 48343 -48 -0.099% sort [cmd/compile].s 37434 37435 +1 +0.003% bufio.s 21386 21418 +32 +0.150% strconv [cmd/compile].s 48391 48343 -48 -0.099% image.s 34978 35022 +44 +0.126% regexp/syntax.s 81719 81781 +62 +0.076% time.s 94341 94184 -157 -0.166% regexp.s 60411 60399 -12 -0.020% bufio [cmd/compile].s 21512 21544 +32 +0.149% encoding/binary.s 34062 34087 +25 +0.073% regexp/syntax [cmd/compile].s 81719 81781 +62 +0.076% encoding/base64.s 11907 11903 -4 -0.034% time [cmd/compile].s 94341 94184 -157 -0.166% index/suffixarray.s 41633 41527 -106 -0.255% os.s 101770 101738 -32 -0.031% regexp [cmd/compile].s 60411 60399 -12 -0.020% encoding/binary [cmd/compile].s 37173 37198 +25 +0.067% encoding/base64 [cmd/compile].s 11907 11903 -4 -0.034% os/exec.s 23900 23907 +7 +0.029% encoding/hex.s 6038 6030 -8 -0.132% crypto/des.s 5073 5056 -17 -0.335% os [cmd/compile].s 102030 101998 -32 -0.031% vendor/golang.org/x/net/http2/hpack.s 22027 22033 +6 +0.027% math/big.s 164808 164753 -55 -0.033% cmd/vendor/golang.org/x/sys/unix.s 121450 121444 -6 -0.005% encoding/json.s 110294 110287 -7 -0.006% testing.s 115303 115281 -22 -0.019% archive/zip.s 65329 65325 -4 -0.006% os/user.s 10078 10080 +2 +0.020% encoding/gob.s 143788 143783 -5 -0.003% crypto/elliptic.s 30686 30704 +18 +0.059% go/doc/comment.s 49401 49433 +32 +0.065% debug/buildinfo.s 9095 9085 -10 -0.110% image/png.s 36113 36081 -32 -0.089% archive/tar.s 71994 71897 -97 -0.135% crypto/internal/cryptotest.s 60872 60849 -23 -0.038% internal/pkgbits.s 20441 20429 -12 -0.059% testing/quick.s 8236 8235 -1 -0.012% log/slog.s 77568 77558 -10 -0.013% internal/trace/internal/oldtrace.s 52885 52896 +11 +0.021% runtime/pprof.s 123978 123969 -9 -0.007% internal/coverage/cfile.s 25198 25184 -14 -0.056% cmd/internal/objabi.s 19954 19946 -8 -0.040% crypto/ecdsa.s 29159 29141 -18 -0.062% log/slog/internal/benchmarks.s 6694 6695 +1 +0.015% net.s 299569 299503 -66 -0.022% os/exec [cmd/compile].s 23888 23895 +7 +0.029% internal/trace.s 179226 179240 +14 +0.008% internal/fuzz.s 86190 86191 +1 +0.001% crypto/x509.s 177195 177164 -31 -0.017% cmd/internal/obj/s390x.s 121642 121610 -32 -0.026% cmd/internal/obj/ppc64.s 140118 140122 +4 +0.003% encoding/hex [cmd/compile].s 6149 6141 -8 -0.130% cmd/internal/objabi [cmd/compile].s 19954 19946 -8 -0.040% cmd/internal/obj/arm64.s 158523 158555 +32 +0.020% go/doc/comment [cmd/compile].s 49512 49544 +32 +0.065% math/big [cmd/compile].s 166394 166339 -55 -0.033% encoding/json [cmd/compile].s 110712 110705 -7 -0.006% cmd/covdata.s 39699 39687 -12 -0.030% runtime/pprof [cmd/compile].s 125209 125200 -9 -0.007% cmd/compile/internal/syntax.s 181755 181736 -19 -0.010% cmd/dist.s 177893 177861 -32 -0.018% crypto/tls.s 389157 389113 -44 -0.011% internal/pkgbits [cmd/compile].s 41644 41632 -12 -0.029% cmd/compile/internal/syntax [cmd/compile].s 196105 196086 -19 -0.010% cmd/compile/internal/types.s 71315 71345 +30 +0.042% cmd/internal/obj/s390x [cmd/compile].s 121733 121701 -32 -0.026% cmd/go/internal/trace.s 4796 4760 -36 -0.751% cmd/internal/obj/arm64 [cmd/compile].s 168120 168147 +27 +0.016% cmd/internal/obj/ppc64 [cmd/compile].s 140219 140223 +4 +0.003% cmd/internal/script.s 83442 83436 -6 -0.007% cmd/link/internal/loader.s 93299 93294 -5 -0.005% net/http.s 620639 620472 -167 -0.027% net/http/pprof.s 35016 35013 -3 -0.009% cmd/compile/internal/coverage.s 6668 6667 -1 -0.015% cmd/vendor/golang.org/x/telemetry/internal/upload.s 34210 34148 -62 -0.181% cmd/compile/internal/coverage [cmd/compile].s 6664 6663 -1 -0.015% cmd/vendor/golang.org/x/text/language.s 48077 48074 -3 -0.006% cmd/go/internal/vcweb.s 45193 45189 -4 -0.009% cmd/go/internal/vcs.s 44749 44729 -20 -0.045% cmd/compile/internal/inline/inlheur.s 83758 83742 -16 -0.019% cmd/compile/internal/inline/inlheur [cmd/compile].s 84773 84757 -16 -0.019% cmd/go/internal/modfetch/codehost.s 89098 89094 -4 -0.004% cmd/trace.s 257550 257564 +14 +0.005% cmd/link/internal/ld.s 641945 641706 -239 -0.037% cmd/link/internal/arm64.s 34805 34798 -7 -0.020% cmd/go/internal/modload.s 328971 328954 -17 -0.005% cmd/go/internal/load.s 178877 178871 -6 -0.003% cmd/go/internal/clean.s 11006 10990 -16 -0.145% cmd/compile/internal/ssa.s 3552843 3553347 +504 +0.014% cmd/compile/internal/ssa [cmd/compile].s 3752511 3753123 +612 +0.016% total 36179015 36178687 -328 -0.001% Change-Id: I251c2898ccf3c9931d162d87dabbd49cf4ec73a5 Reviewed-on: https://go-review.googlesource.com/c/go/+/641757 Reviewed-by: Keith Randall <khr@google.com> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-01-11 19:26:57 +01:00
func lshConst32x1Add(x int32) int32 {
// amd64:"SHLL\t[$]2"
return (x + x) << 1
}
func lshConst64x1Add(x int64) int64 {
// amd64:"SHLQ\t[$]2"
return (x + x) << 1
}
func lshConst32x2Add(x int32) int32 {
// amd64:"SHLL\t[$]3"
return (x + x) << 2
}
func lshConst64x2Add(x int64) int64 {
// amd64:"SHLQ\t[$]3"
return (x + x) << 2
}
// ------------------ //
// masked shifts //
// ------------------ //
func lshMask64x64(v int64, s uint64) int64 {
// arm64:"LSL",-"AND"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SLLV",-"AND"
// ppc64x:"RLDICL",-"ORN",-"ISEL"
// riscv64:"SLL",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v << (s & 63)
}
func rshMask64Ux64(v uint64, s uint64) uint64 {
// arm64:"LSR",-"AND",-"CSEL"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRLV",-"AND"
// ppc64x:"RLDICL",-"ORN",-"ISEL"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
func rshMask64x64(v int64, s uint64) int64 {
// arm64:"ASR",-"AND",-"CSEL"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRAV",-"AND"
// ppc64x:"RLDICL",-"ORN",-"ISEL"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRA\t",-"OR",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
func lshMask32x64(v int32, s uint64) int32 {
// arm64:"LSL",-"AND"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SLL\t","AND","SGTU","MASKEQZ"
// ppc64x:"ISEL",-"ORN"
// riscv64:"SLL",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v << (s & 63)
}
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
func lsh5Mask32x64(v int32, s uint64) int32 {
// loong64:"SLL\t",-"AND"
return v << (s & 31)
}
func rshMask32Ux64(v uint32, s uint64) uint32 {
// arm64:"LSR",-"AND"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRL\t","AND","SGTU","MASKEQZ"
// ppc64x:"ISEL",-"ORN"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRLW","SLTIU","NEG","AND\t",-"SRL\t"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
func rsh5Mask32Ux64(v uint32, s uint64) uint32 {
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRL\t",-"AND"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRLW",-"AND\t",-"SLTIU",-"SRL\t"
return v >> (s & 31)
}
func rshMask32x64(v int32, s uint64) int32 {
// arm64:"ASR",-"AND"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRA\t","AND","SGTU","SUBVU","OR"
// ppc64x:"ISEL",-"ORN"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRAW","OR","SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
func rsh5Mask32x64(v int32, s uint64) int32 {
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRA\t",-"AND"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRAW",-"OR",-"SLTIU"
return v >> (s & 31)
}
func lshMask64x32(v int64, s uint32) int64 {
// arm64:"LSL",-"AND"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SLLV",-"AND"
// ppc64x:"RLDICL",-"ORN"
// riscv64:"SLL",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v << (s & 63)
}
func rshMask64Ux32(v uint64, s uint32) uint64 {
// arm64:"LSR",-"AND",-"CSEL"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRLV",-"AND"
// ppc64x:"RLDICL",-"ORN"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
func rshMask64x32(v int64, s uint32) int64 {
// arm64:"ASR",-"AND",-"CSEL"
cmd/compile: simplify bounded shift on loong64 Use the shiftIsBounded function to generate more efficient shift instructions. This change also optimize shift ops when the shift value is v&63 and v&31. goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000-HV @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.1005n ± 0% 0.8425n ± 1% -23.44% (p=0.000 n=10) LeadingZeros8 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.001 n=10) LeadingZeros16 1.502n ± 0% 1.501n ± 0% -0.07% (p=0.000 n=10) LeadingZeros32 0.9511n ± 0% 0.8050n ± 0% -15.36% (p=0.000 n=10) LeadingZeros64 1.1195n ± 0% 0.8423n ± 0% -24.76% (p=0.000 n=10) TrailingZeros 0.8086n ± 0% 0.8005n ± 0% -1.00% (p=0.000 n=10) TrailingZeros8 1.031n ± 1% 1.035n ± 1% ~ (p=0.136 n=10) TrailingZeros16 0.8114n ± 0% 0.8254n ± 1% +1.73% (p=0.000 n=10) TrailingZeros32 0.8090n ± 0% 0.8005n ± 0% -1.05% (p=0.000 n=10) TrailingZeros64 0.8089n ± 1% 0.8005n ± 0% -1.04% (p=0.000 n=10) OnesCount 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) OnesCount16 0.9344n ± 0% 1.2010n ± 0% +28.53% (p=0.000 n=10) OnesCount32 0.8677n ± 0% 1.2010n ± 0% +38.41% (p=0.000 n=10) OnesCount64 1.2010n ± 0% 0.8671n ± 0% -27.80% (p=0.000 n=10) RotateLeft 0.8009n ± 0% 0.6671n ± 0% -16.71% (p=0.000 n=10) RotateLeft8 1.202n ± 0% 1.327n ± 0% +10.40% (p=0.000 n=10) RotateLeft16 0.8036n ± 0% 0.8218n ± 0% +2.26% (p=0.000 n=10) RotateLeft32 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) RotateLeft64 0.6674n ± 0% 0.8004n ± 0% +19.94% (p=0.000 n=10) Reverse 0.4067n ± 1% 0.4122n ± 1% +1.38% (p=0.001 n=10) Reverse8 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Reverse16 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) Reverse32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.001 n=10) Reverse64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.008 n=10) ReverseBytes 0.4057n ± 1% 0.4133n ± 1% +1.90% (p=0.000 n=10) ReverseBytes16 0.8009n ± 0% 0.8004n ± 0% -0.07% (p=0.000 n=10) ReverseBytes32 0.8009n ± 0% 0.8005n ± 0% -0.05% (p=0.000 n=10) ReverseBytes64 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Add 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add32 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Add64 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Add64multiple 1.832n ± 0% 1.828n ± 0% -0.22% (p=0.001 n=10) Sub 1.201n ± 0% 1.201n ± 0% ~ (p=1.000 n=10) Sub32 1.602n ± 0% 1.601n ± 0% -0.06% (p=0.000 n=10) Sub64 1.201n ± 0% 1.201n ± 0% ~ (p=0.474 n=10) Sub64multiple 2.402n ± 0% 2.400n ± 0% -0.10% (p=0.000 n=10) Mul 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul32 0.8009n ± 0% 0.8004n ± 0% -0.06% (p=0.000 n=10) Mul64 0.8008n ± 0% 0.8004n ± 0% -0.05% (p=0.000 n=10) Div 9.083n ± 0% 7.638n ± 0% -15.91% (p=0.000 n=10) Div32 4.011n ± 0% 4.009n ± 0% -0.05% (p=0.000 n=10) Div64 9.711n ± 0% 8.204n ± 0% -15.51% (p=0.000 n=10) geomean 1.083n 1.078n -0.40% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 627855 | this CL | | sec/op | sec/op vs base | LeadingZeros 1.341n ± 4% 1.331n ± 2% -0.71% (p=0.008 n=10) LeadingZeros8 1.781n ± 0% 1.766n ± 1% -0.84% (p=0.011 n=10) LeadingZeros16 1.782n ± 0% 1.767n ± 0% -0.79% (p=0.001 n=10) LeadingZeros32 1.341n ± 1% 1.333n ± 0% -0.52% (p=0.001 n=10) LeadingZeros64 1.338n ± 0% 1.333n ± 0% -0.37% (p=0.008 n=10) TrailingZeros 0.9025n ± 0% 0.8077n ± 0% -10.50% (p=0.000 n=10) TrailingZeros8 1.056n ± 0% 1.089n ± 1% +3.17% (p=0.001 n=10) TrailingZeros16 1.101n ± 0% 1.102n ± 0% +0.09% (p=0.011 n=10) TrailingZeros32 0.9024n ± 1% 0.8083n ± 0% -10.43% (p=0.000 n=10) TrailingZeros64 0.9028n ± 1% 0.8087n ± 0% -10.43% (p=0.000 n=10) OnesCount 1.482n ± 1% 1.302n ± 0% -12.15% (p=0.000 n=10) OnesCount8 1.206n ± 0% 1.207n ± 2% +0.12% (p=0.000 n=10) OnesCount16 1.534n ± 0% 1.402n ± 0% -8.58% (p=0.000 n=10) OnesCount32 1.531n ± 1% 1.302n ± 0% -14.99% (p=0.000 n=10) OnesCount64 1.302n ± 0% 1.538n ± 1% +18.16% (p=0.000 n=10) RotateLeft 0.8083n ± 0% 0.8087n ± 1% ~ (p=0.579 n=10) RotateLeft8 1.310n ± 0% 1.323n ± 0% +0.95% (p=0.001 n=10) RotateLeft16 1.149n ± 0% 1.165n ± 1% +1.35% (p=0.001 n=10) RotateLeft32 0.8093n ± 0% 0.8105n ± 0% ~ (p=0.393 n=10) RotateLeft64 0.8088n ± 0% 0.8090n ± 0% ~ (p=0.739 n=10) Reverse 0.5109n ± 0% 0.5172n ± 1% +1.25% (p=0.000 n=10) Reverse8 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.002 n=10) Reverse32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.000 n=10) Reverse64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes 0.5122n ± 2% 0.5182n ± 1% ~ (p=0.060 n=10) ReverseBytes16 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes32 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.005 n=10) ReverseBytes64 0.8010n ± 0% 0.8011n ± 0% +0.01% (p=0.001 n=10) Add 1.201n ± 4% 1.202n ± 0% +0.08% (p=0.028 n=10) Add32 1.201n ± 0% 1.202n ± 2% +0.08% (p=0.014 n=10) Add64 1.201n ± 1% 1.202n ± 0% +0.08% (p=0.025 n=10) Add64multiple 1.902n ± 0% 1.913n ± 0% +0.55% (p=0.004 n=10) Sub 1.201n ± 0% 1.202n ± 3% +0.08% (p=0.001 n=10) Sub32 1.654n ± 0% 1.656n ± 1% ~ (p=0.117 n=10) Sub64 1.201n ± 0% 1.202n ± 0% +0.08% (p=0.001 n=10) Sub64multiple 2.180n ± 4% 2.159n ± 1% -0.96% (p=0.006 n=10) Mul 0.9345n ± 0% 0.9346n ± 0% +0.01% (p=0.000 n=10) Mul32 1.030n ± 0% 1.050n ± 1% +1.94% (p=0.000 n=10) Mul64 0.9345n ± 0% 0.9346n ± 1% +0.01% (p=0.000 n=10) Div 11.57n ± 1% 11.12n ± 0% -3.85% (p=0.000 n=10) Div32 4.337n ± 1% 4.341n ± 1% ~ (p=0.286 n=10) Div64 12.76n ± 0% 12.02n ± 3% -5.80% (p=0.000 n=10) geomean 1.252n 1.235n -1.32% Change-Id: Iec4cfd2b83bb0f946068c1d657369ff081d95b04 Reviewed-on: https://go-review.googlesource.com/c/go/+/628575 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-16 16:27:20 +08:00
// loong64:"SRAV",-"AND"
// ppc64x:"RLDICL",-"ORN",-"ISEL"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRA\t",-"OR",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> (s & 63)
}
func lshMask64x32Ext(v int64, s int32) int64 {
// ppc64x:"RLDICL",-"ORN",-"ISEL"
// riscv64:"SLL",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v << uint(s&63)
}
func rshMask64Ux32Ext(v uint64, s int32) uint64 {
// ppc64x:"RLDICL",-"ORN",-"ISEL"
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRL\t",-"AND\t",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> uint(s&63)
}
func rshMask64x32Ext(v int64, s int32) int64 {
// ppc64x:"RLDICL",-"ORN",-"ISEL"
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRA\t",-"OR",-"SLTIU"
// s390x:-"RISBGZ",-"AND",-"LOCGR"
return v >> uint(s&63)
}
// --------------- //
// signed shifts //
// --------------- //
// We do want to generate a test + panicshift for these cases.
func lshSigned(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
// amd64:"TESTB"
_ = x << v8
// amd64:"TESTW"
_ = x << v16
// amd64:"TESTL"
_ = x << v32
// amd64:"TESTQ"
_ = x << v64
}
// We want to avoid generating a test + panicshift for these cases.
func lshSignedMasked(v8 int8, v16 int16, v32 int32, v64 int64, x int) {
// amd64:-"TESTB"
_ = x << (v8 & 7)
// amd64:-"TESTW"
_ = x << (v16 & 15)
// amd64:-"TESTL"
_ = x << (v32 & 31)
// amd64:-"TESTQ"
_ = x << (v64 & 63)
}
// ------------------ //
// bounded shifts //
// ------------------ //
func lshGuarded64(v int64, s uint) int64 {
if s < 64 {
// riscv64:"SLL",-"AND",-"SLTIU"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 09:44:48 -07:00
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
// arm64:"LSL",-"CSEL"
return v << s
}
panic("shift too large")
}
func rshGuarded64U(v uint64, s uint) uint64 {
if s < 64 {
cmd/compile: optimize right shifts of uint32 on riscv The compiler is currently zero extending 32 bit unsigned integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit unsigned values (srlw and srliw) which zero extend the result of the shift to 64 bits. Change the compiler so that it uses srlw and srliw for 32 bit unsigned shifts reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: uint32(a) >> 2 before: sll x5,x10,0x20 srl x10,x5,0x22 after: srlw x10,x10,0x2 uint32(a) >> int(b) before: sll x5,x10,0x20 srl x5,x5,0x20 srl x5,x5,x11 sltiu x6,x11,64 neg x6,x6 and x10,x5,x6 after: srlw x5,x10,x11 sltiu x6,x11,32 neg x6,x6 and x10,x5,x6 bits.RotateLeft32(uint32(a), 1) before: sll x5,x10,0x1 sll x6,x10,0x20 srl x7,x6,0x3f or x5,x5,x7 after: sll x5,x10,0x1 srlw x6,x10,0x1f or x10,x5,x6 bits.RotateLeft32(uint32(a), int(b)) before: and x6,x11,31 sll x7,x10,x6 sll x8,x10,0x20 srl x8,x8,0x20 add x6,x6,-32 neg x6,x6 srl x9,x8,x6 sltiu x6,x6,64 neg x6,x6 and x6,x9,x6 or x6,x6,x7 after: and x5,x11,31 sll x6,x10,x5 add x5,x5,-32 neg x5,x5 srlw x7,x10,x5 sltiu x5,x5,32 neg x5,x5 and x5,x7,x5 or x10,x6,x5 The one regression observed is the following case, an unbounded right shift of a uint32 where the value we're shifting by is known to be < 64 but > 31. As this is an unusual case this commit does not optimize for it, although the existing code does. uint32(a) >> (b & 63) before: sll x5,x10,0x20 srl x5,x5,0x20 and x6,x11,63 srl x10,x5,x6 after and x5,x11,63 srlw x6,x10,x5 sltiu x5,x5,32 neg x5,x5 and x10,x6,x5 Here we have one extra instruction. Some benchmark highlights, generated on a VisionFive2 8GB running Ubuntu 23.04. pkg: math/bits LeadingZeros32-4 18.64n ± 0% 17.32n ± 0% -7.11% (p=0.000 n=10) LeadingZeros64-4 15.47n ± 0% 15.51n ± 0% +0.26% (p=0.027 n=10) TrailingZeros16-4 18.48n ± 0% 17.68n ± 0% -4.33% (p=0.000 n=10) TrailingZeros32-4 16.87n ± 0% 16.07n ± 0% -4.74% (p=0.000 n=10) TrailingZeros64-4 15.26n ± 0% 15.27n ± 0% +0.07% (p=0.043 n=10) OnesCount32-4 20.08n ± 0% 19.29n ± 0% -3.96% (p=0.000 n=10) RotateLeft-4 8.864n ± 0% 8.838n ± 0% -0.30% (p=0.006 n=10) RotateLeft32-4 8.837n ± 0% 8.032n ± 0% -9.11% (p=0.000 n=10) Reverse32-4 29.77n ± 0% 26.52n ± 0% -10.93% (p=0.000 n=10) ReverseBytes32-4 9.640n ± 0% 8.838n ± 0% -8.32% (p=0.000 n=10) Sub32-4 8.835n ± 0% 8.035n ± 0% -9.06% (p=0.000 n=10) geomean 11.50n 11.33n -1.45% pkg: crypto/md5 Hash8Bytes-4 1.486µ ± 0% 1.426µ ± 0% -4.04% (p=0.000 n=10) Hash64-4 2.079µ ± 0% 1.968µ ± 0% -5.36% (p=0.000 n=10) Hash128-4 2.720µ ± 0% 2.557µ ± 0% -5.99% (p=0.000 n=10) Hash256-4 3.996µ ± 0% 3.733µ ± 0% -6.58% (p=0.000 n=10) Hash512-4 6.541µ ± 0% 6.072µ ± 0% -7.18% (p=0.000 n=10) Hash1K-4 11.64µ ± 0% 10.75µ ± 0% -7.58% (p=0.000 n=10) Hash8K-4 82.95µ ± 0% 76.32µ ± 0% -7.99% (p=0.000 n=10) Hash1M-4 10.436m ± 0% 9.591m ± 0% -8.10% (p=0.000 n=10) Hash8M-4 83.50m ± 0% 76.73m ± 0% -8.10% (p=0.000 n=10) Hash8BytesUnaligned-4 1.494µ ± 0% 1.434µ ± 0% -4.02% (p=0.000 n=10) Hash1KUnaligned-4 11.64µ ± 0% 10.76µ ± 0% -7.52% (p=0.000 n=10) Hash8KUnaligned-4 83.01µ ± 0% 76.32µ ± 0% -8.07% (p=0.000 n=10) geomean 28.32µ 26.42µ -6.72% Change-Id: I20483a6668cca1b53fe83944bee3706aadcf8693 Reviewed-on: https://go-review.googlesource.com/c/go/+/528975 Reviewed-by: Michael Pratt <mpratt@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Joel Sing <joel@sing.id.au> Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org>
2023-09-17 13:08:55 +02:00
// riscv64:"SRL\t",-"AND",-"SLTIU"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 09:44:48 -07:00
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
// arm64:"LSR",-"CSEL"
return v >> s
}
panic("shift too large")
}
func rshGuarded64(v int64, s uint) int64 {
if s < 64 {
cmd/compile: optimize right shifts of int32 on riscv64 The compiler is currently sign extending 32 bit signed integers to 64 bits before right shifting them using a 64 bit shift instruction. There's no need to do this as RISC-V has instructions for right shifting 32 bit signed values (sraw and sraiw) which sign extend the result of the shift to 64 bits. Change the compiler so that it uses sraw and sraiw for shifts of signed 32 bit integers reducing in most cases the number of instructions needed to perform the shift. Here are some examples of code sequences that are changed by this patch: int32(a) >> 2 before: sll x5,x10,0x20 sra x10,x5,0x22 after: sraw x10,x10,0x2 int32(v) >> int(s) before: sext.w x5,x10 sltiu x6,x11,64 add x6,x6,-1 or x6,x11,x6 sra x10,x5,x6 after: sltiu x5,x11,32 add x5,x5,-1 or x5,x11,x5 sraw x10,x10,x5 int32(v) >> (int(s) & 31) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,31 sraw x10,x10,x5 int32(100) >> int(a) before: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,64 add x5,x5,-1 or x5,x10,x5 li x6,100 sra x10,x6,x5 after: bltz x10,<target address calls runtime.panicshift> sltiu x5,x10,32 add x5,x5,-1 or x5,x10,x5 li x6,100 sraw x10,x6,x5 int32(v) >> (int(s) & 63) before: sext.w x5,x10 and x6,x11,63 sra x10,x5,x6 after: and x5,x11,63 sltiu x6,x5,32 add x6,x6,-1 or x5,x5,x6 sraw x10,x10,x5 In most cases we eliminate one instruction. In the case where we shift a int32 constant by a variable the number of instructions generated is identical. A sra is simply replaced by a sraw. In the unusual case where we shift right by a variable anded with a constant > 31 but < 64, we generate two additional instructions. As this is an unusual case we do not try to optimize for it. Some improvements can be seen in some of the existing benchmarks, notably in the utf8 package which performs right shifts of runes which are signed 32 bit integers. | utf8-old | utf8-new | | sec/op | sec/op vs base | EncodeASCIIRune-4 17.68n ± 0% 17.67n ± 0% ~ (p=0.312 n=10) EncodeJapaneseRune-4 35.34n ± 0% 34.53n ± 1% -2.31% (p=0.000 n=10) AppendASCIIRune-4 3.213n ± 0% 3.213n ± 0% ~ (p=0.318 n=10) AppendJapaneseRune-4 36.14n ± 0% 35.35n ± 0% -2.19% (p=0.000 n=10) DecodeASCIIRune-4 28.11n ± 0% 27.36n ± 0% -2.69% (p=0.000 n=10) DecodeJapaneseRune-4 38.55n ± 0% 38.58n ± 0% ~ (p=0.612 n=10) Change-Id: I60a91cbede9ce65597571c7b7dd9943eeb8d3cc2 Reviewed-on: https://go-review.googlesource.com/c/go/+/535115 Run-TryBot: Joel Sing <joel@sing.id.au> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Joel Sing <joel@sing.id.au> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: M Zhuo <mzh@golangcn.org> Reviewed-by: David Chase <drchase@google.com>
2023-09-22 13:14:25 +00:00
// riscv64:"SRA\t",-"OR",-"SLTIU"
cmd/compile: optimize shift pairs and masks on s390x Optimize combinations of left and right shifts by a constant value into a 'rotate then insert selected bits [into zero]' instruction. Use the same instruction for contiguous masks since it has some benefits over 'and immediate' (not restricted to 32-bits, does not overwrite source register). To keep the complexity of this change under control I've only implemented 64 bit operations for now. There are a lot more optimizations that can be done with this instruction family. However, since their function overlaps with other instructions we need to be somewhat careful not to break existing optimization rules by creating optimization dead ends. This is particularly true of the load/store merging rules which contain lots of zero extensions and shifts. This CL does interfere with the store merging rules when an operand is shifted left before it is stored: binary.BigEndian.PutUint64(b, x << 1) This is unfortunate but it's not critical and somewhat complex so I plan to fix that in a follow up CL. file before after Δ % addr2line 4117446 4117282 -164 -0.004% api 4945184 4942752 -2432 -0.049% asm 4998079 4991891 -6188 -0.124% buildid 2685158 2684074 -1084 -0.040% cgo 4553732 4553394 -338 -0.007% compile 19294446 19245070 -49376 -0.256% cover 4897105 4891319 -5786 -0.118% dist 3544389 3542785 -1604 -0.045% doc 3926795 3927617 +822 +0.021% fix 3302958 3293868 -9090 -0.275% link 6546274 6543456 -2818 -0.043% nm 4102021 4100825 -1196 -0.029% objdump 4542431 4548483 +6052 +0.133% pack 2482465 2416389 -66076 -2.662% pprof 13366541 13363915 -2626 -0.020% test2json 2829007 2761515 -67492 -2.386% trace 10216164 10219684 +3520 +0.034% vet 6773956 6773572 -384 -0.006% total 107124151 106917891 -206260 -0.193% Change-Id: I7591cce41e06867ba10a745daae9333513062746 Reviewed-on: https://go-review.googlesource.com/c/go/+/233317 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org> Trust: Michael Munday <mike.munday@ibm.com>
2020-05-11 09:44:48 -07:00
// s390x:-"RISBGZ",-"AND",-"LOCGR"
// wasm:-"Select",-".*LtU"
// arm64:"ASR",-"CSEL"
return v >> s
}
panic("shift too large")
}
func provedUnsignedShiftLeft(val64 uint64, val32 uint32, val16 uint16, val8 uint8, shift int) (r1 uint64, r2 uint32, r3 uint16, r4 uint8) {
if shift >= 0 && shift < 64 {
// arm64:"LSL",-"CSEL"
r1 = val64 << shift
}
if shift >= 0 && shift < 32 {
// arm64:"LSL",-"CSEL"
r2 = val32 << shift
}
if shift >= 0 && shift < 16 {
// arm64:"LSL",-"CSEL"
r3 = val16 << shift
}
if shift >= 0 && shift < 8 {
// arm64:"LSL",-"CSEL"
r4 = val8 << shift
}
return r1, r2, r3, r4
}
func provedSignedShiftLeft(val64 int64, val32 int32, val16 int16, val8 int8, shift int) (r1 int64, r2 int32, r3 int16, r4 int8) {
if shift >= 0 && shift < 64 {
// arm64:"LSL",-"CSEL"
r1 = val64 << shift
}
if shift >= 0 && shift < 32 {
// arm64:"LSL",-"CSEL"
r2 = val32 << shift
}
if shift >= 0 && shift < 16 {
// arm64:"LSL",-"CSEL"
r3 = val16 << shift
}
if shift >= 0 && shift < 8 {
// arm64:"LSL",-"CSEL"
r4 = val8 << shift
}
return r1, r2, r3, r4
}
func provedUnsignedShiftRight(val64 uint64, val32 uint32, val16 uint16, val8 uint8, shift int) (r1 uint64, r2 uint32, r3 uint16, r4 uint8) {
if shift >= 0 && shift < 64 {
// arm64:"LSR",-"CSEL"
r1 = val64 >> shift
}
if shift >= 0 && shift < 32 {
// arm64:"LSR",-"CSEL"
r2 = val32 >> shift
}
if shift >= 0 && shift < 16 {
// arm64:"LSR",-"CSEL"
r3 = val16 >> shift
}
if shift >= 0 && shift < 8 {
// arm64:"LSR",-"CSEL"
r4 = val8 >> shift
}
return r1, r2, r3, r4
}
func provedSignedShiftRight(val64 int64, val32 int32, val16 int16, val8 int8, shift int) (r1 int64, r2 int32, r3 int16, r4 int8) {
if shift >= 0 && shift < 64 {
// arm64:"ASR",-"CSEL"
r1 = val64 >> shift
}
if shift >= 0 && shift < 32 {
// arm64:"ASR",-"CSEL"
r2 = val32 >> shift
}
if shift >= 0 && shift < 16 {
// arm64:"ASR",-"CSEL"
r3 = val16 >> shift
}
if shift >= 0 && shift < 8 {
// arm64:"ASR",-"CSEL"
r4 = val8 >> shift
}
return r1, r2, r3, r4
}
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
func checkUnneededTrunc(tab *[100000]uint32, d uint64, v uint32, h uint16, b byte) (uint32, uint64) {
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f := tab[byte(v)^b]
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[byte(v)&b]
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[byte(v)|b]
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[uint16(v)&h]
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[uint16(v)^h]
// ppc64x:-".*RLWINM",-".*RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[uint16(v)|h]
// ppc64x:-".*AND",-"RLDICR",".*CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
f += tab[v&0xff]
// ppc64x:-".*AND",".*CLRLSLWI"
f += 2 * uint32(uint16(d))
// ppc64x:-".*AND",-"RLDICR",".*CLRLSLDI"
g := 2 * uint64(uint32(d))
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
return f, g
}
func checkCombinedShifts(v8 uint8, v16 uint16, v32 uint32, x32 int32, v64 uint64) (uint8, uint16, uint32, uint64, int64) {
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
// ppc64x:-"AND","CLRLSLWI"
f := (v8 & 0xF) << 2
// ppc64x:"CLRLSLWI"
f += byte(v16) << 3
// ppc64x:-"AND","CLRLSLWI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
g := (v16 & 0xFF) << 3
// ppc64x:-"AND","CLRLSLWI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
h := (v32 & 0xFFFFF) << 2
// ppc64x:"CLRLSLDI"
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
i := (v64 & 0xFFFFFFFF) << 5
// ppc64x:-"CLRLSLDI"
i += (v64 & 0xFFFFFFF) << 38
// ppc64x/power9:-"CLRLSLDI"
i += (v64 & 0xFFFF00) << 10
// ppc64x/power9:-"SLD","EXTSWSLI"
j := int64(x32+32) * 8
return f, g, h, i, j
cmd/compile: use combined shifts to improve array addressing on ppc64x This change adds rules to find pairs of instructions that can be combined into a single shifts. These instruction sequences are common in array addressing within loops. Improvements can be seen in many crypto packages and the hash packages. These are based on the extended mnemonics found in the ISA sections C.8.1 and C.8.2. Some rules in PPC64.rules were moved because the ordering prevented some matching. The following results were generated on power9. hash/crc32: CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41% CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50% CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46% CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80% CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27% CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15% CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22% CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79% CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56% CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53% crypto/rc4: RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1) RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1) RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1) crypto/sha1: Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1) Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1) Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1) Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1) There are other improvements found in golang.org/x/crypto which are all in the range of 5-15%. Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c Reviewed-on: https://go-review.googlesource.com/c/go/+/252097 Trust: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
2020-08-31 09:43:40 -04:00
}
func checkWidenAfterShift(v int64, u uint64) (int64, uint64) {
// ppc64x:-".*MOVW"
f := int32(v >> 32)
// ppc64x:".*MOVW"
f += int32(v >> 31)
// ppc64x:-".*MOVH"
g := int16(v >> 48)
// ppc64x:".*MOVH"
g += int16(v >> 30)
// ppc64x:-".*MOVH"
g += int16(f >> 16)
// ppc64x:-".*MOVB"
h := int8(v >> 56)
// ppc64x:".*MOVB"
h += int8(v >> 28)
// ppc64x:-".*MOVB"
h += int8(f >> 24)
// ppc64x:".*MOVB"
h += int8(f >> 16)
return int64(h), uint64(g)
}
func checkShiftAndMask32(v []uint32) {
i := 0
// ppc64x: "RLWNM\t[$]24, R[0-9]+, [$]12, [$]19, R[0-9]+"
v[i] = (v[i] & 0xFF00000) >> 8
i++
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]22, [$]29, R[0-9]+"
v[i] = (v[i] & 0xFF00) >> 6
i++
// ppc64x: "MOVW\tR0"
v[i] = (v[i] & 0xFF) >> 8
i++
// ppc64x: "MOVW\tR0"
v[i] = (v[i] & 0xF000000) >> 28
i++
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]24, [$]31, R[0-9]+"
v[i] = (v[i] >> 6) & 0xFF
i++
// ppc64x: "RLWNM\t[$]26, R[0-9]+, [$]12, [$]19, R[0-9]+"
v[i] = (v[i] >> 6) & 0xFF000
i++
// ppc64x: "MOVW\tR0"
v[i] = (v[i] >> 20) & 0xFF000
i++
// ppc64x: "MOVW\tR0"
v[i] = (v[i] >> 24) & 0xFF00
i++
}
func checkMergedShifts32(a [256]uint32, b [256]uint64, u uint32, v uint32) {
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
a[0] = a[uint8(v>>24)]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]11, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[0] = b[uint8(v>>24)]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[1] = b[(v>>20)&0xFF]
// ppc64x: -"SLD", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]28, R[0-9]+"
b[2] = b[v>>25]
}
cmd/compile: optimize multi-register shifts on amd64 amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-01-07 19:25:05 -08:00
func checkMergedShifts64(a [256]uint32, b [256]uint64, c [256]byte, v uint64) {
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]10, R[0-9]+, [$]22, [$]29, R[0-9]+"
a[0] = a[uint8(v>>24)]
// ppc64x: "SRD", "CLRLSLDI", -"RLWNM"
a[1] = a[uint8(v>>25)]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]9, R[0-9]+, [$]23, [$]29, R[0-9]+"
a[2] = a[v>>25&0x7F]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]3, R[0-9]+, [$]29, [$]29, R[0-9]+"
a[3] = a[(v>>31)&0x01]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]12, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[0] = b[uint8(v>>23)]
// ppc64x: -"CLRLSLDI", "RLWNM\t[$]15, R[0-9]+, [$]21, [$]28, R[0-9]+"
b[1] = b[(v>>20)&0xFF]
// ppc64x: "RLWNM", -"SLD"
b[2] = b[((uint64((uint32(v) >> 21)) & 0x3f) << 4)]
// ppc64x: -"RLWNM"
b[3] = (b[3] << 24) & 0xFFFFFF000000
// ppc64x: "RLWNM\t[$]24, R[0-9]+, [$]0, [$]7,"
b[4] = (b[4] << 24) & 0xFF000000
// ppc64x: "RLWNM\t[$]24, R[0-9]+, [$]0, [$]7,"
b[5] = (b[5] << 24) & 0xFF00000F
// ppc64x: -"RLWNM"
b[6] = (b[6] << 0) & 0xFF00000F
// ppc64x: "RLWNM\t[$]4, R[0-9]+, [$]28, [$]31,"
b[7] = (b[7] >> 28) & 0xF
// ppc64x: "RLWNM\t[$]11, R[0-9]+, [$]10, [$]15"
c[0] = c[((v>>5)&0x3F)<<16]
// ppc64x: "ANDCC\t[$]8064,"
c[1] = c[((v>>7)&0x3F)<<7]
}
func checkShiftMask(a uint32, b uint64, z []uint32, y []uint64) {
_ = y[128]
_ = z[128]
// ppc64x: -"MOVBZ", -"SRW", "RLWNM"
z[0] = uint32(uint8(a >> 5))
// ppc64x: -"MOVBZ", -"SRW", "RLWNM"
z[1] = uint32(uint8((a >> 4) & 0x7e))
// ppc64x: "RLWNM\t[$]25, R[0-9]+, [$]27, [$]29, R[0-9]+"
z[2] = uint32(uint8(a>>7)) & 0x1c
// ppc64x: -"MOVWZ"
y[0] = uint64((a >> 6) & 0x1c)
// ppc64x: -"MOVWZ"
y[1] = uint64(uint32(b)<<6) + 1
// ppc64x: -"MOVHZ", -"MOVWZ"
y[2] = uint64((uint16(a) >> 9) & 0x1F)
// ppc64x: -"MOVHZ", -"MOVWZ", -"ANDCC"
y[3] = uint64(((uint16(a) & 0xFF0) >> 9) & 0x1F)
}
cmd/compile: optimize multi-register shifts on amd64 amd64 can shift in bits from another register instead of filling with 0/1. This pattern is helpful when implementing 128 bit shifts or arbitrary length shifts. In the standard library, it shows up in pure Go math/big. Benchmarks results on amd64 with -tags=math_big_pure_go. name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.45ns ± 3% 4.39ns ± 1% -1.28% (p=0.000 n=30+27) NonZeroShifts/1/shlVU-8 4.13ns ± 4% 4.10ns ± 2% ~ (p=0.254 n=29+28) NonZeroShifts/2/shrVU-8 5.55ns ± 1% 5.63ns ± 2% +1.42% (p=0.000 n=28+29) NonZeroShifts/2/shlVU-8 5.70ns ± 2% 5.14ns ± 1% -9.82% (p=0.000 n=29+28) NonZeroShifts/3/shrVU-8 6.79ns ± 2% 6.35ns ± 2% -6.46% (p=0.000 n=28+29) NonZeroShifts/3/shlVU-8 6.69ns ± 1% 6.25ns ± 1% -6.60% (p=0.000 n=28+27) NonZeroShifts/4/shrVU-8 7.79ns ± 2% 7.06ns ± 2% -9.48% (p=0.000 n=30+30) NonZeroShifts/4/shlVU-8 7.82ns ± 1% 7.24ns ± 1% -7.37% (p=0.000 n=28+29) NonZeroShifts/5/shrVU-8 8.90ns ± 3% 7.93ns ± 1% -10.84% (p=0.000 n=29+26) NonZeroShifts/5/shlVU-8 8.68ns ± 1% 7.92ns ± 1% -8.76% (p=0.000 n=29+29) NonZeroShifts/10/shrVU-8 14.4ns ± 1% 12.3ns ± 2% -14.79% (p=0.000 n=28+29) NonZeroShifts/10/shlVU-8 14.1ns ± 1% 11.9ns ± 2% -15.55% (p=0.000 n=28+27) NonZeroShifts/100/shrVU-8 118ns ± 1% 96ns ± 3% -18.82% (p=0.000 n=30+29) NonZeroShifts/100/shlVU-8 120ns ± 2% 98ns ± 2% -18.46% (p=0.000 n=29+28) NonZeroShifts/1000/shrVU-8 1.10µs ± 1% 0.88µs ± 2% -19.63% (p=0.000 n=29+30) NonZeroShifts/1000/shlVU-8 1.10µs ± 2% 0.88µs ± 2% -20.28% (p=0.000 n=29+28) NonZeroShifts/10000/shrVU-8 10.9µs ± 1% 8.7µs ± 1% -19.78% (p=0.000 n=28+27) NonZeroShifts/10000/shlVU-8 10.9µs ± 2% 8.7µs ± 1% -19.64% (p=0.000 n=29+27) NonZeroShifts/100000/shrVU-8 111µs ± 2% 90µs ± 2% -19.39% (p=0.000 n=28+29) NonZeroShifts/100000/shlVU-8 113µs ± 2% 90µs ± 2% -20.43% (p=0.000 n=30+27) The assembly version is still faster, unfortunately, but the gap is narrowing. Speedup from pure Go to assembly: name old time/op new time/op delta NonZeroShifts/1/shrVU-8 4.39ns ± 1% 3.45ns ± 2% -21.36% (p=0.000 n=27+29) NonZeroShifts/1/shlVU-8 4.10ns ± 2% 3.47ns ± 3% -15.42% (p=0.000 n=28+30) NonZeroShifts/2/shrVU-8 5.63ns ± 2% 3.97ns ± 0% -29.40% (p=0.000 n=29+25) NonZeroShifts/2/shlVU-8 5.14ns ± 1% 3.77ns ± 2% -26.65% (p=0.000 n=28+26) NonZeroShifts/3/shrVU-8 6.35ns ± 2% 4.79ns ± 2% -24.52% (p=0.000 n=29+29) NonZeroShifts/3/shlVU-8 6.25ns ± 1% 4.42ns ± 1% -29.29% (p=0.000 n=27+26) NonZeroShifts/4/shrVU-8 7.06ns ± 2% 5.64ns ± 1% -20.05% (p=0.000 n=30+29) NonZeroShifts/4/shlVU-8 7.24ns ± 1% 5.34ns ± 2% -26.23% (p=0.000 n=29+29) NonZeroShifts/5/shrVU-8 7.93ns ± 1% 6.56ns ± 2% -17.26% (p=0.000 n=26+30) NonZeroShifts/5/shlVU-8 7.92ns ± 1% 6.27ns ± 1% -20.79% (p=0.000 n=29+25) NonZeroShifts/10/shrVU-8 12.3ns ± 2% 10.2ns ± 2% -17.21% (p=0.000 n=29+29) NonZeroShifts/10/shlVU-8 11.9ns ± 2% 10.5ns ± 2% -12.45% (p=0.000 n=27+29) NonZeroShifts/100/shrVU-8 95.9ns ± 3% 77.7ns ± 1% -19.00% (p=0.000 n=29+30) NonZeroShifts/100/shlVU-8 97.5ns ± 2% 66.8ns ± 2% -31.47% (p=0.000 n=28+30) NonZeroShifts/1000/shrVU-8 884ns ± 2% 705ns ± 1% -20.17% (p=0.000 n=30+28) NonZeroShifts/1000/shlVU-8 880ns ± 2% 590ns ± 1% -32.96% (p=0.000 n=28+25) NonZeroShifts/10000/shrVU-8 8.74µs ± 1% 7.34µs ± 3% -15.94% (p=0.000 n=27+30) NonZeroShifts/10000/shlVU-8 8.73µs ± 1% 6.00µs ± 1% -31.25% (p=0.000 n=27+28) NonZeroShifts/100000/shrVU-8 89.6µs ± 2% 75.5µs ± 2% -15.80% (p=0.000 n=29+29) NonZeroShifts/100000/shlVU-8 89.6µs ± 2% 68.0µs ± 3% -24.09% (p=0.000 n=27+30) Change-Id: I18f58d8f5513d737d9cdf09b8f9d14011ffe3958 Reviewed-on: https://go-review.googlesource.com/c/go/+/297050 Trust: Josh Bleecher Snyder <josharian@gmail.com> Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
2021-01-07 19:25:05 -08:00
// 128 bit shifts
func check128bitShifts(x, y uint64, bits uint) (uint64, uint64) {
s := bits & 63
ŝ := (64 - bits) & 63
// check that the shift operation has two commas (three operands)
// amd64:"SHRQ.*,.*,"
shr := x>>s | y<<ŝ
// amd64:"SHLQ.*,.*,"
shl := x<<s | y>>ŝ
return shr, shl
}
func checkShiftToMask(u []uint64, s []int64) {
// amd64:-"SHR",-"SHL","ANDQ"
u[0] = u[0] >> 5 << 5
// amd64:-"SAR",-"SHL","ANDQ"
s[0] = s[0] >> 5 << 5
// amd64:-"SHR",-"SHL","ANDQ"
u[1] = u[1] << 5 >> 5
}
//
// Left shift with addition.
//
func checkLeftShiftWithAddition(a int64, b int64) int64 {
// riscv64/rva20u64: "SLLI","ADD"
// riscv64/rva22u64,riscv64/rva23u64: "SH1ADD"
a = a + b<<1
// riscv64/rva20u64: "SLLI","ADD"
// riscv64/rva22u64,riscv64/rva23u64: "SH2ADD"
a = a + b<<2
// riscv64/rva20u64: "SLLI","ADD"
// riscv64/rva22u64,riscv64/rva23u64: "SH3ADD"
a = a + b<<3
return a
}
//
// Convert and shift.
//
func rsh64Uto32U(v uint64) uint32 {
x := uint32(v)
// riscv64:"MOVWU"
if x > 8 {
// riscv64:"SRLIW",-"MOVWU",-"SLLI"
x >>= 2
}
return x
}
func rsh64Uto16U(v uint64) uint16 {
x := uint16(v)
// riscv64:"MOVHU"
if x > 8 {
// riscv64:"SLLI","SRLI"
x >>= 2
}
return x
}
func rsh64Uto8U(v uint64) uint8 {
x := uint8(v)
// riscv64:"MOVBU"
if x > 8 {
// riscv64:"SLLI","SRLI"
x >>= 2
}
return x
}
func rsh64to32(v int64) int32 {
x := int32(v)
// riscv64:"MOVW"
if x > 8 {
// riscv64:"SRAIW",-"MOVW",-"SLLI"
x >>= 2
}
return x
}
func rsh64to16(v int64) int16 {
x := int16(v)
// riscv64:"MOVH"
if x > 8 {
// riscv64:"SLLI","SRAI"
x >>= 2
}
return x
}
func rsh64to8(v int64) int8 {
x := int8(v)
// riscv64:"MOVB"
if x > 8 {
// riscv64:"SLLI","SRAI"
x >>= 2
}
return x
}
// We don't need to worry about shifting
// more than the type size.
// (There is still a negative shift test, but
// no shift-too-big test.)
func signedModShift(i int) int64 {
// arm64:-"CMP",-"CSEL"
return 1 << (i % 64)
}