cmd/compile: wire up math/bits.Len intrinsics for loong64

For the SubFromLen64 codegen test case to work as intended, we need
to fold c-(-(x-d)) into x+(c-d).

Still, some instances of LeadingZeros are not optimized into single
CLZ instructions right now (actually, the LeadingZeros micro-benchmarks
are currently still compiled with redundant adds/subs of 64, due to
interference of loop optimizations before lowering), but perf numbers
indicate it's not that bad after all.

Micro-benchmark results on Loongson 3A5000 and 3A6000:

goos: linux
goarch: loong64
pkg: math/bits
cpu: Loongson-3A5000 @ 2500.00MHz
               |  bench.old  |              bench.new              |
               |   sec/op    |   sec/op     vs base                |
LeadingZeros     3.660n ± 0%   1.348n ± 0%  -63.17% (p=0.000 n=20)
LeadingZeros8    1.777n ± 0%   1.767n ± 0%   -0.56% (p=0.000 n=20)
LeadingZeros16   2.816n ± 0%   1.770n ± 0%  -37.14% (p=0.000 n=20)
LeadingZeros32   5.293n ± 1%   1.683n ± 0%  -68.21% (p=0.000 n=20)
LeadingZeros64   3.622n ± 0%   1.349n ± 0%  -62.76% (p=0.000 n=20)
geomean          3.229n        1.571n       -51.35%

goos: linux
goarch: loong64
pkg: math/bits
cpu: Loongson-3A6000 @ 2500.00MHz
               |  bench.old   |              bench.new               |
               |    sec/op    |    sec/op     vs base                |
LeadingZeros      2.410n ± 0%    1.103n ± 1%  -54.23% (p=0.000 n=20)
LeadingZeros8     1.236n ± 0%    1.501n ± 0%  +21.44% (p=0.000 n=20)
LeadingZeros16    2.106n ± 0%    1.501n ± 0%  -28.73% (p=0.000 n=20)
LeadingZeros32    2.860n ± 0%    1.324n ± 0%  -53.72% (p=0.000 n=20)
LeadingZeros64   2.6135n ± 0%   0.9509n ± 0%  -63.62% (p=0.000 n=20)
geomean           2.159n         1.256n       -41.81%

Updates #59120

This patch is a copy of CL 483356.
Co-authored-by: WANG Xuerui <git@xen0n.name>

Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556
Reviewed-on: https://go-review.googlesource.com/c/go/+/624575
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Carlos Amedee <carlos@golang.org>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
This commit is contained in:
Xiaolin Zhao 2024-11-02 10:59:20 +08:00 committed by abner chenc
parent 671f2841cb
commit d98c51809d
8 changed files with 159 additions and 5 deletions

View file

@ -17,6 +17,7 @@ func LeadingZeros(n uint) int {
// amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
@ -28,6 +29,7 @@ func LeadingZeros64(n uint64) int {
// amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
@ -39,6 +41,7 @@ func LeadingZeros32(n uint32) int {
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZW"
// loong64:"CLZW",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZW"
@ -50,6 +53,7 @@ func LeadingZeros16(n uint16) int {
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
@ -61,6 +65,7 @@ func LeadingZeros8(n uint8) int {
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
@ -76,6 +81,7 @@ func Len(n uint) int {
// amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
@ -87,6 +93,7 @@ func Len64(n uint64) int {
// amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
@ -94,15 +101,22 @@ func Len64(n uint64) int {
}
func SubFromLen64(n uint64) int {
// loong64:"CLZV",-"ADD"
// ppc64x:"CNTLZD",-"SUBC"
return 64 - bits.Len64(n)
}
func CompareWithLen64(n uint64) bool {
// loong64:"CLZV",-"ADD",-"[$]64",-"[$]9"
return bits.Len64(n) < 9
}
func Len32(n uint32) int {
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZW"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x: "CNTLZW"
@ -114,6 +128,7 @@ func Len16(n uint16) int {
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
@ -125,6 +140,7 @@ func Len8(n uint8) int {
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"