cmd/compile: intrinsify math/bits.Len on riscv64

For riscv64/rva22u64 and above, we can intrinsify math/bits.Len using the
CLZ/CLZW machine instructions.

On a StarFive VisionFive 2 with GORISCV64=rva22u64:

                 │   clz.b.1   │               clz.b.2               │
                 │   sec/op    │   sec/op     vs base                │
LeadingZeros-4     28.89n ± 0%   12.08n ± 0%  -58.19% (p=0.000 n=10)
LeadingZeros8-4    18.79n ± 0%   14.76n ± 0%  -21.45% (p=0.000 n=10)
LeadingZeros16-4   25.27n ± 0%   14.76n ± 0%  -41.59% (p=0.000 n=10)
LeadingZeros32-4   25.12n ± 0%   12.08n ± 0%  -51.92% (p=0.000 n=10)
LeadingZeros64-4   25.89n ± 0%   12.08n ± 0%  -53.35% (p=0.000 n=10)
geomean            24.55n        13.09n       -46.70%

Change-Id: I0dda684713dbdf5336af393f5ccbdae861c4f694
Reviewed-on: https://go-review.googlesource.com/c/go/+/652321
Reviewed-by: David Chase <drchase@google.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Joel Sing 2025-02-24 00:27:34 +11:00
parent af133d86e4
commit b70244ff7a
8 changed files with 195 additions and 31 deletions

View file

@ -15,60 +15,70 @@ import "math/bits"
func LeadingZeros(n uint) int {
// amd64/v1,amd64/v2:"BSRQ"
// amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"SUB"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.LeadingZeros(n)
}
func LeadingZeros64(n uint64) int {
// amd64/v1,amd64/v2:"BSRQ"
// amd64/v3:"LZCNTQ", -"BSRQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm:"CLZ"
// arm64:"CLZ"
// loong64:"CLZV",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.LeadingZeros64(n)
}
func LeadingZeros32(n uint32) int {
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZW"
// arm:"CLZ"
// arm64:"CLZW"
// loong64:"CLZW",-"SUB"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZW"
// riscv64/rva22u64,riscv64/rva23u64:"CLZW",-"ADDI"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.LeadingZeros32(n)
}
func LeadingZeros16(n uint16) int {
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-48",-"NEG"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.LeadingZeros16(n)
}
func LeadingZeros8(n uint8) int {
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
// amd64/v3: "LZCNTL",- "BSRL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-56",-"NEG"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.LeadingZeros8(n)
}
@ -79,30 +89,35 @@ func LeadingZeros8(n uint8) int {
func Len(n uint) int {
// amd64/v1,amd64/v2:"BSRQ"
// amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.Len(n)
}
func Len64(n uint64) int {
// amd64/v1,amd64/v2:"BSRQ"
// amd64/v3: "LZCNTQ"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.Len64(n)
}
func SubFromLen64(n uint64) int {
// loong64:"CLZV",-"ADD"
// ppc64x:"CNTLZD",-"SUBC"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t",-"ADDI",-"NEG"
return 64 - bits.Len64(n)
}
@ -114,36 +129,42 @@ func CompareWithLen64(n uint64) bool {
func Len32(n uint32) int {
// amd64/v1,amd64/v2:"BSRQ","LEAQ",-"CMOVQEQ"
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZW"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x: "CNTLZW"
// riscv64/rva22u64,riscv64/rva23u64:"CLZW","ADDI\t\\$-32"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.Len32(n)
}
func Len16(n uint16) int {
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.Len16(n)
}
func Len8(n uint8) int {
// amd64/v1,amd64/v2:"BSRL","LEAL",-"CMOVQEQ"
// amd64/v3: "LZCNTL"
// s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ"
// arm64:"CLZ"
// arm:"CLZ"
// loong64:"CLZV"
// mips:"CLZ"
// wasm:"I64Clz"
// ppc64x:"SUBC","CNTLZD"
// riscv64/rva22u64,riscv64/rva23u64:"CLZ\t","ADDI\t\\$-64"
// s390x:"FLOGR"
// wasm:"I64Clz"
return bits.Len8(n)
}
@ -451,6 +472,7 @@ func IterateBits64(n uint64) int {
for n != 0 {
// amd64/v1,amd64/v2:"BSFQ",-"CMOVEQ"
// amd64/v3:"TZCNTQ"
// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
i += bits.TrailingZeros64(n)
n &= n - 1
}
@ -462,6 +484,7 @@ func IterateBits32(n uint32) int {
for n != 0 {
// amd64/v1,amd64/v2:"BSFL",-"BTSQ"
// amd64/v3:"TZCNTL"
// riscv64/rva22u64,riscv64/rva23u64: "CTZ\t"
i += bits.TrailingZeros32(n)
n &= n - 1
}