go/src/cmd/compile/internal/ssagen/intrinsics.go

1994 lines
80 KiB
Go
Raw Normal View History

// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssagen
import (
"fmt"
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
"internal/abi"
"internal/buildcfg"
"cmd/compile/internal/base"
"cmd/compile/internal/ir"
"cmd/compile/internal/ssa"
"cmd/compile/internal/typecheck"
"cmd/compile/internal/types"
"cmd/internal/sys"
)
var intrinsics intrinsicBuilders
// An intrinsicBuilder converts a call node n into an ssa value that
// implements that call as an intrinsic. args is a list of arguments to the func.
type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
type intrinsicKey struct {
arch *sys.Arch
pkg string
fn string
}
// intrinsicBuildConfig specifies the config to use for intrinsic building.
type intrinsicBuildConfig struct {
instrumenting bool
go386 string
goamd64 int
goarm buildcfg.GoarmFeatures
goarm64 buildcfg.Goarm64Features
gomips string
gomips64 string
goppc64 int
goriscv64 int
}
type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
// add adds the intrinsic builder b for pkg.fn for the given architecture.
func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
}
ib[intrinsicKey{arch, pkg, fn}] = b
}
// addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
for _, arch := range archs {
ib.add(arch, pkg, fn, b)
}
}
// addForFamilies does the same as addForArchs but operates on architecture families.
func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
for _, arch := range sys.Archs {
if arch.InFamily(archFamilies...) {
intrinsics.add(arch, pkg, fn, b)
}
}
}
// alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
// for which targetPkg.targetFn already exists.
func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
// TODO(jsing): Consider making this work even if the alias is added
// before the intrinsic.
aliased := false
for _, arch := range archs {
if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
intrinsics.add(arch, pkg, fn, b)
aliased = true
}
}
if !aliased {
panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
}
}
// lookup looks up the intrinsic for a pkg.fn on the specified architecture.
func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
return intrinsics[intrinsicKey{arch, pkg, fn}]
}
func initIntrinsics(cfg *intrinsicBuildConfig) {
if cfg == nil {
cfg = &intrinsicBuildConfig{
instrumenting: base.Flag.Cfg.Instrumenting,
go386: buildcfg.GO386,
goamd64: buildcfg.GOAMD64,
goarm: buildcfg.GOARM,
goarm64: buildcfg.GOARM64,
gomips: buildcfg.GOMIPS,
gomips64: buildcfg.GOMIPS64,
goppc64: buildcfg.GOPPC64,
goriscv64: buildcfg.GORISCV64,
}
}
intrinsics = intrinsicBuilders{}
var p4 []*sys.Arch
var p8 []*sys.Arch
var lwatomics []*sys.Arch
for _, a := range sys.Archs {
if a.PtrSize == 4 {
p4 = append(p4, a)
} else {
p8 = append(p8, a)
}
if a.Family != sys.PPC64 {
lwatomics = append(lwatomics, a)
}
}
all := sys.Archs[:]
add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
intrinsics.addForArchs(pkg, fn, b, archs...)
}
addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
}
alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
}
/******** runtime ********/
if !cfg.instrumenting {
add("runtime", "slicebytetostringtmp",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Compiler frontend optimizations emit OBYTES2STRTMP nodes
// for the backend instead of slicebytetostringtmp calls
// when not instrumenting.
return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
},
all...)
}
addF("internal/runtime/math", "MulUintptr",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
}
return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
},
sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
add("runtime", "KeepAlive",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
return nil
},
all...)
addF("runtime", "publicationBarrier",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
return nil
},
sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64)
/******** internal/runtime/sys ********/
add("internal/runtime/sys", "GetCallerPC",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
},
all...)
add("internal/runtime/sys", "GetCallerSP",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
},
all...)
add("internal/runtime/sys", "GetClosurePtr",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
},
all...)
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
if cfg.goppc64 >= 10 {
// Use only on Power10 as the new byte reverse instructions that Power10 provide
// make it worthwhile as an intrinsic
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
sys.PPC64)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
sys.PPC64)
}
if cfg.goriscv64 >= 22 {
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
sys.RISCV64)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
sys.RISCV64)
}
/****** Prefetch ******/
makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
return nil
}
}
// Make Prefetch intrinsics for supported platforms
// On the unsupported platforms stub function will be eliminated
addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
/******** internal/runtime/atomic ********/
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
addF("internal/runtime/atomic", "Load",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Load8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Load64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "LoadAcq",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.PPC64)
addF("internal/runtime/atomic", "LoadAcq64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.PPC64)
addF("internal/runtime/atomic", "Loadp",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "StorepNoWB",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "StoreRel",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.PPC64)
addF("internal/runtime/atomic", "StoreRel64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.PPC64)
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
emit(s, n, args, op1, typ, false)
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
emit(s, n, args, op0, typ, false)
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
return nil
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
}
}
atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Store8",
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Store",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.Loong64)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.Loong64)
addF("internal/runtime/atomic", "Xchg8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
},
sys.AMD64, sys.PPC64)
addF("internal/runtime/atomic", "Xchg",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xchg64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goarm64.LSE {
emit(s, n, args, op1, typ, needReturn)
} else {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
emit(s, n, args, op1, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
}
if needReturn {
return s.variable(n, types.Types[typ])
} else {
return nil
}
}
}
makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
}
makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
}
atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Xchg8",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xchg",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xchg64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
cmd/compile, internal/runtime/atomic: add Xchg8 for loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic instruction AMSWAP[DB]{B,H} [1] is supported. Therefore, the implementation of the atomic operation exchange can be selected according to the CPUCFG flag LAM_BH: AMSWAPDBB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. Because Xchg8 implemented using traditional LL-SC uses too many temporary registers, it is not suitable for intrinsics. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz BenchmarkXchg8 100000000 10.41 ns/op BenchmarkXchg8-2 100000000 10.41 ns/op BenchmarkXchg8-4 100000000 10.41 ns/op BenchmarkXchg8Parallel 96647592 12.41 ns/op BenchmarkXchg8Parallel-2 58376136 20.60 ns/op BenchmarkXchg8Parallel-4 78458899 17.97 ns/op goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz BenchmarkXchg8 38323825 31.23 ns/op BenchmarkXchg8-2 38368219 31.23 ns/op BenchmarkXchg8-4 37154156 31.26 ns/op BenchmarkXchg8Parallel 37908301 31.63 ns/op BenchmarkXchg8Parallel-2 30413440 39.42 ns/op BenchmarkXchg8Parallel-4 30737626 39.03 ns/op For #69735 [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I02ba68f66a2210b6902344fdc9975eb62de728ab Reviewed-on: https://go-review.googlesource.com/c/go/+/623058 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Mauri de Souza Meneguzzo <mauri870@gmail.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
2024-10-30 19:11:49 +08:00
makeAtomicXchg8GuardedIntrinsicLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most loong64 machines support the amswapdb.b
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue3(op, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, s.vars[n])
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], s.vars[n])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TUINT8]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TUINT8])
}
}
addF("internal/runtime/atomic", "Xchg8",
makeAtomicXchg8GuardedIntrinsicLoong64(ssa.OpAtomicExchange8Variant),
sys.Loong64)
addF("internal/runtime/atomic", "Xadd",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xadd64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xadd",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xadd64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Cas",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
sys.AMD64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Cas64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
sys.AMD64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "CasRel",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
sys.PPC64)
atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Cas",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Cas64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
sys.ARM64)
cmd/compiler,internal/runtime/atomic: optimize Cas{64,32} on loong64 In Loongson's new microstructure LA664 (Loongson-3A6000) and later, the atomic compare-and-exchange instruction AMCAS[DB]{B,W,H,V} [1] is supported. Therefore, the implementation of the atomic operation compare-and-swap can be selected according to the CPUCFG flag LAMCAS: AMCASDB(full barrier) instruction is used on new microstructures, and traditional LL-SC is used on LA464 (Loongson-3A5000) and older microstructures. This can significantly improve the performance of Go programs on new microstructures. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 46.84n ± 0% 22.82n ± 0% -51.28% (p=0.000 n=20) Cas-2 47.58n ± 0% 29.57n ± 0% -37.85% (p=0.000 n=20) Cas-4 43.27n ± 20% 25.31n ± 13% -41.50% (p=0.000 n=20) Cas64 46.85n ± 0% 22.82n ± 0% -51.29% (p=0.000 n=20) Cas64-2 47.43n ± 0% 29.53n ± 0% -37.74% (p=0.002 n=20) Cas64-4 43.18n ± 0% 25.28n ± 2% -41.46% (p=0.000 n=20) geomean 45.82n 25.74n -43.82% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Cas 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas-2 52.80n ± 0% 53.11n ± 0% +0.59% (p=0.000 n=20) Cas-4 55.97n ± 0% 57.31n ± 0% +2.39% (p=0.000 n=20) Cas64 50.05n ± 0% 51.26n ± 0% +2.42% (p=0.000 n=20) Cas64-2 52.68n ± 0% 53.11n ± 0% +0.82% (p=0.000 n=20) Cas64-4 55.96n ± 0% 57.26n ± 0% +2.33% (p=0.000 n=20) geomean 52.86n 53.83n +1.82% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html Change-Id: I9b777c63c124fb492f61c903f77061fa2b4e5322 Reviewed-on: https://go-review.googlesource.com/c/go/+/613396 Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-09-20 11:06:18 +08:00
atomicCasEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
makeAtomicCasGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, emit atomicOpEmitter) intrinsicBuilder {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAMCAS, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
emit(s, n, args, op1, types.TBOOL, true)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, types.TBOOL, true)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TBOOL])
}
}
addF("internal/runtime/atomic", "Cas",
makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, atomicCasEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Cas64",
makeAtomicCasGuardedIntrinsicLoong64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, atomicCasEmitterLoong64),
sys.Loong64)
// Old-style atomic logical operation API (all supported archs except arm64).
addF("internal/runtime/atomic", "And8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "And",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Or8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Or",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
// arm64 always uses the new-style atomic logical operations, for both the
// old and new style API.
addF("internal/runtime/atomic", "And8",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or8",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And32",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or32",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
// New-style atomic logical operations, which return the old memory value.
addF("internal/runtime/atomic", "And64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "And32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "Or64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "Or32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
// Aliases for atomic load operations
alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
// Aliases for atomic store operations
alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
// Aliases for atomic swap operations
alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
// Aliases for atomic add operations
alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
// Aliases for atomic CAS operations
alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
// Aliases for atomic And/Or operations
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
/******** math ********/
addF("math", "sqrt",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
},
sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
addF("math", "Trunc",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Ceil",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Floor",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Round",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X)
addF("math", "RoundToEven",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.S390X, sys.Wasm)
addF("math", "Abs",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
addF("math", "Copysign",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
},
sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
},
sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goamd64 >= 3 {
return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // >= haswell cpus are common
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
},
sys.AMD64)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
},
sys.ARM)
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goamd64 >= 2 {
return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
}
}
addF("math", "RoundToEven",
makeRoundAMD64(ssa.OpRoundToEven),
sys.AMD64)
addF("math", "Floor",
makeRoundAMD64(ssa.OpFloor),
sys.AMD64)
addF("math", "Ceil",
makeRoundAMD64(ssa.OpCeil),
sys.AMD64)
addF("math", "Trunc",
makeRoundAMD64(ssa.OpTrunc),
sys.AMD64)
/******** math/bits ********/
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
},
sys.I386)
addF("math/bits", "TrailingZeros32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.I386, sys.MIPS, sys.Loong64, sys.PPC64, sys.S390X, sys.Wasm)
if cfg.goriscv64 >= 22 {
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "TrailingZeros32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
},
sys.RISCV64)
}
// ReverseBytes inlines correctly, no need to intrinsify it.
alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
},
sys.Loong64)
if cfg.goppc64 >= 10 {
// On Power10, 16-bit rotate is not available so use BRH instruction
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
},
sys.PPC64)
}
if cfg.goriscv64 >= 22 {
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
},
sys.RISCV64)
}
addF("math/bits", "Len64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
addF("math/bits", "Len32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
addF("math/bits", "Len16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
addF("math/bits", "Len8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.PPC64, sys.S390X, sys.Wasm)
if cfg.goriscv64 >= 22 {
addF("math/bits", "Len64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "Len32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "Len16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
},
sys.RISCV64)
addF("math/bits", "Len8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
},
sys.RISCV64)
}
alias("math/bits", "Len", "math/bits", "Len64", p8...)
alias("math/bits", "Len", "math/bits", "Len32", p4...)
// LeadingZeros is handled because it trivially calls Len.
addF("math/bits", "Reverse64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
},
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
},
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
},
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
},
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
},
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "RotateLeft8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
},
sys.AMD64, sys.RISCV64)
addF("math/bits", "RotateLeft16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
},
sys.AMD64, sys.RISCV64)
addF("math/bits", "RotateLeft32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
addF("math/bits", "RotateLeft64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goamd64 >= 2 {
return s.newValue1(op, types.Types[types.TINT], args[0])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TINT])
}
}
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TINT])
}
}
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64 For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-03-19 23:57:23 +11:00
makeOnesCountRISCV64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goriscv64 >= 22 {
return s.newValue1(op, types.Types[types.TINT], args[0])
}
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.RISCV64HasZbb, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // Majority of RISC-V support Zbb.
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TINT])
}
}
addF("math/bits", "OnesCount64",
makeOnesCountAMD64(ssa.OpPopCount64),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount64",
makeOnesCountLoong64(ssa.OpPopCount64),
sys.Loong64)
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64 For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-03-19 23:57:23 +11:00
addF("math/bits", "OnesCount64",
makeOnesCountRISCV64(ssa.OpPopCount64),
sys.RISCV64)
addF("math/bits", "OnesCount64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
},
sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount32",
makeOnesCountLoong64(ssa.OpPopCount32),
sys.Loong64)
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64 For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-03-19 23:57:23 +11:00
addF("math/bits", "OnesCount32",
makeOnesCountRISCV64(ssa.OpPopCount32),
sys.RISCV64)
addF("math/bits", "OnesCount32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
},
sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount16",
makeOnesCountLoong64(ssa.OpPopCount16),
sys.Loong64)
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64 For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-03-19 23:57:23 +11:00
addF("math/bits", "OnesCount16",
makeOnesCountRISCV64(ssa.OpPopCount16),
sys.RISCV64)
addF("math/bits", "OnesCount16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
},
sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
addF("math/bits", "OnesCount8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
},
sys.S390X, sys.PPC64, sys.Wasm)
cmd/compile,internal/cpu,runtime: intrinsify math/bits.OnesCount on riscv64 For riscv64/rva22u64 and above, we can intrinsify math/bits.OnesCount using the CPOP/CPOPW machine instructions. Since the native Go implementation of OnesCount is relatively expensive, it is also worth emitting a check for Zbb support when compiled for rva20u64. On a Banana Pi F3, with GORISCV64=rva22u64: │ oc.1 │ oc.2 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 4.389n ± 0% -74.08% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.016n ± 0% -11.10% (p=0.000 n=10) OnesCount16-8 9.404n ± 0% 5.015n ± 0% -46.67% (p=0.000 n=10) OnesCount32-8 13.165n ± 0% 4.388n ± 0% -66.67% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 4.388n ± 0% -73.08% (p=0.000 n=10) geomean 11.40n 4.629n -59.40% On a Banana Pi F3, compiled with GORISCV64=rva20u64 and with Zbb detection enabled: │ oc.3 │ oc.4 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.930n ± 0% 5.643n ± 0% -66.67% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.642n ± 0% ~ (p=0.447 n=10) OnesCount16-8 10.030n ± 0% 6.896n ± 0% -31.25% (p=0.000 n=10) OnesCount32-8 13.170n ± 0% 5.642n ± 0% -57.16% (p=0.000 n=10) OnesCount64-8 16.300n ± 0% 5.642n ± 0% -65.39% (p=0.000 n=10) geomean 11.55n 5.873n -49.16% On a Banana Pi F3, compiled with GORISCV64=rva20u64 but with Zbb detection disabled: │ oc.3 │ oc.5 │ │ sec/op │ sec/op vs base │ OnesCount-8 16.93n ± 0% 29.47n ± 0% +74.07% (p=0.000 n=10) OnesCount8-8 5.642n ± 0% 5.643n ± 0% ~ (p=0.191 n=10) OnesCount16-8 10.03n ± 0% 15.05n ± 0% +50.05% (p=0.000 n=10) OnesCount32-8 13.17n ± 0% 18.18n ± 0% +38.04% (p=0.000 n=10) OnesCount64-8 16.30n ± 0% 21.94n ± 0% +34.60% (p=0.000 n=10) geomean 11.55n 15.84n +37.16% For hardware without Zbb, this adds ~5ns overhead, while for hardware with Zbb we achieve a performance gain up of up to 11ns. It is worth noting that OnesCount8 is cheap enough that it is preferable to stick with the generic version in this case. Change-Id: Id657e40e0dd1b1ab8cc0fe0f8a68df4c9f2d7da5 Reviewed-on: https://go-review.googlesource.com/c/go/+/660856 Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com> Reviewed-by: Mark Ryan <markdryan@rivosinc.com> Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2025-03-19 23:57:23 +11:00
if cfg.goriscv64 >= 22 {
addF("math/bits", "OnesCount8",
makeOnesCountRISCV64(ssa.OpPopCount8),
sys.RISCV64)
}
alias("math/bits", "OnesCount", "math/bits", "OnesCount64", p8...)
addF("math/bits", "Mul64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
addF("math/bits", "Add64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
alias("math/bits", "Add", "math/bits", "Add64", p8...)
alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
addF("math/bits", "Sub64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
addF("math/bits", "Div64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// check for divide-by-zero/overflow and panic with appropriate message
cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
s.check(cmpZero, ir.Syms.Panicdivide)
cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
s.check(cmpOverflow, ir.Syms.Panicoverflow)
return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64)
alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
/******** sync/atomic ********/
// Note: these are disabled by flag_race in findIntrinsic below.
alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
// Note: not StorePointer, that needs a write barrier. Same below for {CompareAnd}Swap.
alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
/******** math/big ********/
alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
/******** internal/runtime/maps ********/
// Important: The intrinsic implementations below return a packed
// bitset, while the portable Go implementation uses an unpacked
// representation (one bit set in each byte).
//
// Thus we must replace most bitset methods with implementations that
// work with the packed representation.
//
// TODO(prattmic): The bitset implementations don't use SIMD, so they
// could be handled with build tags (though that would break
// -d=ssa/intrinsics/off=1).
// With a packed representation we no longer need to shift the result
// of TrailingZeros64.
alias("internal/runtime/maps", "bitsetFirst", "internal/runtime/sys", "TrailingZeros64", sys.ArchAMD64)
addF("internal/runtime/maps", "bitsetRemoveBelow",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
i := args[1]
// Clear the lower i bits in b.
//
// out = b &^ ((1 << i) - 1)
one := s.constInt64(types.Types[types.TUINT64], 1)
mask := s.newValue2(ssa.OpLsh8x8, types.Types[types.TUINT64], one, i)
mask = s.newValue2(ssa.OpSub64, types.Types[types.TUINT64], mask, one)
mask = s.newValue1(ssa.OpCom64, types.Types[types.TUINT64], mask)
return s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, mask)
},
sys.AMD64)
addF("internal/runtime/maps", "bitsetLowestSet",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
// Test the lowest bit in b.
//
// out = (b & 1) == 1
one := s.constInt64(types.Types[types.TUINT64], 1)
and := s.newValue2(ssa.OpAnd64, types.Types[types.TUINT64], b, one)
return s.newValue2(ssa.OpEq64, types.Types[types.TBOOL], and, one)
},
sys.AMD64)
addF("internal/runtime/maps", "bitsetShiftOutLowest",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
b := args[0]
// Right shift out the lowest bit in b.
//
// out = b >> 1
one := s.constInt64(types.Types[types.TUINT64], 1)
return s.newValue2(ssa.OpRsh64Ux64, types.Types[types.TUINT64], b, one)
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchH2",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
g := args[0]
h := args[1]
// Explicit copies to fp registers. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
hfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, h)
// Broadcast h2 into each byte of a word.
var broadcast *ssa.Value
if buildcfg.GOAMD64 >= 4 {
// VPBROADCASTB saves 1 instruction vs PSHUFB
// because the input can come from a GP
// register, while PSHUFB requires moving into
// an FP register first.
//
// Nominally PSHUFB would require a second
// additional instruction to load the control
// mask into a FP register. But broadcast uses
// a control mask of 0, and the register ABI
// already defines X15 as a zero register.
broadcast = s.newValue1(ssa.OpAMD64VPBROADCASTB, types.TypeInt128, h) // use gp copy of h
} else if buildcfg.GOAMD64 >= 2 {
// PSHUFB performs a byte broadcast when given
// a control input of 0.
broadcast = s.newValue1(ssa.OpAMD64PSHUFBbroadcast, types.TypeInt128, hfp)
} else {
// No direct byte broadcast. First we must
// duplicate the lower byte and then do a
// 16-bit broadcast.
// "Unpack" h2 with itself. This duplicates the
// input, resulting in h2 in the lower two
// bytes.
unpack := s.newValue2(ssa.OpAMD64PUNPCKLBW, types.TypeInt128, hfp, hfp)
// Copy the lower 16-bits of unpack into every
// 16-bit slot in the lower 64-bits of the
// output register. Note that immediate 0
// selects the low word as the source for every
// destination slot.
broadcast = s.newValue1I(ssa.OpAMD64PSHUFLW, types.TypeInt128, 0, unpack)
// No need to broadcast into the upper 64-bits,
// as we don't use those.
}
// Compare each byte of the control word with h2. Each
// matching byte has every bit set.
eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, broadcast, gfp)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. If h2 is also zero,
// then we'll get matches on those bytes. Truncate the
// upper bits to ignore such matches.
ret := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
return ret
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchEmpty",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
if buildcfg.GOAMD64 >= 2 {
// "PSIGNB negates each data element of the
// destination operand (the first operand) if
// the signed integer value of the
// corresponding data element in the source
// operand (the second operand) is less than
// zero. If the signed integer value of a data
// element in the source operand is positive,
// the corresponding data element in the
// destination operand is unchanged. If a data
// element in the source operand is zero, the
// corresponding data element in the
// destination operand is set to zero" - Intel SDM
//
// If we pass the group control word as both
// arguments:
// - Full slots are unchanged.
// - Deleted slots are negated, becoming
// 0000 0010.
// - Empty slots are negated, becoming
// 1000 0000 (unchanged!).
//
// The result is that only empty slots have the
// sign bit set. We then use PMOVMSKB to
// extract the sign bits.
sign := s.newValue2(ssa.OpAMD64PSIGNB, types.TypeInt128, gfp, gfp)
// Construct a "byte mask": each output bit is
// equal to the sign bit each input byte. The
// sign bit is only set for empty or deleted
// slots.
//
// This results in a packed output (bit N set
// means byte N matched).
//
// NOTE: See comment above on bitsetFirst.
ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], sign)
// g is only 64-bits so the upper 64-bits of
// the 128-bit register will be zero. PSIGNB
// will keep all of these bytes zero, so no
// need to truncate.
return ret
}
// No PSIGNB, simply do byte equality with ctrlEmpty.
// Load ctrlEmpty into each byte of a control word.
var ctrlsEmpty uint64 = abi.MapCtrlEmpty
cmd/compile: intrinsify swissmap match calls with SIMD on amd64 Use similar SIMD operations to the ones used in Abseil. We still using 8-slot groups (even though the XMM registers could handle 16-slot groups) to keep the implementation simpler (no changes to the memory layout of maps). Still, the implementations of matchH2 and matchEmpty are shorter than the portable version using standard arithmetic operations. They also return a packed bitset, which avoids the need to shift in bitset.first. That said, the packed bitset is a downside in cognitive complexity, as we have to think about two different possible representations. This doesn't leak out of the API, but we do need to intrinsify bitset to switch to a compatible implementation. The compiler's intrinsics don't support intrinsifying methods, so the implementations move to free functions. This makes operations between 0-3% faster on my machine. e.g., MapGetHit/impl=runtimeMap/t=Int64/len=6-12 12.34n ± 1% 11.42n ± 1% -7.46% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=12-12 15.14n ± 2% 14.88n ± 1% -1.72% (p=0.009 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=18-12 15.04n ± 6% 14.66n ± 2% -2.53% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=24-12 15.80n ± 1% 15.48n ± 3% ~ (p=0.444 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=30-12 15.55n ± 4% 14.77n ± 3% -5.02% (p=0.004 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=64-12 15.26n ± 1% 15.05n ± 1% ~ (p=0.055 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=128-12 15.34n ± 1% 15.02n ± 2% -2.09% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=256-12 15.42n ± 1% 15.15n ± 1% -1.75% (p=0.001 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=512-12 15.48n ± 1% 15.18n ± 1% -1.94% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1024-12 17.38n ± 1% 17.05n ± 1% -1.90% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=2048-12 17.96n ± 0% 17.59n ± 1% -2.06% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4096-12 18.36n ± 1% 18.18n ± 1% -0.98% (p=0.013 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=8192-12 18.75n ± 0% 18.31n ± 1% -2.35% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=65536-12 26.25n ± 0% 25.95n ± 1% -1.14% (p=0.000 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=262144-12 44.24n ± 1% 44.06n ± 1% ~ (p=0.181 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=1048576-12 85.02n ± 0% 85.35n ± 0% +0.39% (p=0.032 n=25) MapGetHit/impl=runtimeMap/t=Int64/len=4194304-12 98.87n ± 1% 98.85n ± 1% ~ (p=0.799 n=25) For #54766. Cq-Include-Trybots: luci.golang.try:gotip-linux-ppc64_power10,gotip-linux-amd64-goamd64v3 Change-Id: Ic1b852f02744404122cb3672900fd95f4625905e Reviewed-on: https://go-review.googlesource.com/c/go/+/626277 Reviewed-by: Keith Randall <khr@golang.org> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Michael Pratt <mpratt@google.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-04 12:41:33 -05:00
e := s.constInt64(types.Types[types.TUINT64], int64(ctrlsEmpty))
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
efp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, e)
// Compare each byte of the control word with ctrlEmpty. Each
// matching byte has every bit set.
eq := s.newValue2(ssa.OpAMD64PCMPEQB, types.TypeInt128, efp, gfp)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
out := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], eq)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. The upper 64-bits of
// efp are also zero, so we'll get matches on those
// bytes. Truncate the upper bits to ignore such
// matches.
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
//
// A slot is empty or deleted iff bit 7 (sign bit) is
// set.
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte. The sign bit is only
// set for empty or deleted slots.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
ret := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero. Zero will never match
// ctrlEmpty or ctrlDeleted, so no need to truncate.
return ret
},
sys.AMD64)
addF("internal/runtime/maps", "ctrlGroupMatchFull",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// An empty slot is 1000 0000
// A deleted slot is 1111 1110
// A full slot is 0??? ????
//
// A slot is full iff bit 7 (sign bit) is unset.
g := args[0]
// Explicit copy to fp register. See
// https://go.dev/issue/70451.
gfp := s.newValue1(ssa.OpAMD64MOVQi2f, types.TypeInt128, g)
// Construct a "byte mask": each output bit is equal to
// the sign bit each input byte. The sign bit is only
// set for empty or deleted slots.
//
// This results in a packed output (bit N set means
// byte N matched).
//
// NOTE: See comment above on bitsetFirst.
mask := s.newValue1(ssa.OpAMD64PMOVMSKB, types.Types[types.TUINT16], gfp)
// Invert the mask to set the bits for the full slots.
out := s.newValue1(ssa.OpCom16, types.Types[types.TUINT16], mask)
// g is only 64-bits so the upper 64-bits of the
// 128-bit register will be zero, with bit 7 unset.
// Truncate the upper bits to ignore these.
return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out)
},
sys.AMD64)
if buildcfg.Experiment.SIMD {
// Only enable intrinsics, if SIMD experiment.
simdIntrinsics(addF)
addF("simd", "ClearAVXUpperBits",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue1(ssa.OpAMD64VZEROUPPER, types.TypeMem, s.mem())
return nil
},
sys.AMD64)
}
}
func opLen1(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(op, t, args[0])
}
}
func opLen2(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(op, t, args[0], args[1])
}
}
func opLen2_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(op, t, args[1], args[0])
}
}
func opLen3(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, t, args[0], args[1], args[2])
}
}
func opLen3_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, t, args[2], args[1], args[0])
}
}
func opLen3_21(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, t, args[1], args[0], args[2])
}
}
func opLen3_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, t, args[2], args[0], args[1])
}
}
func opLen4(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue4(op, t, args[0], args[1], args[2], args[3])
}
}
func opLen4_231(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue4(op, t, args[2], args[0], args[1], args[3])
}
}
func opLen4_31(op ssa.Op, t *types.Type) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue4(op, t, args[2], args[1], args[0], args[3])
}
}
func immJumpTable(s *state, idx *ssa.Value, intrinsicCall *ir.CallExpr, genOp func(*state, int)) *ssa.Value {
// Make blocks we'll need.
bEnd := s.f.NewBlock(ssa.BlockPlain)
t := types.Types[types.TUINT8]
if !idx.Type.IsKind(types.TUINT8) {
panic("immJumpTable expects uint8 value")
}
// We will exhaust 0-255, so no need to check the bounds.
b := s.curBlock
b.Kind = ssa.BlockJumpTable
b.Pos = intrinsicCall.Pos()
if base.Flag.Cfg.SpectreIndex {
// Potential Spectre vulnerability hardening?
idx = s.newValue2(ssa.OpSpectreSliceIndex, t, idx, s.uintptrConstant(255))
}
b.SetControl(idx)
targets := [256]*ssa.Block{}
for i := range 256 {
t := s.f.NewBlock(ssa.BlockPlain)
targets[i] = t
b.AddEdgeTo(t)
}
s.endBlock()
for i, t := range targets {
s.startBlock(t)
genOp(s, i)
t.AddEdgeTo(bEnd)
s.endBlock()
}
s.startBlock(bEnd)
ret := s.variable(intrinsicCall, intrinsicCall.Type())
return ret
}
func opLen1Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 {
return s.newValue1I(op, t, args[1].AuxInt<<int64(offset), args[0])
}
return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue1I(op, t, int64(int8(idx<<offset)), args[0])
})
}
}
func opLen2Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 {
return s.newValue2I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2])
}
return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[2])
})
}
}
func opLen3Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 {
return s.newValue3I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3])
}
return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3])
})
}
}
func opLen2Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[2].Op == ssa.OpConst8 {
return s.newValue2I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1])
}
return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue2I(op, t, int64(int8(idx<<offset)), args[0], args[1])
})
}
}
func opLen3Imm8_2I(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[2].Op == ssa.OpConst8 {
return s.newValue3I(op, t, args[2].AuxInt<<int64(offset), args[0], args[1], args[3])
}
return immJumpTable(s, args[2], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue3I(op, t, int64(int8(idx<<offset)), args[0], args[1], args[3])
})
}
}
func opLen4Imm8(op ssa.Op, t *types.Type, offset int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if args[1].Op == ssa.OpConst8 {
return s.newValue4I(op, t, args[1].AuxInt<<int64(offset), args[0], args[2], args[3], args[4])
}
return immJumpTable(s, args[1], n, func(sNew *state, idx int) {
// Encode as int8 due to requirement of AuxInt, check its comment for details.
s.vars[n] = sNew.newValue4I(op, t, int64(int8(idx<<offset)), args[0], args[2], args[3], args[4])
})
}
}
func simdLoad() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpLoad, n.Type(), args[0], s.mem())
}
}
func simdStore() func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.store(args[0].Type, args[1], args[0])
return nil
}
}
var loadMaskOpcodes = map[int]map[int]ssa.Op{
8: {16: ssa.OpLoadMask8x16, 32: ssa.OpLoadMask8x32, 64: ssa.OpLoadMask8x64},
16: {8: ssa.OpLoadMask16x8, 16: ssa.OpLoadMask16x16, 32: ssa.OpLoadMask16x32},
32: {4: ssa.OpLoadMask32x4, 8: ssa.OpLoadMask32x8, 16: ssa.OpLoadMask32x16},
64: {2: ssa.OpLoadMask64x2, 4: ssa.OpLoadMask64x4, 8: ssa.OpLoadMask64x8},
}
var cvtVToMaskOpcodes = map[int]map[int]ssa.Op{
8: {16: ssa.OpCvt16toMask8x16, 32: ssa.OpCvt32toMask8x32, 64: ssa.OpCvt64toMask8x64},
16: {8: ssa.OpCvt8toMask16x8, 16: ssa.OpCvt16toMask16x16, 32: ssa.OpCvt32toMask16x32},
32: {4: ssa.OpCvt8toMask32x4, 8: ssa.OpCvt8toMask32x8, 16: ssa.OpCvt16toMask32x16},
64: {2: ssa.OpCvt8toMask64x2, 4: ssa.OpCvt8toMask64x4, 8: ssa.OpCvt8toMask64x8},
}
var cvtMaskToVOpcodes = map[int]map[int]ssa.Op{
8: {16: ssa.OpCvtMask8x16to16, 32: ssa.OpCvtMask8x32to32, 64: ssa.OpCvtMask8x64to64},
16: {8: ssa.OpCvtMask16x8to8, 16: ssa.OpCvtMask16x16to16, 32: ssa.OpCvtMask16x32to32},
32: {4: ssa.OpCvtMask32x4to8, 8: ssa.OpCvtMask32x8to8, 16: ssa.OpCvtMask32x16to16},
64: {2: ssa.OpCvtMask64x2to8, 4: ssa.OpCvtMask64x4to8, 8: ssa.OpCvtMask64x8to8},
}
func simdLoadMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
op := loadMaskOpcodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
return s.newValue2(op, types.TypeMask, args[0], s.mem())
}
}
func simdStoreMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
opCodes := map[int]map[int]ssa.Op{
8: {16: ssa.OpStoreMask8x16, 32: ssa.OpStoreMask8x32, 64: ssa.OpStoreMask8x64},
16: {8: ssa.OpStoreMask16x8, 16: ssa.OpStoreMask16x16, 32: ssa.OpStoreMask16x32},
32: {4: ssa.OpStoreMask32x4, 8: ssa.OpStoreMask32x8, 16: ssa.OpStoreMask32x16},
64: {2: ssa.OpStoreMask64x2, 4: ssa.OpStoreMask64x4, 8: ssa.OpStoreMask64x8},
}
op := opCodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
s.vars[memVar] = s.newValue3A(op, types.TypeMem, types.TypeMask, args[1], args[0], s.mem())
return nil
}
}
func simdCvtVToMask(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
op := cvtVToMaskOpcodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
return s.newValue1(op, types.TypeMask, args[0])
}
}
func simdCvtMaskToV(elemBits, lanes int) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
op := cvtMaskToVOpcodes[elemBits][lanes]
if op == 0 {
panic(fmt.Sprintf("Unknown mask shape: Mask%dx%d", elemBits, lanes))
}
return s.newValue1(op, n.Type(), args[0])
}
}
func simdMaskedLoad(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(op, n.Type(), args[0], args[1], s.mem())
}
}
func simdMaskedStore(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue4A(op, types.TypeMem, args[0].Type, args[1], args[2], args[0], s.mem())
return nil
}
}
// findIntrinsic returns a function which builds the SSA equivalent of the
// function identified by the symbol sym. If sym is not an intrinsic call, returns nil.
func findIntrinsic(sym *types.Sym) intrinsicBuilder {
if sym == nil || sym.Pkg == nil {
return nil
}
pkg := sym.Pkg.Path
if sym.Pkg == ir.Pkgs.Runtime {
pkg = "runtime"
}
if base.Flag.Race && pkg == "sync/atomic" {
// The race detector needs to be able to intercept these calls.
// We can't intrinsify them.
return nil
}
// Skip intrinsifying math functions (which may contain hard-float
// instructions) when soft-float
if Arch.SoftFloat && pkg == "math" {
return nil
}
fn := sym.Name
if ssa.IntrinsicsDisable {
if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") ||
pkg == "internal/simd" || pkg == "simd" { // TODO after simd has been moved to package simd, remove internal/simd
// These runtime functions don't have definitions, must be intrinsics.
} else {
return nil
}
}
return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
}
func IsIntrinsicCall(n *ir.CallExpr) bool {
if n == nil {
return false
}
name, ok := n.Fun.(*ir.Name)
if !ok {
if n.Fun.Op() == ir.OMETHEXPR {
if meth := ir.MethodExprName(n.Fun); meth != nil {
if fn := meth.Func; fn != nil {
return IsIntrinsicSym(fn.Sym())
}
}
}
return false
}
return IsIntrinsicSym(name.Sym())
}
func IsIntrinsicSym(sym *types.Sym) bool {
return findIntrinsic(sym) != nil
}
// GenIntrinsicBody generates the function body for a bodyless intrinsic.
// This is used when the intrinsic is used in a non-call context, e.g.
// as a function pointer, or (for a method) being referenced from the type
// descriptor.
//
// The compiler already recognizes a call to fn as an intrinsic and can
// directly generate code for it. So we just fill in the body with a call
// to fn.
func GenIntrinsicBody(fn *ir.Func) {
if ir.CurFunc != nil {
base.FatalfAt(fn.Pos(), "enqueueFunc %v inside %v", fn, ir.CurFunc)
}
if base.Flag.LowerR != 0 {
fmt.Println("generate intrinsic for", ir.FuncName(fn))
}
pos := fn.Pos()
ft := fn.Type()
var ret ir.Node
// For a method, it usually starts with an ODOTMETH (pre-typecheck) or
// OMETHEXPR (post-typecheck) referencing the method symbol without the
// receiver type, and Walk rewrites it to a call directly to the
// type-qualified method symbol, moving the receiver to an argument.
// Here fn has already the type-qualified method symbol, and it is hard
// to get the unqualified symbol. So we just generate the post-Walk form
// and mark it typechecked and Walked.
call := ir.NewCallExpr(pos, ir.OCALLFUNC, fn.Nname, nil)
call.Args = ir.RecvParamNames(ft)
call.IsDDD = ft.IsVariadic()
typecheck.Exprs(call.Args)
call.SetTypecheck(1)
call.SetWalked(true)
ret = call
if ft.NumResults() > 0 {
if ft.NumResults() == 1 {
call.SetType(ft.Result(0).Type)
} else {
call.SetType(ft.ResultsTuple())
}
n := ir.NewReturnStmt(base.Pos, nil)
n.Results = []ir.Node{call}
ret = n
}
fn.Body.Append(ret)
if base.Flag.LowerR != 0 {
ir.DumpList("generate intrinsic body", fn.Body)
}
ir.CurFunc = fn
typecheck.Stmts(fn.Body)
ir.CurFunc = nil // we know CurFunc is nil at entry
}