go/src/cmd/compile/internal/ssagen/intrinsics.go

1232 lines
54 KiB
Go
Raw Normal View History

// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package ssagen
import (
"fmt"
"internal/buildcfg"
"cmd/compile/internal/base"
"cmd/compile/internal/ir"
"cmd/compile/internal/ssa"
"cmd/compile/internal/types"
"cmd/internal/sys"
)
var intrinsics intrinsicBuilders
// An intrinsicBuilder converts a call node n into an ssa value that
// implements that call as an intrinsic. args is a list of arguments to the func.
type intrinsicBuilder func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value
type intrinsicKey struct {
arch *sys.Arch
pkg string
fn string
}
// intrinsicBuildConfig specifies the config to use for intrinsic building.
type intrinsicBuildConfig struct {
instrumenting bool
go386 string
goamd64 int
goarm buildcfg.GoarmFeatures
goarm64 buildcfg.Goarm64Features
gomips string
gomips64 string
goppc64 int
goriscv64 int
}
type intrinsicBuilders map[intrinsicKey]intrinsicBuilder
// add adds the intrinsic builder b for pkg.fn for the given architecture.
func (ib intrinsicBuilders) add(arch *sys.Arch, pkg, fn string, b intrinsicBuilder) {
if _, found := ib[intrinsicKey{arch, pkg, fn}]; found {
panic(fmt.Sprintf("intrinsic already exists for %v.%v on %v", pkg, fn, arch.Name))
}
ib[intrinsicKey{arch, pkg, fn}] = b
}
// addForArchs adds the intrinsic builder b for pkg.fn for the given architectures.
func (ib intrinsicBuilders) addForArchs(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
for _, arch := range archs {
ib.add(arch, pkg, fn, b)
}
}
// addForFamilies does the same as addForArchs but operates on architecture families.
func (ib intrinsicBuilders) addForFamilies(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
for _, arch := range sys.Archs {
if arch.InFamily(archFamilies...) {
intrinsics.add(arch, pkg, fn, b)
}
}
}
// alias aliases pkg.fn to targetPkg.targetFn for all architectures in archs
// for which targetPkg.targetFn already exists.
func (ib intrinsicBuilders) alias(pkg, fn, targetPkg, targetFn string, archs ...*sys.Arch) {
// TODO(jsing): Consider making this work even if the alias is added
// before the intrinsic.
aliased := false
for _, arch := range archs {
if b := intrinsics.lookup(arch, targetPkg, targetFn); b != nil {
intrinsics.add(arch, pkg, fn, b)
aliased = true
}
}
if !aliased {
panic(fmt.Sprintf("attempted to alias undefined intrinsic: %s.%s", pkg, fn))
}
}
// lookup looks up the intrinsic for a pkg.fn on the specified architecture.
func (ib intrinsicBuilders) lookup(arch *sys.Arch, pkg, fn string) intrinsicBuilder {
return intrinsics[intrinsicKey{arch, pkg, fn}]
}
func initIntrinsics(cfg *intrinsicBuildConfig) {
if cfg == nil {
cfg = &intrinsicBuildConfig{
instrumenting: base.Flag.Cfg.Instrumenting,
go386: buildcfg.GO386,
goamd64: buildcfg.GOAMD64,
goarm: buildcfg.GOARM,
goarm64: buildcfg.GOARM64,
gomips: buildcfg.GOMIPS,
gomips64: buildcfg.GOMIPS64,
goppc64: buildcfg.GOPPC64,
goriscv64: buildcfg.GORISCV64,
}
}
intrinsics = intrinsicBuilders{}
var p4 []*sys.Arch
var p8 []*sys.Arch
var lwatomics []*sys.Arch
for _, a := range sys.Archs {
if a.PtrSize == 4 {
p4 = append(p4, a)
} else {
p8 = append(p8, a)
}
if a.Family != sys.PPC64 {
lwatomics = append(lwatomics, a)
}
}
all := sys.Archs[:]
add := func(pkg, fn string, b intrinsicBuilder, archs ...*sys.Arch) {
intrinsics.addForArchs(pkg, fn, b, archs...)
}
addF := func(pkg, fn string, b intrinsicBuilder, archFamilies ...sys.ArchFamily) {
intrinsics.addForFamilies(pkg, fn, b, archFamilies...)
}
alias := func(pkg, fn, pkg2, fn2 string, archs ...*sys.Arch) {
intrinsics.alias(pkg, fn, pkg2, fn2, archs...)
}
/******** runtime ********/
if !cfg.instrumenting {
add("runtime", "slicebytetostringtmp",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Compiler frontend optimizations emit OBYTES2STRTMP nodes
// for the backend instead of slicebytetostringtmp calls
// when not instrumenting.
return s.newValue2(ssa.OpStringMake, n.Type(), args[0], args[1])
},
all...)
}
addF("internal/runtime/math", "MulUintptr",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
return s.newValue2(ssa.OpMul32uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
}
return s.newValue2(ssa.OpMul64uover, types.NewTuple(types.Types[types.TUINT], types.Types[types.TUINT]), args[0], args[1])
},
sys.AMD64, sys.I386, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.ARM64)
add("runtime", "KeepAlive",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
data := s.newValue1(ssa.OpIData, s.f.Config.Types.BytePtr, args[0])
s.vars[memVar] = s.newValue2(ssa.OpKeepAlive, types.TypeMem, data, s.mem())
return nil
},
all...)
addF("runtime", "publicationBarrier",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue1(ssa.OpPubBarrier, types.TypeMem, s.mem())
return nil
},
cmd/compile/internal: intrinsify publicationBarrier on loong64 The publication barrier is a StoreStore barrier, which is implemented by "DBAR 0x1A" [1] on loong64. goos: linux goarch: loong64 pkg: runtime cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | Malloc8 31.76n ± 0% 22.79n ± 0% -28.24% (p=0.000 n=20) Malloc8-2 25.46n ± 0% 18.33n ± 0% -28.00% (p=0.000 n=20) Malloc8-4 25.75n ± 0% 18.43n ± 0% -28.41% (p=0.000 n=20) Malloc16 62.97n ± 0% 42.41n ± 0% -32.65% (p=0.000 n=20) Malloc16-2 49.11n ± 0% 31.68n ± 0% -35.50% (p=0.000 n=20) Malloc16-4 49.64n ± 1% 31.95n ± 0% -35.62% (p=0.000 n=20) MallocTypeInfo8 58.57n ± 0% 46.51n ± 0% -20.61% (p=0.000 n=20) MallocTypeInfo8-2 51.43n ± 0% 38.01n ± 0% -26.09% (p=0.000 n=20) MallocTypeInfo8-4 51.65n ± 0% 38.15n ± 0% -26.13% (p=0.000 n=20) MallocTypeInfo16 68.07n ± 0% 51.62n ± 0% -24.17% (p=0.000 n=20) MallocTypeInfo16-2 54.73n ± 0% 41.13n ± 0% -24.85% (p=0.000 n=20) MallocTypeInfo16-4 55.05n ± 0% 41.28n ± 0% -25.02% (p=0.000 n=20) MallocLargeStruct 491.5n ± 0% 454.8n ± 0% -7.47% (p=0.000 n=20) MallocLargeStruct-2 351.8n ± 1% 323.8n ± 0% -7.94% (p=0.000 n=20) MallocLargeStruct-4 333.6n ± 0% 316.7n ± 0% -5.10% (p=0.000 n=20) geomean 71.01n 53.78n -24.26% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html Change-Id: Ica0c89db6f2bebd55d9b3207a1c462a9454e9268 Reviewed-on: https://go-review.googlesource.com/c/go/+/577515 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: Carlos Amedee <carlos@golang.org>
2024-09-19 19:50:23 +08:00
sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64)
/******** internal/runtime/sys ********/
add("internal/runtime/sys", "GetCallerPC",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue0(ssa.OpGetCallerPC, s.f.Config.Types.Uintptr)
},
all...)
add("internal/runtime/sys", "GetCallerSP",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpGetCallerSP, s.f.Config.Types.Uintptr, s.mem())
},
all...)
add("internal/runtime/sys", "GetClosurePtr",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue0(ssa.OpGetClosurePtr, s.f.Config.Types.Uintptr)
},
all...)
brev_arch := []sys.ArchFamily{sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X}
if cfg.goppc64 >= 10 {
// Use only on Power10 as the new byte reverse instructions that Power10 provide
// make it worthwhile as an intrinsic
brev_arch = append(brev_arch, sys.PPC64)
}
addF("internal/runtime/sys", "Bswap32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap32, types.Types[types.TUINT32], args[0])
},
brev_arch...)
addF("internal/runtime/sys", "Bswap64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap64, types.Types[types.TUINT64], args[0])
},
brev_arch...)
/****** Prefetch ******/
makePrefetchFunc := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue2(op, types.TypeMem, args[0], s.mem())
return nil
}
}
// Make Prefetch intrinsics for supported platforms
// On the unsupported platforms stub function will be eliminated
addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
sys.AMD64, sys.ARM64, sys.PPC64)
addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
sys.AMD64, sys.ARM64, sys.PPC64)
/******** internal/runtime/atomic ********/
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
addF("internal/runtime/atomic", "Load",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Load8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Load64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoad64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "LoadAcq",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadAcq32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.PPC64)
addF("internal/runtime/atomic", "LoadAcq64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadAcq64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.PPC64)
addF("internal/runtime/atomic", "Loadp",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue2(ssa.OpAtomicLoadPtr, types.NewTuple(s.f.Config.Types.BytePtr, types.TypeMem), args[0], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, s.f.Config.Types.BytePtr, v)
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
sys.AMD64, sys.ARM64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "StorepNoWB",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStorePtrNoWB, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "StoreRel",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.PPC64)
addF("internal/runtime/atomic", "StoreRel64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStoreRel64, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.PPC64)
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64 On Loong64, AMSWAPDB{W,V} instructions are supported by default, and AMSWAPDB{B,H} [1] is a new instruction added by LA664(Loongson 3A6000) and later microarchitectures. Therefore, AMSWAPDB{W,V} (full barrier) is used to implement AtomicStore{32,64}, and the traditional MOVB or the new AMSWAPDBB is used to implement AtomicStore8 according to the CPU feature. The StoreRelease barrier on Loong64 is "dbar 0x12", but it is still necessary to ensure consistency in the order of Store/Load [2]. LoweredAtomicStorezero{32,64} was removed because on loong64 the constant "0" uses the R0 register, and there is no performance difference between the implementations of LoweredAtomicStorezero{32,64} and LoweredAtomicStore{32,64}. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore64-2 19.61n ± 0% 13.61n ± 0% -30.57% (p=0.000 n=20) AtomicStore64-4 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore 19.61n ± 0% 13.61n ± 0% -30.60% (p=0.000 n=20) AtomicStore-2 19.62n ± 0% 13.61n ± 0% -30.63% (p=0.000 n=20) AtomicStore-4 19.62n ± 0% 13.62n ± 0% -30.58% (p=0.000 n=20) AtomicStore8 19.61n ± 0% 20.01n ± 0% +2.04% (p=0.000 n=20) AtomicStore8-2 19.62n ± 0% 20.02n ± 0% +2.01% (p=0.000 n=20) AtomicStore8-4 19.61n ± 0% 20.02n ± 0% +2.09% (p=0.000 n=20) geomean 19.61n 15.48n -21.08% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | AtomicStore64 18.03n ± 0% 12.81n ± 0% -28.93% (p=0.000 n=20) AtomicStore64-2 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore64-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore 18.02n ± 0% 12.81n ± 0% -28.91% (p=0.000 n=20) AtomicStore-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-2 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) AtomicStore8-4 18.01n ± 0% 12.81n ± 0% -28.87% (p=0.000 n=20) geomean 18.01n 12.81n -28.89% [1]: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html [2]: https://gcc.gnu.org/git/?p=gcc.git;a=blob_plain;f=gcc/config/loongarch/sync.md Change-Id: I4ae5e8dd0e6f026129b6e503990a763ed40c6097 Reviewed-on: https://go-review.googlesource.com/c/go/+/581356 Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com>
2024-09-13 18:47:56 +08:00
makeAtomicGuardedIntrinsicLoong64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
emit(s, n, args, op1, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
if needReturn {
return s.variable(n, types.Types[typ])
} else {
return nil
}
}
}
makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return makeAtomicGuardedIntrinsicLoong64common(op0, op1, typ, emit, false)
}
atomicStoreEmitterLoong64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Store8",
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Store",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.Loong64)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.Loong64)
addF("internal/runtime/atomic", "Xchg8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange8, types.NewTuple(types.Types[types.TUINT8], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT8], v)
},
sys.AMD64, sys.PPC64)
addF("internal/runtime/atomic", "Xchg",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xchg64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicExchange64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
makeAtomicGuardedIntrinsicARM64common := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter, needReturn bool) intrinsicBuilder {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goarm64.LSE {
emit(s, n, args, op1, typ, needReturn)
} else {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARM64HasATOMICS, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
s.startBlock(bTrue)
emit(s, n, args, op1, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, typ, needReturn)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
}
if needReturn {
return s.variable(n, types.Types[typ])
} else {
return nil
}
}
}
makeAtomicGuardedIntrinsicARM64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, true)
}
makeAtomicGuardedIntrinsicARM64old := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return makeAtomicGuardedIntrinsicARM64common(op0, op1, typ, emit, false)
}
atomicEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue3(op, types.NewTuple(types.Types[typ], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Xchg8",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange8, ssa.OpAtomicExchange8Variant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xchg",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange32, ssa.OpAtomicExchange32Variant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xchg64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicExchange64, ssa.OpAtomicExchange64Variant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xadd",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd32, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT32], v)
},
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xadd64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAdd64, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TUINT64], v)
},
sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Xadd",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd32, ssa.OpAtomicAdd32Variant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Xadd64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAdd64, ssa.OpAtomicAdd64Variant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Cas",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Cas64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap64, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
sys.AMD64, sys.Loong64, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "CasRel",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue4(ssa.OpAtomicCompareAndSwap32, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
return s.newValue1(ssa.OpSelect0, types.Types[types.TBOOL], v)
},
sys.PPC64)
atomicCasEmitterARM64 := func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool) {
v := s.newValue4(op, types.NewTuple(types.Types[types.TBOOL], types.TypeMem), args[0], args[1], args[2], s.mem())
s.vars[memVar] = s.newValue1(ssa.OpSelect1, types.TypeMem, v)
if needReturn {
s.vars[n] = s.newValue1(ssa.OpSelect0, types.Types[typ], v)
}
}
addF("internal/runtime/atomic", "Cas",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap32, ssa.OpAtomicCompareAndSwap32Variant, types.TBOOL, atomicCasEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Cas64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicCompareAndSwap64, ssa.OpAtomicCompareAndSwap64Variant, types.TBOOL, atomicCasEmitterARM64),
sys.ARM64)
// Old-style atomic logical operation API (all supported archs except arm64).
addF("internal/runtime/atomic", "And8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "And",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicAnd32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Or8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Or",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicOr32, types.TypeMem, args[0], args[1], s.mem())
return nil
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
// arm64 always uses the new-style atomic logical operations, for both the
// old and new style API.
addF("internal/runtime/atomic", "And8",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd8value, ssa.OpAtomicAnd8valueVariant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or8",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr8value, ssa.OpAtomicOr8valueVariant, types.TUINT8, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd64value, ssa.OpAtomicAnd64valueVariant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And32",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "And",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicAnd32value, ssa.OpAtomicAnd32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or64",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr64value, ssa.OpAtomicOr64valueVariant, types.TUINT64, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or32",
makeAtomicGuardedIntrinsicARM64(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
addF("internal/runtime/atomic", "Or",
makeAtomicGuardedIntrinsicARM64old(ssa.OpAtomicOr32value, ssa.OpAtomicOr32valueVariant, types.TUINT32, atomicEmitterARM64),
sys.ARM64)
// New-style atomic logical operations, which return the old memory value.
addF("internal/runtime/atomic", "And64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAnd64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "And32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicAnd32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "Or64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicOr64value, types.NewTuple(types.Types[types.TUINT64], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
addF("internal/runtime/atomic", "Or32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
v := s.newValue3(ssa.OpAtomicOr32value, types.NewTuple(types.Types[types.TUINT32], types.TypeMem), args[0], args[1], s.mem())
p0, p1 := s.split(v)
s.vars[memVar] = p1
return p0
},
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
sys.AMD64, sys.Loong64)
// Aliases for atomic load operations
alias("internal/runtime/atomic", "Loadint32", "internal/runtime/atomic", "Load", all...)
alias("internal/runtime/atomic", "Loadint64", "internal/runtime/atomic", "Load64", all...)
alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load", p4...)
alias("internal/runtime/atomic", "Loaduintptr", "internal/runtime/atomic", "Load64", p8...)
alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load", p4...)
alias("internal/runtime/atomic", "Loaduint", "internal/runtime/atomic", "Load64", p8...)
alias("internal/runtime/atomic", "LoadAcq", "internal/runtime/atomic", "Load", lwatomics...)
alias("internal/runtime/atomic", "LoadAcq64", "internal/runtime/atomic", "Load64", lwatomics...)
alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...)
alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq", p4...) // linknamed
alias("internal/runtime/atomic", "LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...)
alias("sync", "runtime_LoadAcquintptr", "internal/runtime/atomic", "LoadAcq64", p8...) // linknamed
// Aliases for atomic store operations
alias("internal/runtime/atomic", "Storeint32", "internal/runtime/atomic", "Store", all...)
alias("internal/runtime/atomic", "Storeint64", "internal/runtime/atomic", "Store64", all...)
alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store", p4...)
alias("internal/runtime/atomic", "Storeuintptr", "internal/runtime/atomic", "Store64", p8...)
alias("internal/runtime/atomic", "StoreRel", "internal/runtime/atomic", "Store", lwatomics...)
alias("internal/runtime/atomic", "StoreRel64", "internal/runtime/atomic", "Store64", lwatomics...)
alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...)
alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel", p4...) // linknamed
alias("internal/runtime/atomic", "StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...)
alias("sync", "runtime_StoreReluintptr", "internal/runtime/atomic", "StoreRel64", p8...) // linknamed
// Aliases for atomic swap operations
alias("internal/runtime/atomic", "Xchgint32", "internal/runtime/atomic", "Xchg", all...)
alias("internal/runtime/atomic", "Xchgint64", "internal/runtime/atomic", "Xchg64", all...)
alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg", p4...)
alias("internal/runtime/atomic", "Xchguintptr", "internal/runtime/atomic", "Xchg64", p8...)
// Aliases for atomic add operations
alias("internal/runtime/atomic", "Xaddint32", "internal/runtime/atomic", "Xadd", all...)
alias("internal/runtime/atomic", "Xaddint64", "internal/runtime/atomic", "Xadd64", all...)
alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd", p4...)
alias("internal/runtime/atomic", "Xadduintptr", "internal/runtime/atomic", "Xadd64", p8...)
// Aliases for atomic CAS operations
alias("internal/runtime/atomic", "Casint32", "internal/runtime/atomic", "Cas", all...)
alias("internal/runtime/atomic", "Casint64", "internal/runtime/atomic", "Cas64", all...)
alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas", p4...)
alias("internal/runtime/atomic", "Casuintptr", "internal/runtime/atomic", "Cas64", p8...)
alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas", p4...)
alias("internal/runtime/atomic", "Casp1", "internal/runtime/atomic", "Cas64", p8...)
alias("internal/runtime/atomic", "CasRel", "internal/runtime/atomic", "Cas", lwatomics...)
// Aliases for atomic And/Or operations
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
alias("internal/runtime/atomic", "Anduintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchLoong64)
alias("internal/runtime/atomic", "Oruintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchLoong64)
/******** math ********/
addF("math", "sqrt",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpSqrt, types.Types[types.TFLOAT64], args[0])
},
sys.I386, sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
addF("math", "Trunc",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpTrunc, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Ceil",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCeil, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Floor",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpFloor, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X, sys.Wasm)
addF("math", "Round",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpRound, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.PPC64, sys.S390X)
addF("math", "RoundToEven",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpRoundToEven, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.S390X, sys.Wasm)
addF("math", "Abs",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpAbs, types.Types[types.TFLOAT64], args[0])
},
sys.ARM64, sys.ARM, sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm, sys.MIPS, sys.MIPS64)
addF("math", "Copysign",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpCopysign, types.Types[types.TFLOAT64], args[0], args[1])
},
sys.Loong64, sys.PPC64, sys.RISCV64, sys.Wasm)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
},
sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if !s.config.UseFMA {
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
return s.variable(n, types.Types[types.TFLOAT64])
}
if cfg.goamd64 >= 3 {
return s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasFMA)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // >= haswell cpus are common
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
},
sys.AMD64)
addF("math", "FMA",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if !s.config.UseFMA {
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
return s.variable(n, types.Types[types.TFLOAT64])
}
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.ARMHasVFPv4, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue3(ssa.OpFMA, types.Types[types.TFLOAT64], args[0], args[1], args[2])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
},
sys.ARM)
makeRoundAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goamd64 >= 2 {
return s.newValue1(op, types.Types[types.TFLOAT64], args[0])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasSSE41)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have sse4.1 nowadays
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TFLOAT64], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TFLOAT64]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TFLOAT64])
}
}
addF("math", "RoundToEven",
makeRoundAMD64(ssa.OpRoundToEven),
sys.AMD64)
addF("math", "Floor",
makeRoundAMD64(ssa.OpFloor),
sys.AMD64)
addF("math", "Ceil",
makeRoundAMD64(ssa.OpCeil),
sys.AMD64)
addF("math", "Trunc",
makeRoundAMD64(ssa.OpTrunc),
sys.AMD64)
/******** math/bits ********/
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
lo := s.newValue1(ssa.OpInt64Lo, types.Types[types.TUINT32], args[0])
hi := s.newValue1(ssa.OpInt64Hi, types.Types[types.TUINT32], args[0])
return s.newValue2(ssa.OpCtz64On32, types.Types[types.TINT], lo, hi)
},
sys.I386)
addF("math/bits", "TrailingZeros32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
c := s.constInt32(types.Types[types.TUINT32], 1<<16)
y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
},
sys.MIPS)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz16, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
addF("math/bits", "TrailingZeros16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
c := s.constInt64(types.Types[types.TUINT64], 1<<16)
y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.Loong64, sys.S390X, sys.PPC64)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
c := s.constInt32(types.Types[types.TUINT32], 1<<8)
y := s.newValue2(ssa.OpOr32, types.Types[types.TUINT32], x, c)
return s.newValue1(ssa.OpCtz32, types.Types[types.TINT], y)
},
sys.MIPS)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[types.TINT], args[0])
},
sys.AMD64, sys.I386, sys.ARM, sys.ARM64, sys.Wasm)
addF("math/bits", "TrailingZeros8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
c := s.constInt64(types.Types[types.TUINT64], 1<<8)
y := s.newValue2(ssa.OpOr64, types.Types[types.TUINT64], x, c)
return s.newValue1(ssa.OpCtz64, types.Types[types.TINT], y)
},
cmd/compile: wire up math/bits.TrailingZeros intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 1.7240n ± 0% 0.8120n ± 0% -52.90% (p=0.000 n=20) TrailingZeros8 1.0530n ± 0% 0.8015n ± 0% -23.88% (p=0.000 n=20) TrailingZeros16 2.072n ± 0% 1.015n ± 0% -51.01% (p=0.000 n=20) TrailingZeros32 1.7160n ± 0% 0.8122n ± 0% -52.67% (p=0.000 n=20) TrailingZeros64 2.0060n ± 0% 0.8125n ± 0% -59.50% (p=0.000 n=20) geomean 1.669n 0.8470n -49.25% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | TrailingZeros 2.6275n ± 0% 0.9120n ± 0% -65.29% (p=0.000 n=20) TrailingZeros8 1.451n ± 0% 1.163n ± 0% -19.85% (p=0.000 n=20) TrailingZeros16 3.069n ± 0% 1.201n ± 0% -60.87% (p=0.000 n=20) TrailingZeros32 2.9060n ± 0% 0.9115n ± 0% -68.63% (p=0.000 n=20) TrailingZeros64 2.6305n ± 0% 0.9115n ± 0% -65.35% (p=0.000 n=20) geomean 2.456n 1.011n -58.83% This patch is a copy of CL 479498. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I1a5b2114a844dc0d02c8e68f41ce2443ac3b5fda Reviewed-on: https://go-review.googlesource.com/c/go/+/624356 Reviewed-by: abner chenc <chenguoqi@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com>
2024-11-01 16:09:32 +08:00
sys.Loong64, sys.S390X)
alias("math/bits", "ReverseBytes64", "internal/runtime/sys", "Bswap64", all...)
alias("math/bits", "ReverseBytes32", "internal/runtime/sys", "Bswap32", all...)
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT16], args[0])
},
sys.Loong64)
// ReverseBytes inlines correctly, no need to intrinsify it.
// Nothing special is needed for targets where ReverseBytes16 lowers to a rotate
// On Power10, 16-bit rotate is not available so use BRH instruction
if cfg.goppc64 >= 10 {
addF("math/bits", "ReverseBytes16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBswap16, types.Types[types.TUINT], args[0])
},
sys.PPC64)
}
addF("math/bits", "Len64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "Len32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
addF("math/bits", "Len32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
}
x := s.newValue1(ssa.OpZeroExt32to64, types.Types[types.TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
},
sys.ARM, sys.S390X, sys.MIPS, sys.Wasm)
addF("math/bits", "Len16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[types.TUINT32], args[0])
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
}
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[types.TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
},
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "Len16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen16, types.Types[types.TINT], args[0])
},
sys.AMD64)
addF("math/bits", "Len8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
x := s.newValue1(ssa.OpZeroExt8to32, types.Types[types.TUINT32], args[0])
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], x)
}
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], x)
},
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
addF("math/bits", "Len8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen8, types.Types[types.TINT], args[0])
},
sys.AMD64)
addF("math/bits", "Len",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
return s.newValue1(ssa.OpBitLen32, types.Types[types.TINT], args[0])
}
return s.newValue1(ssa.OpBitLen64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up math/bits.Len intrinsics for loong64 For the SubFromLen64 codegen test case to work as intended, we need to fold c-(-(x-d)) into x+(c-d). Still, some instances of LeadingZeros are not optimized into single CLZ instructions right now (actually, the LeadingZeros micro-benchmarks are currently still compiled with redundant adds/subs of 64, due to interference of loop optimizations before lowering), but perf numbers indicate it's not that bad after all. Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 3.660n ± 0% 1.348n ± 0% -63.17% (p=0.000 n=20) LeadingZeros8 1.777n ± 0% 1.767n ± 0% -0.56% (p=0.000 n=20) LeadingZeros16 2.816n ± 0% 1.770n ± 0% -37.14% (p=0.000 n=20) LeadingZeros32 5.293n ± 1% 1.683n ± 0% -68.21% (p=0.000 n=20) LeadingZeros64 3.622n ± 0% 1.349n ± 0% -62.76% (p=0.000 n=20) geomean 3.229n 1.571n -51.35% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | LeadingZeros 2.410n ± 0% 1.103n ± 1% -54.23% (p=0.000 n=20) LeadingZeros8 1.236n ± 0% 1.501n ± 0% +21.44% (p=0.000 n=20) LeadingZeros16 2.106n ± 0% 1.501n ± 0% -28.73% (p=0.000 n=20) LeadingZeros32 2.860n ± 0% 1.324n ± 0% -53.72% (p=0.000 n=20) LeadingZeros64 2.6135n ± 0% 0.9509n ± 0% -63.62% (p=0.000 n=20) geomean 2.159n 1.256n -41.81% Updates #59120 This patch is a copy of CL 483356. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: Iee81a17f7da06d77a427e73dfcc016f2b15ae556 Reviewed-on: https://go-review.googlesource.com/c/go/+/624575 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: abner chenc <chenguoqi@loongson.cn>
2024-11-02 10:59:20 +08:00
sys.AMD64, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X, sys.MIPS, sys.PPC64, sys.Wasm)
// LeadingZeros is handled because it trivially calls Len.
addF("math/bits", "Reverse64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up bits.Reverse intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 2.8130n ± 0% 0.8008n ± 0% -71.53% (p=0.000 n=20) Reverse8 0.7014n ± 0% 0.4040n ± 0% -42.40% (p=0.000 n=20) Reverse16 1.2975n ± 0% 0.6632n ± 1% -48.89% (p=0.000 n=20) Reverse32 2.7520n ± 0% 0.4042n ± 0% -85.31% (p=0.000 n=20) Reverse64 2.8970n ± 0% 0.4041n ± 0% -86.05% (p=0.000 n=20) geomean 1.828n 0.5116n -72.01% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 4.0050n ± 0% 0.8011n ± 0% -80.00% (p=0.000 n=20) Reverse8 0.8010n ± 0% 0.5210n ± 1% -34.96% (p=0.000 n=20) Reverse16 1.6160n ± 0% 0.6008n ± 0% -62.82% (p=0.000 n=20) Reverse32 3.8550n ± 0% 0.5179n ± 0% -86.57% (p=0.000 n=20) Reverse64 3.8050n ± 0% 0.5177n ± 0% -86.40% (p=0.000 n=20) geomean 2.378n 0.5828n -75.49% Updates #59120 This patch is a copy of CL 483656. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I98681091763279279c8404bd0295785f13ea1c8e Reviewed-on: https://go-review.googlesource.com/c/go/+/624276 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev32, types.Types[types.TINT], args[0])
},
cmd/compile: wire up bits.Reverse intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 2.8130n ± 0% 0.8008n ± 0% -71.53% (p=0.000 n=20) Reverse8 0.7014n ± 0% 0.4040n ± 0% -42.40% (p=0.000 n=20) Reverse16 1.2975n ± 0% 0.6632n ± 1% -48.89% (p=0.000 n=20) Reverse32 2.7520n ± 0% 0.4042n ± 0% -85.31% (p=0.000 n=20) Reverse64 2.8970n ± 0% 0.4041n ± 0% -86.05% (p=0.000 n=20) geomean 1.828n 0.5116n -72.01% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 4.0050n ± 0% 0.8011n ± 0% -80.00% (p=0.000 n=20) Reverse8 0.8010n ± 0% 0.5210n ± 1% -34.96% (p=0.000 n=20) Reverse16 1.6160n ± 0% 0.6008n ± 0% -62.82% (p=0.000 n=20) Reverse32 3.8550n ± 0% 0.5179n ± 0% -86.57% (p=0.000 n=20) Reverse64 3.8050n ± 0% 0.5177n ± 0% -86.40% (p=0.000 n=20) geomean 2.378n 0.5828n -75.49% Updates #59120 This patch is a copy of CL 483656. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I98681091763279279c8404bd0295785f13ea1c8e Reviewed-on: https://go-review.googlesource.com/c/go/+/624276 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev16, types.Types[types.TINT], args[0])
},
cmd/compile: wire up bits.Reverse intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 2.8130n ± 0% 0.8008n ± 0% -71.53% (p=0.000 n=20) Reverse8 0.7014n ± 0% 0.4040n ± 0% -42.40% (p=0.000 n=20) Reverse16 1.2975n ± 0% 0.6632n ± 1% -48.89% (p=0.000 n=20) Reverse32 2.7520n ± 0% 0.4042n ± 0% -85.31% (p=0.000 n=20) Reverse64 2.8970n ± 0% 0.4041n ± 0% -86.05% (p=0.000 n=20) geomean 1.828n 0.5116n -72.01% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 4.0050n ± 0% 0.8011n ± 0% -80.00% (p=0.000 n=20) Reverse8 0.8010n ± 0% 0.5210n ± 1% -34.96% (p=0.000 n=20) Reverse16 1.6160n ± 0% 0.6008n ± 0% -62.82% (p=0.000 n=20) Reverse32 3.8550n ± 0% 0.5179n ± 0% -86.57% (p=0.000 n=20) Reverse64 3.8050n ± 0% 0.5177n ± 0% -86.40% (p=0.000 n=20) geomean 2.378n 0.5828n -75.49% Updates #59120 This patch is a copy of CL 483656. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I98681091763279279c8404bd0295785f13ea1c8e Reviewed-on: https://go-review.googlesource.com/c/go/+/624276 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev8, types.Types[types.TINT], args[0])
},
cmd/compile: wire up bits.Reverse intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 2.8130n ± 0% 0.8008n ± 0% -71.53% (p=0.000 n=20) Reverse8 0.7014n ± 0% 0.4040n ± 0% -42.40% (p=0.000 n=20) Reverse16 1.2975n ± 0% 0.6632n ± 1% -48.89% (p=0.000 n=20) Reverse32 2.7520n ± 0% 0.4042n ± 0% -85.31% (p=0.000 n=20) Reverse64 2.8970n ± 0% 0.4041n ± 0% -86.05% (p=0.000 n=20) geomean 1.828n 0.5116n -72.01% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 4.0050n ± 0% 0.8011n ± 0% -80.00% (p=0.000 n=20) Reverse8 0.8010n ± 0% 0.5210n ± 1% -34.96% (p=0.000 n=20) Reverse16 1.6160n ± 0% 0.6008n ± 0% -62.82% (p=0.000 n=20) Reverse32 3.8550n ± 0% 0.5179n ± 0% -86.57% (p=0.000 n=20) Reverse64 3.8050n ± 0% 0.5177n ± 0% -86.40% (p=0.000 n=20) geomean 2.378n 0.5828n -75.49% Updates #59120 This patch is a copy of CL 483656. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I98681091763279279c8404bd0295785f13ea1c8e Reviewed-on: https://go-review.googlesource.com/c/go/+/624276 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "Reverse",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitRev64, types.Types[types.TINT], args[0])
},
cmd/compile: wire up bits.Reverse intrinsics for loong64 Micro-benchmark results on Loongson 3A5000 and 3A6000: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 2.8130n ± 0% 0.8008n ± 0% -71.53% (p=0.000 n=20) Reverse8 0.7014n ± 0% 0.4040n ± 0% -42.40% (p=0.000 n=20) Reverse16 1.2975n ± 0% 0.6632n ± 1% -48.89% (p=0.000 n=20) Reverse32 2.7520n ± 0% 0.4042n ± 0% -85.31% (p=0.000 n=20) Reverse64 2.8970n ± 0% 0.4041n ± 0% -86.05% (p=0.000 n=20) geomean 1.828n 0.5116n -72.01% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000 @ 2500.00MHz | CL 624576 | this CL | | sec/op | sec/op vs base | Reverse 4.0050n ± 0% 0.8011n ± 0% -80.00% (p=0.000 n=20) Reverse8 0.8010n ± 0% 0.5210n ± 1% -34.96% (p=0.000 n=20) Reverse16 1.6160n ± 0% 0.6008n ± 0% -62.82% (p=0.000 n=20) Reverse32 3.8550n ± 0% 0.5179n ± 0% -86.57% (p=0.000 n=20) Reverse64 3.8050n ± 0% 0.5177n ± 0% -86.40% (p=0.000 n=20) geomean 2.378n 0.5828n -75.49% Updates #59120 This patch is a copy of CL 483656. Co-authored-by: WANG Xuerui <git@xen0n.name> Change-Id: I98681091763279279c8404bd0295785f13ea1c8e Reviewed-on: https://go-review.googlesource.com/c/go/+/624276 Reviewed-by: abner chenc <chenguoqi@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: David Chase <drchase@google.com>
2024-11-02 15:40:13 +08:00
sys.ARM64, sys.Loong64)
addF("math/bits", "RotateLeft8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft8, types.Types[types.TUINT8], args[0], args[1])
},
sys.AMD64, sys.RISCV64)
addF("math/bits", "RotateLeft16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft16, types.Types[types.TUINT16], args[0], args[1])
},
sys.AMD64, sys.RISCV64)
addF("math/bits", "RotateLeft32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft32, types.Types[types.TUINT32], args[0], args[1])
},
sys.AMD64, sys.ARM, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
addF("math/bits", "RotateLeft64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpRotateLeft64, types.Types[types.TUINT64], args[0], args[1])
},
sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64, sys.RISCV64, sys.S390X, sys.Wasm)
alias("math/bits", "RotateLeft", "math/bits", "RotateLeft64", p8...)
makeOnesCountAMD64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
if cfg.goamd64 >= 2 {
return s.newValue1(op, types.Types[types.TINT], args[0])
}
v := s.entryNewValue0A(ssa.OpHasCPUFeature, types.Types[types.TBOOL], ir.Syms.X86HasPOPCNT)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most machines have popcnt nowadays
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TINT])
}
}
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
makeOnesCountLoong64 := func(op ssa.Op) func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLSX, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
b.SetControl(v)
bTrue := s.f.NewBlock(ssa.BlockPlain)
bFalse := s.f.NewBlock(ssa.BlockPlain)
bEnd := s.f.NewBlock(ssa.BlockPlain)
b.AddEdgeTo(bTrue)
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely // most loong64 machines support the LSX
// We have the intrinsic - use it directly.
s.startBlock(bTrue)
s.vars[n] = s.newValue1(op, types.Types[types.TINT], args[0])
s.endBlock().AddEdgeTo(bEnd)
// Call the pure Go version.
s.startBlock(bFalse)
s.vars[n] = s.callResult(n, callNormal) // types.Types[TINT]
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
s.startBlock(bEnd)
return s.variable(n, types.Types[types.TINT])
}
}
addF("math/bits", "OnesCount64",
makeOnesCountAMD64(ssa.OpPopCount64),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount64",
makeOnesCountLoong64(ssa.OpPopCount64),
sys.Loong64)
addF("math/bits", "OnesCount64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[types.TINT], args[0])
},
sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount32",
makeOnesCountLoong64(ssa.OpPopCount32),
sys.Loong64)
addF("math/bits", "OnesCount32",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[types.TINT], args[0])
},
sys.PPC64, sys.ARM64, sys.S390X, sys.Wasm)
addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16),
sys.AMD64)
cmd/compile: optimize math/bits.OnesCount{16,32,64} implementation on loong64 Use Loong64's LSX instruction VPCNT to implement math/bits.OnesCount{16,32,64} and make it intrinsic. Benchmark results on loongson 3A5000 and 3A6000 machines: goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A5000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 4.413n ± 0% 1.401n ± 0% -68.25% (p=0.000 n=10) OnesCount8 1.364n ± 0% 1.363n ± 0% ~ (p=0.130 n=10) OnesCount16 2.112n ± 0% 1.534n ± 0% -27.37% (p=0.000 n=10) OnesCount32 4.533n ± 0% 1.529n ± 0% -66.27% (p=0.000 n=10) OnesCount64 4.565n ± 0% 1.531n ± 1% -66.46% (p=0.000 n=10) geomean 3.048n 1.470n -51.78% goos: linux goarch: loong64 pkg: math/bits cpu: Loongson-3A6000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | OnesCount 3.553n ± 0% 1.201n ± 0% -66.20% (p=0.000 n=10) OnesCount8 0.8021n ± 0% 0.8004n ± 0% -0.21% (p=0.000 n=10) OnesCount16 1.216n ± 0% 1.000n ± 0% -17.76% (p=0.000 n=10) OnesCount32 3.006n ± 0% 1.035n ± 0% -65.57% (p=0.000 n=10) OnesCount64 3.503n ± 0% 1.035n ± 0% -70.45% (p=0.000 n=10) geomean 2.053n 1.006n -51.01% Change-Id: I07a5b8da2bb48711b896387ec7625145804affc8 Reviewed-on: https://go-review.googlesource.com/c/go/+/620978 Reviewed-by: David Chase <drchase@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com> Reviewed-by: Meidan Li <limeidan@loongson.cn> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-18 16:31:29 +08:00
addF("math/bits", "OnesCount16",
makeOnesCountLoong64(ssa.OpPopCount16),
sys.Loong64)
addF("math/bits", "OnesCount16",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[types.TINT], args[0])
},
sys.ARM64, sys.S390X, sys.PPC64, sys.Wasm)
addF("math/bits", "OnesCount8",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount8, types.Types[types.TINT], args[0])
},
sys.S390X, sys.PPC64, sys.Wasm)
addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64),
sys.AMD64)
addF("math/bits", "Mul64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.MIPS64, sys.RISCV64, sys.Loong64)
alias("math/bits", "Mul", "math/bits", "Mul64", p8...)
alias("internal/runtime/math", "Mul64", "math/bits", "Mul64", p8...)
addF("math/bits", "Add64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpAdd64carry, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
alias("math/bits", "Add", "math/bits", "Add64", p8...)
alias("internal/runtime/math", "Add64", "math/bits", "Add64", all...)
addF("math/bits", "Sub64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpSub64borrow, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64, sys.ARM64, sys.PPC64, sys.S390X, sys.RISCV64, sys.Loong64, sys.MIPS64)
alias("math/bits", "Sub", "math/bits", "Sub64", p8...)
addF("math/bits", "Div64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// check for divide-by-zero/overflow and panic with appropriate message
cmpZero := s.newValue2(s.ssaOp(ir.ONE, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[2], s.zeroVal(types.Types[types.TUINT64]))
s.check(cmpZero, ir.Syms.Panicdivide)
cmpOverflow := s.newValue2(s.ssaOp(ir.OLT, types.Types[types.TUINT64]), types.Types[types.TBOOL], args[0], args[2])
s.check(cmpOverflow, ir.Syms.Panicoverflow)
return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[types.TUINT64], types.Types[types.TUINT64]), args[0], args[1], args[2])
},
sys.AMD64)
alias("math/bits", "Div", "math/bits", "Div64", sys.ArchAMD64)
alias("internal/runtime/sys", "TrailingZeros8", "math/bits", "TrailingZeros8", all...)
alias("internal/runtime/sys", "TrailingZeros32", "math/bits", "TrailingZeros32", all...)
alias("internal/runtime/sys", "TrailingZeros64", "math/bits", "TrailingZeros64", all...)
alias("internal/runtime/sys", "Len8", "math/bits", "Len8", all...)
alias("internal/runtime/sys", "Len64", "math/bits", "Len64", all...)
alias("internal/runtime/sys", "OnesCount64", "math/bits", "OnesCount64", all...)
/******** sync/atomic ********/
// Note: these are disabled by flag_race in findIntrinsic below.
alias("sync/atomic", "LoadInt32", "internal/runtime/atomic", "Load", all...)
alias("sync/atomic", "LoadInt64", "internal/runtime/atomic", "Load64", all...)
alias("sync/atomic", "LoadPointer", "internal/runtime/atomic", "Loadp", all...)
alias("sync/atomic", "LoadUint32", "internal/runtime/atomic", "Load", all...)
alias("sync/atomic", "LoadUint64", "internal/runtime/atomic", "Load64", all...)
alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load", p4...)
alias("sync/atomic", "LoadUintptr", "internal/runtime/atomic", "Load64", p8...)
alias("sync/atomic", "StoreInt32", "internal/runtime/atomic", "Store", all...)
alias("sync/atomic", "StoreInt64", "internal/runtime/atomic", "Store64", all...)
// Note: not StorePointer, that needs a write barrier. Same below for {CompareAnd}Swap.
alias("sync/atomic", "StoreUint32", "internal/runtime/atomic", "Store", all...)
alias("sync/atomic", "StoreUint64", "internal/runtime/atomic", "Store64", all...)
alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store", p4...)
alias("sync/atomic", "StoreUintptr", "internal/runtime/atomic", "Store64", p8...)
alias("sync/atomic", "SwapInt32", "internal/runtime/atomic", "Xchg", all...)
alias("sync/atomic", "SwapInt64", "internal/runtime/atomic", "Xchg64", all...)
alias("sync/atomic", "SwapUint32", "internal/runtime/atomic", "Xchg", all...)
alias("sync/atomic", "SwapUint64", "internal/runtime/atomic", "Xchg64", all...)
alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg", p4...)
alias("sync/atomic", "SwapUintptr", "internal/runtime/atomic", "Xchg64", p8...)
alias("sync/atomic", "CompareAndSwapInt32", "internal/runtime/atomic", "Cas", all...)
alias("sync/atomic", "CompareAndSwapInt64", "internal/runtime/atomic", "Cas64", all...)
alias("sync/atomic", "CompareAndSwapUint32", "internal/runtime/atomic", "Cas", all...)
alias("sync/atomic", "CompareAndSwapUint64", "internal/runtime/atomic", "Cas64", all...)
alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas", p4...)
alias("sync/atomic", "CompareAndSwapUintptr", "internal/runtime/atomic", "Cas64", p8...)
alias("sync/atomic", "AddInt32", "internal/runtime/atomic", "Xadd", all...)
alias("sync/atomic", "AddInt64", "internal/runtime/atomic", "Xadd64", all...)
alias("sync/atomic", "AddUint32", "internal/runtime/atomic", "Xadd", all...)
alias("sync/atomic", "AddUint64", "internal/runtime/atomic", "Xadd64", all...)
alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd", p4...)
alias("sync/atomic", "AddUintptr", "internal/runtime/atomic", "Xadd64", p8...)
cmd/compiler,internal/runtime/atomic: optimize And{64,32,8} and Or{64,32,8} on loong64 Use loong64's atomic operation instruction AMANDDB{V,W,W} (full barrier) to implement And{64,32,8}, AMORDB{V,W,W} (full barrier) to implement Or{64,32,8}. Intrinsify And{64,32,8} and Or{64,32,8}, And this CL alias all of the And/Or operations into sync/atomic package. goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) And64 27.73n ± 0% 10.81n ± 0% -61.02% (p=0.000 n=20) And64Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or32 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or32Parallel 28.96n ± 0% 12.41n ± 0% -57.15% (p=0.000 n=20) Or64 27.62n ± 0% 10.81n ± 0% -60.86% (p=0.000 n=20) Or64Parallel 28.97n ± 0% 12.41n ± 0% -57.16% (p=0.000 n=20) And8 29.15n ± 0% 13.21n ± 0% -54.68% (p=0.000 n=20) And 27.71n ± 0% 12.82n ± 0% -53.74% (p=0.000 n=20) And8Parallel 28.99n ± 0% 14.46n ± 0% -50.12% (p=0.000 n=20) AndParallel 29.12n ± 0% 14.42n ± 0% -50.48% (p=0.000 n=20) Or8 28.31n ± 0% 12.81n ± 0% -54.75% (p=0.000 n=20) Or 27.72n ± 0% 12.81n ± 0% -53.79% (p=0.000 n=20) Or8Parallel 29.03n ± 0% 14.62n ± 0% -49.64% (p=0.000 n=20) OrParallel 29.12n ± 0% 14.42n ± 0% -50.49% (p=0.000 n=20) geomean 28.47n 12.58n -55.80% goos: linux goarch: loong64 pkg: internal/runtime/atomic cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | And32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And64 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) And64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) And8 30.42n ± 0% 14.41n ± 0% -52.63% (p=0.000 n=20) And 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) And8Parallel 31.23n ± 0% 15.21n ± 0% -51.30% (p=0.000 n=20) AndParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) Or32 30.02n ± 0% 14.81n ± 0% -50.67% (p=0.000 n=20) Or32Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or64 30.02n ± 0% 14.82n ± 0% -50.63% (p=0.000 n=20) Or64Parallel 30.83n ± 0% 15.61n ± 0% -49.37% (p=0.000 n=20) Or8 30.02n ± 0% 14.01n ± 0% -53.33% (p=0.000 n=20) Or 30.02n ± 0% 13.61n ± 0% -54.66% (p=0.000 n=20) Or8Parallel 30.83n ± 0% 14.81n ± 0% -51.96% (p=0.000 n=20) OrParallel 30.83n ± 0% 14.41n ± 0% -53.26% (p=0.000 n=20) geomean 30.47n 14.75n -51.61% Change-Id: If008ff6a08b51905076f8ddb6e92f8e214d3f7b3 Reviewed-on: https://go-review.googlesource.com/c/go/+/482756 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Qiqi Huang <huangqiqi@loongson.cn> Reviewed-by: Meidan Li <limeidan@loongson.cn> Reviewed-by: David Chase <drchase@google.com> Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-09-23 11:38:36 +08:00
alias("sync/atomic", "AndInt32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUint32", "internal/runtime/atomic", "And32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndInt64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUint64", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "AndUintptr", "internal/runtime/atomic", "And64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrInt32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUint32", "internal/runtime/atomic", "Or32", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrInt64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUint64", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
alias("sync/atomic", "OrUintptr", "internal/runtime/atomic", "Or64", sys.ArchARM64, sys.ArchAMD64, sys.ArchLoong64)
/******** math/big ********/
alias("math/big", "mulWW", "math/bits", "Mul64", p8...)
}
// findIntrinsic returns a function which builds the SSA equivalent of the
// function identified by the symbol sym. If sym is not an intrinsic call, returns nil.
func findIntrinsic(sym *types.Sym) intrinsicBuilder {
if sym == nil || sym.Pkg == nil {
return nil
}
pkg := sym.Pkg.Path
if sym.Pkg == ir.Pkgs.Runtime {
pkg = "runtime"
}
if base.Flag.Race && pkg == "sync/atomic" {
// The race detector needs to be able to intercept these calls.
// We can't intrinsify them.
return nil
}
// Skip intrinsifying math functions (which may contain hard-float
// instructions) when soft-float
if Arch.SoftFloat && pkg == "math" {
return nil
}
fn := sym.Name
if ssa.IntrinsicsDisable {
if pkg == "internal/runtime/sys" && (fn == "GetCallerPC" || fn == "GrtCallerSP" || fn == "GetClosurePtr") {
// These runtime functions don't have definitions, must be intrinsics.
} else {
return nil
}
}
return intrinsics.lookup(Arch.LinkArch.Arch, pkg, fn)
}
func IsIntrinsicCall(n *ir.CallExpr) bool {
if n == nil {
return false
}
name, ok := n.Fun.(*ir.Name)
if !ok {
return false
}
return findIntrinsic(name.Sym()) != nil
}