mirror of
https://github.com/golang/go.git
synced 2026-06-27 19:30:52 +00:00
cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64
On loong64, the OpAtomicStore{8, 32, 64}Variant implementation uses the amswapdb.{b,w,d}
instruction (full dbar); OpAtomicStore{8, 32, 64} uses the memory access instruction
ld+dbar instruction (ld.acq semantics).
Currently, most loong64 machines support finer-grained dbar hints (such as 3A6000, 2K3000,
etc.), so on CPU that support DBAR_HINTS, the OpAtomicStore{8, 32, 64} implementation
should be used as much as possible for better performance.
goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
| old.txt | new.txt |
| sec/op | sec/op vs base |
AtomicStore64-8 12.42n ± 0% 10.02n ± 0% -19.32% (p=0.000 n=10)
AtomicStore-8 12.42n ± 0% 10.02n ± 0% -19.32% (p=0.000 n=10)
AtomicStore8-8 12.42n ± 0% 10.02n ± 0% -19.32% (p=0.000 n=10)
geomean 12.42n 10.02n -19.32%
Change-Id: Ib8960b5eb3ff74f5fd0671192546255771b8bf5b
Reviewed-on: https://go-review.googlesource.com/c/go/+/768200
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
This commit is contained in:
parent
ca10097f29
commit
879b659ae0
12 changed files with 82 additions and 66 deletions
|
|
@ -67,23 +67,24 @@ type symsStruct struct {
|
|||
WBZero *obj.LSym
|
||||
WBMove *obj.LSym
|
||||
// Wasm
|
||||
SigPanic *obj.LSym
|
||||
Staticuint64s *obj.LSym
|
||||
Typedmemmove *obj.LSym
|
||||
Udiv *obj.LSym
|
||||
WriteBarrier *obj.LSym
|
||||
Zerobase *obj.LSym
|
||||
ZeroVal *obj.LSym
|
||||
ARM64HasATOMICS *obj.LSym
|
||||
ARMHasVFPv4 *obj.LSym
|
||||
Loong64HasLAMCAS *obj.LSym
|
||||
Loong64HasLAM_BH *obj.LSym
|
||||
Loong64HasLSX *obj.LSym
|
||||
RISCV64HasZbb *obj.LSym
|
||||
X86HasAVX *obj.LSym
|
||||
X86HasFMA *obj.LSym
|
||||
X86HasPOPCNT *obj.LSym
|
||||
X86HasSSE41 *obj.LSym
|
||||
SigPanic *obj.LSym
|
||||
Staticuint64s *obj.LSym
|
||||
Typedmemmove *obj.LSym
|
||||
Udiv *obj.LSym
|
||||
WriteBarrier *obj.LSym
|
||||
Zerobase *obj.LSym
|
||||
ZeroVal *obj.LSym
|
||||
ARM64HasATOMICS *obj.LSym
|
||||
ARMHasVFPv4 *obj.LSym
|
||||
Loong64HasLAMCAS *obj.LSym
|
||||
Loong64HasLAM_BH *obj.LSym
|
||||
Loong64HasDBAR_HINTS *obj.LSym
|
||||
Loong64HasLSX *obj.LSym
|
||||
RISCV64HasZbb *obj.LSym
|
||||
X86HasAVX *obj.LSym
|
||||
X86HasFMA *obj.LSym
|
||||
X86HasPOPCNT *obj.LSym
|
||||
X86HasSSE41 *obj.LSym
|
||||
// Wasm
|
||||
WasmDiv *obj.LSym
|
||||
// Wasm
|
||||
|
|
|
|||
|
|
@ -302,7 +302,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
|||
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
|
||||
return nil
|
||||
},
|
||||
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
|
||||
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
|
||||
addF("internal/runtime/atomic", "Store64",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
|
||||
|
|
@ -331,7 +331,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
|||
makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
|
||||
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
// Target Atomic feature is identified by dynamic detection
|
||||
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
|
||||
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasDBAR_HINTS, s.sb)
|
||||
v := s.load(types.Types[types.TBOOL], addr)
|
||||
b := s.endBlock()
|
||||
b.Kind = ssa.BlockIf
|
||||
|
|
@ -343,14 +343,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
|||
b.AddEdgeTo(bFalse)
|
||||
b.Likely = ssa.BranchLikely
|
||||
|
||||
// We have atomic instructions - use it directly.
|
||||
// most loong64 machines support the finer-grained DBAR hints
|
||||
s.startBlock(bTrue)
|
||||
emit(s, n, args, op1, typ, false)
|
||||
emit(s, n, args, op0, typ, false)
|
||||
s.endBlock().AddEdgeTo(bEnd)
|
||||
|
||||
// Use original instruction sequence.
|
||||
s.startBlock(bFalse)
|
||||
emit(s, n, args, op0, typ, false)
|
||||
emit(s, n, args, op1, typ, false)
|
||||
s.endBlock().AddEdgeTo(bEnd)
|
||||
|
||||
// Merge results.
|
||||
|
|
@ -368,20 +368,11 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
|
|||
}
|
||||
}
|
||||
|
||||
addF("internal/runtime/atomic", "Store8",
|
||||
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
|
||||
sys.Loong64)
|
||||
addF("internal/runtime/atomic", "Store",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
|
||||
return nil
|
||||
},
|
||||
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore32, ssa.OpAtomicStore32Variant, types.TUINT8, atomicStoreEmitterLoong64),
|
||||
sys.Loong64)
|
||||
addF("internal/runtime/atomic", "Store64",
|
||||
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
|
||||
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
|
||||
return nil
|
||||
},
|
||||
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore64, ssa.OpAtomicStore64Variant, types.TUINT8, atomicStoreEmitterLoong64),
|
||||
sys.Loong64)
|
||||
|
||||
addF("internal/runtime/atomic", "Xchg8",
|
||||
|
|
|
|||
|
|
@ -170,16 +170,17 @@ func InitConfig() {
|
|||
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
|
||||
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
|
||||
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
|
||||
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
|
||||
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
|
||||
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
|
||||
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
|
||||
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
|
||||
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
|
||||
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
|
||||
ir.Syms.Loong64HasLAM_BH = typecheck.LookupRuntimeVar("loong64HasLAM_BH") // bool
|
||||
ir.Syms.Loong64HasLSX = typecheck.LookupRuntimeVar("loong64HasLSX") // bool
|
||||
ir.Syms.RISCV64HasZbb = typecheck.LookupRuntimeVar("riscv64HasZbb") // bool
|
||||
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
|
||||
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
|
||||
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
|
||||
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
|
||||
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
|
||||
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
|
||||
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
|
||||
ir.Syms.Loong64HasLAM_BH = typecheck.LookupRuntimeVar("loong64HasLAM_BH") // bool
|
||||
ir.Syms.Loong64HasDBAR_HINTS = typecheck.LookupRuntimeVar("loong64HasDBAR_HINTS") // bool
|
||||
ir.Syms.Loong64HasLSX = typecheck.LookupRuntimeVar("loong64HasLSX") // bool
|
||||
ir.Syms.RISCV64HasZbb = typecheck.LookupRuntimeVar("riscv64HasZbb") // bool
|
||||
ir.Syms.Staticuint64s = typecheck.LookupRuntimeVar("staticuint64s")
|
||||
ir.Syms.Typedmemmove = typecheck.LookupRuntimeFunc("typedmemmove")
|
||||
ir.Syms.Udiv = typecheck.LookupRuntimeVar("udiv") // asm func with special ABI
|
||||
|
|
|
|||
|
|
@ -303,6 +303,7 @@ var armHasVFPv4 bool
|
|||
var arm64HasATOMICS bool
|
||||
var loong64HasLAMCAS bool
|
||||
var loong64HasLAM_BH bool
|
||||
var loong64HasDBAR_HINTS bool
|
||||
var loong64HasLSX bool
|
||||
var riscv64HasZbb bool
|
||||
|
||||
|
|
|
|||
|
|
@ -250,6 +250,7 @@ var runtimeDecls = [...]struct {
|
|||
{"arm64HasATOMICS", varTag, 6},
|
||||
{"loong64HasLAMCAS", varTag, 6},
|
||||
{"loong64HasLAM_BH", varTag, 6},
|
||||
{"loong64HasDBAR_HINTS", varTag, 6},
|
||||
{"loong64HasLSX", varTag, 6},
|
||||
{"riscv64HasZbb", varTag, 6},
|
||||
{"asanregisterglobals", funcTag, 136},
|
||||
|
|
|
|||
|
|
@ -229,6 +229,7 @@ var builtins = [...]struct {
|
|||
{"runtime.arm64HasATOMICS", 0},
|
||||
{"runtime.loong64HasLAMCAS", 0},
|
||||
{"runtime.loong64HasLAM_BH", 0},
|
||||
{"runtime.loong64HasDBAR_HINTS", 0},
|
||||
{"runtime.loong64HasLSX", 0},
|
||||
{"runtime.riscv64HasZbb", 0},
|
||||
{"runtime.asanregisterglobals", 1},
|
||||
|
|
|
|||
|
|
@ -92,13 +92,14 @@ var ARM64 struct {
|
|||
// The booleans in Loong64 contain the correspondingly named cpu feature bit.
|
||||
// The struct is padded to avoid false sharing.
|
||||
var Loong64 struct {
|
||||
_ CacheLinePad
|
||||
HasLSX bool // support 128-bit vector extension
|
||||
HasLASX bool // support 256-bit vector extension
|
||||
HasCRC32 bool // support CRC instruction
|
||||
HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
|
||||
HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
|
||||
_ CacheLinePad
|
||||
_ CacheLinePad
|
||||
HasLSX bool // support 128-bit vector extension
|
||||
HasLASX bool // support 256-bit vector extension
|
||||
HasCRC32 bool // support CRC instruction
|
||||
HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
|
||||
HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
|
||||
HasDBAR_HINTS bool // supports finer-grained DBAR hints
|
||||
_ CacheLinePad
|
||||
}
|
||||
|
||||
var MIPS64X struct {
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ const (
|
|||
// CPUCFG2 bits
|
||||
cpucfg2_LAM_BH = 1 << 27
|
||||
cpucfg2_LAMCAS = 1 << 28
|
||||
|
||||
// CPUCFG3 bits
|
||||
cpucfg3_DBAR_HINTS = 1 << 17
|
||||
)
|
||||
|
||||
// get_cpucfg is implemented in cpu_loong64.s.
|
||||
|
|
@ -31,6 +34,7 @@ func doinit() {
|
|||
{Name: "crc32", Feature: &Loong64.HasCRC32},
|
||||
{Name: "lamcas", Feature: &Loong64.HasLAMCAS},
|
||||
{Name: "lam_bh", Feature: &Loong64.HasLAM_BH},
|
||||
{Name: "dbar_hints", Feature: &Loong64.HasDBAR_HINTS},
|
||||
}
|
||||
|
||||
// The CPUCFG data on Loong64 only reflects the hardware capabilities,
|
||||
|
|
@ -42,10 +46,12 @@ func doinit() {
|
|||
// through CPUCFG
|
||||
cfg1 := get_cpucfg(1)
|
||||
cfg2 := get_cpucfg(2)
|
||||
cfg3 := get_cpucfg(3)
|
||||
|
||||
Loong64.HasCRC32 = cfgIsSet(cfg1, cpucfg1_CRC32)
|
||||
Loong64.HasLAMCAS = cfgIsSet(cfg2, cpucfg2_LAMCAS)
|
||||
Loong64.HasLAM_BH = cfgIsSet(cfg2, cpucfg2_LAM_BH)
|
||||
Loong64.HasDBAR_HINTS = cfgIsSet(cfg3, cpucfg3_DBAR_HINTS)
|
||||
|
||||
osInit()
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
|
||||
offsetLoong64HasLAM_BH = unsafe.Offsetof(cpu.Loong64.HasLAM_BH)
|
||||
offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
|
||||
offsetLoong64HasDBAR_HINTS = unsafe.Offsetof(cpu.Loong64.HasDBAR_HINTS)
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
|
|
|
|||
|
|
@ -231,29 +231,40 @@ TEXT ·StoreRel64(SB), NOSPLIT, $0-16
|
|||
TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
|
||||
JMP ·Store64(SB)
|
||||
|
||||
TEXT ·Store(SB), NOSPLIT, $0-12
|
||||
MOVV ptr+0(FP), R4
|
||||
MOVW val+8(FP), R5
|
||||
AMSWAPDBW R5, (R4), R0
|
||||
RET
|
||||
|
||||
TEXT ·Store8(SB), NOSPLIT, $0-9
|
||||
MOVV ptr+0(FP), R4
|
||||
MOVB val+8(FP), R5
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLoong64HasLAM_BH(SB), R6
|
||||
BEQ R6, _legacy_store8_
|
||||
AMSWAPDBB R5, (R4), R0
|
||||
RET
|
||||
_legacy_store8_:
|
||||
// StoreRelease barrier
|
||||
DBAR $0x12
|
||||
MOVB R5, 0(R4)
|
||||
DBAR $0x18
|
||||
RET
|
||||
|
||||
TEXT ·Store(SB), NOSPLIT, $0-12
|
||||
MOVV ptr+0(FP), R4
|
||||
MOVW val+8(FP), R5
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLoong64HasDBAR_HINTS(SB), R6
|
||||
BEQ R6, _variant_
|
||||
// StoreRelease barrier
|
||||
DBAR $0x12
|
||||
MOVW R5, 0(R4)
|
||||
DBAR $0x18
|
||||
RET
|
||||
_variant_:
|
||||
AMSWAPDBW R5, (R4), R0
|
||||
RET
|
||||
|
||||
TEXT ·Store64(SB), NOSPLIT, $0-16
|
||||
MOVV ptr+0(FP), R4
|
||||
MOVV val+8(FP), R5
|
||||
MOVBU internal∕cpu·Loong64+const_offsetLoong64HasDBAR_HINTS(SB), R6
|
||||
BEQ R6, _variant_
|
||||
// StoreRelease barrier
|
||||
DBAR $0x12
|
||||
MOVV R5, 0(R4)
|
||||
DBAR $0x18
|
||||
RET
|
||||
_variant_:
|
||||
AMSWAPDBV R5, (R4), R0
|
||||
RET
|
||||
|
||||
|
|
|
|||
|
|
@ -37,9 +37,10 @@ var (
|
|||
|
||||
arm64HasATOMICS bool
|
||||
|
||||
loong64HasLAMCAS bool
|
||||
loong64HasLAM_BH bool
|
||||
loong64HasLSX bool
|
||||
loong64HasLAMCAS bool
|
||||
loong64HasLAM_BH bool
|
||||
loong64HasDBAR_HINTS bool
|
||||
loong64HasLSX bool
|
||||
|
||||
riscv64HasZbb bool
|
||||
)
|
||||
|
|
|
|||
|
|
@ -782,6 +782,7 @@ func cpuinit(env string) {
|
|||
case "loong64":
|
||||
loong64HasLAMCAS = cpu.Loong64.HasLAMCAS
|
||||
loong64HasLAM_BH = cpu.Loong64.HasLAM_BH
|
||||
loong64HasDBAR_HINTS = cpu.Loong64.HasDBAR_HINTS
|
||||
loong64HasLSX = cpu.Loong64.HasLSX
|
||||
|
||||
case "riscv64":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue