cmd/compiler,internal/runtime/atomic: optimize Store{64,32,8} on loong64

On loong64, the OpAtomicStore{8, 32, 64}Variant implementation uses the amswapdb.{b,w,d}
instruction (full dbar); OpAtomicStore{8, 32, 64} uses the memory access instruction
ld+dbar instruction (ld.acq semantics).

Currently, most loong64 machines support finer-grained dbar hints (such as 3A6000, 2K3000,
etc.), so on CPU that support DBAR_HINTS, the OpAtomicStore{8, 32, 64} implementation
should be used as much as possible for better performance.

goos: linux
goarch: loong64
pkg: internal/runtime/atomic
cpu: Loongson-3A6000 @ 2500.00MHz
                |   old.txt   |               new.txt               |
                |   sec/op    |   sec/op     vs base                |
AtomicStore64-8   12.42n ± 0%   10.02n ± 0%  -19.32% (p=0.000 n=10)
AtomicStore-8     12.42n ± 0%   10.02n ± 0%  -19.32% (p=0.000 n=10)
AtomicStore8-8    12.42n ± 0%   10.02n ± 0%  -19.32% (p=0.000 n=10)
geomean           12.42n        10.02n       -19.32%

Change-Id: Ib8960b5eb3ff74f5fd0671192546255771b8bf5b
Reviewed-on: https://go-review.googlesource.com/c/go/+/768200
Reviewed-by: Carlos Amedee <carlos@golang.org>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
This commit is contained in:
Guoqi Chen 2026-04-17 10:32:30 +08:00 committed by abner chenc
parent ca10097f29
commit 879b659ae0
12 changed files with 82 additions and 66 deletions

View file

@ -67,23 +67,24 @@ type symsStruct struct {
WBZero *obj.LSym
WBMove *obj.LSym
// Wasm
SigPanic *obj.LSym
Staticuint64s *obj.LSym
Typedmemmove *obj.LSym
Udiv *obj.LSym
WriteBarrier *obj.LSym
Zerobase *obj.LSym
ZeroVal *obj.LSym
ARM64HasATOMICS *obj.LSym
ARMHasVFPv4 *obj.LSym
Loong64HasLAMCAS *obj.LSym
Loong64HasLAM_BH *obj.LSym
Loong64HasLSX *obj.LSym
RISCV64HasZbb *obj.LSym
X86HasAVX *obj.LSym
X86HasFMA *obj.LSym
X86HasPOPCNT *obj.LSym
X86HasSSE41 *obj.LSym
SigPanic *obj.LSym
Staticuint64s *obj.LSym
Typedmemmove *obj.LSym
Udiv *obj.LSym
WriteBarrier *obj.LSym
Zerobase *obj.LSym
ZeroVal *obj.LSym
ARM64HasATOMICS *obj.LSym
ARMHasVFPv4 *obj.LSym
Loong64HasLAMCAS *obj.LSym
Loong64HasLAM_BH *obj.LSym
Loong64HasDBAR_HINTS *obj.LSym
Loong64HasLSX *obj.LSym
RISCV64HasZbb *obj.LSym
X86HasAVX *obj.LSym
X86HasFMA *obj.LSym
X86HasPOPCNT *obj.LSym
X86HasSSE41 *obj.LSym
// Wasm
WasmDiv *obj.LSym
// Wasm

View file

@ -302,7 +302,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
sys.AMD64, sys.ARM64, sys.Loong64, sys.MIPS, sys.MIPS64, sys.PPC64, sys.RISCV64, sys.S390X)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64, types.TypeMem, args[0], args[1], s.mem())
@ -331,7 +331,7 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
makeAtomicStoreGuardedIntrinsicLoong64 := func(op0, op1 ssa.Op, typ types.Kind, emit atomicOpEmitter) intrinsicBuilder {
return func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
// Target Atomic feature is identified by dynamic detection
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasLAM_BH, s.sb)
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[types.TBOOL].PtrTo(), ir.Syms.Loong64HasDBAR_HINTS, s.sb)
v := s.load(types.Types[types.TBOOL], addr)
b := s.endBlock()
b.Kind = ssa.BlockIf
@ -343,14 +343,14 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
b.AddEdgeTo(bFalse)
b.Likely = ssa.BranchLikely
// We have atomic instructions - use it directly.
// most loong64 machines support the finer-grained DBAR hints
s.startBlock(bTrue)
emit(s, n, args, op1, typ, false)
emit(s, n, args, op0, typ, false)
s.endBlock().AddEdgeTo(bEnd)
// Use original instruction sequence.
s.startBlock(bFalse)
emit(s, n, args, op0, typ, false)
emit(s, n, args, op1, typ, false)
s.endBlock().AddEdgeTo(bEnd)
// Merge results.
@ -368,20 +368,11 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
}
}
addF("internal/runtime/atomic", "Store8",
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore8, ssa.OpAtomicStore8Variant, types.TUINT8, atomicStoreEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Store",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore32Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore32, ssa.OpAtomicStore32Variant, types.TUINT8, atomicStoreEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Store64",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
s.vars[memVar] = s.newValue3(ssa.OpAtomicStore64Variant, types.TypeMem, args[0], args[1], s.mem())
return nil
},
makeAtomicStoreGuardedIntrinsicLoong64(ssa.OpAtomicStore64, ssa.OpAtomicStore64Variant, types.TUINT8, atomicStoreEmitterLoong64),
sys.Loong64)
addF("internal/runtime/atomic", "Xchg8",

View file

@ -170,16 +170,17 @@ func InitConfig() {
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
ir.Syms.Loong64HasLAM_BH = typecheck.LookupRuntimeVar("loong64HasLAM_BH") // bool
ir.Syms.Loong64HasLSX = typecheck.LookupRuntimeVar("loong64HasLSX") // bool
ir.Syms.RISCV64HasZbb = typecheck.LookupRuntimeVar("riscv64HasZbb") // bool
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
ir.Syms.Loong64HasLAM_BH = typecheck.LookupRuntimeVar("loong64HasLAM_BH") // bool
ir.Syms.Loong64HasDBAR_HINTS = typecheck.LookupRuntimeVar("loong64HasDBAR_HINTS") // bool
ir.Syms.Loong64HasLSX = typecheck.LookupRuntimeVar("loong64HasLSX") // bool
ir.Syms.RISCV64HasZbb = typecheck.LookupRuntimeVar("riscv64HasZbb") // bool
ir.Syms.Staticuint64s = typecheck.LookupRuntimeVar("staticuint64s")
ir.Syms.Typedmemmove = typecheck.LookupRuntimeFunc("typedmemmove")
ir.Syms.Udiv = typecheck.LookupRuntimeVar("udiv") // asm func with special ABI

View file

@ -303,6 +303,7 @@ var armHasVFPv4 bool
var arm64HasATOMICS bool
var loong64HasLAMCAS bool
var loong64HasLAM_BH bool
var loong64HasDBAR_HINTS bool
var loong64HasLSX bool
var riscv64HasZbb bool

View file

@ -250,6 +250,7 @@ var runtimeDecls = [...]struct {
{"arm64HasATOMICS", varTag, 6},
{"loong64HasLAMCAS", varTag, 6},
{"loong64HasLAM_BH", varTag, 6},
{"loong64HasDBAR_HINTS", varTag, 6},
{"loong64HasLSX", varTag, 6},
{"riscv64HasZbb", varTag, 6},
{"asanregisterglobals", funcTag, 136},

View file

@ -229,6 +229,7 @@ var builtins = [...]struct {
{"runtime.arm64HasATOMICS", 0},
{"runtime.loong64HasLAMCAS", 0},
{"runtime.loong64HasLAM_BH", 0},
{"runtime.loong64HasDBAR_HINTS", 0},
{"runtime.loong64HasLSX", 0},
{"runtime.riscv64HasZbb", 0},
{"runtime.asanregisterglobals", 1},

View file

@ -92,13 +92,14 @@ var ARM64 struct {
// The booleans in Loong64 contain the correspondingly named cpu feature bit.
// The struct is padded to avoid false sharing.
var Loong64 struct {
_ CacheLinePad
HasLSX bool // support 128-bit vector extension
HasLASX bool // support 256-bit vector extension
HasCRC32 bool // support CRC instruction
HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
_ CacheLinePad
_ CacheLinePad
HasLSX bool // support 128-bit vector extension
HasLASX bool // support 256-bit vector extension
HasCRC32 bool // support CRC instruction
HasLAMCAS bool // support AMCAS[_DB].{B/H/W/D}
HasLAM_BH bool // support AM{SWAP/ADD}[_DB].{B/H} instruction
HasDBAR_HINTS bool // supports finer-grained DBAR hints
_ CacheLinePad
}
var MIPS64X struct {

View file

@ -19,6 +19,9 @@ const (
// CPUCFG2 bits
cpucfg2_LAM_BH = 1 << 27
cpucfg2_LAMCAS = 1 << 28
// CPUCFG3 bits
cpucfg3_DBAR_HINTS = 1 << 17
)
// get_cpucfg is implemented in cpu_loong64.s.
@ -31,6 +34,7 @@ func doinit() {
{Name: "crc32", Feature: &Loong64.HasCRC32},
{Name: "lamcas", Feature: &Loong64.HasLAMCAS},
{Name: "lam_bh", Feature: &Loong64.HasLAM_BH},
{Name: "dbar_hints", Feature: &Loong64.HasDBAR_HINTS},
}
// The CPUCFG data on Loong64 only reflects the hardware capabilities,
@ -42,10 +46,12 @@ func doinit() {
// through CPUCFG
cfg1 := get_cpucfg(1)
cfg2 := get_cpucfg(2)
cfg3 := get_cpucfg(3)
Loong64.HasCRC32 = cfgIsSet(cfg1, cpucfg1_CRC32)
Loong64.HasLAMCAS = cfgIsSet(cfg2, cpucfg2_LAMCAS)
Loong64.HasLAM_BH = cfgIsSet(cfg2, cpucfg2_LAM_BH)
Loong64.HasDBAR_HINTS = cfgIsSet(cfg3, cpucfg3_DBAR_HINTS)
osInit()
}

View file

@ -12,8 +12,8 @@ import (
)
const (
offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
offsetLoong64HasLAM_BH = unsafe.Offsetof(cpu.Loong64.HasLAM_BH)
offsetLOONG64HasLAMCAS = unsafe.Offsetof(cpu.Loong64.HasLAMCAS)
offsetLoong64HasDBAR_HINTS = unsafe.Offsetof(cpu.Loong64.HasDBAR_HINTS)
)
//go:noescape

View file

@ -231,29 +231,40 @@ TEXT ·StoreRel64(SB), NOSPLIT, $0-16
TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
JMP ·Store64(SB)
TEXT ·Store(SB), NOSPLIT, $0-12
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
AMSWAPDBW R5, (R4), R0
RET
TEXT ·Store8(SB), NOSPLIT, $0-9
MOVV ptr+0(FP), R4
MOVB val+8(FP), R5
MOVBU internalcpu·Loong64+const_offsetLoong64HasLAM_BH(SB), R6
BEQ R6, _legacy_store8_
AMSWAPDBB R5, (R4), R0
RET
_legacy_store8_:
// StoreRelease barrier
DBAR $0x12
MOVB R5, 0(R4)
DBAR $0x18
RET
TEXT ·Store(SB), NOSPLIT, $0-12
MOVV ptr+0(FP), R4
MOVW val+8(FP), R5
MOVBU internalcpu·Loong64+const_offsetLoong64HasDBAR_HINTS(SB), R6
BEQ R6, _variant_
// StoreRelease barrier
DBAR $0x12
MOVW R5, 0(R4)
DBAR $0x18
RET
_variant_:
AMSWAPDBW R5, (R4), R0
RET
TEXT ·Store64(SB), NOSPLIT, $0-16
MOVV ptr+0(FP), R4
MOVV val+8(FP), R5
MOVBU internalcpu·Loong64+const_offsetLoong64HasDBAR_HINTS(SB), R6
BEQ R6, _variant_
// StoreRelease barrier
DBAR $0x12
MOVV R5, 0(R4)
DBAR $0x18
RET
_variant_:
AMSWAPDBV R5, (R4), R0
RET

View file

@ -37,9 +37,10 @@ var (
arm64HasATOMICS bool
loong64HasLAMCAS bool
loong64HasLAM_BH bool
loong64HasLSX bool
loong64HasLAMCAS bool
loong64HasLAM_BH bool
loong64HasDBAR_HINTS bool
loong64HasLSX bool
riscv64HasZbb bool
)

View file

@ -782,6 +782,7 @@ func cpuinit(env string) {
case "loong64":
loong64HasLAMCAS = cpu.Loong64.HasLAMCAS
loong64HasLAM_BH = cpu.Loong64.HasLAM_BH
loong64HasDBAR_HINTS = cpu.Loong64.HasDBAR_HINTS
loong64HasLSX = cpu.Loong64.HasLSX
case "riscv64":