cmd/compile,internal/bytealg: add MemEq intrinsic for runtime.memequal

Introduce a new MemEq SSA operation for runtime.memequal. The operation
is initially implemented for arm64. The change adds opt rules (following
existing rules for call to runtime.memequal), working with MemEq, and a
later op version LoweredMemEq which may be lowered differently for more
constant size cases in future (for other targets as well as for arm64).
The new MemEq SSA operation does not have memory result, allowing cse of
loads operations around it.

Code size difference (for arm64 linux):

Executable            Old .text  New .text     Change
-------------------------------------------------------
asm                     1970420    1969668     -0.04%
cgo                     1741220    1740212     -0.06%
compile                 8956756    8959428     +0.03%
cover                   1879332    1878772     -0.03%
link                    2574116    2572660     -0.06%
preprofile               867124     866820     -0.04%
vet                     2890404    2888596     -0.06%

Change-Id: I6ab507929b861884d17d5818cfbd152cf7879751
Reviewed-on: https://go-review.googlesource.com/c/go/+/686655
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Alexander Musman 2025-07-05 23:16:36 +03:00 committed by Gopher Robot
parent 4976606a2f
commit dda7c8253d
15 changed files with 345 additions and 4 deletions

View file

@ -1322,6 +1322,11 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
p.To.Name = obj.NAME_EXTERN
// AuxInt encodes how many buffer entries we need.
p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
case ssa.OpARM64LoweredMemEq:
p := s.Prog(obj.ACALL)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = ir.Syms.Memequal
case ssa.OpARM64LoweredPanicBoundsRR, ssa.OpARM64LoweredPanicBoundsRC, ssa.OpARM64LoweredPanicBoundsCR, ssa.OpARM64LoweredPanicBoundsCC:
// Compute the constant we put in the PCData entry for this call.

View file

@ -40,6 +40,7 @@ type symsStruct struct {
MallocGCSmallScanNoHeader [27]*obj.LSym
MallocGCTiny [16]*obj.LSym
Memmove *obj.LSym
Memequal *obj.LSym
Msanread *obj.LSym
Msanwrite *obj.LSym
Msanmove *obj.LSym

View file

@ -481,6 +481,7 @@
(GetClosurePtr ...) => (LoweredGetClosurePtr ...)
(GetCallerSP ...) => (LoweredGetCallerSP ...)
(GetCallerPC ...) => (LoweredGetCallerPC ...)
(MemEq ...) => (LoweredMemEq ...)
// Absorb pseudo-ops into blocks.
(If (Equal cc) yes no) => (EQ cc yes no)

View file

@ -535,6 +535,7 @@ func init() {
// pseudo-ops
{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gpg}}, nilCheck: true, faultOnNilArg0: true}, // panic if arg0 is nil. arg1=mem.
{name: "LoweredMemEq", argLength: 4, reg: regInfo{inputs: []regMask{buildReg("R0"), buildReg("R1"), buildReg("R2")}, outputs: []regMask{buildReg("R0")}, clobbers: callerSave}, typ: "Bool", faultOnNilArg0: true, faultOnNilArg1: true, clobberFlags: true, call: true}, // arg0, arg1 - pointers to memory, arg2=size, arg3=mem.
{name: "Equal", argLength: 1, reg: readflags}, // bool, true flags encode x==y false otherwise.
{name: "NotEqual", argLength: 1, reg: readflags}, // bool, true flags encode x!=y false otherwise.

View file

@ -1525,6 +1525,41 @@
&& isSamePtr(p, q)
=> (MakeResult (ConstBool <typ.Bool> [true]) mem)
(MemEq sptr tptr (Const64 [1]) mem)
=> (Eq8 (Load <typ.Int8> sptr mem) (Load <typ.Int8> tptr mem))
(Load <typ.Int8> sptr:(Addr {scon} (SB)) mem)
&& symIsRO(scon)
=> (Const8 <typ.Int8> [int8(read8(scon,0))])
(MemEq sptr tptr (Const64 [2]) mem)
&& canLoadUnaligned(config)
=> (Eq16 (Load <typ.Int16> sptr mem) (Load <typ.Int16> tptr mem))
(Load <typ.Int16> sptr:(Addr {scon} (SB)) mem)
&& symIsRO(scon)
=> (Const16 <typ.Int16> [int16(read16(scon,0,config.ctxt.Arch.ByteOrder))])
(MemEq sptr tptr (Const64 [4]) mem)
&& canLoadUnaligned(config)
=> (Eq32 (Load <typ.Int32> sptr mem) (Load <typ.Int32> tptr mem))
(Load <typ.Int32> sptr:(Addr {scon} (SB)) mem)
&& symIsRO(scon)
=> (Const32 <typ.Int32> [int32(read32(scon,0,config.ctxt.Arch.ByteOrder))])
(MemEq sptr tptr (Const64 [8]) mem)
&& canLoadUnaligned(config) && config.PtrSize == 8
=> (Eq64 (Load <typ.Int64> sptr mem) (Load <typ.Int64> tptr mem))
(Load <typ.Int64> sptr:(Addr {scon} (SB)) mem)
&& symIsRO(scon)
=> (Const64 <typ.Int64> [int64(read64(scon,0,config.ctxt.Arch.ByteOrder))])
(MemEq _ _ (Const64 [0]) _) => (ConstBool <typ.Bool> [true])
(MemEq p q _ _) && isSamePtr(p, q) => (ConstBool <typ.Bool> [true])
// Turn known-size calls to memclrNoHeapPointers into a Zero.
// Note that we are using types.Types[types.TUINT8] instead of sptr.Type.Elem() - see issue 55122 and CL 431496 for more details.
(SelectN [0] call:(StaticCall {sym} sptr (Const(64|32) [c]) mem))

View file

@ -679,6 +679,9 @@ var genericOps = []opData{
{name: "PrefetchCache", argLength: 2, hasSideEffects: true}, // Do prefetch arg0 to cache. arg0=addr, arg1=memory.
{name: "PrefetchCacheStreamed", argLength: 2, hasSideEffects: true}, // Do non-temporal or streamed prefetch arg0 to cache. arg0=addr, arg1=memory.
// Helper instruction which is semantically equivalent to calling runtime.memequal, but some targets may prefer to custom lower it later, e.g. for specific constant sizes.
{name: "MemEq", argLength: 4, commutative: true, typ: "Bool"}, // arg0=ptr0, arg1=ptr1, arg2=size, arg3=memory.
// SIMD
{name: "ZeroSIMD", argLength: 0}, // zero value of a vector

View file

@ -4202,6 +4202,7 @@ const (
OpARM64CALLclosure
OpARM64CALLinter
OpARM64LoweredNilCheck
OpARM64LoweredMemEq
OpARM64Equal
OpARM64NotEqual
OpARM64LessThan
@ -5916,6 +5917,7 @@ const (
OpClobberReg
OpPrefetchCache
OpPrefetchCacheStreamed
OpMemEq
OpZeroSIMD
OpCvt16toMask8x16
OpCvt32toMask8x32
@ -65520,6 +65522,25 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredMemEq",
argLen: 4,
clobberFlags: true,
call: true,
faultOnNilArg0: true,
faultOnNilArg1: true,
reg: regInfo{
inputs: []inputInfo{
{0, 1}, // R0
{1, 2}, // R1
{2, 4}, // R2
},
clobbers: 9223372035109945343, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
outputs: []outputInfo{
{0, 1}, // R0
},
},
},
{
name: "Equal",
argLen: 1,
@ -85517,6 +85538,12 @@ var opcodeTable = [...]opInfo{
hasSideEffects: true,
generic: true,
},
{
name: "MemEq",
argLen: 4,
commutative: true,
generic: true,
},
{
name: "ZeroSIMD",
argLen: 0,

View file

@ -897,7 +897,15 @@ func (s *regAllocState) dropIfUnused(v *Value) {
}
vi := &s.values[v.ID]
r := vi.uses
if r == nil || (!opcodeTable[v.Op].fixedReg && r.dist > s.nextCall[s.curIdx]) {
nextCall := s.nextCall[s.curIdx]
if opcodeTable[v.Op].call {
if s.curIdx == len(s.nextCall)-1 {
nextCall = math.MaxInt32
} else {
nextCall = s.nextCall[s.curIdx+1]
}
}
if r == nil || (!opcodeTable[v.Op].fixedReg && r.dist > nextCall) {
s.freeRegs(vi.regs)
}
}
@ -1036,8 +1044,11 @@ func (s *regAllocState) regalloc(f *Func) {
regValLiveSet.add(v.ID)
}
}
if len(s.nextCall) < len(b.Values) {
s.nextCall = append(s.nextCall, make([]int32, len(b.Values)-len(s.nextCall))...)
if cap(s.nextCall) < len(b.Values) {
c := cap(s.nextCall)
s.nextCall = append(s.nextCall[:c], make([]int32, len(b.Values)-c)...)
} else {
s.nextCall = s.nextCall[:len(b.Values)]
}
var nextCall int32 = math.MaxInt32
for i := len(b.Values) - 1; i >= 0; i-- {

View file

@ -840,6 +840,9 @@ func rewriteValueARM64(v *Value) bool {
case OpMax64F:
v.Op = OpARM64FMAXD
return true
case OpMemEq:
v.Op = OpARM64LoweredMemEq
return true
case OpMin32F:
v.Op = OpARM64FMINS
return true

View file

@ -224,6 +224,8 @@ func rewriteValuegeneric(v *Value) bool {
return rewriteValuegeneric_OpLsh8x64(v)
case OpLsh8x8:
return rewriteValuegeneric_OpLsh8x8(v)
case OpMemEq:
return rewriteValuegeneric_OpMemEq(v)
case OpMod16:
return rewriteValuegeneric_OpMod16(v)
case OpMod16u:
@ -11869,6 +11871,8 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
typ := &b.Func.Config.Types
// match: (Load <t1> p1 (Store {t2} p2 x _))
// cond: isSamePtr(p1, p2) && copyCompatibleType(t1, x.Type) && t1.Size() == t2.Size()
// result: x
@ -12453,6 +12457,102 @@ func rewriteValuegeneric_OpLoad(v *Value) bool {
v.AddArg(v0)
return true
}
// match: (Load <typ.Int8> sptr:(Addr {scon} (SB)) mem)
// cond: symIsRO(scon)
// result: (Const8 <typ.Int8> [int8(read8(scon,0))])
for {
if v.Type != typ.Int8 {
break
}
sptr := v_0
if sptr.Op != OpAddr {
break
}
scon := auxToSym(sptr.Aux)
sptr_0 := sptr.Args[0]
if sptr_0.Op != OpSB {
break
}
if !(symIsRO(scon)) {
break
}
v.reset(OpConst8)
v.Type = typ.Int8
v.AuxInt = int8ToAuxInt(int8(read8(scon, 0)))
return true
}
// match: (Load <typ.Int16> sptr:(Addr {scon} (SB)) mem)
// cond: symIsRO(scon)
// result: (Const16 <typ.Int16> [int16(read16(scon,0,config.ctxt.Arch.ByteOrder))])
for {
if v.Type != typ.Int16 {
break
}
sptr := v_0
if sptr.Op != OpAddr {
break
}
scon := auxToSym(sptr.Aux)
sptr_0 := sptr.Args[0]
if sptr_0.Op != OpSB {
break
}
if !(symIsRO(scon)) {
break
}
v.reset(OpConst16)
v.Type = typ.Int16
v.AuxInt = int16ToAuxInt(int16(read16(scon, 0, config.ctxt.Arch.ByteOrder)))
return true
}
// match: (Load <typ.Int32> sptr:(Addr {scon} (SB)) mem)
// cond: symIsRO(scon)
// result: (Const32 <typ.Int32> [int32(read32(scon,0,config.ctxt.Arch.ByteOrder))])
for {
if v.Type != typ.Int32 {
break
}
sptr := v_0
if sptr.Op != OpAddr {
break
}
scon := auxToSym(sptr.Aux)
sptr_0 := sptr.Args[0]
if sptr_0.Op != OpSB {
break
}
if !(symIsRO(scon)) {
break
}
v.reset(OpConst32)
v.Type = typ.Int32
v.AuxInt = int32ToAuxInt(int32(read32(scon, 0, config.ctxt.Arch.ByteOrder)))
return true
}
// match: (Load <typ.Int64> sptr:(Addr {scon} (SB)) mem)
// cond: symIsRO(scon)
// result: (Const64 <typ.Int64> [int64(read64(scon,0,config.ctxt.Arch.ByteOrder))])
for {
if v.Type != typ.Int64 {
break
}
sptr := v_0
if sptr.Op != OpAddr {
break
}
scon := auxToSym(sptr.Aux)
sptr_0 := sptr.Args[0]
if sptr_0.Op != OpSB {
break
}
if !(symIsRO(scon)) {
break
}
v.reset(OpConst64)
v.Type = typ.Int64
v.AuxInt = int64ToAuxInt(int64(read64(scon, 0, config.ctxt.Arch.ByteOrder)))
return true
}
// match: (Load (Addr {s} sb) _)
// cond: isFixedLoad(v, s, 0)
// result: rewriteFixedLoad(v, s, sb, 0)
@ -14767,6 +14867,124 @@ func rewriteValuegeneric_OpLsh8x8(v *Value) bool {
}
return false
}
func rewriteValuegeneric_OpMemEq(v *Value) bool {
v_3 := v.Args[3]
v_2 := v.Args[2]
v_1 := v.Args[1]
v_0 := v.Args[0]
b := v.Block
config := b.Func.Config
typ := &b.Func.Config.Types
// match: (MemEq sptr tptr (Const64 [1]) mem)
// result: (Eq8 (Load <typ.Int8> sptr mem) (Load <typ.Int8> tptr mem))
for {
sptr := v_0
tptr := v_1
if v_2.Op != OpConst64 || auxIntToInt64(v_2.AuxInt) != 1 {
break
}
mem := v_3
v.reset(OpEq8)
v0 := b.NewValue0(v.Pos, OpLoad, typ.Int8)
v0.AddArg2(sptr, mem)
v1 := b.NewValue0(v.Pos, OpLoad, typ.Int8)
v1.AddArg2(tptr, mem)
v.AddArg2(v0, v1)
return true
}
// match: (MemEq sptr tptr (Const64 [2]) mem)
// cond: canLoadUnaligned(config)
// result: (Eq16 (Load <typ.Int16> sptr mem) (Load <typ.Int16> tptr mem))
for {
sptr := v_0
tptr := v_1
if v_2.Op != OpConst64 || auxIntToInt64(v_2.AuxInt) != 2 {
break
}
mem := v_3
if !(canLoadUnaligned(config)) {
break
}
v.reset(OpEq16)
v0 := b.NewValue0(v.Pos, OpLoad, typ.Int16)
v0.AddArg2(sptr, mem)
v1 := b.NewValue0(v.Pos, OpLoad, typ.Int16)
v1.AddArg2(tptr, mem)
v.AddArg2(v0, v1)
return true
}
// match: (MemEq sptr tptr (Const64 [4]) mem)
// cond: canLoadUnaligned(config)
// result: (Eq32 (Load <typ.Int32> sptr mem) (Load <typ.Int32> tptr mem))
for {
sptr := v_0
tptr := v_1
if v_2.Op != OpConst64 || auxIntToInt64(v_2.AuxInt) != 4 {
break
}
mem := v_3
if !(canLoadUnaligned(config)) {
break
}
v.reset(OpEq32)
v0 := b.NewValue0(v.Pos, OpLoad, typ.Int32)
v0.AddArg2(sptr, mem)
v1 := b.NewValue0(v.Pos, OpLoad, typ.Int32)
v1.AddArg2(tptr, mem)
v.AddArg2(v0, v1)
return true
}
// match: (MemEq sptr tptr (Const64 [8]) mem)
// cond: canLoadUnaligned(config) && config.PtrSize == 8
// result: (Eq64 (Load <typ.Int64> sptr mem) (Load <typ.Int64> tptr mem))
for {
sptr := v_0
tptr := v_1
if v_2.Op != OpConst64 || auxIntToInt64(v_2.AuxInt) != 8 {
break
}
mem := v_3
if !(canLoadUnaligned(config) && config.PtrSize == 8) {
break
}
v.reset(OpEq64)
v0 := b.NewValue0(v.Pos, OpLoad, typ.Int64)
v0.AddArg2(sptr, mem)
v1 := b.NewValue0(v.Pos, OpLoad, typ.Int64)
v1.AddArg2(tptr, mem)
v.AddArg2(v0, v1)
return true
}
// match: (MemEq _ _ (Const64 [0]) _)
// result: (ConstBool <typ.Bool> [true])
for {
if v_2.Op != OpConst64 || auxIntToInt64(v_2.AuxInt) != 0 {
break
}
v.reset(OpConstBool)
v.Type = typ.Bool
v.AuxInt = boolToAuxInt(true)
return true
}
// match: (MemEq p q _ _)
// cond: isSamePtr(p, q)
// result: (ConstBool <typ.Bool> [true])
for {
for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
p := v_0
q := v_1
if !(isSamePtr(p, q)) {
continue
}
v.reset(OpConstBool)
v.Type = typ.Bool
v.AuxInt = boolToAuxInt(true)
return true
}
break
}
return false
}
func rewriteValuegeneric_OpMod16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]

View file

@ -196,6 +196,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
},
sys.AMD64, sys.I386, sys.ARM64, sys.ARM, sys.Loong64, sys.S390X)
addF("runtime", "memequal",
func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value {
return s.newValue4(ssa.OpMemEq, s.f.Config.Types.Bool, args[0], args[1], args[2], s.mem())
},
sys.ARM64)
if cfg.goppc64 >= 10 {
// Use only on Power10 as the new byte reverse instructions that Power10 provide
// make it worthwhile as an intrinsic

View file

@ -327,6 +327,7 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
{"arm64", "math/bits", "TrailingZeros64"}: struct{}{},
{"arm64", "math/bits", "TrailingZeros8"}: struct{}{},
{"arm64", "runtime", "KeepAlive"}: struct{}{},
{"arm64", "runtime", "memequal"}: struct{}{},
{"arm64", "runtime", "publicationBarrier"}: struct{}{},
{"arm64", "runtime", "slicebytetostringtmp"}: struct{}{},
{"arm64", "sync", "runtime_LoadAcquintptr"}: struct{}{},

View file

@ -141,6 +141,7 @@ func InitConfig() {
}
ir.Syms.MallocGC = typecheck.LookupRuntimeFunc("mallocgc")
ir.Syms.Memmove = typecheck.LookupRuntimeFunc("memmove")
ir.Syms.Memequal = typecheck.LookupRuntimeFunc("memequal")
ir.Syms.Msanread = typecheck.LookupRuntimeFunc("msanread")
ir.Syms.Msanwrite = typecheck.LookupRuntimeFunc("msanwrite")
ir.Syms.Msanmove = typecheck.LookupRuntimeFunc("msanmove")

View file

@ -660,6 +660,17 @@ func equalVarString8(a string) bool {
return a[:8] == b
}
func equalVarStringNoSpill(a,b string) bool {
s := string("ZZZZZZZZZ")
// arm64:".*memequal"
memeq1 := a[:9] == s
// arm64:-".*"
memeq2 := s == a[:9]
// arm64:-"MOVB\tR0,.*SP",".*memequal"
memeq3 := s == b[:9]
return memeq1 && memeq2 && memeq3
}
func cmpToCmn(a, b, c, d int) int {
var c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11 int
// arm64:`CMN`,-`CMP`

17
test/codegen/memcse.go Normal file
View file

@ -0,0 +1,17 @@
// asmcheck
// Copyright 2025 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Test common subexpression elimination of loads around other operations.
package codegen
func loadsAroundMemEqual(p *int, s1, s2 string) (int, bool) {
x := *p
eq := s1 == s2
y := *p
// arm64:"MOVD ZR, R0"
return x - y, eq
}