cmd/compile: improve stp merging for non-sequent cases

Original algorithm merges stores with the first
mergeable store in the chain, but it misses some
cases. Additional reordering stores in increasing order
of memory access in the chain allows merging in these cases.

Fixes #71987

There are the results of sweet benchmarks and
the difference between sizes of sections .text

                        │ old.results │            new.results             │
                        │   sec/op    │   sec/op     vs base               │
BleveIndexBatch100-4      7.614 ± 2%    7.548 ± 1%       ~ (p=0.190 n=10)
ESBuildThreeJS-4         821.3m ± 0%   819.0m ± 1%       ~ (p=0.165 n=10)
ESBuildRomeTS-4          206.2m ± 1%   204.4m ± 1%  -0.90% (p=0.023 n=10)
EtcdPut-4                64.89m ± 1%   64.94m ± 2%       ~ (p=0.684 n=10)
EtcdSTM-4                318.4m ± 0%   319.2m ± 1%       ~ (p=0.631 n=10)
GoBuildKubelet-4          157.4 ± 0%    157.6 ± 0%       ~ (p=0.105 n=10)
GoBuildKubeletLink-4      12.42 ± 2%    12.41 ± 1%       ~ (p=0.529 n=10)
GoBuildIstioctl-4         124.4 ± 0%    124.4 ± 0%       ~ (p=0.579 n=10)
GoBuildIstioctlLink-4     8.700 ± 1%    8.693 ± 1%       ~ (p=0.912 n=10)
GoBuildFrontend-4         46.52 ± 0%    46.50 ± 0%       ~ (p=0.971 n=10)
GoBuildFrontendLink-4     2.282 ± 1%    2.272 ± 1%       ~ (p=0.529 n=10)
GoBuildTsgo-4             75.02 ± 1%    75.31 ± 1%       ~ (p=0.436 n=10)
GoBuildTsgoLink-4         1.229 ± 1%    1.219 ± 1%  -0.82% (p=0.035 n=10)
GopherLuaKNucleotide-4    34.77 ± 5%    34.31 ± 1%  -1.33% (p=0.015 n=10)
MarkdownRenderXHTML-4    286.6m ± 0%   285.7m ± 1%       ~ (p=0.315 n=10)
Tile38QueryLoad-4        657.2µ ± 1%   660.3µ ± 0%       ~ (p=0.436 n=10)
geomean                   2.570         2.563       -0.24%

Executable            Old .text  New .text     Change
-------------------------------------------------------
benchmark               6504820    6504020     -0.01%
bleve-index-bench       3903860    3903636     -0.01%
esbuild                 4801012    4801172     +0.00%
esbuild-bench           1256404    1256340     -0.01%
etcd                    9188148    9187076     -0.01%
etcd-bench              6462228    6461524     -0.01%
go                      5924468    5923892     -0.01%
go-build-bench          1282004    1281940     -0.00%
gopher-lua-bench        1639540    1639348     -0.01%
markdown-bench          1478452    1478356     -0.01%
tile38-bench            2753524    2753300     -0.01%
tile38-server          10241380   10240068     -0.01%

Change-Id: Ieb4fdfd656aca458f65fc45938de70550632bd13
Reviewed-on: https://go-review.googlesource.com/c/go/+/698097
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
Melnikov Denis 2025-08-21 18:00:57 +03:00 committed by Gopher Robot
parent bdd51e7855
commit 4c63d798cb
2 changed files with 133 additions and 6 deletions

View file

@ -212,6 +212,12 @@ func pairStores(f *Func) {
last := f.Cache.allocBoolSlice(f.NumValues())
defer f.Cache.freeBoolSlice(last)
type stChainElem struct {
v *Value
i int // Index in chain (0 == last store)
}
var order []stChainElem
// prevStore returns the previous store in the
// same block, or nil if there are none.
prevStore := func(v *Value) *Value {
@ -225,6 +231,27 @@ func pairStores(f *Func) {
return m
}
// storeWidth returns the width of store,
// or 0 if it is not a store
storeWidth := func(op Op) int64 {
var width int64
switch op {
case OpARM64MOVDstore, OpARM64FMOVDstore:
width = 8
case OpARM64MOVWstore, OpARM64FMOVSstore:
width = 4
case OpARM64MOVHstore:
width = 2
case OpARM64MOVBstore:
width = 1
default:
width = 0
}
return width
}
const limit = 10
for _, b := range f.Blocks {
// Find last store in block, so we can
// walk the stores last to first.
@ -250,9 +277,84 @@ func pairStores(f *Func) {
}
}
order = order[:0]
for i, v := 0, lastMem; v != nil; v = prevStore(v) {
order = append(order, stChainElem{v, i})
i++
}
reordering:
for i, v_elem := range order {
v := v_elem.v
if v.Uses != 1 {
// We can't reorder stores if the earlier
// store has any use besides the next one
// in the store chain.
// (Unless we could check the aliasing of
// all those other uses.)
continue
}
widthV := storeWidth(v.Op)
if widthV == 0 {
// Can't reorder with any other memory operations.
// (atomics, calls, ...)
continue
}
chain := order[i+1:]
count := limit
// Var 'count' keeps us in O(n) territory
for j, w_elem := range chain {
if count--; count == 0 {
// Only look back so far.
// This keeps us in O(n) territory, and it
// also prevents us from keeping values
// in registers for too long (and thus
// needing to spill them).
continue reordering
}
w := w_elem.v
if w.Uses != 1 {
// We can't reorder stores if the earlier
// store has any use besides the next one
// in the store chain.
// (Unless we could check the aliasing of
// all those other uses.)
continue reordering
}
widthW := storeWidth(w.Op)
if widthW == 0 {
// Can't reorder with any other memory operations.
// (atomics, calls, ...)
continue reordering
}
// We only allow reordering with respect to other
// writes to the same pointer and aux, so we can
// compute the exact the aliasing relationship.
if w.Args[0] != v.Args[0] ||
w.Aux != v.Aux {
// Can't reorder with operation with incomparable destination memory pointer.
continue reordering
}
if overlap(w.AuxInt, widthW, v.AuxInt, widthV) {
// Aliases with the same slot with v's location.
continue reordering
}
// Reordering stores in increasing order of memory access
if v.AuxInt < w.AuxInt {
order[i], order[i+j+1] = order[i+j+1], order[i]
v = w
widthV = widthW
}
}
}
// Check all stores, from last to first.
memCheck:
for v := lastMem; v != nil; v = prevStore(v) {
for i, v_elem := range order {
v := v_elem.v
info := pairableStores[v.Op]
if info.width == 0 {
continue // Not pairable.
@ -269,8 +371,10 @@ func pairStores(f *Func) {
// Look for earlier store we can combine with.
lowerOk := true
higherOk := true
count := 10 // max lookback distance
for w := prevStore(v); w != nil; w = prevStore(w) {
count := limit // max lookback distance
chain := order[i+1:]
for _, w_elem := range chain {
w := w_elem.v
if w.Uses != 1 {
// We can't combine stores if the earlier
// store has any use besides the next one
@ -293,11 +397,17 @@ func pairStores(f *Func) {
args[1], args[2] = args[2], args[1]
off -= info.width
}
v.reset(info.pair)
v.AddArgs(args...)
v.Aux = aux
v.AuxInt = off
v.Pos = w.Pos // take position of earlier of the two stores (TODO: not really working?)
// Take position of earlier of the two stores
if v_elem.i < w_elem.i {
v.Pos = w.Pos
} else {
w.Pos = v.Pos
}
// Make w just a memory copy.
wmem := w.MemoryArg()

View file

@ -1053,17 +1053,34 @@ func dwstoreF32(p *struct{ a, b float32 }, x, y float32) {
}
func dwstoreBig(p *struct{ a, b, c, d, e, f int64 }, a, b, c, d, e, f int64) {
// This is not perfect. We merge b+a, then d+e, then c and f have no pair.
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 16\(R[0-9]+\)`
p.c = c
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 32\(R[0-9]+\)`
p.f = f
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
p.a = a
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 24\(R[0-9]+\)`
p.e = e
p.d = d
p.b = b
}
func dwstoreUnorderedArray(p *struct{ a, b, c, d int }, a, b, c, d int) {
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), 16\(R[0-9]+\)`
p.c = c
p.d = d
// arm64:`STP\s\(R[0-9]+, R[0-9]+\), \(R[0-9]+\)`
p.a = a
p.b = b
}
func dwstoreBigNil(p *struct{ i, j struct{ a, b, c int } }) {
// arm64:`STP\s\(ZR, ZR\), 32\(R[0-9]+\)`
// arm64:`STP\s\(ZR, ZR\), 16\(R[0-9]+\)`
p.j = struct{ a, b, c int }{}
// arm64:`STP\s\(ZR, ZR\), \(R[0-9]+\)`
p.i = struct{ a, b, c int }{}
}
func dwstoreRet() [2]int {
// arm64:"STP\t"
return [2]int{5, 6}