mirror of
https://github.com/golang/go.git
synced 2025-10-19 19:13:18 +00:00
cmd/compile: simplify zerorange on amd64
Get rid of duffzero and large zeroing cases. We only use this code for small things now. Change-Id: Idcf330d0ac6433448efa8e32be7eb7f988e10122 Reviewed-on: https://go-review.googlesource.com/c/go/+/678619 Reviewed-by: Jorropo <jorropo.pgm@gmail.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Keith Randall <khr@google.com>
This commit is contained in:
parent
f8eae7a3c3
commit
b10eb1d042
4 changed files with 25 additions and 101 deletions
|
@ -5,113 +5,23 @@
|
||||||
package amd64
|
package amd64
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"cmd/compile/internal/ir"
|
|
||||||
"cmd/compile/internal/objw"
|
"cmd/compile/internal/objw"
|
||||||
"cmd/compile/internal/types"
|
|
||||||
"cmd/internal/obj"
|
"cmd/internal/obj"
|
||||||
"cmd/internal/obj/x86"
|
"cmd/internal/obj/x86"
|
||||||
)
|
)
|
||||||
|
|
||||||
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
|
|
||||||
// See runtime/mkduff.go.
|
|
||||||
const (
|
|
||||||
dzBlocks = 16 // number of MOV/ADD blocks
|
|
||||||
dzBlockLen = 4 // number of clears per block
|
|
||||||
dzBlockSize = 23 // size of instructions in a single block
|
|
||||||
dzMovSize = 5 // size of single MOV instruction w/ offset
|
|
||||||
dzLeaqSize = 4 // size of single LEAQ instruction
|
|
||||||
dzClearStep = 16 // number of bytes cleared by each MOV instruction
|
|
||||||
|
|
||||||
dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
|
|
||||||
dzSize = dzBlocks * dzBlockSize
|
|
||||||
)
|
|
||||||
|
|
||||||
// dzOff returns the offset for a jump into DUFFZERO.
|
|
||||||
// b is the number of bytes to zero.
|
|
||||||
func dzOff(b int64) int64 {
|
|
||||||
off := int64(dzSize)
|
|
||||||
off -= b / dzClearLen * dzBlockSize
|
|
||||||
tailLen := b % dzClearLen
|
|
||||||
if tailLen >= dzClearStep {
|
|
||||||
off -= dzLeaqSize + dzMovSize*(tailLen/dzClearStep)
|
|
||||||
}
|
|
||||||
return off
|
|
||||||
}
|
|
||||||
|
|
||||||
// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
|
|
||||||
// b is the number of bytes to zero.
|
|
||||||
func dzDI(b int64) int64 {
|
|
||||||
tailLen := b % dzClearLen
|
|
||||||
if tailLen < dzClearStep {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
tailSteps := tailLen / dzClearStep
|
|
||||||
return -dzClearStep * (dzBlockLen - tailSteps)
|
|
||||||
}
|
|
||||||
|
|
||||||
func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
|
func zerorange(pp *objw.Progs, p *obj.Prog, off, cnt int64, state *uint32) *obj.Prog {
|
||||||
const (
|
if cnt%8 != 0 {
|
||||||
r13 = 1 << iota // if R13 is already zeroed.
|
panic("zeroed region not aligned")
|
||||||
)
|
|
||||||
|
|
||||||
if cnt == 0 {
|
|
||||||
return p
|
|
||||||
}
|
}
|
||||||
|
for cnt >= 16 {
|
||||||
if cnt == 8 {
|
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
|
||||||
|
off += 16
|
||||||
|
cnt -= 16
|
||||||
|
}
|
||||||
|
if cnt != 0 {
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
|
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off)
|
||||||
} else if cnt <= int64(8*types.RegSize) {
|
|
||||||
for i := int64(0); i < cnt/16; i++ {
|
|
||||||
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+i*16)
|
|
||||||
}
|
|
||||||
|
|
||||||
if cnt%16 != 0 {
|
|
||||||
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_SP, off+cnt-int64(16))
|
|
||||||
}
|
|
||||||
} else if cnt <= int64(128*types.RegSize) {
|
|
||||||
// Save DI to r12. With the amd64 Go register abi, DI can contain
|
|
||||||
// an incoming parameter, whereas R12 is always scratch.
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
|
|
||||||
// Emit duffzero call
|
|
||||||
p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
|
|
||||||
p = pp.Append(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
|
|
||||||
p.To.Sym = ir.Syms.Duffzero
|
|
||||||
if cnt%16 != 0 {
|
|
||||||
p = pp.Append(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X15, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
|
|
||||||
}
|
|
||||||
// Restore DI from r12
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
|
|
||||||
|
|
||||||
} else {
|
|
||||||
// When the register ABI is in effect, at this point in the
|
|
||||||
// prolog we may have live values in all of RAX,RDI,RCX. Save
|
|
||||||
// them off to registers before the REPSTOSQ below, then
|
|
||||||
// restore. Note that R12 and R13 are always available as
|
|
||||||
// scratch regs; here we also use R15 (this is safe to do
|
|
||||||
// since there won't be any globals accessed in the prolog).
|
|
||||||
// See rewriteToUseGot() in obj6.go for more on r15 use.
|
|
||||||
|
|
||||||
// Save rax/rdi/rcx
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_DI, 0, obj.TYPE_REG, x86.REG_R12, 0)
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_R13, 0)
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_CX, 0, obj.TYPE_REG, x86.REG_R15, 0)
|
|
||||||
|
|
||||||
// Set up the REPSTOSQ and kick it off.
|
|
||||||
p = pp.Append(p, x86.AXORL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_REG, x86.REG_AX, 0)
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(types.RegSize), obj.TYPE_REG, x86.REG_CX, 0)
|
|
||||||
p = pp.Append(p, leaptr, obj.TYPE_MEM, x86.REG_SP, off, obj.TYPE_REG, x86.REG_DI, 0)
|
|
||||||
p = pp.Append(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
|
|
||||||
p = pp.Append(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
|
|
||||||
|
|
||||||
// Restore rax/rdi/rcx
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R12, 0, obj.TYPE_REG, x86.REG_DI, 0)
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R13, 0, obj.TYPE_REG, x86.REG_AX, 0)
|
|
||||||
p = pp.Append(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_R15, 0, obj.TYPE_REG, x86.REG_CX, 0)
|
|
||||||
|
|
||||||
// Record the fact that r13 is no longer zero.
|
|
||||||
*state &= ^uint32(r13)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -144,6 +144,15 @@ func memIdx(a *obj.Addr, v *ssa.Value) {
|
||||||
|
|
||||||
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
|
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + LEAQ,
|
||||||
// See runtime/mkduff.go.
|
// See runtime/mkduff.go.
|
||||||
|
const (
|
||||||
|
dzBlocks = 16 // number of MOV/ADD blocks
|
||||||
|
dzBlockLen = 4 // number of clears per block
|
||||||
|
dzBlockSize = 23 // size of instructions in a single block
|
||||||
|
dzMovSize = 5 // size of single MOV instruction w/ offset
|
||||||
|
dzLeaqSize = 4 // size of single LEAQ instruction
|
||||||
|
dzClearStep = 16 // number of bytes cleared by each MOV instruction
|
||||||
|
)
|
||||||
|
|
||||||
func duffStart(size int64) int64 {
|
func duffStart(size int64) int64 {
|
||||||
x, _ := duff(size)
|
x, _ := duff(size)
|
||||||
return x
|
return x
|
||||||
|
|
|
@ -769,7 +769,7 @@ func (lv *Liveness) epilogue() {
|
||||||
// its stack copy is not live.
|
// its stack copy is not live.
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Note: zeroing is handled by zeroResults in walk.go.
|
// Note: zeroing is handled by zeroResults in ../ssagen/ssa.go.
|
||||||
livedefer.Set(int32(i))
|
livedefer.Set(int32(i))
|
||||||
}
|
}
|
||||||
if n.IsOutputParamHeapAddr() {
|
if n.IsOutputParamHeapAddr() {
|
||||||
|
|
|
@ -25,8 +25,13 @@ type ArchInfo struct {
|
||||||
|
|
||||||
PadFrame func(int64) int64
|
PadFrame func(int64) int64
|
||||||
|
|
||||||
// ZeroRange zeroes a range of memory on stack. It is only inserted
|
// ZeroRange zeroes a range of memory the on stack.
|
||||||
// at function entry, and it is ok to clobber registers.
|
// - it is only called at function entry
|
||||||
|
// - it is ok to clobber (non-arg) registers.
|
||||||
|
// - currently used only for small things, so it can be simple.
|
||||||
|
// - pointers to heap-allocated return values
|
||||||
|
// - open-coded deferred functions
|
||||||
|
// (Max size in make.bash is 40 bytes.)
|
||||||
ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog
|
ZeroRange func(*objw.Progs, *obj.Prog, int64, int64, *uint32) *obj.Prog
|
||||||
|
|
||||||
Ginsnop func(*objw.Progs) *obj.Prog
|
Ginsnop func(*objw.Progs) *obj.Prog
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue