mirror of
https://github.com/golang/go.git
synced 2025-10-19 11:03:18 +00:00
runtime: support arm64 Neon in async preemption
This is a port of CL 669195 adjusted to save arm64 Neon registers off stack. Change-Id: Ia014778a8c9f0c1d05977b04184f51e791ae8495 Reviewed-on: https://go-review.googlesource.com/c/go/+/695916 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Mark Freeman <markfreeman@google.com> Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
parent
5368e77429
commit
955a5a0dc5
5 changed files with 239 additions and 79 deletions
|
@ -163,19 +163,21 @@ package runtime
|
|||
type xRegs struct {
|
||||
`)
|
||||
pos := 0
|
||||
for _, reg := range l.regs {
|
||||
if reg.pos != pos {
|
||||
log.Fatalf("padding not implemented")
|
||||
for _, seq := range l.regs {
|
||||
for _, r := range seq.regs {
|
||||
if r.pos != pos && !seq.fixedOffset {
|
||||
log.Fatalf("padding not implemented")
|
||||
}
|
||||
typ := fmt.Sprintf("[%d]byte", r.size)
|
||||
switch {
|
||||
case r.size == 4 && r.pos%4 == 0:
|
||||
typ = "uint32"
|
||||
case r.size == 8 && r.pos%8 == 0:
|
||||
typ = "uint64"
|
||||
}
|
||||
fmt.Fprintf(g.w, "\t%s %s\n", r.name, typ)
|
||||
pos += r.size
|
||||
}
|
||||
typ := fmt.Sprintf("[%d]byte", reg.size)
|
||||
switch {
|
||||
case reg.size == 4 && reg.pos%4 == 0:
|
||||
typ = "uint32"
|
||||
case reg.size == 8 && reg.pos%8 == 0:
|
||||
typ = "uint64"
|
||||
}
|
||||
fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
|
||||
pos += reg.size
|
||||
}
|
||||
fmt.Fprintf(g.w, "}\n")
|
||||
|
||||
|
@ -191,16 +193,61 @@ type xRegs struct {
|
|||
|
||||
type layout struct {
|
||||
stack int
|
||||
regs []regPos
|
||||
regs []regSeq
|
||||
sp string // stack pointer register
|
||||
}
|
||||
|
||||
type regPos struct {
|
||||
pos, size int
|
||||
type regInfo struct {
|
||||
size int // register size in bytes
|
||||
name string // register name
|
||||
|
||||
// Some register names may require a specific suffix.
|
||||
// In ARM64, a suffix called an "arrangement specifier" can be added to
|
||||
// a register name. For example:
|
||||
//
|
||||
// V0.B16
|
||||
//
|
||||
// In this case, "V0" is the register name, and ".B16" is the suffix.
|
||||
suffix string
|
||||
|
||||
pos int // position on stack
|
||||
}
|
||||
|
||||
// Some save/restore operations can involve multiple registers in a single
|
||||
// instruction. For example, the LDP/STP instructions in ARM64:
|
||||
//
|
||||
// LDP 8(RSP), (R0, R1)
|
||||
// STP (R0, R1), 8(RSP)
|
||||
//
|
||||
// In these cases, a pair of registers (R0, R1) is used as a single argument.
|
||||
type regSeq struct {
|
||||
saveOp string
|
||||
restoreOp string
|
||||
reg string
|
||||
regs []regInfo
|
||||
|
||||
// By default, all registers are saved on the stack, and the stack pointer offset
|
||||
// is calculated based on the size of each register. For example (ARM64):
|
||||
//
|
||||
// STP (R0, R1), 8(RSP)
|
||||
// STP (R2, R3), 24(RSP)
|
||||
//
|
||||
// However, automatic offset calculation may not always be desirable.
|
||||
// In some cases, the offset must remain fixed:
|
||||
//
|
||||
// VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
|
||||
// VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
|
||||
//
|
||||
// In this example, R0 is post-incremented after each instruction,
|
||||
// so the offset should not be recalculated. For such cases,
|
||||
// `fixedOffset` is set to true.
|
||||
fixedOffset bool
|
||||
|
||||
// After conversion to a string, register names are separated by commas
|
||||
// and may be wrapped in a custom pair of brackets. For example (ARM64):
|
||||
//
|
||||
// (R0, R1) // wrapped in parentheses
|
||||
// [V0.B16, V1.B16, V2.B16, V3.B16] // wrapped in square brackets
|
||||
brackets [2]string
|
||||
|
||||
// If this register requires special save and restore, these
|
||||
// give those operations with a %d placeholder for the stack
|
||||
|
@ -208,40 +255,95 @@ type regPos struct {
|
|||
save, restore string
|
||||
}
|
||||
|
||||
func (l *layout) add(op, reg string, size int) {
|
||||
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
|
||||
func (l *layout) add(op, regname string, size int) {
|
||||
l.regs = append(l.regs, regSeq{saveOp: op, restoreOp: op, regs: []regInfo{{size, regname, "", l.stack}}})
|
||||
l.stack += size
|
||||
}
|
||||
|
||||
func (l *layout) add2(sop, rop, reg string, size int) {
|
||||
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
|
||||
l.stack += size
|
||||
func (l *layout) add2(sop, rop string, regs []regInfo, brackets [2]string, fixedOffset bool) {
|
||||
l.regs = append(l.regs, regSeq{saveOp: sop, restoreOp: rop, regs: regs, brackets: brackets, fixedOffset: fixedOffset})
|
||||
if !fixedOffset {
|
||||
for i := range regs {
|
||||
regs[i].pos = l.stack
|
||||
l.stack += regs[i].size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *layout) addSpecial(save, restore string, size int) {
|
||||
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
|
||||
l.regs = append(l.regs, regSeq{save: save, restore: restore, regs: []regInfo{{size, "", "", l.stack}}})
|
||||
l.stack += size
|
||||
}
|
||||
|
||||
func (rs *regSeq) String() string {
|
||||
switch len(rs.regs) {
|
||||
case 0:
|
||||
log.Fatal("Register sequence must not be empty!")
|
||||
case 1:
|
||||
return rs.regs[0].name
|
||||
default:
|
||||
names := make([]string, 0)
|
||||
for _, r := range rs.regs {
|
||||
name := r.name + r.suffix
|
||||
names = append(names, name)
|
||||
}
|
||||
return rs.brackets[0] + strings.Join(names, ", ") + rs.brackets[1]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func (l *layout) save(g *gen) {
|
||||
for _, reg := range l.regs {
|
||||
if reg.save != "" {
|
||||
g.p(reg.save, reg.pos)
|
||||
for _, seq := range l.regs {
|
||||
if len(seq.regs) < 1 {
|
||||
log.Fatal("Register sequence must not be empty!")
|
||||
}
|
||||
// When dealing with a sequence of registers, we assume that only the position
|
||||
// of the first register is relevant. For example:
|
||||
//
|
||||
// STP (R0, R1), 8(RSP)
|
||||
// STP (R2, R3), 24(RSP)
|
||||
//
|
||||
// Here, R0.pos is 8. While we can infer that R1.pos is 16, it doesn't need to
|
||||
// be explicitly specified, as the STP instruction calculates it automatically.
|
||||
pos := seq.regs[0].pos
|
||||
if seq.save != "" {
|
||||
g.p(seq.save, pos)
|
||||
} else {
|
||||
g.p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp)
|
||||
name := seq.String()
|
||||
g.p("%s %s, %d(%s)", seq.saveOp, name, pos, l.sp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *layout) restoreInOrder(g *gen, reverse bool) {
|
||||
var seq []regSeq
|
||||
if reverse {
|
||||
seq = make([]regSeq, 0)
|
||||
for i := len(l.regs) - 1; i >= 0; i-- {
|
||||
seq = append(seq, l.regs[i])
|
||||
}
|
||||
} else {
|
||||
seq = l.regs
|
||||
}
|
||||
for _, reg := range seq {
|
||||
if len(reg.regs) < 1 {
|
||||
log.Fatal("Register sequence must not be empty!")
|
||||
}
|
||||
pos := reg.regs[0].pos
|
||||
if reg.restore != "" {
|
||||
g.p(reg.restore, pos)
|
||||
} else {
|
||||
g.p("%s %d(%s), %s", reg.restoreOp, pos, l.sp, reg.String())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (l *layout) restore(g *gen) {
|
||||
for i := len(l.regs) - 1; i >= 0; i-- {
|
||||
reg := l.regs[i]
|
||||
if reg.restore != "" {
|
||||
g.p(reg.restore, reg.pos)
|
||||
} else {
|
||||
g.p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg)
|
||||
}
|
||||
}
|
||||
l.restoreInOrder(g, true)
|
||||
}
|
||||
|
||||
func (l *layout) restoreDirect(g *gen) {
|
||||
l.restoreInOrder(g, false)
|
||||
}
|
||||
|
||||
func gen386(g *gen) {
|
||||
|
@ -320,8 +422,11 @@ func genAMD64(g *gen) {
|
|||
// We don't have to do this, but it results in a nice Go type. If we split
|
||||
// this into multiple types, we probably should stop doing this.
|
||||
for i := range lXRegs.regs {
|
||||
lXRegs.regs[i].pos = lZRegs.regs[i].pos
|
||||
lYRegs.regs[i].pos = lZRegs.regs[i].pos
|
||||
for j := range lXRegs.regs[i].regs {
|
||||
lXRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
|
||||
lYRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
|
||||
}
|
||||
|
||||
}
|
||||
writeXRegs(g.goarch, &lZRegs)
|
||||
|
||||
|
@ -456,6 +561,7 @@ func genARM(g *gen) {
|
|||
}
|
||||
|
||||
func genARM64(g *gen) {
|
||||
const vReg = "R0" // *xRegState
|
||||
p := g.p
|
||||
// Add integer registers R0-R26
|
||||
// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
|
||||
|
@ -466,8 +572,11 @@ func genARM64(g *gen) {
|
|||
i--
|
||||
continue // R18 is not used, skip
|
||||
}
|
||||
reg := fmt.Sprintf("(R%d, R%d)", i, i+1)
|
||||
l.add2("STP", "LDP", reg, 16)
|
||||
regs := []regInfo{
|
||||
{name: fmt.Sprintf("R%d", i), size: 8},
|
||||
{name: fmt.Sprintf("R%d", i+1), size: 8},
|
||||
}
|
||||
l.add2("STP", "LDP", regs, [2]string{"(", ")"}, false)
|
||||
}
|
||||
// Add flag registers.
|
||||
l.addSpecial(
|
||||
|
@ -480,10 +589,17 @@ func genARM64(g *gen) {
|
|||
8)
|
||||
// TODO: FPCR? I don't think we'll change it, so no need to save.
|
||||
// Add floating point registers F0-F31.
|
||||
for i := 0; i < 31; i += 2 {
|
||||
reg := fmt.Sprintf("(F%d, F%d)", i, i+1)
|
||||
l.add2("FSTPD", "FLDPD", reg, 16)
|
||||
lVRegs := layout{sp: vReg} // Non-GP registers
|
||||
for i := 0; i < 31; i += 4 {
|
||||
regs := []regInfo{
|
||||
{name: fmt.Sprintf("V%d", i), suffix: ".B16", size: 16, pos: 64},
|
||||
{name: fmt.Sprintf("V%d", i+1), suffix: ".B16", size: 16, pos: 64},
|
||||
{name: fmt.Sprintf("V%d", i+2), suffix: ".B16", size: 16, pos: 64},
|
||||
{name: fmt.Sprintf("V%d", i+3), suffix: ".B16", size: 16, pos: 64},
|
||||
}
|
||||
lVRegs.add2("VST1.P", "VLD1.P", regs, [2]string{"[", "]"}, true)
|
||||
}
|
||||
writeXRegs(g.goarch, &lVRegs)
|
||||
if l.stack%16 != 0 {
|
||||
l.stack += 8 // SP needs 16-byte alignment
|
||||
}
|
||||
|
@ -500,8 +616,20 @@ func genARM64(g *gen) {
|
|||
p("MOVD R30, (RSP)")
|
||||
p("#endif")
|
||||
|
||||
p("// Save GPs")
|
||||
l.save(g)
|
||||
p("// Save extended register state to p.xRegs.scratch")
|
||||
p("MOVD g_m(g), %s", vReg)
|
||||
p("MOVD m_p(%s), %s", vReg, vReg)
|
||||
p("ADD $(p_xRegs+xRegPerP_scratch), %s, %s", vReg, vReg)
|
||||
lVRegs.save(g)
|
||||
p("CALL ·asyncPreempt2(SB)")
|
||||
p("// Restore non-GPs from *p.xRegs.cache")
|
||||
p("MOVD g_m(g), %s", vReg)
|
||||
p("MOVD m_p(%s), %s", vReg, vReg)
|
||||
p("MOVD (p_xRegs+xRegPerP_cache)(%s), %s", vReg, vReg)
|
||||
lVRegs.restoreDirect(g)
|
||||
p("// Restore GPs")
|
||||
l.restore(g)
|
||||
|
||||
p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
|
||||
|
|
38
src/runtime/preempt_arm64.go
Normal file
38
src/runtime/preempt_arm64.go
Normal file
|
@ -0,0 +1,38 @@
|
|||
// Code generated by mkpreempt.go; DO NOT EDIT.
|
||||
|
||||
package runtime
|
||||
|
||||
type xRegs struct {
|
||||
V0 [16]byte
|
||||
V1 [16]byte
|
||||
V2 [16]byte
|
||||
V3 [16]byte
|
||||
V4 [16]byte
|
||||
V5 [16]byte
|
||||
V6 [16]byte
|
||||
V7 [16]byte
|
||||
V8 [16]byte
|
||||
V9 [16]byte
|
||||
V10 [16]byte
|
||||
V11 [16]byte
|
||||
V12 [16]byte
|
||||
V13 [16]byte
|
||||
V14 [16]byte
|
||||
V15 [16]byte
|
||||
V16 [16]byte
|
||||
V17 [16]byte
|
||||
V18 [16]byte
|
||||
V19 [16]byte
|
||||
V20 [16]byte
|
||||
V21 [16]byte
|
||||
V22 [16]byte
|
||||
V23 [16]byte
|
||||
V24 [16]byte
|
||||
V25 [16]byte
|
||||
V26 [16]byte
|
||||
V27 [16]byte
|
||||
V28 [16]byte
|
||||
V29 [16]byte
|
||||
V30 [16]byte
|
||||
V31 [16]byte
|
||||
}
|
|
@ -4,13 +4,14 @@
|
|||
#include "textflag.h"
|
||||
|
||||
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
|
||||
MOVD R30, -496(RSP)
|
||||
SUB $496, RSP
|
||||
MOVD R30, -240(RSP)
|
||||
SUB $240, RSP
|
||||
MOVD R29, -8(RSP)
|
||||
SUB $8, RSP, R29
|
||||
#ifdef GOOS_ios
|
||||
MOVD R30, (RSP)
|
||||
#endif
|
||||
// Save GPs
|
||||
STP (R0, R1), 8(RSP)
|
||||
STP (R2, R3), 24(RSP)
|
||||
STP (R4, R5), 40(RSP)
|
||||
|
@ -28,39 +29,32 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
|
|||
MOVD R0, 216(RSP)
|
||||
MOVD FPSR, R0
|
||||
MOVD R0, 224(RSP)
|
||||
FSTPD (F0, F1), 232(RSP)
|
||||
FSTPD (F2, F3), 248(RSP)
|
||||
FSTPD (F4, F5), 264(RSP)
|
||||
FSTPD (F6, F7), 280(RSP)
|
||||
FSTPD (F8, F9), 296(RSP)
|
||||
FSTPD (F10, F11), 312(RSP)
|
||||
FSTPD (F12, F13), 328(RSP)
|
||||
FSTPD (F14, F15), 344(RSP)
|
||||
FSTPD (F16, F17), 360(RSP)
|
||||
FSTPD (F18, F19), 376(RSP)
|
||||
FSTPD (F20, F21), 392(RSP)
|
||||
FSTPD (F22, F23), 408(RSP)
|
||||
FSTPD (F24, F25), 424(RSP)
|
||||
FSTPD (F26, F27), 440(RSP)
|
||||
FSTPD (F28, F29), 456(RSP)
|
||||
FSTPD (F30, F31), 472(RSP)
|
||||
// Save extended register state to p.xRegs.scratch
|
||||
MOVD g_m(g), R0
|
||||
MOVD m_p(R0), R0
|
||||
ADD $(p_xRegs+xRegPerP_scratch), R0, R0
|
||||
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
|
||||
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
|
||||
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
|
||||
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
|
||||
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
|
||||
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
|
||||
VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R0)
|
||||
VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R0)
|
||||
CALL ·asyncPreempt2(SB)
|
||||
FLDPD 472(RSP), (F30, F31)
|
||||
FLDPD 456(RSP), (F28, F29)
|
||||
FLDPD 440(RSP), (F26, F27)
|
||||
FLDPD 424(RSP), (F24, F25)
|
||||
FLDPD 408(RSP), (F22, F23)
|
||||
FLDPD 392(RSP), (F20, F21)
|
||||
FLDPD 376(RSP), (F18, F19)
|
||||
FLDPD 360(RSP), (F16, F17)
|
||||
FLDPD 344(RSP), (F14, F15)
|
||||
FLDPD 328(RSP), (F12, F13)
|
||||
FLDPD 312(RSP), (F10, F11)
|
||||
FLDPD 296(RSP), (F8, F9)
|
||||
FLDPD 280(RSP), (F6, F7)
|
||||
FLDPD 264(RSP), (F4, F5)
|
||||
FLDPD 248(RSP), (F2, F3)
|
||||
FLDPD 232(RSP), (F0, F1)
|
||||
// Restore non-GPs from *p.xRegs.cache
|
||||
MOVD g_m(g), R0
|
||||
MOVD m_p(R0), R0
|
||||
MOVD (p_xRegs+xRegPerP_cache)(R0), R0
|
||||
VLD1.P 64(R0), [V0.B16, V1.B16, V2.B16, V3.B16]
|
||||
VLD1.P 64(R0), [V4.B16, V5.B16, V6.B16, V7.B16]
|
||||
VLD1.P 64(R0), [V8.B16, V9.B16, V10.B16, V11.B16]
|
||||
VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
|
||||
VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
|
||||
VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
|
||||
VLD1.P 64(R0), [V24.B16, V25.B16, V26.B16, V27.B16]
|
||||
VLD1.P 64(R0), [V28.B16, V29.B16, V30.B16, V31.B16]
|
||||
// Restore GPs
|
||||
MOVD 224(RSP), R0
|
||||
MOVD R0, FPSR
|
||||
MOVD 216(RSP), R0
|
||||
|
@ -78,8 +72,8 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
|
|||
LDP 40(RSP), (R4, R5)
|
||||
LDP 24(RSP), (R2, R3)
|
||||
LDP 8(RSP), (R0, R1)
|
||||
MOVD 496(RSP), R30
|
||||
MOVD 240(RSP), R30
|
||||
MOVD -8(RSP), R29
|
||||
MOVD (RSP), R27
|
||||
ADD $512, RSP
|
||||
ADD $256, RSP
|
||||
RET (R27)
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build !amd64
|
||||
//go:build !amd64 && !arm64
|
||||
|
||||
// This provides common support for architectures that DO NOT use extended
|
||||
// register state in asynchronous preemption.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:build amd64
|
||||
//go:build amd64 || arm64
|
||||
|
||||
// This provides common support for architectures that use extended register
|
||||
// state in asynchronous preemption.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue