runtime: support arm64 Neon in async preemption

This is a port of CL 669195 adjusted to save arm64 Neon registers
off stack.

Change-Id: Ia014778a8c9f0c1d05977b04184f51e791ae8495
Reviewed-on: https://go-review.googlesource.com/c/go/+/695916
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
This commit is contained in:
Julia Lapenko 2025-08-13 22:23:14 +03:00 committed by Cherry Mui
parent 5368e77429
commit 955a5a0dc5
5 changed files with 239 additions and 79 deletions

View file

@ -163,19 +163,21 @@ package runtime
type xRegs struct {
`)
pos := 0
for _, reg := range l.regs {
if reg.pos != pos {
log.Fatalf("padding not implemented")
for _, seq := range l.regs {
for _, r := range seq.regs {
if r.pos != pos && !seq.fixedOffset {
log.Fatalf("padding not implemented")
}
typ := fmt.Sprintf("[%d]byte", r.size)
switch {
case r.size == 4 && r.pos%4 == 0:
typ = "uint32"
case r.size == 8 && r.pos%8 == 0:
typ = "uint64"
}
fmt.Fprintf(g.w, "\t%s %s\n", r.name, typ)
pos += r.size
}
typ := fmt.Sprintf("[%d]byte", reg.size)
switch {
case reg.size == 4 && reg.pos%4 == 0:
typ = "uint32"
case reg.size == 8 && reg.pos%8 == 0:
typ = "uint64"
}
fmt.Fprintf(g.w, "\t%s %s\n", reg.reg, typ)
pos += reg.size
}
fmt.Fprintf(g.w, "}\n")
@ -191,16 +193,61 @@ type xRegs struct {
type layout struct {
stack int
regs []regPos
regs []regSeq
sp string // stack pointer register
}
type regPos struct {
pos, size int
type regInfo struct {
size int // register size in bytes
name string // register name
// Some register names may require a specific suffix.
// In ARM64, a suffix called an "arrangement specifier" can be added to
// a register name. For example:
//
// V0.B16
//
// In this case, "V0" is the register name, and ".B16" is the suffix.
suffix string
pos int // position on stack
}
// Some save/restore operations can involve multiple registers in a single
// instruction. For example, the LDP/STP instructions in ARM64:
//
// LDP 8(RSP), (R0, R1)
// STP (R0, R1), 8(RSP)
//
// In these cases, a pair of registers (R0, R1) is used as a single argument.
type regSeq struct {
saveOp string
restoreOp string
reg string
regs []regInfo
// By default, all registers are saved on the stack, and the stack pointer offset
// is calculated based on the size of each register. For example (ARM64):
//
// STP (R0, R1), 8(RSP)
// STP (R2, R3), 24(RSP)
//
// However, automatic offset calculation may not always be desirable.
// In some cases, the offset must remain fixed:
//
// VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
// VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
//
// In this example, R0 is post-incremented after each instruction,
// so the offset should not be recalculated. For such cases,
// `fixedOffset` is set to true.
fixedOffset bool
// After conversion to a string, register names are separated by commas
// and may be wrapped in a custom pair of brackets. For example (ARM64):
//
// (R0, R1) // wrapped in parentheses
// [V0.B16, V1.B16, V2.B16, V3.B16] // wrapped in square brackets
brackets [2]string
// If this register requires special save and restore, these
// give those operations with a %d placeholder for the stack
@ -208,40 +255,95 @@ type regPos struct {
save, restore string
}
func (l *layout) add(op, reg string, size int) {
l.regs = append(l.regs, regPos{saveOp: op, restoreOp: op, reg: reg, pos: l.stack, size: size})
func (l *layout) add(op, regname string, size int) {
l.regs = append(l.regs, regSeq{saveOp: op, restoreOp: op, regs: []regInfo{{size, regname, "", l.stack}}})
l.stack += size
}
func (l *layout) add2(sop, rop, reg string, size int) {
l.regs = append(l.regs, regPos{saveOp: sop, restoreOp: rop, reg: reg, pos: l.stack, size: size})
l.stack += size
func (l *layout) add2(sop, rop string, regs []regInfo, brackets [2]string, fixedOffset bool) {
l.regs = append(l.regs, regSeq{saveOp: sop, restoreOp: rop, regs: regs, brackets: brackets, fixedOffset: fixedOffset})
if !fixedOffset {
for i := range regs {
regs[i].pos = l.stack
l.stack += regs[i].size
}
}
}
func (l *layout) addSpecial(save, restore string, size int) {
l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack, size: size})
l.regs = append(l.regs, regSeq{save: save, restore: restore, regs: []regInfo{{size, "", "", l.stack}}})
l.stack += size
}
func (rs *regSeq) String() string {
switch len(rs.regs) {
case 0:
log.Fatal("Register sequence must not be empty!")
case 1:
return rs.regs[0].name
default:
names := make([]string, 0)
for _, r := range rs.regs {
name := r.name + r.suffix
names = append(names, name)
}
return rs.brackets[0] + strings.Join(names, ", ") + rs.brackets[1]
}
return ""
}
func (l *layout) save(g *gen) {
for _, reg := range l.regs {
if reg.save != "" {
g.p(reg.save, reg.pos)
for _, seq := range l.regs {
if len(seq.regs) < 1 {
log.Fatal("Register sequence must not be empty!")
}
// When dealing with a sequence of registers, we assume that only the position
// of the first register is relevant. For example:
//
// STP (R0, R1), 8(RSP)
// STP (R2, R3), 24(RSP)
//
// Here, R0.pos is 8. While we can infer that R1.pos is 16, it doesn't need to
// be explicitly specified, as the STP instruction calculates it automatically.
pos := seq.regs[0].pos
if seq.save != "" {
g.p(seq.save, pos)
} else {
g.p("%s %s, %d(%s)", reg.saveOp, reg.reg, reg.pos, l.sp)
name := seq.String()
g.p("%s %s, %d(%s)", seq.saveOp, name, pos, l.sp)
}
}
}
func (l *layout) restoreInOrder(g *gen, reverse bool) {
var seq []regSeq
if reverse {
seq = make([]regSeq, 0)
for i := len(l.regs) - 1; i >= 0; i-- {
seq = append(seq, l.regs[i])
}
} else {
seq = l.regs
}
for _, reg := range seq {
if len(reg.regs) < 1 {
log.Fatal("Register sequence must not be empty!")
}
pos := reg.regs[0].pos
if reg.restore != "" {
g.p(reg.restore, pos)
} else {
g.p("%s %d(%s), %s", reg.restoreOp, pos, l.sp, reg.String())
}
}
}
func (l *layout) restore(g *gen) {
for i := len(l.regs) - 1; i >= 0; i-- {
reg := l.regs[i]
if reg.restore != "" {
g.p(reg.restore, reg.pos)
} else {
g.p("%s %d(%s), %s", reg.restoreOp, reg.pos, l.sp, reg.reg)
}
}
l.restoreInOrder(g, true)
}
func (l *layout) restoreDirect(g *gen) {
l.restoreInOrder(g, false)
}
func gen386(g *gen) {
@ -320,8 +422,11 @@ func genAMD64(g *gen) {
// We don't have to do this, but it results in a nice Go type. If we split
// this into multiple types, we probably should stop doing this.
for i := range lXRegs.regs {
lXRegs.regs[i].pos = lZRegs.regs[i].pos
lYRegs.regs[i].pos = lZRegs.regs[i].pos
for j := range lXRegs.regs[i].regs {
lXRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
lYRegs.regs[i].regs[j].pos = lZRegs.regs[i].regs[j].pos
}
}
writeXRegs(g.goarch, &lZRegs)
@ -456,6 +561,7 @@ func genARM(g *gen) {
}
func genARM64(g *gen) {
const vReg = "R0" // *xRegState
p := g.p
// Add integer registers R0-R26
// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
@ -466,8 +572,11 @@ func genARM64(g *gen) {
i--
continue // R18 is not used, skip
}
reg := fmt.Sprintf("(R%d, R%d)", i, i+1)
l.add2("STP", "LDP", reg, 16)
regs := []regInfo{
{name: fmt.Sprintf("R%d", i), size: 8},
{name: fmt.Sprintf("R%d", i+1), size: 8},
}
l.add2("STP", "LDP", regs, [2]string{"(", ")"}, false)
}
// Add flag registers.
l.addSpecial(
@ -480,10 +589,17 @@ func genARM64(g *gen) {
8)
// TODO: FPCR? I don't think we'll change it, so no need to save.
// Add floating point registers F0-F31.
for i := 0; i < 31; i += 2 {
reg := fmt.Sprintf("(F%d, F%d)", i, i+1)
l.add2("FSTPD", "FLDPD", reg, 16)
lVRegs := layout{sp: vReg} // Non-GP registers
for i := 0; i < 31; i += 4 {
regs := []regInfo{
{name: fmt.Sprintf("V%d", i), suffix: ".B16", size: 16, pos: 64},
{name: fmt.Sprintf("V%d", i+1), suffix: ".B16", size: 16, pos: 64},
{name: fmt.Sprintf("V%d", i+2), suffix: ".B16", size: 16, pos: 64},
{name: fmt.Sprintf("V%d", i+3), suffix: ".B16", size: 16, pos: 64},
}
lVRegs.add2("VST1.P", "VLD1.P", regs, [2]string{"[", "]"}, true)
}
writeXRegs(g.goarch, &lVRegs)
if l.stack%16 != 0 {
l.stack += 8 // SP needs 16-byte alignment
}
@ -500,8 +616,20 @@ func genARM64(g *gen) {
p("MOVD R30, (RSP)")
p("#endif")
p("// Save GPs")
l.save(g)
p("// Save extended register state to p.xRegs.scratch")
p("MOVD g_m(g), %s", vReg)
p("MOVD m_p(%s), %s", vReg, vReg)
p("ADD $(p_xRegs+xRegPerP_scratch), %s, %s", vReg, vReg)
lVRegs.save(g)
p("CALL ·asyncPreempt2(SB)")
p("// Restore non-GPs from *p.xRegs.cache")
p("MOVD g_m(g), %s", vReg)
p("MOVD m_p(%s), %s", vReg, vReg)
p("MOVD (p_xRegs+xRegPerP_cache)(%s), %s", vReg, vReg)
lVRegs.restoreDirect(g)
p("// Restore GPs")
l.restore(g)
p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it

View file

@ -0,0 +1,38 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
package runtime
type xRegs struct {
V0 [16]byte
V1 [16]byte
V2 [16]byte
V3 [16]byte
V4 [16]byte
V5 [16]byte
V6 [16]byte
V7 [16]byte
V8 [16]byte
V9 [16]byte
V10 [16]byte
V11 [16]byte
V12 [16]byte
V13 [16]byte
V14 [16]byte
V15 [16]byte
V16 [16]byte
V17 [16]byte
V18 [16]byte
V19 [16]byte
V20 [16]byte
V21 [16]byte
V22 [16]byte
V23 [16]byte
V24 [16]byte
V25 [16]byte
V26 [16]byte
V27 [16]byte
V28 [16]byte
V29 [16]byte
V30 [16]byte
V31 [16]byte
}

View file

@ -4,13 +4,14 @@
#include "textflag.h"
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVD R30, -496(RSP)
SUB $496, RSP
MOVD R30, -240(RSP)
SUB $240, RSP
MOVD R29, -8(RSP)
SUB $8, RSP, R29
#ifdef GOOS_ios
MOVD R30, (RSP)
#endif
// Save GPs
STP (R0, R1), 8(RSP)
STP (R2, R3), 24(RSP)
STP (R4, R5), 40(RSP)
@ -28,39 +29,32 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVD R0, 216(RSP)
MOVD FPSR, R0
MOVD R0, 224(RSP)
FSTPD (F0, F1), 232(RSP)
FSTPD (F2, F3), 248(RSP)
FSTPD (F4, F5), 264(RSP)
FSTPD (F6, F7), 280(RSP)
FSTPD (F8, F9), 296(RSP)
FSTPD (F10, F11), 312(RSP)
FSTPD (F12, F13), 328(RSP)
FSTPD (F14, F15), 344(RSP)
FSTPD (F16, F17), 360(RSP)
FSTPD (F18, F19), 376(RSP)
FSTPD (F20, F21), 392(RSP)
FSTPD (F22, F23), 408(RSP)
FSTPD (F24, F25), 424(RSP)
FSTPD (F26, F27), 440(RSP)
FSTPD (F28, F29), 456(RSP)
FSTPD (F30, F31), 472(RSP)
// Save extended register state to p.xRegs.scratch
MOVD g_m(g), R0
MOVD m_p(R0), R0
ADD $(p_xRegs+xRegPerP_scratch), R0, R0
VST1.P [V0.B16, V1.B16, V2.B16, V3.B16], 64(R0)
VST1.P [V4.B16, V5.B16, V6.B16, V7.B16], 64(R0)
VST1.P [V8.B16, V9.B16, V10.B16, V11.B16], 64(R0)
VST1.P [V12.B16, V13.B16, V14.B16, V15.B16], 64(R0)
VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R0)
VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R0)
VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R0)
VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R0)
CALL ·asyncPreempt2(SB)
FLDPD 472(RSP), (F30, F31)
FLDPD 456(RSP), (F28, F29)
FLDPD 440(RSP), (F26, F27)
FLDPD 424(RSP), (F24, F25)
FLDPD 408(RSP), (F22, F23)
FLDPD 392(RSP), (F20, F21)
FLDPD 376(RSP), (F18, F19)
FLDPD 360(RSP), (F16, F17)
FLDPD 344(RSP), (F14, F15)
FLDPD 328(RSP), (F12, F13)
FLDPD 312(RSP), (F10, F11)
FLDPD 296(RSP), (F8, F9)
FLDPD 280(RSP), (F6, F7)
FLDPD 264(RSP), (F4, F5)
FLDPD 248(RSP), (F2, F3)
FLDPD 232(RSP), (F0, F1)
// Restore non-GPs from *p.xRegs.cache
MOVD g_m(g), R0
MOVD m_p(R0), R0
MOVD (p_xRegs+xRegPerP_cache)(R0), R0
VLD1.P 64(R0), [V0.B16, V1.B16, V2.B16, V3.B16]
VLD1.P 64(R0), [V4.B16, V5.B16, V6.B16, V7.B16]
VLD1.P 64(R0), [V8.B16, V9.B16, V10.B16, V11.B16]
VLD1.P 64(R0), [V12.B16, V13.B16, V14.B16, V15.B16]
VLD1.P 64(R0), [V16.B16, V17.B16, V18.B16, V19.B16]
VLD1.P 64(R0), [V20.B16, V21.B16, V22.B16, V23.B16]
VLD1.P 64(R0), [V24.B16, V25.B16, V26.B16, V27.B16]
VLD1.P 64(R0), [V28.B16, V29.B16, V30.B16, V31.B16]
// Restore GPs
MOVD 224(RSP), R0
MOVD R0, FPSR
MOVD 216(RSP), R0
@ -78,8 +72,8 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
LDP 40(RSP), (R4, R5)
LDP 24(RSP), (R2, R3)
LDP 8(RSP), (R0, R1)
MOVD 496(RSP), R30
MOVD 240(RSP), R30
MOVD -8(RSP), R29
MOVD (RSP), R27
ADD $512, RSP
ADD $256, RSP
RET (R27)

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64
//go:build !amd64 && !arm64
// This provides common support for architectures that DO NOT use extended
// register state in asynchronous preemption.

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64
//go:build amd64 || arm64
// This provides common support for architectures that use extended register
// state in asynchronous preemption.