runtime: save lasx and lsx registers in loong64 async preemption

This is a port of CL 669195 and CL 695916 adjusted to save loong64
lasx and lsx registers off stack.

Change-Id: Ie56787c76259a9545f5a8adcb09f588c8451bbd6
Reviewed-on: https://go-review.googlesource.com/c/go/+/711180
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
This commit is contained in:
Guoqi Chen 2025-09-11 16:21:44 +08:00 committed by abner chenc
parent 79ae97fe9b
commit 99cf4d671c
5 changed files with 339 additions and 80 deletions

View file

@ -713,10 +713,11 @@ func genMIPS(g *gen, _64bit bool) {
}
func genLoong64(g *gen) {
p := g.p
const xReg = "R4" // *xRegState
p, label := g.p, g.label
mov := "MOVV"
movf := "MOVD"
add := "ADDV"
sub := "SUBV"
regsize := 8
@ -732,12 +733,6 @@ func genLoong64(g *gen) {
l.add(mov, reg, regsize)
}
// Add floating point registers F0-F31.
for i := 0; i <= 31; i++ {
reg := fmt.Sprintf("F%d", i)
l.add(movf, reg, regsize)
}
// Add condition flag register fcc0-fcc7
sv := ""
rs := ""
@ -764,12 +759,80 @@ func genLoong64(g *gen) {
mov+" %d(R3), R5\n"+rs,
regsize)
// Create layouts for lasx, lsx and fp registers.
lasxRegs := layout{sp: xReg}
lsxRegs := lasxRegs
fpRegs := lasxRegs
for i := 0; i <= 31; i++ {
lasxRegs.add("XVMOVQ", fmt.Sprintf("X%d", i), 256/8)
lsxRegs.add("VMOVQ", fmt.Sprintf("V%d", i), 128/8)
fpRegs.add("MOVD", fmt.Sprintf("F%d", i), 64/8)
}
for i := range lsxRegs.regs {
for j := range lsxRegs.regs[i].regs {
lsxRegs.regs[i].regs[j].pos = lasxRegs.regs[i].regs[j].pos
fpRegs.regs[i].regs[j].pos = lasxRegs.regs[i].regs[j].pos
}
}
writeXRegs(g.goarch, &lasxRegs)
// allocate frame, save PC of interrupted instruction (in LR)
p(mov+" R1, -%d(R3)", l.stack)
p(sub+" $%d, R3", l.stack)
p("// Save GPs")
l.save(g)
p("// Save extended register state to p.xRegs.scratch")
p("MOVV g_m(g), %s", xReg)
p("MOVV m_p(%s), %s", xReg, xReg)
p("ADDV $(p_xRegs+xRegPerP_scratch), %s, %s", xReg, xReg)
p("MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R5")
p("BNE R5, saveLASX")
p("MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R5")
p("BNE R5, saveLSX")
label("saveFP:")
fpRegs.save(g)
p("JMP preempt")
label("saveLSX:")
lsxRegs.save(g)
p("JMP preempt")
label("saveLASX:")
lasxRegs.save(g)
label("preempt:")
p("CALL ·asyncPreempt2(SB)")
p("// Restore non-GPs from *p.xRegs.cache")
p("MOVV g_m(g), %s", xReg)
p("MOVV m_p(%s), %s", xReg, xReg)
p("MOVV (p_xRegs+xRegPerP_cache)(%s), %s", xReg, xReg)
p("MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R5")
p("BNE R5, restoreLASX")
p("MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R5")
p("BNE R5, restoreLSX")
label("restoreFP:")
fpRegs.restore(g)
p("JMP restoreGPs")
label("restoreLSX:")
lsxRegs.restore(g)
p("JMP restoreGPs")
label("restoreLASX:")
lasxRegs.restore(g)
p("// Restore GPs")
label("restoreGPs:")
l.restore(g)
p(mov+" %d(R3), R1", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it

View file

@ -0,0 +1,38 @@
// Code generated by mkpreempt.go; DO NOT EDIT.
package runtime
type xRegs struct {
X0 [32]byte
X1 [32]byte
X2 [32]byte
X3 [32]byte
X4 [32]byte
X5 [32]byte
X6 [32]byte
X7 [32]byte
X8 [32]byte
X9 [32]byte
X10 [32]byte
X11 [32]byte
X12 [32]byte
X13 [32]byte
X14 [32]byte
X15 [32]byte
X16 [32]byte
X17 [32]byte
X18 [32]byte
X19 [32]byte
X20 [32]byte
X21 [32]byte
X22 [32]byte
X23 [32]byte
X24 [32]byte
X25 [32]byte
X26 [32]byte
X27 [32]byte
X28 [32]byte
X29 [32]byte
X30 [32]byte
X31 [32]byte
}

View file

@ -4,8 +4,9 @@
#include "textflag.h"
TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVV R1, -480(R3)
SUBV $480, R3
MOVV R1, -224(R3)
SUBV $224, R3
// Save GPs
MOVV R4, 8(R3)
MOVV R5, 16(R3)
MOVV R6, 24(R3)
@ -32,38 +33,6 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVV R28, 192(R3)
MOVV R29, 200(R3)
MOVV R31, 208(R3)
MOVD F0, 216(R3)
MOVD F1, 224(R3)
MOVD F2, 232(R3)
MOVD F3, 240(R3)
MOVD F4, 248(R3)
MOVD F5, 256(R3)
MOVD F6, 264(R3)
MOVD F7, 272(R3)
MOVD F8, 280(R3)
MOVD F9, 288(R3)
MOVD F10, 296(R3)
MOVD F11, 304(R3)
MOVD F12, 312(R3)
MOVD F13, 320(R3)
MOVD F14, 328(R3)
MOVD F15, 336(R3)
MOVD F16, 344(R3)
MOVD F17, 352(R3)
MOVD F18, 360(R3)
MOVD F19, 368(R3)
MOVD F20, 376(R3)
MOVD F21, 384(R3)
MOVD F22, 392(R3)
MOVD F23, 400(R3)
MOVD F24, 408(R3)
MOVD F25, 416(R3)
MOVD F26, 424(R3)
MOVD F27, 432(R3)
MOVD F28, 440(R3)
MOVD F29, 448(R3)
MOVD F30, 456(R3)
MOVD F31, 464(R3)
MOVV FCC0, R4
BSTRINSV $7, R4, $0, R5
MOVV FCC1, R4
@ -80,9 +49,230 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
BSTRINSV $55, R4, $48, R5
MOVV FCC7, R4
BSTRINSV $63, R4, $56, R5
MOVV R5, 472(R3)
MOVV R5, 216(R3)
// Save extended register state to p.xRegs.scratch
MOVV g_m(g), R4
MOVV m_p(R4), R4
ADDV $(p_xRegs+xRegPerP_scratch), R4, R4
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R5
BNE R5, saveLASX
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R5
BNE R5, saveLSX
saveFP:
MOVD F0, 0(R4)
MOVD F1, 32(R4)
MOVD F2, 64(R4)
MOVD F3, 96(R4)
MOVD F4, 128(R4)
MOVD F5, 160(R4)
MOVD F6, 192(R4)
MOVD F7, 224(R4)
MOVD F8, 256(R4)
MOVD F9, 288(R4)
MOVD F10, 320(R4)
MOVD F11, 352(R4)
MOVD F12, 384(R4)
MOVD F13, 416(R4)
MOVD F14, 448(R4)
MOVD F15, 480(R4)
MOVD F16, 512(R4)
MOVD F17, 544(R4)
MOVD F18, 576(R4)
MOVD F19, 608(R4)
MOVD F20, 640(R4)
MOVD F21, 672(R4)
MOVD F22, 704(R4)
MOVD F23, 736(R4)
MOVD F24, 768(R4)
MOVD F25, 800(R4)
MOVD F26, 832(R4)
MOVD F27, 864(R4)
MOVD F28, 896(R4)
MOVD F29, 928(R4)
MOVD F30, 960(R4)
MOVD F31, 992(R4)
JMP preempt
saveLSX:
VMOVQ V0, 0(R4)
VMOVQ V1, 32(R4)
VMOVQ V2, 64(R4)
VMOVQ V3, 96(R4)
VMOVQ V4, 128(R4)
VMOVQ V5, 160(R4)
VMOVQ V6, 192(R4)
VMOVQ V7, 224(R4)
VMOVQ V8, 256(R4)
VMOVQ V9, 288(R4)
VMOVQ V10, 320(R4)
VMOVQ V11, 352(R4)
VMOVQ V12, 384(R4)
VMOVQ V13, 416(R4)
VMOVQ V14, 448(R4)
VMOVQ V15, 480(R4)
VMOVQ V16, 512(R4)
VMOVQ V17, 544(R4)
VMOVQ V18, 576(R4)
VMOVQ V19, 608(R4)
VMOVQ V20, 640(R4)
VMOVQ V21, 672(R4)
VMOVQ V22, 704(R4)
VMOVQ V23, 736(R4)
VMOVQ V24, 768(R4)
VMOVQ V25, 800(R4)
VMOVQ V26, 832(R4)
VMOVQ V27, 864(R4)
VMOVQ V28, 896(R4)
VMOVQ V29, 928(R4)
VMOVQ V30, 960(R4)
VMOVQ V31, 992(R4)
JMP preempt
saveLASX:
XVMOVQ X0, 0(R4)
XVMOVQ X1, 32(R4)
XVMOVQ X2, 64(R4)
XVMOVQ X3, 96(R4)
XVMOVQ X4, 128(R4)
XVMOVQ X5, 160(R4)
XVMOVQ X6, 192(R4)
XVMOVQ X7, 224(R4)
XVMOVQ X8, 256(R4)
XVMOVQ X9, 288(R4)
XVMOVQ X10, 320(R4)
XVMOVQ X11, 352(R4)
XVMOVQ X12, 384(R4)
XVMOVQ X13, 416(R4)
XVMOVQ X14, 448(R4)
XVMOVQ X15, 480(R4)
XVMOVQ X16, 512(R4)
XVMOVQ X17, 544(R4)
XVMOVQ X18, 576(R4)
XVMOVQ X19, 608(R4)
XVMOVQ X20, 640(R4)
XVMOVQ X21, 672(R4)
XVMOVQ X22, 704(R4)
XVMOVQ X23, 736(R4)
XVMOVQ X24, 768(R4)
XVMOVQ X25, 800(R4)
XVMOVQ X26, 832(R4)
XVMOVQ X27, 864(R4)
XVMOVQ X28, 896(R4)
XVMOVQ X29, 928(R4)
XVMOVQ X30, 960(R4)
XVMOVQ X31, 992(R4)
preempt:
CALL ·asyncPreempt2(SB)
MOVV 472(R3), R5
// Restore non-GPs from *p.xRegs.cache
MOVV g_m(g), R4
MOVV m_p(R4), R4
MOVV (p_xRegs+xRegPerP_cache)(R4), R4
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLASX(SB), R5
BNE R5, restoreLASX
MOVBU internalcpu·Loong64+const_offsetLOONG64HasLSX(SB), R5
BNE R5, restoreLSX
restoreFP:
MOVD 992(R4), F31
MOVD 960(R4), F30
MOVD 928(R4), F29
MOVD 896(R4), F28
MOVD 864(R4), F27
MOVD 832(R4), F26
MOVD 800(R4), F25
MOVD 768(R4), F24
MOVD 736(R4), F23
MOVD 704(R4), F22
MOVD 672(R4), F21
MOVD 640(R4), F20
MOVD 608(R4), F19
MOVD 576(R4), F18
MOVD 544(R4), F17
MOVD 512(R4), F16
MOVD 480(R4), F15
MOVD 448(R4), F14
MOVD 416(R4), F13
MOVD 384(R4), F12
MOVD 352(R4), F11
MOVD 320(R4), F10
MOVD 288(R4), F9
MOVD 256(R4), F8
MOVD 224(R4), F7
MOVD 192(R4), F6
MOVD 160(R4), F5
MOVD 128(R4), F4
MOVD 96(R4), F3
MOVD 64(R4), F2
MOVD 32(R4), F1
MOVD 0(R4), F0
JMP restoreGPs
restoreLSX:
VMOVQ 992(R4), V31
VMOVQ 960(R4), V30
VMOVQ 928(R4), V29
VMOVQ 896(R4), V28
VMOVQ 864(R4), V27
VMOVQ 832(R4), V26
VMOVQ 800(R4), V25
VMOVQ 768(R4), V24
VMOVQ 736(R4), V23
VMOVQ 704(R4), V22
VMOVQ 672(R4), V21
VMOVQ 640(R4), V20
VMOVQ 608(R4), V19
VMOVQ 576(R4), V18
VMOVQ 544(R4), V17
VMOVQ 512(R4), V16
VMOVQ 480(R4), V15
VMOVQ 448(R4), V14
VMOVQ 416(R4), V13
VMOVQ 384(R4), V12
VMOVQ 352(R4), V11
VMOVQ 320(R4), V10
VMOVQ 288(R4), V9
VMOVQ 256(R4), V8
VMOVQ 224(R4), V7
VMOVQ 192(R4), V6
VMOVQ 160(R4), V5
VMOVQ 128(R4), V4
VMOVQ 96(R4), V3
VMOVQ 64(R4), V2
VMOVQ 32(R4), V1
VMOVQ 0(R4), V0
JMP restoreGPs
restoreLASX:
XVMOVQ 992(R4), X31
XVMOVQ 960(R4), X30
XVMOVQ 928(R4), X29
XVMOVQ 896(R4), X28
XVMOVQ 864(R4), X27
XVMOVQ 832(R4), X26
XVMOVQ 800(R4), X25
XVMOVQ 768(R4), X24
XVMOVQ 736(R4), X23
XVMOVQ 704(R4), X22
XVMOVQ 672(R4), X21
XVMOVQ 640(R4), X20
XVMOVQ 608(R4), X19
XVMOVQ 576(R4), X18
XVMOVQ 544(R4), X17
XVMOVQ 512(R4), X16
XVMOVQ 480(R4), X15
XVMOVQ 448(R4), X14
XVMOVQ 416(R4), X13
XVMOVQ 384(R4), X12
XVMOVQ 352(R4), X11
XVMOVQ 320(R4), X10
XVMOVQ 288(R4), X9
XVMOVQ 256(R4), X8
XVMOVQ 224(R4), X7
XVMOVQ 192(R4), X6
XVMOVQ 160(R4), X5
XVMOVQ 128(R4), X4
XVMOVQ 96(R4), X3
XVMOVQ 64(R4), X2
XVMOVQ 32(R4), X1
XVMOVQ 0(R4), X0
// Restore GPs
restoreGPs:
MOVV 216(R3), R5
BSTRPICKV $7, R5, $0, R4
MOVV R4, FCC0
BSTRPICKV $15, R5, $8, R4
@ -99,38 +289,6 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVV R4, FCC6
BSTRPICKV $63, R5, $56, R4
MOVV R4, FCC7
MOVD 464(R3), F31
MOVD 456(R3), F30
MOVD 448(R3), F29
MOVD 440(R3), F28
MOVD 432(R3), F27
MOVD 424(R3), F26
MOVD 416(R3), F25
MOVD 408(R3), F24
MOVD 400(R3), F23
MOVD 392(R3), F22
MOVD 384(R3), F21
MOVD 376(R3), F20
MOVD 368(R3), F19
MOVD 360(R3), F18
MOVD 352(R3), F17
MOVD 344(R3), F16
MOVD 336(R3), F15
MOVD 328(R3), F14
MOVD 320(R3), F13
MOVD 312(R3), F12
MOVD 304(R3), F11
MOVD 296(R3), F10
MOVD 288(R3), F9
MOVD 280(R3), F8
MOVD 272(R3), F7
MOVD 264(R3), F6
MOVD 256(R3), F5
MOVD 248(R3), F4
MOVD 240(R3), F3
MOVD 232(R3), F2
MOVD 224(R3), F1
MOVD 216(R3), F0
MOVV 208(R3), R31
MOVV 200(R3), R29
MOVV 192(R3), R28
@ -157,7 +315,7 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
MOVV 24(R3), R6
MOVV 16(R3), R5
MOVV 8(R3), R4
MOVV 480(R3), R1
MOVV 224(R3), R1
MOVV (R3), R30
ADDV $488, R3
ADDV $232, R3
JMP (R30)

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 && !arm64
//go:build !amd64 && !arm64 && !loong64
// This provides common support for architectures that DO NOT use extended
// register state in asynchronous preemption.

View file

@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build amd64 || arm64
//go:build amd64 || arm64 || loong64
// This provides common support for architectures that use extended register
// state in asynchronous preemption.