[dev.simd] cmd/compile: ensure the whole X15 register is zeroed

On AMD64, we reserve the X15 register as the zero register.
Currently we use an SSE instruction to zero it, and we only use
it in SSE contexts. When the machine supports AVX, the high bits
of the register is not necessarily zeroed.

Now that the compiler generates AVX code for SIMD, it would be
great to have a zero register in the AVX context. This CL zeroes
the whole X15 register if AVX is supported.

Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7
Reviewed-on: https://go-review.googlesource.com/c/go/+/698237
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Cherry Mui 2025-08-21 14:37:18 -04:00
parent baea0c700b
commit 4c311aa38f
16 changed files with 78 additions and 8 deletions

View file

@ -18,6 +18,7 @@ import (
"cmd/internal/obj" "cmd/internal/obj"
"cmd/internal/obj/x86" "cmd/internal/obj/x86"
"internal/abi" "internal/abi"
"internal/buildcfg"
) )
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail: case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal { if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
// zeroing X15 when entering ABIInternal from ABI0 // zeroing X15 when entering ABIInternal from ABI0
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) zeroX15(s)
// set G register from TLS // set G register from TLS
getgFromTLS(s, x86.REG_R14) getgFromTLS(s, x86.REG_R14)
} }
@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
s.Call(v) s.Call(v)
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 { if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
// zeroing X15 when entering ABIInternal from ABI0 // zeroing X15 when entering ABIInternal from ABI0
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) zeroX15(s)
// set G register from TLS // set G register from TLS
getgFromTLS(s, x86.REG_R14) getgFromTLS(s, x86.REG_R14)
} }
@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
} }
} }
// zeroX15 zeroes the X15 register.
func zeroX15(s *ssagen.State) {
vxorps := func(s *ssagen.State) {
p := s.Prog(x86.AVXORPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.AddRestSourceReg(x86.REG_X15)
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_X15
}
if buildcfg.GOAMD64 >= 3 {
vxorps(s)
return
}
// AVX may not be available, check before zeroing the high bits.
p := s.Prog(x86.ACMPB)
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
p.From.Sym = ir.Syms.X86HasAVX
p.To.Type = obj.TYPE_CONST
p.To.Offset = 1
jmp := s.Prog(x86.AJNE)
jmp.To.Type = obj.TYPE_BRANCH
vxorps(s)
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
jmp.To.SetTarget(sse)
}
// Example instruction: VRSQRTPS X1, X1 // Example instruction: VRSQRTPS X1, X1
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog { func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())

View file

@ -68,6 +68,7 @@ type symsStruct struct {
Loong64HasLAM_BH *obj.LSym Loong64HasLAM_BH *obj.LSym
Loong64HasLSX *obj.LSym Loong64HasLSX *obj.LSym
RISCV64HasZbb *obj.LSym RISCV64HasZbb *obj.LSym
X86HasAVX *obj.LSym
X86HasFMA *obj.LSym X86HasFMA *obj.LSym
X86HasPOPCNT *obj.LSym X86HasPOPCNT *obj.LSym
X86HasSSE41 *obj.LSym X86HasSSE41 *obj.LSym

View file

@ -150,9 +150,10 @@ func InitConfig() {
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert") ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero") ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove") ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
} }
var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym

View file

@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32 func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
// architecture variants // architecture variants
var x86HasAVX bool
var x86HasFMA bool
var x86HasPOPCNT bool var x86HasPOPCNT bool
var x86HasSSE41 bool var x86HasSSE41 bool
var x86HasFMA bool
var armHasVFPv4 bool var armHasVFPv4 bool
var arm64HasATOMICS bool var arm64HasATOMICS bool
var loong64HasLAMCAS bool var loong64HasLAMCAS bool

View file

@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
{"libfuzzerHookStrCmp", funcTag, 155}, {"libfuzzerHookStrCmp", funcTag, 155},
{"libfuzzerHookEqualFold", funcTag, 155}, {"libfuzzerHookEqualFold", funcTag, 155},
{"addCovMeta", funcTag, 157}, {"addCovMeta", funcTag, 157},
{"x86HasAVX", varTag, 6},
{"x86HasFMA", varTag, 6},
{"x86HasPOPCNT", varTag, 6}, {"x86HasPOPCNT", varTag, 6},
{"x86HasSSE41", varTag, 6}, {"x86HasSSE41", varTag, 6},
{"x86HasFMA", varTag, 6},
{"armHasVFPv4", varTag, 6}, {"armHasVFPv4", varTag, 6},
{"arm64HasATOMICS", varTag, 6}, {"arm64HasATOMICS", varTag, 6},
{"loong64HasLAMCAS", varTag, 6}, {"loong64HasLAMCAS", varTag, 6},

View file

@ -1015,6 +1015,9 @@ needm:
// there's no need to handle that. Clear R14 so that there's // there's no need to handle that. Clear R14 so that there's
// a bad value in there, in case needm tries to use it. // a bad value in there, in case needm tries to use it.
XORPS X15, X15 XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
XORQ R14, R14 XORQ R14, R14
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX CALL AX
@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
get_tls(R14) get_tls(R14)
MOVQ g(R14), R14 MOVQ g(R14), R14
XORPS X15, X15 XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
JMP ·sigpanic<ABIInternal>(SB) JMP ·sigpanic<ABIInternal>(SB)
// gcWriteBarrier informs the GC about heap pointer writes. // gcWriteBarrier informs the GC about heap pointer writes.

View file

@ -28,9 +28,10 @@ const (
var ( var (
// Set in runtime.cpuinit. // Set in runtime.cpuinit.
// TODO: deprecate these; use internal/cpu directly. // TODO: deprecate these; use internal/cpu directly.
x86HasAVX bool
x86HasFMA bool
x86HasPOPCNT bool x86HasPOPCNT bool
x86HasSSE41 bool x86HasSSE41 bool
x86HasFMA bool
armHasVFPv4 bool armHasVFPv4 bool

View file

@ -766,9 +766,10 @@ func cpuinit(env string) {
// to guard execution of instructions that can not be assumed to be always supported. // to guard execution of instructions that can not be assumed to be always supported.
switch GOARCH { switch GOARCH {
case "386", "amd64": case "386", "amd64":
x86HasAVX = cpu.X86.HasAVX
x86HasFMA = cpu.X86.HasFMA
x86HasPOPCNT = cpu.X86.HasPOPCNT x86HasPOPCNT = cpu.X86.HasPOPCNT
x86HasSSE41 = cpu.X86.HasSSE41 x86HasSSE41 = cpu.X86.HasSSE41
x86HasFMA = cpu.X86.HasFMA
case "arm": case "arm":
armHasVFPv4 = cpu.ARM.HasVFPv4 armHasVFPv4 = cpu.ARM.HasVFPv4

View file

@ -456,6 +456,9 @@ call:
// Back to Go world, set special registers. // Back to Go world, set special registers.
// The g register (R14) is preserved in C. // The g register (R14) is preserved in C.
XORPS X15, X15 XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
RET RET
// C->Go callback thunk that allows to call runtime·racesymbolize from C code. // C->Go callback thunk that allows to call runtime·racesymbolize from C code.

View file

@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking
@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking
@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12) get_tls(R12)
MOVQ g(R12), R14 MOVQ g(R12), R14
PXOR X15, X15 PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots. // Reserve space for spill slots.
NOP SP // disable vet stack checking NOP SP // disable vet stack checking

View file

@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
// R14 is cleared in case there's a non-zero value in there // R14 is cleared in case there's a non-zero value in there
// if called from a non-go thread. // if called from a non-go thread.
XORPS X15, X15 XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
XORQ R14, R14 XORQ R14, R14
get_tls(AX) get_tls(AX)