[dev.simd] cmd/compile: ensure the whole X15 register is zeroed

On AMD64, we reserve the X15 register as the zero register.
Currently we use an SSE instruction to zero it, and we only use
it in SSE contexts. When the machine supports AVX, the high bits
of the register is not necessarily zeroed.

Now that the compiler generates AVX code for SIMD, it would be
great to have a zero register in the AVX context. This CL zeroes
the whole X15 register if AVX is supported.

Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7
Reviewed-on: https://go-review.googlesource.com/c/go/+/698237
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
Cherry Mui 2025-08-21 14:37:18 -04:00
parent baea0c700b
commit 4c311aa38f
16 changed files with 78 additions and 8 deletions

View file

@ -18,6 +18,7 @@ import (
"cmd/internal/obj"
"cmd/internal/obj/x86"
"internal/abi"
"internal/buildcfg"
)
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
// zeroing X15 when entering ABIInternal from ABI0
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
s.Call(v)
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
// zeroing X15 when entering ABIInternal from ABI0
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
zeroX15(s)
// set G register from TLS
getgFromTLS(s, x86.REG_R14)
}
@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
}
}
// zeroX15 zeroes the X15 register.
func zeroX15(s *ssagen.State) {
vxorps := func(s *ssagen.State) {
p := s.Prog(x86.AVXORPS)
p.From.Type = obj.TYPE_REG
p.From.Reg = x86.REG_X15
p.AddRestSourceReg(x86.REG_X15)
p.To.Type = obj.TYPE_REG
p.To.Reg = x86.REG_X15
}
if buildcfg.GOAMD64 >= 3 {
vxorps(s)
return
}
// AVX may not be available, check before zeroing the high bits.
p := s.Prog(x86.ACMPB)
p.From.Type = obj.TYPE_MEM
p.From.Name = obj.NAME_EXTERN
p.From.Sym = ir.Syms.X86HasAVX
p.To.Type = obj.TYPE_CONST
p.To.Offset = 1
jmp := s.Prog(x86.AJNE)
jmp.To.Type = obj.TYPE_BRANCH
vxorps(s)
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
jmp.To.SetTarget(sse)
}
// Example instruction: VRSQRTPS X1, X1
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
p := s.Prog(v.Op.Asm())

View file

@ -68,6 +68,7 @@ type symsStruct struct {
Loong64HasLAM_BH *obj.LSym
Loong64HasLSX *obj.LSym
RISCV64HasZbb *obj.LSym
X86HasAVX *obj.LSym
X86HasFMA *obj.LSym
X86HasPOPCNT *obj.LSym
X86HasSSE41 *obj.LSym

View file

@ -150,9 +150,10 @@ func InitConfig() {
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
}
var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym

View file

@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
// architecture variants
var x86HasAVX bool
var x86HasFMA bool
var x86HasPOPCNT bool
var x86HasSSE41 bool
var x86HasFMA bool
var armHasVFPv4 bool
var arm64HasATOMICS bool
var loong64HasLAMCAS bool

View file

@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
{"libfuzzerHookStrCmp", funcTag, 155},
{"libfuzzerHookEqualFold", funcTag, 155},
{"addCovMeta", funcTag, 157},
{"x86HasAVX", varTag, 6},
{"x86HasFMA", varTag, 6},
{"x86HasPOPCNT", varTag, 6},
{"x86HasSSE41", varTag, 6},
{"x86HasFMA", varTag, 6},
{"armHasVFPv4", varTag, 6},
{"arm64HasATOMICS", varTag, 6},
{"loong64HasLAMCAS", varTag, 6},

View file

@ -1015,6 +1015,9 @@ needm:
// there's no need to handle that. Clear R14 so that there's
// a bad value in there, in case needm tries to use it.
XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
XORQ R14, R14
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX
@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
get_tls(R14)
MOVQ g(R14), R14
XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
JMP ·sigpanic<ABIInternal>(SB)
// gcWriteBarrier informs the GC about heap pointer writes.

View file

@ -28,9 +28,10 @@ const (
var (
// Set in runtime.cpuinit.
// TODO: deprecate these; use internal/cpu directly.
x86HasAVX bool
x86HasFMA bool
x86HasPOPCNT bool
x86HasSSE41 bool
x86HasFMA bool
armHasVFPv4 bool

View file

@ -766,9 +766,10 @@ func cpuinit(env string) {
// to guard execution of instructions that can not be assumed to be always supported.
switch GOARCH {
case "386", "amd64":
x86HasAVX = cpu.X86.HasAVX
x86HasFMA = cpu.X86.HasFMA
x86HasPOPCNT = cpu.X86.HasPOPCNT
x86HasSSE41 = cpu.X86.HasSSE41
x86HasFMA = cpu.X86.HasFMA
case "arm":
armHasVFPv4 = cpu.ARM.HasVFPv4

View file

@ -456,6 +456,9 @@ call:
// Back to Go world, set special registers.
// The g register (R14) is preserved in C.
XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
RET
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.

View file

@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking
@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
get_tls(R12)
MOVQ g(R12), R14
PXOR X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
// Reserve space for spill slots.
NOP SP // disable vet stack checking

View file

@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
// R14 is cleared in case there's a non-zero value in there
// if called from a non-go thread.
XORPS X15, X15
CMPB internalcpu·X86+const_offsetX86HasAVX(SB), $1
JNE 2(PC)
VXORPS X15, X15, X15
XORQ R14, R14
get_tls(AX)