From 4c311aa38f6e354ec4d9f5882a16c36a2e4b0f36 Mon Sep 17 00:00:00 2001 From: Cherry Mui Date: Thu, 21 Aug 2025 14:37:18 -0400 Subject: [PATCH] [dev.simd] cmd/compile: ensure the whole X15 register is zeroed On AMD64, we reserve the X15 register as the zero register. Currently we use an SSE instruction to zero it, and we only use it in SSE contexts. When the machine supports AVX, the high bits of the register is not necessarily zeroed. Now that the compiler generates AVX code for SIMD, it would be great to have a zero register in the AVX context. This CL zeroes the whole X15 register if AVX is supported. Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7 Reviewed-on: https://go-review.googlesource.com/c/go/+/698237 LUCI-TryBot-Result: Go LUCI Reviewed-by: Junyang Shao Reviewed-by: David Chase --- src/cmd/compile/internal/amd64/ssa.go | 33 +++++++++++++++++-- src/cmd/compile/internal/ir/symtab.go | 1 + src/cmd/compile/internal/ssagen/ssa.go | 4 +-- .../internal/typecheck/_builtin/runtime.go | 3 +- src/cmd/compile/internal/typecheck/builtin.go | 3 +- src/runtime/asm_amd64.s | 6 ++++ src/runtime/cpuflags.go | 3 +- src/runtime/proc.go | 3 +- src/runtime/race_amd64.s | 3 ++ src/runtime/sys_darwin_amd64.s | 3 ++ src/runtime/sys_dragonfly_amd64.s | 3 ++ src/runtime/sys_freebsd_amd64.s | 6 ++++ src/runtime/sys_linux_amd64.s | 6 ++++ src/runtime/sys_netbsd_amd64.s | 3 ++ src/runtime/sys_openbsd_amd64.s | 3 ++ src/runtime/sys_windows_amd64.s | 3 ++ 16 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/cmd/compile/internal/amd64/ssa.go b/src/cmd/compile/internal/amd64/ssa.go index 3ae3c617646..f511e75e972 100644 --- a/src/cmd/compile/internal/amd64/ssa.go +++ b/src/cmd/compile/internal/amd64/ssa.go @@ -18,6 +18,7 @@ import ( "cmd/internal/obj" "cmd/internal/obj/x86" "internal/abi" + "internal/buildcfg" ) // ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags. @@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail: if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { s.Call(v) if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 { // zeroing X15 when entering ABIInternal from ABI0 - opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + zeroX15(s) // set G register from TLS getgFromTLS(s, x86.REG_R14) } @@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { } } +// zeroX15 zeroes the X15 register. +func zeroX15(s *ssagen.State) { + vxorps := func(s *ssagen.State) { + p := s.Prog(x86.AVXORPS) + p.From.Type = obj.TYPE_REG + p.From.Reg = x86.REG_X15 + p.AddRestSourceReg(x86.REG_X15) + p.To.Type = obj.TYPE_REG + p.To.Reg = x86.REG_X15 + } + if buildcfg.GOAMD64 >= 3 { + vxorps(s) + return + } + // AVX may not be available, check before zeroing the high bits. + p := s.Prog(x86.ACMPB) + p.From.Type = obj.TYPE_MEM + p.From.Name = obj.NAME_EXTERN + p.From.Sym = ir.Syms.X86HasAVX + p.To.Type = obj.TYPE_CONST + p.To.Offset = 1 + jmp := s.Prog(x86.AJNE) + jmp.To.Type = obj.TYPE_BRANCH + vxorps(s) + sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15) + jmp.To.SetTarget(sse) +} + // Example instruction: VRSQRTPS X1, X1 func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog { p := s.Prog(v.Op.Asm()) diff --git a/src/cmd/compile/internal/ir/symtab.go b/src/cmd/compile/internal/ir/symtab.go index ee0f52fbf3f..2222a5444aa 100644 --- a/src/cmd/compile/internal/ir/symtab.go +++ b/src/cmd/compile/internal/ir/symtab.go @@ -68,6 +68,7 @@ type symsStruct struct { Loong64HasLAM_BH *obj.LSym Loong64HasLSX *obj.LSym RISCV64HasZbb *obj.LSym + X86HasAVX *obj.LSym X86HasFMA *obj.LSym X86HasPOPCNT *obj.LSym X86HasSSE41 *obj.LSym diff --git a/src/cmd/compile/internal/ssagen/ssa.go b/src/cmd/compile/internal/ssagen/ssa.go index abb6370a15f..57129817f6c 100644 --- a/src/cmd/compile/internal/ssagen/ssa.go +++ b/src/cmd/compile/internal/ssagen/ssa.go @@ -150,9 +150,10 @@ func InitConfig() { ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert") ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero") ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove") + ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool + ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool - ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool @@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool { } var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym - diff --git a/src/cmd/compile/internal/typecheck/_builtin/runtime.go b/src/cmd/compile/internal/typecheck/_builtin/runtime.go index 296bfdc281d..1e4d0b7db6e 100644 --- a/src/cmd/compile/internal/typecheck/_builtin/runtime.go +++ b/src/cmd/compile/internal/typecheck/_builtin/runtime.go @@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint) func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32 // architecture variants +var x86HasAVX bool +var x86HasFMA bool var x86HasPOPCNT bool var x86HasSSE41 bool -var x86HasFMA bool var armHasVFPv4 bool var arm64HasATOMICS bool var loong64HasLAMCAS bool diff --git a/src/cmd/compile/internal/typecheck/builtin.go b/src/cmd/compile/internal/typecheck/builtin.go index 535f0fb7e88..6b8c6d7bad5 100644 --- a/src/cmd/compile/internal/typecheck/builtin.go +++ b/src/cmd/compile/internal/typecheck/builtin.go @@ -232,9 +232,10 @@ var runtimeDecls = [...]struct { {"libfuzzerHookStrCmp", funcTag, 155}, {"libfuzzerHookEqualFold", funcTag, 155}, {"addCovMeta", funcTag, 157}, + {"x86HasAVX", varTag, 6}, + {"x86HasFMA", varTag, 6}, {"x86HasPOPCNT", varTag, 6}, {"x86HasSSE41", varTag, 6}, - {"x86HasFMA", varTag, 6}, {"armHasVFPv4", varTag, 6}, {"arm64HasATOMICS", varTag, 6}, {"loong64HasLAMCAS", varTag, 6}, diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s index cf1d49a4ad8..f8ebd030b61 100644 --- a/src/runtime/asm_amd64.s +++ b/src/runtime/asm_amd64.s @@ -1015,6 +1015,9 @@ needm: // there's no need to handle that. Clear R14 so that there's // a bad value in there, in case needm tries to use it. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 MOVQ $runtime·needAndBindM(SB), AX CALL AX @@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0 get_tls(R14) MOVQ g(R14), R14 XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 JMP ·sigpanic(SB) // gcWriteBarrier informs the GC about heap pointer writes. diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go index 6452364b68e..67ed081ef6d 100644 --- a/src/runtime/cpuflags.go +++ b/src/runtime/cpuflags.go @@ -28,9 +28,10 @@ const ( var ( // Set in runtime.cpuinit. // TODO: deprecate these; use internal/cpu directly. + x86HasAVX bool + x86HasFMA bool x86HasPOPCNT bool x86HasSSE41 bool - x86HasFMA bool armHasVFPv4 bool diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 68647d771fe..1d597d59c2f 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -766,9 +766,10 @@ func cpuinit(env string) { // to guard execution of instructions that can not be assumed to be always supported. switch GOARCH { case "386", "amd64": + x86HasAVX = cpu.X86.HasAVX + x86HasFMA = cpu.X86.HasFMA x86HasPOPCNT = cpu.X86.HasPOPCNT x86HasSSE41 = cpu.X86.HasSSE41 - x86HasFMA = cpu.X86.HasFMA case "arm": armHasVFPv4 = cpu.ARM.HasVFPv4 diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s index e19118bd54e..23f2e59e3d4 100644 --- a/src/runtime/race_amd64.s +++ b/src/runtime/race_amd64.s @@ -456,6 +456,9 @@ call: // Back to Go world, set special registers. // The g register (R14) is preserved in C. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 RET // C->Go callback thunk that allows to call runtime·racesymbolize from C code. diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s index cc4e52d305a..0091546f204 100644 --- a/src/runtime/sys_darwin_amd64.s +++ b/src/runtime/sys_darwin_amd64.s @@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s index a223c2cf76b..84bf326aad3 100644 --- a/src/runtime/sys_dragonfly_amd64.s +++ b/src/runtime/sys_dragonfly_amd64.s @@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s index 977ea093d24..a1fa3a6fa29 100644 --- a/src/runtime/sys_freebsd_amd64.s +++ b/src/runtime/sys_freebsd_amd64.s @@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s index 941f70b0e8e..02505c2fb0a 100644 --- a/src/runtime/sys_linux_amd64.s +++ b/src/runtime/sys_linux_amd64.s @@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking @@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s index 2f1ddcdc897..edc7f3d6ee0 100644 --- a/src/runtime/sys_netbsd_amd64.s +++ b/src/runtime/sys_netbsd_amd64.s @@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s index ff0bc2416aa..734dfe6478e 100644 --- a/src/runtime/sys_openbsd_amd64.s +++ b/src/runtime/sys_openbsd_amd64.s @@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0 get_tls(R12) MOVQ g(R12), R14 PXOR X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 // Reserve space for spill slots. NOP SP // disable vet stack checking diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s index e438599910f..b0b4d3cce65 100644 --- a/src/runtime/sys_windows_amd64.s +++ b/src/runtime/sys_windows_amd64.s @@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0 // R14 is cleared in case there's a non-zero value in there // if called from a non-go thread. XORPS X15, X15 + CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1 + JNE 2(PC) + VXORPS X15, X15, X15 XORQ R14, R14 get_tls(AX)