mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
[dev.simd] cmd/compile: ensure the whole X15 register is zeroed
On AMD64, we reserve the X15 register as the zero register. Currently we use an SSE instruction to zero it, and we only use it in SSE contexts. When the machine supports AVX, the high bits of the register is not necessarily zeroed. Now that the compiler generates AVX code for SIMD, it would be great to have a zero register in the AVX context. This CL zeroes the whole X15 register if AVX is supported. Change-Id: I4dc803362f2e007b1614b90de435fbb7814cebc7 Reviewed-on: https://go-review.googlesource.com/c/go/+/698237 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Junyang Shao <shaojunyang@google.com> Reviewed-by: David Chase <drchase@google.com>
This commit is contained in:
parent
baea0c700b
commit
4c311aa38f
16 changed files with 78 additions and 8 deletions
|
|
@ -18,6 +18,7 @@ import (
|
|||
"cmd/internal/obj"
|
||||
"cmd/internal/obj/x86"
|
||||
"internal/abi"
|
||||
"internal/buildcfg"
|
||||
)
|
||||
|
||||
// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
|
||||
|
|
@ -1290,7 +1291,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
case ssa.OpAMD64CALLstatic, ssa.OpAMD64CALLtail:
|
||||
if s.ABI == obj.ABI0 && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABIInternal {
|
||||
// zeroing X15 when entering ABIInternal from ABI0
|
||||
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
||||
zeroX15(s)
|
||||
// set G register from TLS
|
||||
getgFromTLS(s, x86.REG_R14)
|
||||
}
|
||||
|
|
@ -1301,7 +1302,7 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
s.Call(v)
|
||||
if s.ABI == obj.ABIInternal && v.Aux.(*ssa.AuxCall).Fn.ABI() == obj.ABI0 {
|
||||
// zeroing X15 when entering ABIInternal from ABI0
|
||||
opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
||||
zeroX15(s)
|
||||
// set G register from TLS
|
||||
getgFromTLS(s, x86.REG_R14)
|
||||
}
|
||||
|
|
@ -1829,6 +1830,34 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
|
|||
}
|
||||
}
|
||||
|
||||
// zeroX15 zeroes the X15 register.
|
||||
func zeroX15(s *ssagen.State) {
|
||||
vxorps := func(s *ssagen.State) {
|
||||
p := s.Prog(x86.AVXORPS)
|
||||
p.From.Type = obj.TYPE_REG
|
||||
p.From.Reg = x86.REG_X15
|
||||
p.AddRestSourceReg(x86.REG_X15)
|
||||
p.To.Type = obj.TYPE_REG
|
||||
p.To.Reg = x86.REG_X15
|
||||
}
|
||||
if buildcfg.GOAMD64 >= 3 {
|
||||
vxorps(s)
|
||||
return
|
||||
}
|
||||
// AVX may not be available, check before zeroing the high bits.
|
||||
p := s.Prog(x86.ACMPB)
|
||||
p.From.Type = obj.TYPE_MEM
|
||||
p.From.Name = obj.NAME_EXTERN
|
||||
p.From.Sym = ir.Syms.X86HasAVX
|
||||
p.To.Type = obj.TYPE_CONST
|
||||
p.To.Offset = 1
|
||||
jmp := s.Prog(x86.AJNE)
|
||||
jmp.To.Type = obj.TYPE_BRANCH
|
||||
vxorps(s)
|
||||
sse := opregreg(s, x86.AXORPS, x86.REG_X15, x86.REG_X15)
|
||||
jmp.To.SetTarget(sse)
|
||||
}
|
||||
|
||||
// Example instruction: VRSQRTPS X1, X1
|
||||
func simdV11(s *ssagen.State, v *ssa.Value) *obj.Prog {
|
||||
p := s.Prog(v.Op.Asm())
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ type symsStruct struct {
|
|||
Loong64HasLAM_BH *obj.LSym
|
||||
Loong64HasLSX *obj.LSym
|
||||
RISCV64HasZbb *obj.LSym
|
||||
X86HasAVX *obj.LSym
|
||||
X86HasFMA *obj.LSym
|
||||
X86HasPOPCNT *obj.LSym
|
||||
X86HasSSE41 *obj.LSym
|
||||
|
|
|
|||
|
|
@ -150,9 +150,10 @@ func InitConfig() {
|
|||
ir.Syms.TypeAssert = typecheck.LookupRuntimeFunc("typeAssert")
|
||||
ir.Syms.WBZero = typecheck.LookupRuntimeFunc("wbZero")
|
||||
ir.Syms.WBMove = typecheck.LookupRuntimeFunc("wbMove")
|
||||
ir.Syms.X86HasAVX = typecheck.LookupRuntimeVar("x86HasAVX") // bool
|
||||
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
|
||||
ir.Syms.X86HasPOPCNT = typecheck.LookupRuntimeVar("x86HasPOPCNT") // bool
|
||||
ir.Syms.X86HasSSE41 = typecheck.LookupRuntimeVar("x86HasSSE41") // bool
|
||||
ir.Syms.X86HasFMA = typecheck.LookupRuntimeVar("x86HasFMA") // bool
|
||||
ir.Syms.ARMHasVFPv4 = typecheck.LookupRuntimeVar("armHasVFPv4") // bool
|
||||
ir.Syms.ARM64HasATOMICS = typecheck.LookupRuntimeVar("arm64HasATOMICS") // bool
|
||||
ir.Syms.Loong64HasLAMCAS = typecheck.LookupRuntimeVar("loong64HasLAMCAS") // bool
|
||||
|
|
@ -7714,4 +7715,3 @@ func isStructNotSIMD(t *types.Type) bool {
|
|||
}
|
||||
|
||||
var BoundsCheckFunc [ssa.BoundsKindCount]*obj.LSym
|
||||
|
||||
|
|
|
|||
|
|
@ -284,9 +284,10 @@ func libfuzzerHookEqualFold(string, string, uint)
|
|||
func addCovMeta(p unsafe.Pointer, len uint32, hash [16]byte, pkpath string, pkgId int, cmode uint8, cgran uint8) uint32
|
||||
|
||||
// architecture variants
|
||||
var x86HasAVX bool
|
||||
var x86HasFMA bool
|
||||
var x86HasPOPCNT bool
|
||||
var x86HasSSE41 bool
|
||||
var x86HasFMA bool
|
||||
var armHasVFPv4 bool
|
||||
var arm64HasATOMICS bool
|
||||
var loong64HasLAMCAS bool
|
||||
|
|
|
|||
|
|
@ -232,9 +232,10 @@ var runtimeDecls = [...]struct {
|
|||
{"libfuzzerHookStrCmp", funcTag, 155},
|
||||
{"libfuzzerHookEqualFold", funcTag, 155},
|
||||
{"addCovMeta", funcTag, 157},
|
||||
{"x86HasAVX", varTag, 6},
|
||||
{"x86HasFMA", varTag, 6},
|
||||
{"x86HasPOPCNT", varTag, 6},
|
||||
{"x86HasSSE41", varTag, 6},
|
||||
{"x86HasFMA", varTag, 6},
|
||||
{"armHasVFPv4", varTag, 6},
|
||||
{"arm64HasATOMICS", varTag, 6},
|
||||
{"loong64HasLAMCAS", varTag, 6},
|
||||
|
|
|
|||
|
|
@ -1015,6 +1015,9 @@ needm:
|
|||
// there's no need to handle that. Clear R14 so that there's
|
||||
// a bad value in there, in case needm tries to use it.
|
||||
XORPS X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
XORQ R14, R14
|
||||
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
|
||||
CALL AX
|
||||
|
|
@ -1712,6 +1715,9 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
|
|||
get_tls(R14)
|
||||
MOVQ g(R14), R14
|
||||
XORPS X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
JMP ·sigpanic<ABIInternal>(SB)
|
||||
|
||||
// gcWriteBarrier informs the GC about heap pointer writes.
|
||||
|
|
|
|||
|
|
@ -28,9 +28,10 @@ const (
|
|||
var (
|
||||
// Set in runtime.cpuinit.
|
||||
// TODO: deprecate these; use internal/cpu directly.
|
||||
x86HasAVX bool
|
||||
x86HasFMA bool
|
||||
x86HasPOPCNT bool
|
||||
x86HasSSE41 bool
|
||||
x86HasFMA bool
|
||||
|
||||
armHasVFPv4 bool
|
||||
|
||||
|
|
|
|||
|
|
@ -766,9 +766,10 @@ func cpuinit(env string) {
|
|||
// to guard execution of instructions that can not be assumed to be always supported.
|
||||
switch GOARCH {
|
||||
case "386", "amd64":
|
||||
x86HasAVX = cpu.X86.HasAVX
|
||||
x86HasFMA = cpu.X86.HasFMA
|
||||
x86HasPOPCNT = cpu.X86.HasPOPCNT
|
||||
x86HasSSE41 = cpu.X86.HasSSE41
|
||||
x86HasFMA = cpu.X86.HasFMA
|
||||
|
||||
case "arm":
|
||||
armHasVFPv4 = cpu.ARM.HasVFPv4
|
||||
|
|
|
|||
|
|
@ -456,6 +456,9 @@ call:
|
|||
// Back to Go world, set special registers.
|
||||
// The g register (R14) is preserved in C.
|
||||
XORPS X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
RET
|
||||
|
||||
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
|
||||
|
|
|
|||
|
|
@ -177,6 +177,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -228,6 +228,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -265,6 +265,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
@ -290,6 +293,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -340,6 +340,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
@ -365,6 +368,9 @@ TEXT runtime·sigprofNonGoWrapper<>(SB),NOSPLIT|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -310,6 +310,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -64,6 +64,9 @@ TEXT runtime·sigtramp(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
|
|||
get_tls(R12)
|
||||
MOVQ g(R12), R14
|
||||
PXOR X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
|
||||
// Reserve space for spill slots.
|
||||
NOP SP // disable vet stack checking
|
||||
|
|
|
|||
|
|
@ -32,6 +32,9 @@ TEXT sigtramp<>(SB),NOSPLIT,$0-0
|
|||
// R14 is cleared in case there's a non-zero value in there
|
||||
// if called from a non-go thread.
|
||||
XORPS X15, X15
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $1
|
||||
JNE 2(PC)
|
||||
VXORPS X15, X15, X15
|
||||
XORQ R14, R14
|
||||
|
||||
get_tls(AX)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue