mirror of
https://github.com/golang/go.git
synced 2025-12-08 06:10:04 +00:00
runtime: use RDTSCP for instruction stream serialized read of TSC
To measure all instructions having been completed before reading
the time stamp counter with RDTSC an instruction sequence that
has instruction stream serializing properties which guarantee
waiting until all previous instructions have been executed is
needed. This does not necessary mean to wait for all stores to
be globally visible.
This CL aims to remove vendor specific logic for determining the
instruction sequence with CPU feature flag checks that are
CPU vendor independent.
For intel LFENCE has the wanted properties at least
since it was introduced together with SSE2 support.
On AMD instruction stream serializing LFENCE is supported by setting
an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors.
AMD family 0Fh/11h processors support LFENCE as serializing always.
AMD plans support for this MSR and access to this bit for all future processors.
Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf
Reading the MSR to determine LFENCE properties is not always possible
or reliable (hypervisors). The Linux kernel is relying on serializing
LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295
and the MSR C001_1029 to enable serialization has been set by default
with the Spectre v1 mitigations.
Using an MFENCE on AMD is waiting on previous instructions having been executed
but in addition also flushes store buffers.
To align the serialization properties without runtime detection
of CPU manufacturers we can use the newer RDTSCP instruction which
waits until all previous instructions have been executed.
RDTSCP is available on Intel since around 2008 and on AMD CPUs since
around 2006. Support for RDTSCP can be checked independently
of manufacturer by checking CPUID bits.
Using RDTSCP is the default in Linux to read TSC in program order
when the instruction is available.
e22ce8eb63/arch/x86/include/asm/msr.h (L231)
Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d
Reviewed-on: https://go-review.googlesource.com/c/go/+/344429
Reviewed-by: Keith Randall <khr@golang.org>
Trust: Martin Möhrmann <martin@golang.org>
Run-TryBot: Martin Möhrmann <martin@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
This commit is contained in:
parent
fa34678c67
commit
22540abf76
6 changed files with 64 additions and 28 deletions
|
|
@ -36,6 +36,7 @@ var X86 struct {
|
||||||
HasOSXSAVE bool
|
HasOSXSAVE bool
|
||||||
HasPCLMULQDQ bool
|
HasPCLMULQDQ bool
|
||||||
HasPOPCNT bool
|
HasPOPCNT bool
|
||||||
|
HasRDTSCP bool
|
||||||
HasSSE2 bool
|
HasSSE2 bool
|
||||||
HasSSE3 bool
|
HasSSE3 bool
|
||||||
HasSSSE3 bool
|
HasSSSE3 bool
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,9 @@ const (
|
||||||
cpuid_BMI2 = 1 << 8
|
cpuid_BMI2 = 1 << 8
|
||||||
cpuid_ERMS = 1 << 9
|
cpuid_ERMS = 1 << 9
|
||||||
cpuid_ADX = 1 << 19
|
cpuid_ADX = 1 << 19
|
||||||
|
|
||||||
|
// edx bits for CPUID 0x80000001
|
||||||
|
cpuid_RDTSCP = 1 << 27
|
||||||
)
|
)
|
||||||
|
|
||||||
var maxExtendedFunctionInformation uint32
|
var maxExtendedFunctionInformation uint32
|
||||||
|
|
@ -53,6 +56,7 @@ func doinit() {
|
||||||
{Name: "fma", Feature: &X86.HasFMA},
|
{Name: "fma", Feature: &X86.HasFMA},
|
||||||
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
|
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
|
||||||
{Name: "popcnt", Feature: &X86.HasPOPCNT},
|
{Name: "popcnt", Feature: &X86.HasPOPCNT},
|
||||||
|
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
|
||||||
{Name: "sse3", Feature: &X86.HasSSE3},
|
{Name: "sse3", Feature: &X86.HasSSE3},
|
||||||
{Name: "sse41", Feature: &X86.HasSSE41},
|
{Name: "sse41", Feature: &X86.HasSSE41},
|
||||||
{Name: "sse42", Feature: &X86.HasSSE42},
|
{Name: "sse42", Feature: &X86.HasSSE42},
|
||||||
|
|
@ -112,6 +116,16 @@ func doinit() {
|
||||||
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
|
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
|
||||||
X86.HasERMS = isSet(ebx7, cpuid_ERMS)
|
X86.HasERMS = isSet(ebx7, cpuid_ERMS)
|
||||||
X86.HasADX = isSet(ebx7, cpuid_ADX)
|
X86.HasADX = isSet(ebx7, cpuid_ADX)
|
||||||
|
|
||||||
|
var maxExtendedInformation uint32
|
||||||
|
maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
|
||||||
|
|
||||||
|
if maxExtendedInformation < 0x80000001 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
_, _, _, edxExt1 := cpuid(0x80000001, 0)
|
||||||
|
X86.HasRDTSCP = isSet(edxExt1, cpuid_RDTSCP)
|
||||||
}
|
}
|
||||||
|
|
||||||
func isSet(hwc uint32, value uint32) bool {
|
func isSet(hwc uint32, value uint32) bool {
|
||||||
|
|
|
||||||
|
|
@ -137,9 +137,6 @@ has_cpuid:
|
||||||
CMPL AX, $0
|
CMPL AX, $0
|
||||||
JE nocpuinfo
|
JE nocpuinfo
|
||||||
|
|
||||||
// Figure out how to serialize RDTSC.
|
|
||||||
// On Intel processors LFENCE is enough. AMD requires MFENCE.
|
|
||||||
// Don't know about the rest, so let's do MFENCE.
|
|
||||||
CMPL BX, $0x756E6547 // "Genu"
|
CMPL BX, $0x756E6547 // "Genu"
|
||||||
JNE notintel
|
JNE notintel
|
||||||
CMPL DX, $0x49656E69 // "ineI"
|
CMPL DX, $0x49656E69 // "ineI"
|
||||||
|
|
@ -147,7 +144,6 @@ has_cpuid:
|
||||||
CMPL CX, $0x6C65746E // "ntel"
|
CMPL CX, $0x6C65746E // "ntel"
|
||||||
JNE notintel
|
JNE notintel
|
||||||
MOVB $1, runtime·isIntel(SB)
|
MOVB $1, runtime·isIntel(SB)
|
||||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
|
||||||
notintel:
|
notintel:
|
||||||
|
|
||||||
// Load EAX=1 cpuid flags
|
// Load EAX=1 cpuid flags
|
||||||
|
|
@ -838,19 +834,36 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
|
||||||
|
|
||||||
// func cputicks() int64
|
// func cputicks() int64
|
||||||
TEXT runtime·cputicks(SB),NOSPLIT,$0-8
|
TEXT runtime·cputicks(SB),NOSPLIT,$0-8
|
||||||
|
// LFENCE/MFENCE instruction support is dependent on SSE2.
|
||||||
|
// When no SSE2 support is present do not enforce any serialization
|
||||||
|
// since using CPUID to serialize the instruction stream is
|
||||||
|
// very costly.
|
||||||
CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
|
CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
|
||||||
JNE done
|
JNE rdtsc
|
||||||
CMPB runtime·lfenceBeforeRdtsc(SB), $1
|
CMPB internal∕cpu·X86+const_offsetX86HasRDTSCP(SB), $1
|
||||||
JNE mfence
|
JNE fences
|
||||||
LFENCE
|
// Instruction stream serializing RDTSCP is supported.
|
||||||
JMP done
|
// RDTSCP is supported by Intel Nehalem (2008) and
|
||||||
mfence:
|
// AMD K8 Rev. F (2006) and newer.
|
||||||
MFENCE
|
RDTSCP
|
||||||
done:
|
done:
|
||||||
RDTSC
|
|
||||||
MOVL AX, ret_lo+0(FP)
|
MOVL AX, ret_lo+0(FP)
|
||||||
MOVL DX, ret_hi+4(FP)
|
MOVL DX, ret_hi+4(FP)
|
||||||
RET
|
RET
|
||||||
|
fences:
|
||||||
|
// MFENCE is instruction stream serializing and flushes the
|
||||||
|
// store buffers on AMD. The serialization semantics of LFENCE on AMD
|
||||||
|
// are dependent on MSR C001_1029 and CPU generation.
|
||||||
|
// LFENCE on Intel does wait for all previous instructions to have executed.
|
||||||
|
// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
|
||||||
|
// previous instructions executed and all previous loads and stores to globally visible.
|
||||||
|
// Using MFENCE;LFENCE here aligns the serializing properties without
|
||||||
|
// runtime detection of CPU manufacturer.
|
||||||
|
MFENCE
|
||||||
|
LFENCE
|
||||||
|
rdtsc:
|
||||||
|
RDTSC
|
||||||
|
JMP done
|
||||||
|
|
||||||
TEXT ldt0setup<>(SB),NOSPLIT,$16-0
|
TEXT ldt0setup<>(SB),NOSPLIT,$16-0
|
||||||
// set up ldt 7 to point at m0.tls
|
// set up ldt 7 to point at m0.tls
|
||||||
|
|
|
||||||
|
|
@ -103,9 +103,6 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
|
||||||
CMPL AX, $0
|
CMPL AX, $0
|
||||||
JE nocpuinfo
|
JE nocpuinfo
|
||||||
|
|
||||||
// Figure out how to serialize RDTSC.
|
|
||||||
// On Intel processors LFENCE is enough. AMD requires MFENCE.
|
|
||||||
// Don't know about the rest, so let's do MFENCE.
|
|
||||||
CMPL BX, $0x756E6547 // "Genu"
|
CMPL BX, $0x756E6547 // "Genu"
|
||||||
JNE notintel
|
JNE notintel
|
||||||
CMPL DX, $0x49656E69 // "ineI"
|
CMPL DX, $0x49656E69 // "ineI"
|
||||||
|
|
@ -113,7 +110,6 @@ TEXT runtime·rt0_go(SB),NOSPLIT|TOPFRAME,$0
|
||||||
CMPL CX, $0x6C65746E // "ntel"
|
CMPL CX, $0x6C65746E // "ntel"
|
||||||
JNE notintel
|
JNE notintel
|
||||||
MOVB $1, runtime·isIntel(SB)
|
MOVB $1, runtime·isIntel(SB)
|
||||||
MOVB $1, runtime·lfenceBeforeRdtsc(SB)
|
|
||||||
notintel:
|
notintel:
|
||||||
|
|
||||||
// Load EAX=1 cpuid flags
|
// Load EAX=1 cpuid flags
|
||||||
|
|
@ -928,18 +924,30 @@ TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
|
||||||
|
|
||||||
// func cputicks() int64
|
// func cputicks() int64
|
||||||
TEXT runtime·cputicks(SB),NOSPLIT,$0-0
|
TEXT runtime·cputicks(SB),NOSPLIT,$0-0
|
||||||
CMPB runtime·lfenceBeforeRdtsc(SB), $1
|
CMPB internal∕cpu·X86+const_offsetX86HasRDTSCP(SB), $1
|
||||||
JNE mfence
|
JNE fences
|
||||||
LFENCE
|
// Instruction stream serializing RDTSCP is supported.
|
||||||
JMP done
|
// RDTSCP is supported by Intel Nehalem (2008) and
|
||||||
mfence:
|
// AMD K8 Rev. F (2006) and newer.
|
||||||
MFENCE
|
RDTSCP
|
||||||
done:
|
done:
|
||||||
RDTSC
|
|
||||||
SHLQ $32, DX
|
SHLQ $32, DX
|
||||||
ADDQ DX, AX
|
ADDQ DX, AX
|
||||||
MOVQ AX, ret+0(FP)
|
MOVQ AX, ret+0(FP)
|
||||||
RET
|
RET
|
||||||
|
fences:
|
||||||
|
// MFENCE is instruction stream serializing and flushes the
|
||||||
|
// store buffers on AMD. The serialization semantics of LFENCE on AMD
|
||||||
|
// are dependent on MSR C001_1029 and CPU generation.
|
||||||
|
// LFENCE on Intel does wait for all previous instructions to have executed.
|
||||||
|
// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
|
||||||
|
// previous instructions executed and all previous loads and stores to globally visible.
|
||||||
|
// Using MFENCE;LFENCE here aligns the serializing properties without
|
||||||
|
// runtime detection of CPU manufacturer.
|
||||||
|
MFENCE
|
||||||
|
LFENCE
|
||||||
|
RDTSC
|
||||||
|
JMP done
|
||||||
|
|
||||||
// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
|
// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
|
||||||
// hash function using AES hardware instructions
|
// hash function using AES hardware instructions
|
||||||
|
|
|
||||||
|
|
@ -11,10 +11,11 @@ import (
|
||||||
|
|
||||||
// Offsets into internal/cpu records for use in assembly.
|
// Offsets into internal/cpu records for use in assembly.
|
||||||
const (
|
const (
|
||||||
offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX)
|
offsetX86HasAVX = unsafe.Offsetof(cpu.X86.HasAVX)
|
||||||
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
|
||||||
offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
|
offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
|
||||||
offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
|
offsetX86HasRDTSCP = unsafe.Offsetof(cpu.X86.HasRDTSCP)
|
||||||
|
offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
|
||||||
|
|
||||||
offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
|
offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1128,7 +1128,6 @@ var (
|
||||||
// Set on startup in asm_{386,amd64}.s
|
// Set on startup in asm_{386,amd64}.s
|
||||||
processorVersionInfo uint32
|
processorVersionInfo uint32
|
||||||
isIntel bool
|
isIntel bool
|
||||||
lfenceBeforeRdtsc bool
|
|
||||||
|
|
||||||
goarm uint8 // set by cmd/link on arm systems
|
goarm uint8 // set by cmd/link on arm systems
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue