go/src/internal/cpu/cpu_x86.go

237 lines
6.8 KiB
Go
Raw Normal View History

// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build 386 || amd64
package cpu
const CacheLinePadSize = 64
// cpuid is implemented in cpu_x86.s.
func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
// xgetbv with ecx = 0 is implemented in cpu_x86.s.
func xgetbv() (eax, edx uint32)
// getGOAMD64level is implemented in cpu_x86.s. Returns number in [1,4].
func getGOAMD64level() int32
const (
// ecx bits
cpuid_SSE3 = 1 << 0
cpuid_PCLMULQDQ = 1 << 1
cpuid_SSSE3 = 1 << 9
cpuid_GFNI = 1 << 8
cpuid_FMA = 1 << 12
cpuid_SSE41 = 1 << 19
cpuid_SSE42 = 1 << 20
cpuid_POPCNT = 1 << 23
cpuid_AES = 1 << 25
cpuid_OSXSAVE = 1 << 27
cpuid_AVX = 1 << 28
// ebx bits
cpuid_BMI1 = 1 << 3
cpuid_AVX2 = 1 << 5
cpuid_BMI2 = 1 << 8
cpuid_ERMS = 1 << 9
cpuid_AVX512F = 1 << 16
cpuid_AVX512DQ = 1 << 17
cpuid_ADX = 1 << 19
cpuid_AVX512CD = 1 << 28
cpuid_SHA = 1 << 29
cpuid_AVX512BW = 1 << 30
cpuid_AVX512VL = 1 << 31
runtime: add ERMS-based memmove support for modern CPU platforms The current memmove implementation uses REP MOVSB to copy data larger than 2KB when the useAVXmemmove global variable is false and the CPU supports the ERMS feature. This feature is currently only enabled on CPUs in the Sandy Bridge (Client) , Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server) microarchitectures. For modern Intel CPU microarchitectures that support the ERMS feature, such as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better performance than the AVX-based copy currently implemented in memmove. Benchstat result: goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz │ ./old.txt │ ./new.txt │ │ sec/op │ sec/op vs base │ Memmove/2048-2 25.24n ± 0% 24.27n ± 0% -3.84% (p=0.000 n=10) Memmove/4096-2 44.87n ± 0% 33.16n ± 1% -26.11% (p=0.000 n=10) geomean 33.65n 28.37n -15.71% │ ./old.txt │ ./new.txt │ │ B/s │ B/s vs base │ Memmove/2048-2 75.56Gi ± 0% 78.59Gi ± 0% +4.02% (p=0.000 n=10) Memmove/4096-2 85.01Gi ± 0% 115.05Gi ± 1% +35.34% (p=0.000 n=10) geomean 80.14Gi 95.09Gi +18.65% Fixes #66958 Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d GitHub-Last-Rev: 89cf5af32b1b41e1499282058656a8a5c7aed359 GitHub-Pull-Request: golang/go#66959 Reviewed-on: https://go-review.googlesource.com/c/go/+/580735 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-07-02 04:02:12 +00:00
// edx bits
cpuid_FSRM = 1 << 4
runtime: use RDTSCP for instruction stream serialized read of TSC To measure all instructions having been completed before reading the time stamp counter with RDTSC an instruction sequence that has instruction stream serializing properties which guarantee waiting until all previous instructions have been executed is needed. This does not necessary mean to wait for all stores to be globally visible. This CL aims to remove vendor specific logic for determining the instruction sequence with CPU feature flag checks that are CPU vendor independent. For intel LFENCE has the wanted properties at least since it was introduced together with SSE2 support. On AMD instruction stream serializing LFENCE is supported by setting an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors. AMD family 0Fh/11h processors support LFENCE as serializing always. AMD plans support for this MSR and access to this bit for all future processors. Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf Reading the MSR to determine LFENCE properties is not always possible or reliable (hypervisors). The Linux kernel is relying on serializing LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295 and the MSR C001_1029 to enable serialization has been set by default with the Spectre v1 mitigations. Using an MFENCE on AMD is waiting on previous instructions having been executed but in addition also flushes store buffers. To align the serialization properties without runtime detection of CPU manufacturers we can use the newer RDTSCP instruction which waits until all previous instructions have been executed. RDTSCP is available on Intel since around 2008 and on AMD CPUs since around 2006. Support for RDTSCP can be checked independently of manufacturer by checking CPUID bits. Using RDTSCP is the default in Linux to read TSC in program order when the instruction is available. https://github.com/torvalds/linux/blob/e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93/arch/x86/include/asm/msr.h#L231 Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d Reviewed-on: https://go-review.googlesource.com/c/go/+/344429 Reviewed-by: Keith Randall <khr@golang.org> Trust: Martin Möhrmann <martin@golang.org> Run-TryBot: Martin Möhrmann <martin@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
2021-08-23 13:53:22 +02:00
// edx bits for CPUID 0x80000001
cpuid_RDTSCP = 1 << 27
)
var maxExtendedFunctionInformation uint32
func doinit() {
options = []option{
{Name: "adx", Feature: &X86.HasADX},
{Name: "aes", Feature: &X86.HasAES},
{Name: "erms", Feature: &X86.HasERMS},
runtime: add ERMS-based memmove support for modern CPU platforms The current memmove implementation uses REP MOVSB to copy data larger than 2KB when the useAVXmemmove global variable is false and the CPU supports the ERMS feature. This feature is currently only enabled on CPUs in the Sandy Bridge (Client) , Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server) microarchitectures. For modern Intel CPU microarchitectures that support the ERMS feature, such as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better performance than the AVX-based copy currently implemented in memmove. Benchstat result: goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz │ ./old.txt │ ./new.txt │ │ sec/op │ sec/op vs base │ Memmove/2048-2 25.24n ± 0% 24.27n ± 0% -3.84% (p=0.000 n=10) Memmove/4096-2 44.87n ± 0% 33.16n ± 1% -26.11% (p=0.000 n=10) geomean 33.65n 28.37n -15.71% │ ./old.txt │ ./new.txt │ │ B/s │ B/s vs base │ Memmove/2048-2 75.56Gi ± 0% 78.59Gi ± 0% +4.02% (p=0.000 n=10) Memmove/4096-2 85.01Gi ± 0% 115.05Gi ± 1% +35.34% (p=0.000 n=10) geomean 80.14Gi 95.09Gi +18.65% Fixes #66958 Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d GitHub-Last-Rev: 89cf5af32b1b41e1499282058656a8a5c7aed359 GitHub-Pull-Request: golang/go#66959 Reviewed-on: https://go-review.googlesource.com/c/go/+/580735 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-07-02 04:02:12 +00:00
{Name: "fsrm", Feature: &X86.HasFSRM},
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
runtime: use RDTSCP for instruction stream serialized read of TSC To measure all instructions having been completed before reading the time stamp counter with RDTSC an instruction sequence that has instruction stream serializing properties which guarantee waiting until all previous instructions have been executed is needed. This does not necessary mean to wait for all stores to be globally visible. This CL aims to remove vendor specific logic for determining the instruction sequence with CPU feature flag checks that are CPU vendor independent. For intel LFENCE has the wanted properties at least since it was introduced together with SSE2 support. On AMD instruction stream serializing LFENCE is supported by setting an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors. AMD family 0Fh/11h processors support LFENCE as serializing always. AMD plans support for this MSR and access to this bit for all future processors. Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf Reading the MSR to determine LFENCE properties is not always possible or reliable (hypervisors). The Linux kernel is relying on serializing LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295 and the MSR C001_1029 to enable serialization has been set by default with the Spectre v1 mitigations. Using an MFENCE on AMD is waiting on previous instructions having been executed but in addition also flushes store buffers. To align the serialization properties without runtime detection of CPU manufacturers we can use the newer RDTSCP instruction which waits until all previous instructions have been executed. RDTSCP is available on Intel since around 2008 and on AMD CPUs since around 2006. Support for RDTSCP can be checked independently of manufacturer by checking CPUID bits. Using RDTSCP is the default in Linux to read TSC in program order when the instruction is available. https://github.com/torvalds/linux/blob/e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93/arch/x86/include/asm/msr.h#L231 Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d Reviewed-on: https://go-review.googlesource.com/c/go/+/344429 Reviewed-by: Keith Randall <khr@golang.org> Trust: Martin Möhrmann <martin@golang.org> Run-TryBot: Martin Möhrmann <martin@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
2021-08-23 13:53:22 +02:00
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
{Name: "sha", Feature: &X86.HasSHA},
}
level := getGOAMD64level()
if level < 2 {
// These options are required at level 2. At lower levels
// they can be turned off.
options = append(options,
option{Name: "popcnt", Feature: &X86.HasPOPCNT},
option{Name: "sse3", Feature: &X86.HasSSE3},
option{Name: "sse41", Feature: &X86.HasSSE41},
option{Name: "sse42", Feature: &X86.HasSSE42},
option{Name: "ssse3", Feature: &X86.HasSSSE3})
}
if level < 3 {
// These options are required at level 3. At lower levels
// they can be turned off.
options = append(options,
option{Name: "avx", Feature: &X86.HasAVX},
option{Name: "avx2", Feature: &X86.HasAVX2},
option{Name: "bmi1", Feature: &X86.HasBMI1},
option{Name: "bmi2", Feature: &X86.HasBMI2},
option{Name: "fma", Feature: &X86.HasFMA})
}
if level < 4 {
// These options are required at level 4. At lower levels
// they can be turned off.
options = append(options,
option{Name: "avx512f", Feature: &X86.HasAVX512F},
option{Name: "avx512cd", Feature: &X86.HasAVX512CD},
option{Name: "avx512bw", Feature: &X86.HasAVX512BW},
option{Name: "avx512dq", Feature: &X86.HasAVX512DQ},
option{Name: "avx512vl", Feature: &X86.HasAVX512VL},
)
}
maxID, _, _, _ := cpuid(0, 0)
if maxID < 1 {
return
}
maxExtendedFunctionInformation, _, _, _ = cpuid(0x80000000, 0)
_, _, ecx1, _ := cpuid(1, 0)
X86.HasSSE3 = isSet(ecx1, cpuid_SSE3)
X86.HasPCLMULQDQ = isSet(ecx1, cpuid_PCLMULQDQ)
X86.HasSSSE3 = isSet(ecx1, cpuid_SSSE3)
X86.HasSSE41 = isSet(ecx1, cpuid_SSE41)
X86.HasSSE42 = isSet(ecx1, cpuid_SSE42)
X86.HasPOPCNT = isSet(ecx1, cpuid_POPCNT)
X86.HasAES = isSet(ecx1, cpuid_AES)
// OSXSAVE can be false when using older Operating Systems
// or when explicitly disabled on newer Operating Systems by
// e.g. setting the xsavedisable boot option on Windows 10.
X86.HasOSXSAVE = isSet(ecx1, cpuid_OSXSAVE)
// The FMA instruction set extension only has VEX prefixed instructions.
// VEX prefixed instructions require OSXSAVE to be enabled.
// See Intel 64 and IA-32 Architecture Software Developers Manual Volume 2
// Section 2.4 "AVX and SSE Instruction Exception Specification"
X86.HasFMA = isSet(ecx1, cpuid_FMA) && X86.HasOSXSAVE
osSupportsAVX := false
osSupportsAVX512 := false
// For XGETBV, OSXSAVE bit is required and sufficient.
if X86.HasOSXSAVE {
eax, _ := xgetbv()
// Check if XMM and YMM registers have OS support.
osSupportsAVX = isSet(eax, 1<<1) && isSet(eax, 1<<2)
// AVX512 detection does not work on Darwin,
// see https://github.com/golang/go/issues/49233
//
// Check if opmask, ZMMhi256 and Hi16_ZMM have OS support.
osSupportsAVX512 = osSupportsAVX && isSet(eax, 1<<5) && isSet(eax, 1<<6) && isSet(eax, 1<<7)
}
X86.HasAVX = isSet(ecx1, cpuid_AVX) && osSupportsAVX
if maxID < 7 {
return
}
_, ebx7, ecx7, edx7 := cpuid(7, 0)
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
X86.HasERMS = isSet(ebx7, cpuid_ERMS)
X86.HasADX = isSet(ebx7, cpuid_ADX)
X86.HasSHA = isSet(ebx7, cpuid_SHA)
runtime: use RDTSCP for instruction stream serialized read of TSC To measure all instructions having been completed before reading the time stamp counter with RDTSC an instruction sequence that has instruction stream serializing properties which guarantee waiting until all previous instructions have been executed is needed. This does not necessary mean to wait for all stores to be globally visible. This CL aims to remove vendor specific logic for determining the instruction sequence with CPU feature flag checks that are CPU vendor independent. For intel LFENCE has the wanted properties at least since it was introduced together with SSE2 support. On AMD instruction stream serializing LFENCE is supported by setting an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors. AMD family 0Fh/11h processors support LFENCE as serializing always. AMD plans support for this MSR and access to this bit for all future processors. Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf Reading the MSR to determine LFENCE properties is not always possible or reliable (hypervisors). The Linux kernel is relying on serializing LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295 and the MSR C001_1029 to enable serialization has been set by default with the Spectre v1 mitigations. Using an MFENCE on AMD is waiting on previous instructions having been executed but in addition also flushes store buffers. To align the serialization properties without runtime detection of CPU manufacturers we can use the newer RDTSCP instruction which waits until all previous instructions have been executed. RDTSCP is available on Intel since around 2008 and on AMD CPUs since around 2006. Support for RDTSCP can be checked independently of manufacturer by checking CPUID bits. Using RDTSCP is the default in Linux to read TSC in program order when the instruction is available. https://github.com/torvalds/linux/blob/e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93/arch/x86/include/asm/msr.h#L231 Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d Reviewed-on: https://go-review.googlesource.com/c/go/+/344429 Reviewed-by: Keith Randall <khr@golang.org> Trust: Martin Möhrmann <martin@golang.org> Run-TryBot: Martin Möhrmann <martin@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
2021-08-23 13:53:22 +02:00
X86.HasAVX512F = isSet(ebx7, cpuid_AVX512F) && osSupportsAVX512
if X86.HasAVX512F {
X86.HasAVX512CD = isSet(ebx7, cpuid_AVX512CD)
X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
X86.HasAVX512DQ = isSet(ebx7, cpuid_AVX512DQ)
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
}
runtime: add ERMS-based memmove support for modern CPU platforms The current memmove implementation uses REP MOVSB to copy data larger than 2KB when the useAVXmemmove global variable is false and the CPU supports the ERMS feature. This feature is currently only enabled on CPUs in the Sandy Bridge (Client) , Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server) microarchitectures. For modern Intel CPU microarchitectures that support the ERMS feature, such as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better performance than the AVX-based copy currently implemented in memmove. Benchstat result: goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz │ ./old.txt │ ./new.txt │ │ sec/op │ sec/op vs base │ Memmove/2048-2 25.24n ± 0% 24.27n ± 0% -3.84% (p=0.000 n=10) Memmove/4096-2 44.87n ± 0% 33.16n ± 1% -26.11% (p=0.000 n=10) geomean 33.65n 28.37n -15.71% │ ./old.txt │ ./new.txt │ │ B/s │ B/s vs base │ Memmove/2048-2 75.56Gi ± 0% 78.59Gi ± 0% +4.02% (p=0.000 n=10) Memmove/4096-2 85.01Gi ± 0% 115.05Gi ± 1% +35.34% (p=0.000 n=10) geomean 80.14Gi 95.09Gi +18.65% Fixes #66958 Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d GitHub-Last-Rev: 89cf5af32b1b41e1499282058656a8a5c7aed359 GitHub-Pull-Request: golang/go#66959 Reviewed-on: https://go-review.googlesource.com/c/go/+/580735 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-07-02 04:02:12 +00:00
X86.HasFSRM = isSet(edx7, cpuid_FSRM)
X86.HasGFNI = isSet(ecx7, cpuid_GFNI)
runtime: add ERMS-based memmove support for modern CPU platforms The current memmove implementation uses REP MOVSB to copy data larger than 2KB when the useAVXmemmove global variable is false and the CPU supports the ERMS feature. This feature is currently only enabled on CPUs in the Sandy Bridge (Client) , Sandy Bridge (Server), Ivy Bridge (Client), and Ivy Bridge (Server) microarchitectures. For modern Intel CPU microarchitectures that support the ERMS feature, such as Ice Lake (Server), Sapphire Rapids , REP MOVSB achieves better performance than the AVX-based copy currently implemented in memmove. Benchstat result: goos: linux goarch: amd64 pkg: runtime cpu: Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz │ ./old.txt │ ./new.txt │ │ sec/op │ sec/op vs base │ Memmove/2048-2 25.24n ± 0% 24.27n ± 0% -3.84% (p=0.000 n=10) Memmove/4096-2 44.87n ± 0% 33.16n ± 1% -26.11% (p=0.000 n=10) geomean 33.65n 28.37n -15.71% │ ./old.txt │ ./new.txt │ │ B/s │ B/s vs base │ Memmove/2048-2 75.56Gi ± 0% 78.59Gi ± 0% +4.02% (p=0.000 n=10) Memmove/4096-2 85.01Gi ± 0% 115.05Gi ± 1% +35.34% (p=0.000 n=10) geomean 80.14Gi 95.09Gi +18.65% Fixes #66958 Change-Id: I1fafd1b51a16752f83ac15047cf3b29422a79d5d GitHub-Last-Rev: 89cf5af32b1b41e1499282058656a8a5c7aed359 GitHub-Pull-Request: golang/go#66959 Reviewed-on: https://go-review.googlesource.com/c/go/+/580735 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Reviewed-by: Keith Randall <khr@google.com> Reviewed-by: Keith Randall <khr@golang.org> Auto-Submit: Keith Randall <khr@golang.org> Reviewed-by: Cherry Mui <cherryyz@google.com>
2024-07-02 04:02:12 +00:00
runtime: use RDTSCP for instruction stream serialized read of TSC To measure all instructions having been completed before reading the time stamp counter with RDTSC an instruction sequence that has instruction stream serializing properties which guarantee waiting until all previous instructions have been executed is needed. This does not necessary mean to wait for all stores to be globally visible. This CL aims to remove vendor specific logic for determining the instruction sequence with CPU feature flag checks that are CPU vendor independent. For intel LFENCE has the wanted properties at least since it was introduced together with SSE2 support. On AMD instruction stream serializing LFENCE is supported by setting an MSR C001_1029[1]=1 on AMD family 10h/12h/14h/15h/16h/17h processors. AMD family 0Fh/11h processors support LFENCE as serializing always. AMD plans support for this MSR and access to this bit for all future processors. Source: https://developer.amd.com/wp-content/resources/Managing-Speculation-on-AMD-Processors.pdf Reading the MSR to determine LFENCE properties is not always possible or reliable (hypervisors). The Linux kernel is relying on serializing LFENCE on AMD CPUs since a commit in July 2019: https://lkml.org/lkml/2019/7/22/295 and the MSR C001_1029 to enable serialization has been set by default with the Spectre v1 mitigations. Using an MFENCE on AMD is waiting on previous instructions having been executed but in addition also flushes store buffers. To align the serialization properties without runtime detection of CPU manufacturers we can use the newer RDTSCP instruction which waits until all previous instructions have been executed. RDTSCP is available on Intel since around 2008 and on AMD CPUs since around 2006. Support for RDTSCP can be checked independently of manufacturer by checking CPUID bits. Using RDTSCP is the default in Linux to read TSC in program order when the instruction is available. https://github.com/torvalds/linux/blob/e22ce8eb631bdc47a4a4ea7ecf4e4ba499db4f93/arch/x86/include/asm/msr.h#L231 Change-Id: Ifa841843b9abb2816f8f0754a163ebf01385306d Reviewed-on: https://go-review.googlesource.com/c/go/+/344429 Reviewed-by: Keith Randall <khr@golang.org> Trust: Martin Möhrmann <martin@golang.org> Run-TryBot: Martin Möhrmann <martin@golang.org> TryBot-Result: Go Bot <gobot@golang.org>
2021-08-23 13:53:22 +02:00
var maxExtendedInformation uint32
maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0)
if maxExtendedInformation < 0x80000001 {
return
}
_, _, _, edxExt1 := cpuid(0x80000001, 0)
X86.HasRDTSCP = isSet(edxExt1, cpuid_RDTSCP)
doDerived = func() {
// Rather than carefully gating on fundamental AVX-512 features, we have
// a virtual "AVX512" feature that captures F+CD+BW+DQ+VL. BW, DQ, and
// VL have a huge effect on which AVX-512 instructions are available,
// and these have all been supported on everything except the earliest
// Phi chips with AVX-512. No CPU has had CD without F, so we include
// it. GOAMD64=v4 also implies exactly this set, and these are all
// included in AVX10.1.
X86.HasAVX512 = X86.HasAVX512F && X86.HasAVX512CD && X86.HasAVX512BW && X86.HasAVX512DQ && X86.HasAVX512VL
X86.HasAVX512GFNI = X86.HasAVX512 && X86.HasGFNI
}
}
func isSet(hwc uint32, value uint32) bool {
return hwc&value != 0
}
// Name returns the CPU name given by the vendor.
// If the CPU name can not be determined an
// empty string is returned.
func Name() string {
if maxExtendedFunctionInformation < 0x80000004 {
return ""
}
data := make([]byte, 0, 3*4*4)
var eax, ebx, ecx, edx uint32
eax, ebx, ecx, edx = cpuid(0x80000002, 0)
data = appendBytes(data, eax, ebx, ecx, edx)
eax, ebx, ecx, edx = cpuid(0x80000003, 0)
data = appendBytes(data, eax, ebx, ecx, edx)
eax, ebx, ecx, edx = cpuid(0x80000004, 0)
data = appendBytes(data, eax, ebx, ecx, edx)
// Trim leading spaces.
for len(data) > 0 && data[0] == ' ' {
data = data[1:]
}
// Trim tail after and including the first null byte.
for i, c := range data {
if c == '\x00' {
data = data[:i]
break
}
}
return string(data)
}
func appendBytes(b []byte, args ...uint32) []byte {
for _, arg := range args {
b = append(b,
byte((arg >> 0)),
byte((arg >> 8)),
byte((arg >> 16)),
byte((arg >> 24)))
}
return b
}