mirror of
https://github.com/golang/go.git
synced 2025-10-19 11:03:18 +00:00
hash/crc32: add AVX512 IEEE CRC32 calculation
Benchmark:
goos: windows
goarch: amd64
pkg: hash/crc32
cpu: AMD Ryzen 9 9950X 16-Core Processor
benchmark old MB/s new MB/s speedup
BenchmarkCRC32/poly=IEEE/size=15/align=0-32 1081.48 1089.42 1.01x
BenchmarkCRC32/poly=IEEE/size=15/align=1-32 1085.87 1082.61 1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=0-32 2756.33 2752.37 1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=1-32 2758.27 2756.99 1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=0-32 18133.44 18076.52 1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=1-32 18151.05 18055.41 0.99x
BenchmarkCRC32/poly=IEEE/size=1kB/align=0-32 19902.93 48581.07 2.44x
BenchmarkCRC32/poly=IEEE/size=1kB/align=1-32 19966.99 48393.25 2.42x
BenchmarkCRC32/poly=IEEE/size=4kB/align=0-32 21690.33 51679.25 2.38x
BenchmarkCRC32/poly=IEEE/size=4kB/align=1-32 21655.30 51731.22 2.39x
BenchmarkCRC32/poly=IEEE/size=32kB/align=0-32 22046.57 46406.90 2.10x
BenchmarkCRC32/poly=IEEE/size=32kB/align=1-32 21986.22 46250.66 2.10x
AVX512 are enabled above 1KB input size.
This rather high limit is due to AVX512 may be slower to ramp up
than the regular SSE4 implementation for smaller inputs.
This is not reflected in the benchmarks,
since consecutive calls means the CPU is "hot".
The 'HasAVX512VPCLMULQDQ' name mirrors the one in golang.org/x/sys/cpu
Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271
Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271
GitHub-Last-Rev: 6639f07b9f
GitHub-Pull-Request: golang/go#74701
Reviewed-on: https://go-review.googlesource.com/c/go/+/689435
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
c641900f72
commit
18dbe5b941
4 changed files with 84 additions and 25 deletions
|
@ -13,6 +13,11 @@ import (
|
|||
"unsafe"
|
||||
)
|
||||
|
||||
// Offset into internal/cpu records for use in assembly.
|
||||
const (
|
||||
offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ)
|
||||
)
|
||||
|
||||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
||||
// and IEEE CRC.
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "textflag.h"
|
||||
#include "go_asm.h"
|
||||
|
||||
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
|
||||
//
|
||||
|
@ -136,15 +137,23 @@ loop:
|
|||
// Linux kernel, since they avoid the costly
|
||||
// PSHUFB 16 byte reversal proposed in the
|
||||
// original Intel paper.
|
||||
// Splatted so it can be loaded with a single VMOVDQU64
|
||||
DATA r2r1<>+0(SB)/8, $0x154442bd4
|
||||
DATA r2r1<>+8(SB)/8, $0x1c6e41596
|
||||
DATA r2r1<>+16(SB)/8, $0x154442bd4
|
||||
DATA r2r1<>+24(SB)/8, $0x1c6e41596
|
||||
DATA r2r1<>+32(SB)/8, $0x154442bd4
|
||||
DATA r2r1<>+40(SB)/8, $0x1c6e41596
|
||||
DATA r2r1<>+48(SB)/8, $0x154442bd4
|
||||
DATA r2r1<>+56(SB)/8, $0x1c6e41596
|
||||
|
||||
DATA r4r3<>+0(SB)/8, $0x1751997d0
|
||||
DATA r4r3<>+8(SB)/8, $0x0ccaa009e
|
||||
DATA rupoly<>+0(SB)/8, $0x1db710641
|
||||
DATA rupoly<>+8(SB)/8, $0x1f7011641
|
||||
DATA r5<>+0(SB)/8, $0x163cd6124
|
||||
|
||||
GLOBL r2r1<>(SB),RODATA,$16
|
||||
GLOBL r2r1<>(SB), RODATA, $64
|
||||
GLOBL r4r3<>(SB),RODATA,$16
|
||||
GLOBL rupoly<>(SB),RODATA,$16
|
||||
GLOBL r5<>(SB),RODATA,$8
|
||||
|
@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
|
|||
MOVQ p+8(FP), SI // data pointer
|
||||
MOVQ p_len+16(FP), CX // len(p)
|
||||
|
||||
// Check feature support and length to be >= 1024 bytes.
|
||||
CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
|
||||
JNE useSSE42
|
||||
CMPQ CX, $1024
|
||||
JL useSSE42
|
||||
|
||||
// Use AVX512
|
||||
VPXORQ Z0, Z0, Z0
|
||||
VMOVQ AX, X0
|
||||
VMOVDQU64 (SI), Z1
|
||||
VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1
|
||||
ADDQ $64, SI // buf+=64
|
||||
SUBQ $64, CX // len-=64
|
||||
|
||||
VMOVDQU64 r2r1<>+0(SB), Z0
|
||||
|
||||
loopback64Avx512:
|
||||
VMOVDQU64 (SI), Z11 // Load next
|
||||
VPCLMULQDQ $0x11, Z0, Z1, Z5
|
||||
VPCLMULQDQ $0, Z0, Z1, Z1
|
||||
VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
|
||||
|
||||
ADDQ $0x40, DI
|
||||
ADDQ $64, SI // buf+=64
|
||||
SUBQ $64, CX // len-=64
|
||||
CMPQ CX, $64 // Less than 64 bytes left?
|
||||
JGE loopback64Avx512
|
||||
|
||||
// Unfold result into XMM1-XMM4 to match SSE4 code.
|
||||
VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
|
||||
VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
|
||||
VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
|
||||
VZEROUPPER
|
||||
JMP remain64
|
||||
|
||||
PCALIGN $16
|
||||
useSSE42:
|
||||
MOVOU (SI), X1
|
||||
MOVOU 16(SI), X2
|
||||
MOVOU 32(SI), X3
|
||||
|
@ -207,6 +253,7 @@ loopback64:
|
|||
CMPQ CX, $64 // Less than 64 bytes left?
|
||||
JGE loopback64
|
||||
|
||||
PCALIGN $16
|
||||
/* Fold result into a single register (X1) */
|
||||
remain64:
|
||||
MOVOA r4r3<>+0(SB), X0
|
||||
|
|
|
@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize
|
|||
// in addition to the cpuid feature bit being set.
|
||||
// The struct is padded to avoid false sharing.
|
||||
var X86 struct {
|
||||
_ CacheLinePad
|
||||
HasAES bool
|
||||
HasADX bool
|
||||
HasAVX bool
|
||||
HasAVX2 bool
|
||||
HasAVX512F bool
|
||||
HasAVX512BW bool
|
||||
HasAVX512VL bool
|
||||
HasBMI1 bool
|
||||
HasBMI2 bool
|
||||
HasERMS bool
|
||||
HasFSRM bool
|
||||
HasFMA bool
|
||||
HasOSXSAVE bool
|
||||
HasPCLMULQDQ bool
|
||||
HasPOPCNT bool
|
||||
HasRDTSCP bool
|
||||
HasSHA bool
|
||||
HasSSE3 bool
|
||||
HasSSSE3 bool
|
||||
HasSSE41 bool
|
||||
HasSSE42 bool
|
||||
_ CacheLinePad
|
||||
_ CacheLinePad
|
||||
HasAES bool
|
||||
HasADX bool
|
||||
HasAVX bool
|
||||
HasAVX2 bool
|
||||
HasAVX512F bool
|
||||
HasAVX512BW bool
|
||||
HasAVX512VL bool
|
||||
HasBMI1 bool
|
||||
HasBMI2 bool
|
||||
HasERMS bool
|
||||
HasFSRM bool
|
||||
HasFMA bool
|
||||
HasOSXSAVE bool
|
||||
HasPCLMULQDQ bool
|
||||
HasPOPCNT bool
|
||||
HasRDTSCP bool
|
||||
HasSHA bool
|
||||
HasSSE3 bool
|
||||
HasSSSE3 bool
|
||||
HasSSE41 bool
|
||||
HasSSE42 bool
|
||||
HasAVX512VPCLMULQDQ bool
|
||||
_ CacheLinePad
|
||||
}
|
||||
|
||||
// The booleans in ARM contain the correspondingly named cpu feature bit.
|
||||
|
|
|
@ -40,6 +40,10 @@ const (
|
|||
cpuid_SHA = 1 << 29
|
||||
cpuid_AVX512BW = 1 << 30
|
||||
cpuid_AVX512VL = 1 << 31
|
||||
|
||||
// ecx bits
|
||||
cpuid_AVX512VPCLMULQDQ = 1 << 10
|
||||
|
||||
// edx bits
|
||||
cpuid_FSRM = 1 << 4
|
||||
// edx bits for CPUID 0x80000001
|
||||
|
@ -57,6 +61,7 @@ func doinit() {
|
|||
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
|
||||
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
|
||||
{Name: "sha", Feature: &X86.HasSHA},
|
||||
{Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ},
|
||||
}
|
||||
level := getGOAMD64level()
|
||||
if level < 2 {
|
||||
|
@ -139,7 +144,7 @@ func doinit() {
|
|||
return
|
||||
}
|
||||
|
||||
_, ebx7, _, edx7 := cpuid(7, 0)
|
||||
_, ebx7, ecx7, edx7 := cpuid(7, 0)
|
||||
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
|
||||
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
|
||||
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
|
||||
|
@ -151,6 +156,7 @@ func doinit() {
|
|||
if X86.HasAVX512F {
|
||||
X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
|
||||
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
|
||||
X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
|
||||
}
|
||||
|
||||
X86.HasFSRM = isSet(edx7, cpuid_FSRM)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue