hash/crc32: add AVX512 IEEE CRC32 calculation

Benchmark:

goos: windows
goarch: amd64
pkg: hash/crc32
cpu: AMD Ryzen 9 9950X 16-Core Processor

benchmark                                               old MB/s     new MB/s     speedup
BenchmarkCRC32/poly=IEEE/size=15/align=0-32             1081.48      1089.42      1.01x
BenchmarkCRC32/poly=IEEE/size=15/align=1-32             1085.87      1082.61      1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=0-32             2756.33      2752.37      1.00x
BenchmarkCRC32/poly=IEEE/size=40/align=1-32             2758.27      2756.99      1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=0-32            18133.44     18076.52     1.00x
BenchmarkCRC32/poly=IEEE/size=512/align=1-32            18151.05     18055.41     0.99x
BenchmarkCRC32/poly=IEEE/size=1kB/align=0-32            19902.93     48581.07     2.44x
BenchmarkCRC32/poly=IEEE/size=1kB/align=1-32            19966.99     48393.25     2.42x
BenchmarkCRC32/poly=IEEE/size=4kB/align=0-32            21690.33     51679.25     2.38x
BenchmarkCRC32/poly=IEEE/size=4kB/align=1-32            21655.30     51731.22     2.39x
BenchmarkCRC32/poly=IEEE/size=32kB/align=0-32           22046.57     46406.90     2.10x
BenchmarkCRC32/poly=IEEE/size=32kB/align=1-32           21986.22     46250.66     2.10x

AVX512 are enabled above 1KB input size.

This rather high limit is due to AVX512 may be slower to ramp up
than the regular SSE4 implementation for smaller inputs.

This is not reflected in the benchmarks,
since consecutive calls means the CPU is "hot".

The 'HasAVX512VPCLMULQDQ' name mirrors the one in golang.org/x/sys/cpu

Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271

Change-Id: Id23685d8e3cc412b6d397a7d70056844bdb79271
GitHub-Last-Rev: 6639f07b9f
GitHub-Pull-Request: golang/go#74701
Reviewed-on: https://go-review.googlesource.com/c/go/+/689435
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
Klaus Post 2025-07-23 11:23:52 +00:00 committed by Gopher Robot
parent c641900f72
commit 18dbe5b941
4 changed files with 84 additions and 25 deletions

View file

@ -13,6 +13,11 @@ import (
"unsafe"
)
// Offset into internal/cpu records for use in assembly.
const (
offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ)
)
// This file contains the code to call the SSE 4.2 version of the Castagnoli
// and IEEE CRC.

View file

@ -3,6 +3,7 @@
// license that can be found in the LICENSE file.
#include "textflag.h"
#include "go_asm.h"
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
//
@ -136,15 +137,23 @@ loop:
// Linux kernel, since they avoid the costly
// PSHUFB 16 byte reversal proposed in the
// original Intel paper.
// Splatted so it can be loaded with a single VMOVDQU64
DATA r2r1<>+0(SB)/8, $0x154442bd4
DATA r2r1<>+8(SB)/8, $0x1c6e41596
DATA r2r1<>+16(SB)/8, $0x154442bd4
DATA r2r1<>+24(SB)/8, $0x1c6e41596
DATA r2r1<>+32(SB)/8, $0x154442bd4
DATA r2r1<>+40(SB)/8, $0x1c6e41596
DATA r2r1<>+48(SB)/8, $0x154442bd4
DATA r2r1<>+56(SB)/8, $0x1c6e41596
DATA r4r3<>+0(SB)/8, $0x1751997d0
DATA r4r3<>+8(SB)/8, $0x0ccaa009e
DATA rupoly<>+0(SB)/8, $0x1db710641
DATA rupoly<>+8(SB)/8, $0x1f7011641
DATA r5<>+0(SB)/8, $0x163cd6124
GLOBL r2r1<>(SB),RODATA,$16
GLOBL r2r1<>(SB), RODATA, $64
GLOBL r4r3<>(SB),RODATA,$16
GLOBL rupoly<>(SB),RODATA,$16
GLOBL r5<>(SB),RODATA,$8
@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
MOVQ p+8(FP), SI // data pointer
MOVQ p_len+16(FP), CX // len(p)
// Check feature support and length to be >= 1024 bytes.
CMPB internalcpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
JNE useSSE42
CMPQ CX, $1024
JL useSSE42
// Use AVX512
VPXORQ Z0, Z0, Z0
VMOVQ AX, X0
VMOVDQU64 (SI), Z1
VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1
ADDQ $64, SI // buf+=64
SUBQ $64, CX // len-=64
VMOVDQU64 r2r1<>+0(SB), Z0
loopback64Avx512:
VMOVDQU64 (SI), Z11 // Load next
VPCLMULQDQ $0x11, Z0, Z1, Z5
VPCLMULQDQ $0, Z0, Z1, Z1
VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
ADDQ $0x40, DI
ADDQ $64, SI // buf+=64
SUBQ $64, CX // len-=64
CMPQ CX, $64 // Less than 64 bytes left?
JGE loopback64Avx512
// Unfold result into XMM1-XMM4 to match SSE4 code.
VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
VZEROUPPER
JMP remain64
PCALIGN $16
useSSE42:
MOVOU (SI), X1
MOVOU 16(SI), X2
MOVOU 32(SI), X3
@ -207,6 +253,7 @@ loopback64:
CMPQ CX, $64 // Less than 64 bytes left?
JGE loopback64
PCALIGN $16
/* Fold result into a single register (X1) */
remain64:
MOVOA r4r3<>+0(SB), X0

View file

@ -48,6 +48,7 @@ var X86 struct {
HasSSSE3 bool
HasSSE41 bool
HasSSE42 bool
HasAVX512VPCLMULQDQ bool
_ CacheLinePad
}

View file

@ -40,6 +40,10 @@ const (
cpuid_SHA = 1 << 29
cpuid_AVX512BW = 1 << 30
cpuid_AVX512VL = 1 << 31
// ecx bits
cpuid_AVX512VPCLMULQDQ = 1 << 10
// edx bits
cpuid_FSRM = 1 << 4
// edx bits for CPUID 0x80000001
@ -57,6 +61,7 @@ func doinit() {
{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
{Name: "rdtscp", Feature: &X86.HasRDTSCP},
{Name: "sha", Feature: &X86.HasSHA},
{Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ},
}
level := getGOAMD64level()
if level < 2 {
@ -139,7 +144,7 @@ func doinit() {
return
}
_, ebx7, _, edx7 := cpuid(7, 0)
_, ebx7, ecx7, edx7 := cpuid(7, 0)
X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@ -151,6 +156,7 @@ func doinit() {
if X86.HasAVX512F {
X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
}
X86.HasFSRM = isSet(edx7, cpuid_FSRM)