diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go index 6be129f5dd..105ce01a1e 100644 --- a/src/hash/crc32/crc32_amd64.go +++ b/src/hash/crc32/crc32_amd64.go @@ -13,6 +13,11 @@ import ( "unsafe" ) +// Offset into internal/cpu records for use in assembly. +const ( + offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ) +) + // This file contains the code to call the SSE 4.2 version of the Castagnoli // and IEEE CRC. diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s index 6af6c253a7..4c482dc4a7 100644 --- a/src/hash/crc32/crc32_amd64.s +++ b/src/hash/crc32/crc32_amd64.s @@ -3,6 +3,7 @@ // license that can be found in the LICENSE file. #include "textflag.h" +#include "go_asm.h" // castagnoliSSE42 updates the (non-inverted) crc with the given buffer. // @@ -136,15 +137,23 @@ loop: // Linux kernel, since they avoid the costly // PSHUFB 16 byte reversal proposed in the // original Intel paper. +// Splatted so it can be loaded with a single VMOVDQU64 DATA r2r1<>+0(SB)/8, $0x154442bd4 DATA r2r1<>+8(SB)/8, $0x1c6e41596 +DATA r2r1<>+16(SB)/8, $0x154442bd4 +DATA r2r1<>+24(SB)/8, $0x1c6e41596 +DATA r2r1<>+32(SB)/8, $0x154442bd4 +DATA r2r1<>+40(SB)/8, $0x1c6e41596 +DATA r2r1<>+48(SB)/8, $0x154442bd4 +DATA r2r1<>+56(SB)/8, $0x1c6e41596 + DATA r4r3<>+0(SB)/8, $0x1751997d0 DATA r4r3<>+8(SB)/8, $0x0ccaa009e DATA rupoly<>+0(SB)/8, $0x1db710641 DATA rupoly<>+8(SB)/8, $0x1f7011641 DATA r5<>+0(SB)/8, $0x163cd6124 -GLOBL r2r1<>(SB),RODATA,$16 +GLOBL r2r1<>(SB), RODATA, $64 GLOBL r4r3<>(SB),RODATA,$16 GLOBL rupoly<>(SB),RODATA,$16 GLOBL r5<>(SB),RODATA,$8 @@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0 MOVQ p+8(FP), SI // data pointer MOVQ p_len+16(FP), CX // len(p) + // Check feature support and length to be >= 1024 bytes. + CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1 + JNE useSSE42 + CMPQ CX, $1024 + JL useSSE42 + + // Use AVX512 + VPXORQ Z0, Z0, Z0 + VMOVQ AX, X0 + VMOVDQU64 (SI), Z1 + VPXORQ Z0, Z1, Z1 // Merge initial CRC value into Z1 + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + + VMOVDQU64 r2r1<>+0(SB), Z0 + +loopback64Avx512: + VMOVDQU64 (SI), Z11 // Load next + VPCLMULQDQ $0x11, Z0, Z1, Z5 + VPCLMULQDQ $0, Z0, Z1, Z1 + VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1 + + ADDQ $0x40, DI + ADDQ $64, SI // buf+=64 + SUBQ $64, CX // len-=64 + CMPQ CX, $64 // Less than 64 bytes left? + JGE loopback64Avx512 + + // Unfold result into XMM1-XMM4 to match SSE4 code. + VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane + VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane + VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane + VZEROUPPER + JMP remain64 + + PCALIGN $16 +useSSE42: MOVOU (SI), X1 MOVOU 16(SI), X2 MOVOU 32(SI), X3 @@ -207,6 +253,7 @@ loopback64: CMPQ CX, $64 // Less than 64 bytes left? JGE loopback64 + PCALIGN $16 /* Fold result into a single register (X1) */ remain64: MOVOA r4r3<>+0(SB), X0 diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 760dc0b469..6017b1acc9 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize // in addition to the cpuid feature bit being set. // The struct is padded to avoid false sharing. var X86 struct { - _ CacheLinePad - HasAES bool - HasADX bool - HasAVX bool - HasAVX2 bool - HasAVX512F bool - HasAVX512BW bool - HasAVX512VL bool - HasBMI1 bool - HasBMI2 bool - HasERMS bool - HasFSRM bool - HasFMA bool - HasOSXSAVE bool - HasPCLMULQDQ bool - HasPOPCNT bool - HasRDTSCP bool - HasSHA bool - HasSSE3 bool - HasSSSE3 bool - HasSSE41 bool - HasSSE42 bool - _ CacheLinePad + _ CacheLinePad + HasAES bool + HasADX bool + HasAVX bool + HasAVX2 bool + HasAVX512F bool + HasAVX512BW bool + HasAVX512VL bool + HasBMI1 bool + HasBMI2 bool + HasERMS bool + HasFSRM bool + HasFMA bool + HasOSXSAVE bool + HasPCLMULQDQ bool + HasPOPCNT bool + HasRDTSCP bool + HasSHA bool + HasSSE3 bool + HasSSSE3 bool + HasSSE41 bool + HasSSE42 bool + HasAVX512VPCLMULQDQ bool + _ CacheLinePad } // The booleans in ARM contain the correspondingly named cpu feature bit. diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index ee812076e9..69b9542ae2 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -40,6 +40,10 @@ const ( cpuid_SHA = 1 << 29 cpuid_AVX512BW = 1 << 30 cpuid_AVX512VL = 1 << 31 + + // ecx bits + cpuid_AVX512VPCLMULQDQ = 1 << 10 + // edx bits cpuid_FSRM = 1 << 4 // edx bits for CPUID 0x80000001 @@ -57,6 +61,7 @@ func doinit() { {Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ}, {Name: "rdtscp", Feature: &X86.HasRDTSCP}, {Name: "sha", Feature: &X86.HasSHA}, + {Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ}, } level := getGOAMD64level() if level < 2 { @@ -139,7 +144,7 @@ func doinit() { return } - _, ebx7, _, edx7 := cpuid(7, 0) + _, ebx7, ecx7, edx7 := cpuid(7, 0) X86.HasBMI1 = isSet(ebx7, cpuid_BMI1) X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX X86.HasBMI2 = isSet(ebx7, cpuid_BMI2) @@ -151,6 +156,7 @@ func doinit() { if X86.HasAVX512F { X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW) X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL) + X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ) } X86.HasFSRM = isSet(edx7, cpuid_FSRM)