diff --git a/src/hash/crc32/crc32_amd64.go b/src/hash/crc32/crc32_amd64.go
index 6be129f5dd..105ce01a1e 100644
--- a/src/hash/crc32/crc32_amd64.go
+++ b/src/hash/crc32/crc32_amd64.go
@@ -13,6 +13,11 @@ import (
 	"unsafe"
 )
 
+// Offset into internal/cpu records for use in assembly.
+const (
+	offsetX86HasAVX512VPCLMULQDQL = unsafe.Offsetof(cpu.X86.HasAVX512VPCLMULQDQ)
+)
+
 // This file contains the code to call the SSE 4.2 version of the Castagnoli
 // and IEEE CRC.
 
diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s
index 6af6c253a7..4c482dc4a7 100644
--- a/src/hash/crc32/crc32_amd64.s
+++ b/src/hash/crc32/crc32_amd64.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "go_asm.h"
 
 // castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
 //
@@ -136,15 +137,23 @@ loop:
 // Linux kernel, since they avoid the costly
 // PSHUFB 16 byte reversal proposed in the
 // original Intel paper.
+// Splatted so it can be loaded with a single VMOVDQU64
 DATA r2r1<>+0(SB)/8, $0x154442bd4
 DATA r2r1<>+8(SB)/8, $0x1c6e41596
+DATA r2r1<>+16(SB)/8, $0x154442bd4
+DATA r2r1<>+24(SB)/8, $0x1c6e41596
+DATA r2r1<>+32(SB)/8, $0x154442bd4
+DATA r2r1<>+40(SB)/8, $0x1c6e41596
+DATA r2r1<>+48(SB)/8, $0x154442bd4
+DATA r2r1<>+56(SB)/8, $0x1c6e41596
+
 DATA r4r3<>+0(SB)/8, $0x1751997d0
 DATA r4r3<>+8(SB)/8, $0x0ccaa009e
 DATA rupoly<>+0(SB)/8, $0x1db710641
 DATA rupoly<>+8(SB)/8, $0x1f7011641
 DATA r5<>+0(SB)/8, $0x163cd6124
 
-GLOBL r2r1<>(SB),RODATA,$16
+GLOBL r2r1<>(SB), RODATA, $64
 GLOBL r4r3<>(SB),RODATA,$16
 GLOBL rupoly<>(SB),RODATA,$16
 GLOBL r5<>(SB),RODATA,$8
@@ -158,6 +167,43 @@ TEXT ·ieeeCLMUL(SB),NOSPLIT,$0
 	MOVQ   p+8(FP), SI  	         // data pointer
 	MOVQ   p_len+16(FP), CX          // len(p)
 
+	// Check feature support and length to be >= 1024 bytes.
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX512VPCLMULQDQL(SB), $1
+	JNE  useSSE42
+	CMPQ CX, $1024
+	JL   useSSE42
+
+	// Use AVX512
+	VPXORQ    Z0, Z0, Z0
+	VMOVQ     AX, X0
+	VMOVDQU64 (SI), Z1
+	VPXORQ    Z0, Z1, Z1 // Merge initial CRC value into Z1
+	ADDQ      $64, SI    // buf+=64
+	SUBQ      $64, CX    // len-=64
+
+	VMOVDQU64 r2r1<>+0(SB), Z0
+
+loopback64Avx512:
+	VMOVDQU64  (SI), Z11          // Load next
+	VPCLMULQDQ $0x11, Z0, Z1, Z5
+	VPCLMULQDQ $0, Z0, Z1, Z1
+	VPTERNLOGD $0x96, Z11, Z5, Z1 // Combine results with xor into Z1
+
+	ADDQ $0x40, DI
+	ADDQ $64, SI    // buf+=64
+	SUBQ $64, CX    // len-=64
+	CMPQ CX, $64    // Less than 64 bytes left?
+	JGE  loopback64Avx512
+
+	// Unfold result into XMM1-XMM4 to match SSE4 code.
+	VEXTRACTF32X4 $1, Z1, X2 // X2: Second 128-bit lane
+	VEXTRACTF32X4 $2, Z1, X3 // X3: Third 128-bit lane
+	VEXTRACTF32X4 $3, Z1, X4 // X4: Fourth 128-bit lane
+	VZEROUPPER
+	JMP remain64
+
+	PCALIGN $16
+useSSE42:
 	MOVOU  (SI), X1
 	MOVOU  16(SI), X2
 	MOVOU  32(SI), X3
@@ -207,6 +253,7 @@ loopback64:
 	CMPQ    CX, $64      // Less than 64 bytes left?
 	JGE     loopback64
 
+	PCALIGN $16
 	/* Fold result into a single register (X1) */
 remain64:
 	MOVOA       r4r3<>+0(SB), X0
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 760dc0b469..6017b1acc9 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -26,29 +26,30 @@ var CacheLineSize uintptr = CacheLinePadSize
 // in addition to the cpuid feature bit being set.
 // The struct is padded to avoid false sharing.
 var X86 struct {
-	_            CacheLinePad
-	HasAES       bool
-	HasADX       bool
-	HasAVX       bool
-	HasAVX2      bool
-	HasAVX512F   bool
-	HasAVX512BW  bool
-	HasAVX512VL  bool
-	HasBMI1      bool
-	HasBMI2      bool
-	HasERMS      bool
-	HasFSRM      bool
-	HasFMA       bool
-	HasOSXSAVE   bool
-	HasPCLMULQDQ bool
-	HasPOPCNT    bool
-	HasRDTSCP    bool
-	HasSHA       bool
-	HasSSE3      bool
-	HasSSSE3     bool
-	HasSSE41     bool
-	HasSSE42     bool
-	_            CacheLinePad
+	_                   CacheLinePad
+	HasAES              bool
+	HasADX              bool
+	HasAVX              bool
+	HasAVX2             bool
+	HasAVX512F          bool
+	HasAVX512BW         bool
+	HasAVX512VL         bool
+	HasBMI1             bool
+	HasBMI2             bool
+	HasERMS             bool
+	HasFSRM             bool
+	HasFMA              bool
+	HasOSXSAVE          bool
+	HasPCLMULQDQ        bool
+	HasPOPCNT           bool
+	HasRDTSCP           bool
+	HasSHA              bool
+	HasSSE3             bool
+	HasSSSE3            bool
+	HasSSE41            bool
+	HasSSE42            bool
+	HasAVX512VPCLMULQDQ bool
+	_                   CacheLinePad
 }
 
 // The booleans in ARM contain the correspondingly named cpu feature bit.
diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go
index ee812076e9..69b9542ae2 100644
--- a/src/internal/cpu/cpu_x86.go
+++ b/src/internal/cpu/cpu_x86.go
@@ -40,6 +40,10 @@ const (
 	cpuid_SHA      = 1 << 29
 	cpuid_AVX512BW = 1 << 30
 	cpuid_AVX512VL = 1 << 31
+
+	// ecx bits
+	cpuid_AVX512VPCLMULQDQ = 1 << 10
+
 	// edx bits
 	cpuid_FSRM = 1 << 4
 	// edx bits for CPUID 0x80000001
@@ -57,6 +61,7 @@ func doinit() {
 		{Name: "pclmulqdq", Feature: &X86.HasPCLMULQDQ},
 		{Name: "rdtscp", Feature: &X86.HasRDTSCP},
 		{Name: "sha", Feature: &X86.HasSHA},
+		{Name: "vpclmulqdq", Feature: &X86.HasAVX512VPCLMULQDQ},
 	}
 	level := getGOAMD64level()
 	if level < 2 {
@@ -139,7 +144,7 @@ func doinit() {
 		return
 	}
 
-	_, ebx7, _, edx7 := cpuid(7, 0)
+	_, ebx7, ecx7, edx7 := cpuid(7, 0)
 	X86.HasBMI1 = isSet(ebx7, cpuid_BMI1)
 	X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX
 	X86.HasBMI2 = isSet(ebx7, cpuid_BMI2)
@@ -151,6 +156,7 @@ func doinit() {
 	if X86.HasAVX512F {
 		X86.HasAVX512BW = isSet(ebx7, cpuid_AVX512BW)
 		X86.HasAVX512VL = isSet(ebx7, cpuid_AVX512VL)
+		X86.HasAVX512VPCLMULQDQ = isSet(ecx7, cpuid_AVX512VPCLMULQDQ)
 	}
 
 	X86.HasFSRM = isSet(edx7, cpuid_FSRM)