archive/zip: fix writer-side Zip64 edge cases

I ran into this because the broken Writer caused mysterious and very
hard to debug failures uploading archive/zip-generated files to the
Internet Archive. (Only zip files bigger than 4GiB *and* smaller than
around 6.5GiB failed. I still don't have an explanation for the latter
part, maybe the parser has different logic for when the count of records
crosses 65535 and the Zip64 EOCD is used.)

Reproducing testdata/zip64/*.zsparse:

Inputs (sparse zero files via `truncate -s N NAME`, sizes in bytes):
  big5g.bin     5<<30                   big4g.bin     4<<30
  big4g-1.bin   (4<<30) - 1             big4g-2.bin   (4<<30) - 2
  under4g.bin   (4<<30) - 59            first         (4<<30) - 36
  small.bin     42  (use `dd` for the non-sparse 42-byte file)

Cases (case → entries, M=0 Store, M=9 Deflate):
  store-5g             big5g.bin/0
  deflate-zeros-5g     big5g.bin/9
  store-4g-minus-1     big4g-1.bin/0
  store-4g-minus-2     big4g-2.bin/0
  store-just-under-4g  under4g.bin/0
  store-exact-4g       big4g.bin/0
  offset-past-4g       big5g.bin/0, small.bin/0
  offset-eq-4g         first/0,     small.bin/0

Producers:
  infozip-*    Info-ZIP 3.0:
                 zip -q -X -M OUT.zip <entries>
  libarchive-* bsdtar (libarchive):
                 bsdtar -cf OUT.zip --format zip \
                   --options zip:compression={store|deflate} <entries>
  go126-*      archive/zip from Go 1.26. Build with GOTOOLCHAIN=go1.26.0
                 from a tempdir whose go.mod declares `go 1.26.0`.
                 For each entry:
                 zip.FileHeader{Name, Method: zip.Store|zip.Deflate},
                 CreateHeader, io.CopyN(fw, zeros, size), w.Close().

Convert each OUT.zip to ${producer}-${case}.zsparse using the format
defined in archive/zip/zip64_sparse_test.go (scanSparse / readSparse):
walk the zip in 4 KiB chunks, drop chunks that are entirely zero,
coalesce adjacent non-zero chunks into spans, and serialize the result
as gzip of:

  uint64 LE  totalSize
  uint32 LE  numSpans
  numSpans times:
    uint64 LE  offset
    uint32 LE  dataLen
    dataLen bytes

Updates #22520
Fixes #23572
Fixes #33116
Fixes #69415

Change-Id: I6e24e7170094346af494da153c63e6b56a6a6964
Reviewed-on: https://go-review.googlesource.com/c/go/+/725161
Auto-Submit: Filippo Valsorda <filippo@golang.org>
LUCI-TryBot-Result: golang-scoped@luci-project-accounts.iam.gserviceaccount.com <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
SLSA-Policy-Verified: SLSA Policy Verification Service <devtools-gerritcodereview-exitgate@google.com>
Reviewed-by: Russ Cox <rsc@golang.org>
This commit is contained in:
Filippo Valsorda 2025-11-30 23:23:38 +01:00 committed by Gopher Robot
parent a7ea4a7ecd
commit 3a9c8e1d90
28 changed files with 975 additions and 66 deletions

View file

@ -156,7 +156,11 @@ type FileHeader struct {
// UncompressedSize64 is the uncompressed size of the file in bytes.
UncompressedSize64 uint64
Extra []byte
// Extra are the extensible data fields. The writer automatically includes
// the appropriate Zip64 field if necessary, and [Writer.Close] appends the
// Central Directory version of the Zip64 field to Extra.
Extra []byte
ExternalAttrs uint32 // Meaning depends on CreatorVersion
}
@ -337,11 +341,6 @@ func (h *FileHeader) SetMode(mode fs.FileMode) {
}
}
// isZip64 reports whether the file size exceeds the 32 bit limit
func (h *FileHeader) isZip64() bool {
return h.CompressedSize64 >= uint32max || h.UncompressedSize64 >= uint32max
}
func (h *FileHeader) hasDataDescriptor() bool {
return h.Flags&0x8 != 0
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -93,48 +93,65 @@ func (w *Writer) Close() error {
// write central directory
start := w.cw.count
usedZip64 := false
for _, h := range w.dir {
// For the Central Directory, we always have the correct sizes.
//
// Implementations disagree on what triggers the inclusion of a Zip64
// extra field: Info-ZIP only writes it if any size or offset EXCEEDS
// 4GiB - 1, while libarchive writes it if any size REACHES OR EXCEEDS
// 4GiB - 1, or if the offset EXCEEDS 4GiB - 1. The spec is ambiguous.
//
// We conservatively write Zip64 extra fields if any size or offset
// REACHES OR EXCEEDS 4GiB - 1, to maximize compatibility with readers.
// There is no ambiguity in parsing, so there is no downside to it.
//
// The spec is clear though that all and only the fields that REACH OR
// EXCEED 4GiB - 1 are included in the Zip64 extra, once it's present.
readerVersion := h.ReaderVersion
if h.CompressedSize64 >= uint32max || h.UncompressedSize64 >= uint32max || h.offset >= uint32max {
usedZip64 = true
readerVersion = max(readerVersion, zipVersion45)
var size uint16
var buf [28]byte // 2x uint16 + up to 3x uint64
eb := writeBuf(buf[:])
eb.uint16(zip64ExtraID)
eb.uint16(0) // size to be filled out later
if h.UncompressedSize64 >= uint32max {
eb.uint64(h.UncompressedSize64)
size += 8
}
if h.CompressedSize64 >= uint32max {
eb.uint64(h.CompressedSize64)
size += 8
}
if h.offset >= uint32max {
eb.uint64(h.offset)
size += 8
}
sb := writeBuf(buf[2:])
sb.uint16(size)
h.Extra = append(h.Extra, buf[:4+size]...)
}
var buf [directoryHeaderLen]byte
b := writeBuf(buf[:])
b.uint32(uint32(directoryHeaderSignature))
b.uint16(h.CreatorVersion)
b.uint16(h.ReaderVersion)
b.uint16(readerVersion)
b.uint16(h.Flags)
b.uint16(h.Method)
b.uint16(h.ModifiedTime)
b.uint16(h.ModifiedDate)
b.uint32(h.CRC32)
if h.isZip64() || h.offset >= uint32max {
// the file needs a zip64 header. store maxint in both
// 32 bit size fields (and offset later) to signal that the
// zip64 extra header should be used.
b.uint32(uint32max) // compressed size
b.uint32(uint32max) // uncompressed size
// append a zip64 extra block to Extra
var buf [28]byte // 2x uint16 + 3x uint64
eb := writeBuf(buf[:])
eb.uint16(zip64ExtraID)
eb.uint16(24) // size = 3x uint64
eb.uint64(h.UncompressedSize64)
eb.uint64(h.CompressedSize64)
eb.uint64(h.offset)
h.Extra = append(h.Extra, buf[:]...)
} else {
b.uint32(h.CompressedSize)
b.uint32(h.UncompressedSize)
}
b.uint32(uint32(min(h.CompressedSize64, uint32max)))
b.uint32(uint32(min(h.UncompressedSize64, uint32max)))
b.uint16(uint16(len(h.Name)))
b.uint16(uint16(len(h.Extra)))
b.uint16(uint16(len(h.Comment)))
b = b[4:] // skip disk number start and internal file attr (2x uint16)
b.uint32(h.ExternalAttrs)
if h.offset > uint32max {
b.uint32(uint32max)
} else {
b.uint32(uint32(h.offset))
}
b.uint32(uint32(min(h.offset, uint32max)))
if _, err := w.cw.Write(buf[:]); err != nil {
return err
}
@ -158,7 +175,11 @@ func (w *Writer) Close() error {
f(size, offset)
}
if records >= uint16max || size >= uint32max || offset >= uint32max {
// Emit the Zip64 EOCD records whenever any individual entry needed a Zip64
// extra field, even if the EOCD's own fields fit in 32 bits, matching
// Info-ZIP (but not libarchive). See APPNOTE 4.3.9.2: "when Zip64
// extensions are in use, the EOCD64 record must be present."
if usedZip64 || records >= uint16max || size >= uint32max || offset >= uint32max {
var buf [directory64EndLen + directory64LocLen]byte
b := writeBuf(buf[:])
@ -183,24 +204,18 @@ func (w *Writer) Close() error {
if _, err := w.cw.Write(buf[:]); err != nil {
return err
}
// store max values in the regular end record to signal
// that the zip64 values should be used instead
records = uint16max
size = uint32max
offset = uint32max
}
// write end record
var buf [directoryEndLen]byte
b := writeBuf(buf[:])
b.uint32(uint32(directoryEndSignature))
b = b[4:] // skip over disk number and first disk number (2x uint16)
b.uint16(uint16(records)) // number of entries this disk
b.uint16(uint16(records)) // number of entries total
b.uint32(uint32(size)) // size of directory
b.uint32(uint32(offset)) // start of directory
b.uint16(uint16(len(w.comment))) // byte size of EOCD comment
b = b[4:] // skip over disk number and first disk number (2x uint16)
b.uint16(uint16(min(uint16max, records))) // number of entries this disk
b.uint16(uint16(min(uint16max, records))) // number of entries total
b.uint32(uint32(min(uint32max, size))) // size of directory
b.uint32(uint32(min(uint32max, offset))) // start of directory
b.uint16(uint16(len(w.comment))) // byte size of EOCD comment
if _, err := w.cw.Write(buf[:]); err != nil {
return err
}
@ -398,38 +413,79 @@ func writeHeader(w io.Writer, h *header) error {
return errLongExtra
}
// The correct behavior of a streaming writer, implemented by Info-ZIP 3.0,
// would be to write 0xFFFFFFFF in the size fields and then write a Zip64
// extra field with the sizes at zero (to signal they are stored in a ZIP64
// data descriptor, in case the file is > 4GiB).
//
// We don't do that, and instead write zeroes directly in the size fields,
// because that wastes 28 bytes for every file smaller than 4GiB, and
// because it would change the encoding of nearly every zip file created by
// archive/zip. (No one should rely on it being stable, but still.)
//
// Anyway, the Local File Header is not that important, as the Central
// Directory is authoritative, and there we always write the correct sizes.
//
// If we do know the sizes, because [Writer.CreateRaw] is used and the data
// descriptor flag is not set, then we write them to the header. If either
// size reaches 4GiB, we write 0xFFFFFFFF placeholders and a Zip64 extra
// field with BOTH sizes, per the spec and matching Info-ZIP. Note this is
// different from the Central Directory Zip64 extra field logic, somehow.
//
// (One final interesting case that doesn't apply to us: if the input is
// streaming but the output is seekable, Info-ZIP always writes Zip64 extra
// fields, and then goes back and patches in the sizes, even for files < 4GiB.)
var zip64ExtraInfo []byte
readerVersion := h.ReaderVersion
noDataDescriptor := h.raw && !h.hasDataDescriptor()
if noDataDescriptor && (h.CompressedSize64 > uint32max || h.UncompressedSize64 > uint32max) {
readerVersion = max(readerVersion, zipVersion45)
zip64ExtraInfo = make([]byte, 20) // 2x uint16 + 2x uint64
b := writeBuf(zip64ExtraInfo)
b.uint16(zip64ExtraID)
b.uint16(16) // size of Zip64 extra field data
b.uint64(h.UncompressedSize64)
b.uint64(h.CompressedSize64)
}
var buf [fileHeaderLen]byte
b := writeBuf(buf[:])
b.uint32(uint32(fileHeaderSignature))
b.uint16(h.ReaderVersion)
b.uint16(readerVersion)
b.uint16(h.Flags)
b.uint16(h.Method)
b.uint16(h.ModifiedTime)
b.uint16(h.ModifiedDate)
// In raw mode (caller does the compression), the values are either
// written here or in the trailing data descriptor based on the header
// flags.
if h.raw && !h.hasDataDescriptor() {
if noDataDescriptor {
b.uint32(h.CRC32)
b.uint32(uint32(min(h.CompressedSize64, uint32max)))
b.uint32(uint32(min(h.UncompressedSize64, uint32max)))
if zip64ExtraInfo != nil {
b.uint32(uint32max)
b.uint32(uint32max)
} else {
b.uint32(uint32(h.CompressedSize64))
b.uint32(uint32(h.UncompressedSize64))
}
} else {
// When this package handle the compression, these values are
// always written to the trailing data descriptor.
b.uint32(0) // crc32
b.uint32(0) // compressed size
b.uint32(0) // uncompressed size
}
b.uint16(uint16(len(h.Name)))
b.uint16(uint16(len(h.Extra)))
b.uint16(uint16(len(h.Extra) + len(zip64ExtraInfo)))
if _, err := w.Write(buf[:]); err != nil {
return err
}
if _, err := io.WriteString(w, h.Name); err != nil {
return err
}
_, err := w.Write(h.Extra)
return err
if _, err := w.Write(h.Extra); err != nil {
return err
}
if _, err := w.Write(zip64ExtraInfo); err != nil {
return err
}
return nil
}
// CreateRaw adds a file to the zip archive using the provided [FileHeader] and
@ -601,7 +657,7 @@ func (w *fileWriter) close() error {
fh.CompressedSize64 = uint64(w.compCount.count)
fh.UncompressedSize64 = uint64(w.rawCount.count)
if fh.isZip64() {
if w.CompressedSize64 > uint32max || w.UncompressedSize64 > uint32max {
fh.CompressedSize = uint32max
fh.UncompressedSize = uint32max
fh.ReaderVersion = zipVersion45 // requires 4.5 - File uses ZIP64 format extensions
@ -617,13 +673,13 @@ func (w *fileWriter) writeDataDescriptor() error {
if !w.hasDataDescriptor() {
return nil
}
// Write data descriptor. This is more complicated than one would
// think, see e.g. comments in zipfile.c:putextended() and
// https://bugs.openjdk.org/browse/JDK-7073588.
// The approach here is to write 8 byte sizes if needed without
// adding a zip64 extra in the local header (too late anyway).
// See the comment in [writeHeader] about how and why we don't signal ZIP64
// mode in the local file header. If one of the sizes turns out to exceed
// 4GiB, we use the 64-bit sizes anyway, for lack of alternatives.
//
// See also https://bugs.openjdk.org/browse/JDK-7073588.
var buf []byte
if w.isZip64() {
if w.CompressedSize64 > uint32max || w.UncompressedSize64 > uint32max {
buf = make([]byte, dataDescriptor64Len)
} else {
buf = make([]byte, dataDescriptorLen)
@ -631,7 +687,7 @@ func (w *fileWriter) writeDataDescriptor() error {
b := writeBuf(buf)
b.uint32(dataDescriptorSignature) // de-facto standard, required by OS X
b.uint32(w.CRC32)
if w.isZip64() {
if w.CompressedSize64 > uint32max || w.UncompressedSize64 > uint32max {
b.uint64(w.CompressedSize64)
b.uint64(w.UncompressedSize64)
} else {

View file

@ -0,0 +1,240 @@
// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package zip
import (
"cmp"
"compress/gzip"
"errors"
"fmt"
"io"
"os"
"slices"
)
// A sparseFile represents an archive as a sequence of non-zero byte spans
// (the LFH headers, the Central Directory, the EOCD records, and any
// non-zero compressed bodies) plus a total length. Bytes outside any span
// are implicitly zero. This is the storage format used for goldens under
// testdata/zip64/ (suffix .zsparse) and the in-memory shape produced by
// the writer-reproduction harness.
//
// On-disk layout (all little-endian):
//
// uint64 size
// uint32 numSpans
// for each span:
// uint64 offset
// uint32 dataLen
// dataLen bytes
//
// Spans are sorted by offset and non-overlapping.
type sparseFile struct {
Size int64
Spans []sparseSpan
}
type sparseSpan struct {
Offset int64
Data []byte
}
// ReadAt implements [io.ReaderAt] by serving the underlying spans and
// synthesizing zero bytes for any gap inside [0, Size).
func (f *sparseFile) ReadAt(p []byte, off int64) (int, error) {
if off < 0 {
return 0, errors.New("sparseFile: negative offset")
}
if off >= f.Size {
return 0, io.EOF
}
end := min(off+int64(len(p)), f.Size)
n := int(end - off)
clear(p[:n])
for _, s := range f.Spans {
sEnd := s.Offset + int64(len(s.Data))
if sEnd <= off || s.Offset >= end {
continue
}
from := max(s.Offset, off)
to := min(sEnd, end)
copy(p[from-off:to-off], s.Data[from-s.Offset:to-s.Offset])
}
if n < len(p) {
return n, io.EOF
}
return n, nil
}
// materializeTail returns the last keep bytes of the conceptual file as a
// plain byte slice, suitable for [parseCD].
func (f *sparseFile) materializeTail(keep int64) (data []byte, baseOff uint64) {
if keep > f.Size {
keep = f.Size
}
base := f.Size - keep
buf := make([]byte, keep)
f.ReadAt(buf, base)
return buf, uint64(base)
}
const sparseChunk = 4096
// scanSparse stream-reads r and builds a sparseFile, treating any contiguous
// run of zero bytes (rounded to sparseChunk boundaries) as a gap. Adjacent
// non-zero chunks are coalesced into one span.
func scanSparse(r io.Reader) (*sparseFile, error) {
f := &sparseFile{}
var cur *sparseSpan
buf := make([]byte, sparseChunk)
for {
n, err := io.ReadFull(r, buf)
if n > 0 {
chunk := buf[:n]
if isAllZero(chunk) {
if cur != nil {
f.Spans = append(f.Spans, *cur)
cur = nil
}
} else {
if cur == nil {
cur = &sparseSpan{Offset: f.Size}
}
cur.Data = append(cur.Data, chunk...)
}
f.Size += int64(n)
}
if err != nil {
if err == io.EOF || err == io.ErrUnexpectedEOF {
break
}
return nil, err
}
}
if cur != nil {
f.Spans = append(f.Spans, *cur)
}
return f, nil
}
// writeSparse serializes f to w in the on-disk format described on
// [sparseFile].
func writeSparse(w io.Writer, f *sparseFile) error {
var hdr [12]byte
le.PutUint64(hdr[:8], uint64(f.Size))
le.PutUint32(hdr[8:12], uint32(len(f.Spans)))
if _, err := w.Write(hdr[:]); err != nil {
return err
}
for _, s := range f.Spans {
var b [12]byte
le.PutUint64(b[:8], uint64(s.Offset))
le.PutUint32(b[8:12], uint32(len(s.Data)))
if _, err := w.Write(b[:]); err != nil {
return err
}
if _, err := w.Write(s.Data); err != nil {
return err
}
}
return nil
}
// readSparse parses the on-disk format from r.
func readSparse(r io.Reader) (*sparseFile, error) {
var hdr [12]byte
if _, err := io.ReadFull(r, hdr[:]); err != nil {
return nil, err
}
f := &sparseFile{
Size: int64(le.Uint64(hdr[:8])),
}
n := le.Uint32(hdr[8:12])
if n > 1<<20 {
return nil, fmt.Errorf("sparseFile: implausible span count %d", n)
}
f.Spans = make([]sparseSpan, n)
for i := range f.Spans {
var b [12]byte
if _, err := io.ReadFull(r, b[:]); err != nil {
return nil, err
}
f.Spans[i].Offset = int64(le.Uint64(b[:8]))
sz := le.Uint32(b[8:12])
f.Spans[i].Data = make([]byte, sz)
if _, err := io.ReadFull(r, f.Spans[i].Data); err != nil {
return nil, err
}
}
if !slices.IsSortedFunc(f.Spans, func(a, b sparseSpan) int {
return cmp.Compare(a.Offset, b.Offset)
}) {
return nil, errors.New("sparseFile: spans not sorted")
}
return f, nil
}
// readSparseFile reads a sparse file from path. The file is expected to be
// gzip-compressed; the outer gzip wrap shrinks goldens that contain non-zero
// compressed bodies (e.g., the deflate-zeros entries) by 100x because
// deflate-of-zeros is highly repetitive. Small Store goldens benefit too:
// gzip's header overhead is ~30 bytes, well under the bytes saved on a 4 KB
// sparse representation.
func readSparseFile(path string) (*sparseFile, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
zr, err := gzip.NewReader(f)
if err != nil {
return nil, err
}
defer zr.Close()
return readSparse(zr)
}
// isAllZero reports whether every byte in b is 0.
func isAllZero(b []byte) bool {
for _, c := range b {
if c != 0 {
return false
}
}
return true
}
// sparseBuffer accumulates writes into a [sparseFile], dropping any
// chunkSize-byte chunk that is all-zero. This makes capturing the result
// of pushing multi-GiB streams of zeros through the writer almost free —
// the only bytes that end up retained are the LFHs, the Central
// Directory, the EOCD records, and any non-zero compressed body.
type sparseBuffer struct {
f sparseFile
cur *sparseSpan
}
func (t *sparseBuffer) Write(p []byte) (int, error) {
n := len(p)
for len(p) > 0 {
k := len(p)
if k > sparseChunk {
k = sparseChunk
}
chunk := p[:k]
if isAllZero(chunk) {
t.cur = nil
} else {
if t.cur == nil {
t.f.Spans = append(t.f.Spans, sparseSpan{Offset: t.f.Size})
t.cur = &t.f.Spans[len(t.f.Spans)-1]
}
t.cur.Data = append(t.cur.Data, chunk...)
}
t.f.Size += int64(k)
p = p[k:]
}
return n, nil
}

View file

@ -0,0 +1,614 @@
// Copyright 2026 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package zip
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"path/filepath"
"slices"
"strings"
"testing"
)
// TestZip64WriterCDGoldens checks that the archive/zip Writer emits a Central
// Directory that matches the Zip64 conventions used by Info-ZIP, libarchive,
// and the pre-CL archive/zip writer (go126-*), for archives at or above 4 GiB,
// except where we intentionally diverged.
//
// For each golden in testdata/zip64/*.zsparse (see [sparseFile] for the
// committed format), the test:
// 1. Parses the golden's CD into a producer-independent snapshot — which
// fields hold 0xFFFFFFFF placeholders, which Zip64 extra sub-fields are
// present and in what order, and the EOCD/EOCD64 values.
// 2. Verifies the production [NewReader] parses the same archive.
// 3. Replays the same entries through a fresh [Writer] into a [sparseBuffer]
// and parses our own CD.
// 4. Verifies the production [NewReader] parses our reproduced archive too.
// 5. Compares the two snapshots field-by-field, ignoring producer-specific
// details (creator version, external attrs, non-Zip64 extras, absolute
// byte offsets that depend on LFH/data-descriptor layout).
func TestZip64WriterCDGoldens(t *testing.T) {
if testing.Short() {
t.Skip("skipping in short mode; each golden replays a multi-GiB write")
}
matches, err := filepath.Glob("testdata/zip64/*.zsparse")
if err != nil {
t.Fatal(err)
}
if len(matches) == 0 {
t.Fatal("missing Zip64 goldens in testdata/zip64")
}
// Tail materialized for parseCD. Goldens have ≤ 2 entries; their CD
// plus EOCD records fits in well under 1 MiB.
const tailKeep = 1 << 20
// archive/zip's writer takes the most defensive position on every
// spec-fuzzy point: it always emits the Zip64 extra at the 0xFFFFFFFF
// boundary (matching libarchive but more conservative than Info-ZIP) AND
// emits EOCD64 whenever any entry has a Zip64 extra in its CD record
// (matching Info-ZIP but more conservative than libarchive). The go126-
// goldens are output of an older archive/zip writer, and the format
// deliberately diverges; they are kept here so the reader-side check
// enforces backwards compatibility with archives produced by our own past
// writer, and to ensure we only diverge where intended.
expectedDiff := map[string]bool{
// Info-ZIP treats a CD size field of exactly 0xFFFFFFFF as a real
// value and omits the Zip64 extra; archive/zip defensively emits
// the Zip64 extra with USize64+CSize64.
"infozip-store-4g-minus-1": true,
// Info-ZIP treats a CD offset field of exactly 0xFFFFFFFF as a real
// value and omits the Zip64 extra for offset; archive/zip defensively
// emits the Zip64 extra with the offset sub-field.
"infozip-offset-eq-4g": true,
// libarchive's writer emits EOCD64 only on EOCD-level overflow (CD
// size/offset > 4GiB, records > 0xFFFF); archive/zip also emits
// EOCD64 when any per-entry CD record uses a Zip64 extra, even if
// the EOCD fields fit in 32 bits.
"libarchive-deflate-zeros-5g": true,
// libarchive's LFH always carries a UT timestamp extra (~9 bytes),
// so its dirOffset for a body of 4GiB-59 lands just past 0xFFFFFFFF
// and it emits EOCD64. archive/zip's streaming LFH has no such
// extras and stays under uint32max.
"libarchive-store-just-under-4g": true,
// The old archive/zip writer differs from the current writer on
// every Zip64-using entry: it always wrote a fixed 24-byte Zip64
// extra with all three sub-fields (usize, csize, offset) and set
// both 32-bit size fields to 0xFFFFFFFF whenever the per-entry
// trigger fired; it also set the EOCD records/size/offset to the
// placeholder values whenever EOCD64 was present.
"go126-store-5g": true,
"go126-deflate-zeros-5g": true,
"go126-store-4g-minus-1": true,
"go126-store-4g-minus-2": true,
"go126-store-exact-4g": true,
"go126-offset-past-4g": true,
"go126-offset-eq-4g": true,
"go126-store-just-under-4g": false,
}
for _, path := range matches {
name := strings.TrimSuffix(filepath.Base(path), ".zsparse")
t.Run(name, func(t *testing.T) {
t.Parallel()
goldenSF, err := readSparseFile(path)
if err != nil {
t.Fatalf("read golden: %v", err)
}
goldenData, goldenBase := goldenSF.materializeTail(tailKeep)
golden, err := parseCD(goldenData, goldenBase)
if err != nil {
t.Fatalf("parse golden CD: %v", err)
}
// Verify the production Reader can parse the full golden.
checkReaderMatchesSnapshot(t, "golden", goldenSF, golden)
oursSF := reproduceCD(t, golden)
oursData, oursBase := oursSF.materializeTail(tailKeep)
got, err := parseCD(oursData, oursBase)
if err != nil {
t.Fatalf("parse reproduced CD: %v\nbytes:\n%s", err, hexDump(oursData))
}
// Verify the production Reader can parse archive/zip's own
// output and gets the same view of the entries.
checkReaderMatchesSnapshot(t, "reproduced", oursSF, got)
if expectedDiff[name] {
var cap captureReporter
compareCDSnapshots(&cap, golden, got)
if !cap.failed {
t.Errorf("expected this golden to fail equivalence, but it passed")
} else {
t.Logf("expected mismatch:\n%s", indent(cap.msg.String(), " "))
}
return
}
compareCDSnapshots(t, golden, got)
})
}
}
// errReporter is the subset of [testing.TB] that [compareCDSnapshots] uses.
// The captureReporter implementation lets the test capture mismatches for
// expected-failure cases instead of propagating them to the outer t.
type errReporter interface {
Errorf(format string, args ...any)
Helper()
}
type captureReporter struct {
failed bool
msg strings.Builder
}
func (c *captureReporter) Errorf(format string, args ...any) {
c.failed = true
fmt.Fprintf(&c.msg, format+"\n", args...)
}
func (c *captureReporter) Helper() {}
// checkReaderMatchesSnapshot opens the archive backed by the sparseFile
// using the production [NewReader] and asserts that the entry list it
// returns matches the [cdSnapshot] (entry count, names, resolved 64-bit
// sizes).
func checkReaderMatchesSnapshot(t *testing.T, label string, f *sparseFile, snap *cdSnapshot) {
t.Helper()
zr, err := NewReader(f, f.Size)
if err != nil {
t.Fatalf("%s: NewReader: %v", label, err)
}
if g, w := len(zr.File), len(snap.Entries); g != w {
t.Errorf("%s: NewReader returned %d files, parseCD found %d", label, g, w)
return
}
for i, f := range zr.File {
want := &snap.Entries[i]
if f.Name != want.Name {
t.Errorf("%s entry %d: Name = %q, want %q", label, i, f.Name, want.Name)
}
if f.UncompressedSize64 != want.USize64 {
t.Errorf("%s entry %d %q: UncompressedSize64 = %d, want %d", label, i, want.Name, f.UncompressedSize64, want.USize64)
}
if f.CompressedSize64 != want.CSize64 {
t.Errorf("%s entry %d %q: CompressedSize64 = %d, want %d", label, i, want.Name, f.CompressedSize64, want.CSize64)
}
}
}
// indent prefixes every line of s with prefix.
func indent(s, prefix string) string {
if s == "" {
return s
}
lines := strings.Split(strings.TrimRight(s, "\n"), "\n")
for i, l := range lines {
lines[i] = prefix + l
}
return strings.Join(lines, "\n") + "\n"
}
// reproduceCD writes a zip archive with the same logical entries as golden
// into a [sparseBuffer] (which drops all-zero chunks, so pushing multi-GiB
// streams of zeros through the writer is essentially free) and returns the
// resulting [sparseFile].
//
// For entries where compressed == uncompressed (Store, or other 1:1 cases)
// we drive the Writer through [Writer.CreateHeader] so that the data
// descriptor, offset accounting, and Close-time CD emission all exercise
// the production streaming path. The CRC32 hasher is replaced with
// [fakeHash32] to avoid hashing many GiB of zeros.
//
// For entries where compressed ≪ uncompressed (Method=Deflate over zeros),
// actually deflating multi-GiB streams at test time is prohibitively slow,
// so we fall back to [Writer.CreateRaw] and declare the sizes directly.
// The Central Directory output is identical either way.
func reproduceCD(t *testing.T, golden *cdSnapshot) *sparseFile {
t.Helper()
sb := &sparseBuffer{}
w := NewWriter(sb)
for i, e := range golden.Entries {
if e.CSize64 == e.USize64 {
fh := &FileHeader{Name: e.Name, Method: e.Method}
fw, err := w.CreateHeader(fh)
if err != nil {
t.Fatalf("CreateHeader[%d %q]: %v", i, e.Name, err)
}
fw.(*fileWriter).crc32 = fakeHash32{}
if _, err := io.CopyN(fw, zeros{}, int64(e.USize64)); err != nil {
t.Fatalf("CopyN[%d %q]: %v", i, e.Name, err)
}
continue
}
fh := &FileHeader{
Name: e.Name,
Method: e.Method,
CompressedSize64: e.CSize64,
UncompressedSize64: e.USize64,
}
fw, err := w.CreateRaw(fh)
if err != nil {
t.Fatalf("CreateRaw[%d %q]: %v", i, e.Name, err)
}
if _, err := io.CopyN(fw, zeros{}, int64(e.CSize64)); err != nil {
t.Fatalf("CopyN[%d %q]: %v", i, e.Name, err)
}
}
if err := w.Close(); err != nil {
t.Fatalf("Close: %v", err)
}
return &sb.f
}
// compareCDSnapshots asserts that got matches want on Zip64-relevant fields.
//
// Per-entry size fields (RawCSize, RawUSize, CSize64, USize64) are compared
// exactly — we feed them in from the golden when reproducing, so the writer
// has no excuse to disagree. Per-entry RawOffset and the EOCD records/size/
// offset fields are compared only as placeholder-or-not: their absolute
// values depend on producer-specific LFH layout (Info-ZIP packs sizes into
// the LFH; archive/zip's streaming path uses a data descriptor; libarchive
// adds UT extras) and that's not what this test is pinning down.
func compareCDSnapshots(t errReporter, want, got *cdSnapshot) {
t.Helper()
if g, w := len(got.Entries), len(want.Entries); g != w {
t.Errorf("entry count = %d, want %d", g, w)
return
}
for i := range want.Entries {
we, ge := &want.Entries[i], &got.Entries[i]
// csize and usize come from the declared FileHeader values, so the
// raw 32-bit fields must match exactly (real value vs. placeholder
// choice and, when not placeholder, the value itself).
if we.RawCSize != ge.RawCSize {
t.Errorf("entry %d %q: RawCSize = %#08x, want %#08x", i, we.Name, ge.RawCSize, we.RawCSize)
}
if we.RawUSize != ge.RawUSize {
t.Errorf("entry %d %q: RawUSize = %#08x, want %#08x", i, we.Name, ge.RawUSize, we.RawUSize)
}
// Resolved csize/usize must match — we fed them in from the golden.
if we.CSize64 != ge.CSize64 {
t.Errorf("entry %d %q: CSize64 = %d, want %d", i, we.Name, ge.CSize64, we.CSize64)
}
if we.USize64 != ge.USize64 {
t.Errorf("entry %d %q: USize64 = %d, want %d", i, we.Name, ge.USize64, we.USize64)
}
// Offset is layout-dependent. Compare placeholder-or-not, not value.
if isPlaceholder32(we.RawOffset) != isPlaceholder32(ge.RawOffset) {
t.Errorf("entry %d %q: RawOffset placeholder = %#08x, want %#08x", i, we.Name, ge.RawOffset, we.RawOffset)
}
// Zip64 sub-field presence/order, must match exactly.
if !slices.Equal(we.Z64ExtraFields, ge.Z64ExtraFields) {
t.Errorf("entry %d %q: Zip64 sub-field order = %v, want %v", i, we.Name, ge.Z64ExtraFields, we.Z64ExtraFields)
}
// ReaderVersion ≥ 45 whenever a Zip64 extra is present.
if len(we.Z64ExtraFields) > 0 && ge.ReaderVersion < zipVersion45 {
t.Errorf("entry %d %q: ReaderVersion = %d, want ≥ %d (Zip64 extra present)", i, we.Name, ge.ReaderVersion, zipVersion45)
}
}
// EOCD: compare placeholder-or-not for each field. Exact values are
// layout-dependent.
if isPlaceholder16(want.EOCD.Records) != isPlaceholder16(got.EOCD.Records) {
t.Errorf("EOCD records placeholder = %#x, want %#x", got.EOCD.Records, want.EOCD.Records)
}
if isPlaceholder32(want.EOCD.Size) != isPlaceholder32(got.EOCD.Size) {
t.Errorf("EOCD size placeholder = %#x, want %#x", got.EOCD.Size, want.EOCD.Size)
}
if isPlaceholder32(want.EOCD.Offset) != isPlaceholder32(got.EOCD.Offset) {
t.Errorf("EOCD offset placeholder = %#x, want %#x", got.EOCD.Offset, want.EOCD.Offset)
}
if got.HasEOCD64 != want.HasEOCD64 {
t.Errorf("EOCD64 present = %v, want %v", got.HasEOCD64, want.HasEOCD64)
}
if want.HasEOCD64 && got.HasEOCD64 {
if got.EOCD64.Records != want.EOCD64.Records {
t.Errorf("EOCD64 records = %d, want %d", got.EOCD64.Records, want.EOCD64.Records)
}
// EOCD64.Size and EOCD64.Offset are layout-dependent.
}
}
func isPlaceholder32(v uint32) bool { return v == uint32max }
func isPlaceholder16(v uint16) bool { return v == uint16max }
// CD snapshot types and parser
// zip64SubID identifies one of the three sub-fields that may appear in a
// Zip64 extended-information extra field, in the spec-defined order.
type zip64SubID int
const (
z64USize zip64SubID = iota + 1
z64CSize
z64Offset
)
func (s zip64SubID) String() string {
switch s {
case z64USize:
return "usize"
case z64CSize:
return "csize"
case z64Offset:
return "offset"
}
return fmt.Sprintf("zip64SubID(%d)", int(s))
}
type cdEntry struct {
Name string
Method uint16
ReaderVersion uint16
// Raw 32-bit fields from the CD record. A value of 0xFFFFFFFF indicates
// the real value is in the Zip64 extended-information extra field.
RawCSize uint32
RawUSize uint32
RawOffset uint32
// Resolved 64-bit values (from the 32-bit field if not a placeholder,
// otherwise from the Zip64 extra).
CSize64 uint64
USize64 uint64
Offset64 uint64
// Sub-fields present in the Zip64 extra, in the order they appear.
Z64ExtraFields []zip64SubID
}
type eocdRec struct {
Records uint16 // 0xFFFF if placeholder
Size uint32 // 0xFFFFFFFF if placeholder
Offset uint32 // 0xFFFFFFFF if placeholder
}
type eocd64Rec struct {
Records uint64
Size uint64
Offset uint64
}
type cdSnapshot struct {
Entries []cdEntry
EOCD eocdRec
HasEOCD64 bool
EOCD64 eocd64Rec
}
var le = binary.LittleEndian
// parseCD parses the Central Directory and EOCD records of a zip archive
// from its raw bytes. data must be the tail of the archive, with baseOffset
// indicating where data[0] sits in the original archive (0 for whole-archive
// input).
func parseCD(data []byte, baseOffset uint64) (*cdSnapshot, error) {
sigOff, err := findEOCD(data)
if err != nil {
return nil, err
}
snap := &cdSnapshot{}
snap.EOCD.Records = le.Uint16(data[sigOff+10:])
snap.EOCD.Size = le.Uint32(data[sigOff+12:])
snap.EOCD.Offset = le.Uint32(data[sigOff+16:])
dirOffset := uint64(snap.EOCD.Offset)
nRecords := uint64(snap.EOCD.Records)
// toData converts an absolute archive offset to a data slice offset,
// returning false if it lies before our captured tail.
toData := func(absOff uint64) (uint64, bool) {
if absOff < baseOffset {
return 0, false
}
return absOff - baseOffset, true
}
// Look for an EOCD64 locator immediately preceding the EOCD record.
if sigOff >= directory64LocLen {
locOff := sigOff - directory64LocLen
if le.Uint32(data[locOff:]) == directory64LocSignature {
eocd64Off := le.Uint64(data[locOff+8:])
eocd64DataOff, ok := toData(eocd64Off)
if !ok {
return nil, fmt.Errorf("zip: EOCD64 at %#x before captured tail (base %#x)", eocd64Off, baseOffset)
}
if eocd64DataOff+directory64EndLen > uint64(len(data)) {
return nil, errors.New("zip: EOCD64 offset out of range")
}
if le.Uint32(data[eocd64DataOff:]) != directory64EndSignature {
return nil, errors.New("zip: EOCD64 signature mismatch")
}
snap.HasEOCD64 = true
snap.EOCD64.Records = le.Uint64(data[eocd64DataOff+32:])
snap.EOCD64.Size = le.Uint64(data[eocd64DataOff+40:])
snap.EOCD64.Offset = le.Uint64(data[eocd64DataOff+48:])
dirOffset = snap.EOCD64.Offset
nRecords = snap.EOCD64.Records
}
}
off, ok := toData(dirOffset)
if !ok {
return nil, fmt.Errorf("zip: CD at %#x before captured tail (base %#x)", dirOffset, baseOffset)
}
for i := uint64(0); i < nRecords; i++ {
if off+directoryHeaderLen > uint64(len(data)) {
return nil, fmt.Errorf("zip: CD entry %d out of range", i)
}
rec := data[off:]
if le.Uint32(rec) != directoryHeaderSignature {
return nil, fmt.Errorf("zip: bad CD signature at offset %d", off)
}
var e cdEntry
e.ReaderVersion = le.Uint16(rec[6:])
e.Method = le.Uint16(rec[10:])
e.RawCSize = le.Uint32(rec[20:])
e.RawUSize = le.Uint32(rec[24:])
nameLen := uint64(le.Uint16(rec[28:]))
extraLen := uint64(le.Uint16(rec[30:]))
commLen := uint64(le.Uint16(rec[32:]))
e.RawOffset = le.Uint32(rec[42:])
recLen := uint64(directoryHeaderLen) + nameLen + extraLen + commLen
if off+recLen > uint64(len(data)) {
return nil, fmt.Errorf("zip: CD entry %d truncated", i)
}
nameOff := off + directoryHeaderLen
extraOff := nameOff + nameLen
e.Name = string(data[nameOff:extraOff])
extra := data[extraOff : extraOff+extraLen]
e.CSize64 = uint64(e.RawCSize)
e.USize64 = uint64(e.RawUSize)
e.Offset64 = uint64(e.RawOffset)
// Walk extra fields; consume the Zip64 sub-field if present.
// Per the spec and Info-ZIP convention, the Zip64 extra contains
// 8-byte values for exactly the size/offset fields whose 32-bit
// counterpart is 0xFFFFFFFF, in the order: USize, CSize, Offset.
for len(extra) >= 4 {
tag := le.Uint16(extra)
size := uint64(le.Uint16(extra[2:]))
if 4+size > uint64(len(extra)) {
break
}
field := extra[4 : 4+size]
extra = extra[4+size:]
if tag != zip64ExtraID {
continue
}
if e.RawUSize == uint32max && len(field) >= 8 {
e.USize64 = le.Uint64(field)
e.Z64ExtraFields = append(e.Z64ExtraFields, z64USize)
field = field[8:]
}
if e.RawCSize == uint32max && len(field) >= 8 {
e.CSize64 = le.Uint64(field)
e.Z64ExtraFields = append(e.Z64ExtraFields, z64CSize)
field = field[8:]
}
if e.RawOffset == uint32max && len(field) >= 8 {
e.Offset64 = le.Uint64(field)
e.Z64ExtraFields = append(e.Z64ExtraFields, z64Offset)
field = field[8:]
}
}
snap.Entries = append(snap.Entries, e)
off += recLen
}
return snap, nil
}
// findEOCD locates the EOCD record by scanning back from the end of data,
// matching both the signature and the trailing comment-length field.
func findEOCD(data []byte) (uint64, error) {
if len(data) < directoryEndLen {
return 0, errors.New("zip: too short for EOCD")
}
maxComment := uint16max
lo := len(data) - directoryEndLen
hi := lo
if hi > maxComment {
lo = hi - maxComment
} else {
lo = 0
}
for i := hi; i >= lo; i-- {
if le.Uint32(data[i:]) != directoryEndSignature {
continue
}
cl := int(le.Uint16(data[i+20:]))
if i+directoryEndLen+cl == len(data) {
return uint64(i), nil
}
}
return 0, errors.New("zip: EOCD not found")
}
// hexDump returns a short hex dump of data for failure messages.
func hexDump(data []byte) string {
if len(data) > 4096 {
data = data[len(data)-4096:]
}
var b strings.Builder
for i := 0; i < len(data); i += 16 {
end := min(i+16, len(data))
fmt.Fprintf(&b, "%04x % x\n", i, data[i:end])
}
return b.String()
}
// TestZip64LFHBothPlaceholders covers the [Writer.CreateRaw] + no-data-
// descriptor path where the entry's uncompressed or compressed size exceeds
// 4 GiB. The Local File Header carries a Zip64 extra with both 8-byte
// USize64 and CSize64 sub-fields (matching Info-ZIP), so per APPNOTE 4.5.3
// both 32-bit size fields in the LFH must be the 0xFFFFFFFF placeholder —
// even if only one of the sizes actually overflows.
func TestZip64LFHBothPlaceholders(t *testing.T) {
var buf bytes.Buffer
w := NewWriter(&buf)
fh := &FileHeader{
Name: "x",
Method: Deflate,
CompressedSize64: 1024,
UncompressedSize64: 5 << 30, // > 4 GiB
}
fw, err := w.CreateRaw(fh)
if err != nil {
t.Fatal(err)
}
if _, err := io.CopyN(fw, zeros{}, int64(fh.CompressedSize64)); err != nil {
t.Fatal(err)
}
if err := w.Close(); err != nil {
t.Fatal(err)
}
b := buf.Bytes()
if got := le.Uint32(b[14:18]); got != fh.CRC32 {
t.Errorf("LFH CRC32 = %#x, want %#x", got, fh.CRC32)
}
if got := le.Uint32(b[18:22]); got != uint32max {
t.Errorf("LFH CompressedSize = %#x, want %#x (placeholder)", got, uint32(uint32max))
}
if got := le.Uint32(b[22:26]); got != uint32max {
t.Errorf("LFH UncompressedSize = %#x, want %#x (placeholder)", got, uint32(uint32max))
}
// The Zip64 LFH extra should carry both 64-bit sub-fields in
// USize64-then-CSize64 order.
nameLen := uint64(le.Uint16(b[26:28]))
extraLen := uint64(le.Uint16(b[28:30]))
if want := uint64(20); extraLen != want {
t.Fatalf("LFH extra length = %d, want %d", extraLen, want)
}
extra := b[30+nameLen : 30+nameLen+extraLen]
if tag := le.Uint16(extra[:2]); tag != zip64ExtraID {
t.Errorf("Zip64 extra tag = %#x, want %#x", tag, zip64ExtraID)
}
if dataLen := le.Uint16(extra[2:4]); dataLen != 16 {
t.Errorf("Zip64 extra data length = %d, want 16", dataLen)
}
if got := le.Uint64(extra[4:12]); got != fh.UncompressedSize64 {
t.Errorf("Zip64 USize64 = %d, want %d", got, fh.UncompressedSize64)
}
if got := le.Uint64(extra[12:20]); got != fh.CompressedSize64 {
t.Errorf("Zip64 CSize64 = %d, want %d", got, fh.CompressedSize64)
}
}