cpython/Lib/test/test_zipfile64.py
Danny Lin aec0aed197
gh-51067: Add remove() and repack() to ZipFile (GH-134627)
The docs included in the commit do the best job of describing this.

Much discussion on the PR and issue.

thank you to to core team folks jaraco, emmatyping, gpshead, and all others who added their constructive comments along the way.

---------

Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Gregory P. Smith <greg@krypto.org>
2026-06-20 12:45:53 -07:00

320 lines
13 KiB
Python

# Tests of the full ZIP64 functionality of zipfile
# The support.requires call is the only reason for keeping this separate
# from test_zipfile
from test import support
# XXX(nnorwitz): disable this test by looking for extralargefile resource,
# which doesn't exist. This test takes over 30 minutes to run in general
# and requires more disk space than most of the buildbots.
support.requires(
'extralargefile',
'test requires loads of disk-space bytes and a long time to run'
)
import zipfile, unittest
import time
import tracemalloc
import sys
import unittest.mock as mock
from tempfile import TemporaryFile
from test.support import os_helper
from test.support import requires_zlib
from test.test_zipfile.test_core import Unseekable
from test.test_zipfile.test_core import struct_pack_no_dd_sig
TESTFN = os_helper.TESTFN
TESTFN2 = TESTFN + "2"
# How much time in seconds can pass before we print a 'Still working' message.
_PRINT_WORKING_MSG_INTERVAL = 60
class TestsWithSourceFile(unittest.TestCase):
def setUp(self):
# Create test data.
line_gen = ("Test of zipfile line %d." % i for i in range(1000000))
self.data = '\n'.join(line_gen).encode('ascii')
def zipTest(self, f, compression):
# Create the ZIP archive.
with zipfile.ZipFile(f, "w", compression) as zipfp:
# It will contain enough copies of self.data to reach about 6 GiB of
# raw data to store.
filecount = 6*1024**3 // len(self.data)
next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
for num in range(filecount):
zipfp.writestr("testfn%d" % num, self.data)
# Print still working message since this test can be really slow
if next_time <= time.monotonic():
next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
print((
' zipTest still writing %d of %d, be patient...' %
(num, filecount)), file=sys.__stdout__)
sys.__stdout__.flush()
# Read the ZIP archive
with zipfile.ZipFile(f, "r", compression) as zipfp:
for num in range(filecount):
self.assertEqual(zipfp.read("testfn%d" % num), self.data)
# Print still working message since this test can be really slow
if next_time <= time.monotonic():
next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
print((
' zipTest still reading %d of %d, be patient...' %
(num, filecount)), file=sys.__stdout__)
sys.__stdout__.flush()
# Check that testzip thinks the archive is valid
self.assertIsNone(zipfp.testzip())
def testStored(self):
# Try the temp file first. If we do TESTFN2 first, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
self.zipTest(f, zipfile.ZIP_STORED)
self.assertFalse(f.closed)
self.zipTest(TESTFN2, zipfile.ZIP_STORED)
@requires_zlib()
def testDeflated(self):
# Try the temp file first. If we do TESTFN2 first, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
self.zipTest(f, zipfile.ZIP_DEFLATED)
self.assertFalse(f.closed)
self.zipTest(TESTFN2, zipfile.ZIP_DEFLATED)
def tearDown(self):
os_helper.unlink(TESTFN2)
class TestRepack(unittest.TestCase):
def setUp(self):
# Create test data.
line_gen = ("Test of zipfile line %d." % i for i in range(1000000))
self.data = '\n'.join(line_gen).encode('ascii')
# It will contain enough copies of self.data to reach about 8 GiB.
self.datacount = 8*1024**3 // len(self.data)
# memory usage should not exceed 10 MiB
self.allowed_memory = 10*1024**2
def _write_large_file(self, fh):
next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
for num in range(self.datacount):
fh.write(self.data)
# Print still working message since this test can be really slow
if next_time <= time.monotonic():
next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
print((
' writing %d of %d, be patient...' %
(num, self.datacount)), file=sys.__stdout__)
sys.__stdout__.flush()
def test_strip_removed_large_file(self):
"""Should move the physical data of a file positioned after a large
removed file without causing a memory issue."""
# Try the temp file. If we do TESTFN2, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
tracemalloc.start()
self._test_strip_removed_large_file(f)
self.assertFalse(f.closed)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
self.assertLess(peak, self.allowed_memory)
def _test_strip_removed_large_file(self, f):
file = 'file.txt'
file1 = 'largefile.txt'
data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
with zipfile.ZipFile(f, 'w') as zh:
with zh.open(file1, 'w', force_zip64=True) as fh:
self._write_large_file(fh)
zh.writestr(file, data)
with zipfile.ZipFile(f, 'a') as zh:
zh.remove(file1)
zh.repack()
self.assertIsNone(zh.testzip())
def test_strip_removed_file_before_large_file(self):
"""Should move the physical data of a large file positioned after a
removed file without causing a memory issue."""
# Try the temp file. If we do TESTFN2, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
tracemalloc.start()
self._test_strip_removed_file_before_large_file(f)
self.assertFalse(f.closed)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
self.assertLess(peak, self.allowed_memory)
def _test_strip_removed_file_before_large_file(self, f):
file = 'file.txt'
file1 = 'largefile.txt'
data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
with zipfile.ZipFile(f, 'w') as zh:
zh.writestr(file, data)
with zh.open(file1, 'w', force_zip64=True) as fh:
self._write_large_file(fh)
with zipfile.ZipFile(f, 'a') as zh:
zh.remove(file)
zh.repack()
self.assertIsNone(zh.testzip())
def test_strip_removed_large_file_with_dd(self):
"""Should scan for the data descriptor of a removed large file without
causing a memory issue."""
# Try the temp file. If we do TESTFN2, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
tracemalloc.start()
self._test_strip_removed_large_file_with_dd(f)
self.assertFalse(f.closed)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
self.assertLess(peak, self.allowed_memory)
def _test_strip_removed_large_file_with_dd(self, f):
file = 'file.txt'
file1 = 'largefile.txt'
data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
with zipfile.ZipFile(Unseekable(f), 'w') as zh:
with zh.open(file1, 'w', force_zip64=True) as fh:
self._write_large_file(fh)
zh.writestr(file, data)
with zipfile.ZipFile(f, 'a') as zh:
zh.remove(file1)
zh.repack()
self.assertIsNone(zh.testzip())
def test_strip_removed_large_file_with_dd_no_sig(self):
"""Should scan for the data descriptor (without signature) of a removed
large file without causing a memory issue."""
# Reduce data scale for this test, as it's especially slow...
self.datacount = 30*1024**2 // len(self.data)
self.allowed_memory = 200*1024
# Try the temp file. If we do TESTFN2, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
tracemalloc.start()
self._test_strip_removed_large_file_with_dd_no_sig(f)
self.assertFalse(f.closed)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
self.assertLess(peak, self.allowed_memory)
def _test_strip_removed_large_file_with_dd_no_sig(self, f):
file = 'file.txt'
file1 = 'largefile.txt'
data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig):
with zipfile.ZipFile(Unseekable(f), 'w') as zh:
with zh.open(file1, 'w', force_zip64=True) as fh:
self._write_large_file(fh)
zh.writestr(file, data)
with zipfile.ZipFile(f, 'a') as zh:
zh.remove(file1)
# strict_descriptor=False to scan the unsigned data descriptor
# (scanning is disabled under the strict_descriptor=True default)
zh.repack(strict_descriptor=False)
self.assertIsNone(zh.testzip())
@requires_zlib()
def test_strip_removed_large_file_with_dd_no_sig_by_decompression(self):
"""Should scan for the data descriptor (without signature) of a removed
large file without causing a memory issue."""
# Try the temp file. If we do TESTFN2, then it hogs
# gigabytes of disk space for the duration of the test.
with TemporaryFile() as f:
tracemalloc.start()
self._test_strip_removed_large_file_with_dd_no_sig_by_decompression(
f, zipfile.ZIP_DEFLATED)
self.assertFalse(f.closed)
current, peak = tracemalloc.get_traced_memory()
tracemalloc.stop()
self.assertLess(peak, self.allowed_memory)
def _test_strip_removed_large_file_with_dd_no_sig_by_decompression(self, f, method):
file = 'file.txt'
file1 = 'largefile.txt'
data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig):
with zipfile.ZipFile(Unseekable(f), 'w', compression=method) as zh:
with zh.open(file1, 'w', force_zip64=True) as fh:
self._write_large_file(fh)
zh.writestr(file, data)
with zipfile.ZipFile(f, 'a') as zh:
zh.remove(file1)
# strict_descriptor=False to detect the unsigned data descriptor
# (scanning is disabled under the strict_descriptor=True default)
zh.repack(strict_descriptor=False)
self.assertIsNone(zh.testzip())
class OtherTests(unittest.TestCase):
def testMoreThan64kFiles(self):
# This test checks that more than 64k files can be added to an archive,
# and that the resulting archive can be read properly by ZipFile
with zipfile.ZipFile(TESTFN, mode="w", allowZip64=True) as zipf:
zipf.debug = 100
numfiles = (1 << 16) * 3//2
for i in range(numfiles):
zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
self.assertEqual(len(zipf.namelist()), numfiles)
with zipfile.ZipFile(TESTFN, mode="r") as zipf2:
self.assertEqual(len(zipf2.namelist()), numfiles)
for i in range(numfiles):
content = zipf2.read("foo%08d" % i).decode('ascii')
self.assertEqual(content, "%d" % (i**3 % 57))
def testMoreThan64kFilesAppend(self):
with zipfile.ZipFile(TESTFN, mode="w", allowZip64=False) as zipf:
zipf.debug = 100
numfiles = (1 << 16) - 1
for i in range(numfiles):
zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
self.assertEqual(len(zipf.namelist()), numfiles)
with self.assertRaises(zipfile.LargeZipFile):
zipf.writestr("foo%08d" % numfiles, b'')
self.assertEqual(len(zipf.namelist()), numfiles)
with zipfile.ZipFile(TESTFN, mode="a", allowZip64=False) as zipf:
zipf.debug = 100
self.assertEqual(len(zipf.namelist()), numfiles)
with self.assertRaises(zipfile.LargeZipFile):
zipf.writestr("foo%08d" % numfiles, b'')
self.assertEqual(len(zipf.namelist()), numfiles)
with zipfile.ZipFile(TESTFN, mode="a", allowZip64=True) as zipf:
zipf.debug = 100
self.assertEqual(len(zipf.namelist()), numfiles)
numfiles2 = (1 << 16) * 3//2
for i in range(numfiles, numfiles2):
zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
self.assertEqual(len(zipf.namelist()), numfiles2)
with zipfile.ZipFile(TESTFN, mode="r") as zipf2:
self.assertEqual(len(zipf2.namelist()), numfiles2)
for i in range(numfiles2):
content = zipf2.read("foo%08d" % i).decode('ascii')
self.assertEqual(content, "%d" % (i**3 % 57))
def tearDown(self):
os_helper.unlink(TESTFN)
os_helper.unlink(TESTFN2)
if __name__ == "__main__":
unittest.main()