cpython/Lib/test/test_zipfile64.py

# Tests of the full ZIP64 functionality of zipfile
# The support.requires call is the only reason for keeping this separate
# from test_zipfile
from test import support

# XXX(nnorwitz): disable this test by looking for extralargefile resource,
# which doesn't exist.  This test takes over 30 minutes to run in general
# and requires more disk space than most of the buildbots.
support.requires(
        'extralargefile',
        'test requires loads of disk-space bytes and a long time to run'
    )

import zipfile, unittest
import time
import tracemalloc
import sys
import unittest.mock as mock

from tempfile import TemporaryFile

from test.support import os_helper
from test.support import requires_zlib
from test.test_zipfile.test_core import Unseekable
from test.test_zipfile.test_core import struct_pack_no_dd_sig

TESTFN = os_helper.TESTFN
TESTFN2 = TESTFN + "2"

# How much time in seconds can pass before we print a 'Still working' message.
_PRINT_WORKING_MSG_INTERVAL = 60

class TestsWithSourceFile(unittest.TestCase):
    def setUp(self):
        # Create test data.
        line_gen = ("Test of zipfile line %d." % i for i in range(1000000))
        self.data = '\n'.join(line_gen).encode('ascii')

    def zipTest(self, f, compression):
        # Create the ZIP archive.
        with zipfile.ZipFile(f, "w", compression) as zipfp:

            # It will contain enough copies of self.data to reach about 6 GiB of
            # raw data to store.
            filecount = 6*1024**3 // len(self.data)

            next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
            for num in range(filecount):
                zipfp.writestr("testfn%d" % num, self.data)
                # Print still working message since this test can be really slow
                if next_time <= time.monotonic():
                    next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
                    print((
                    '  zipTest still writing %d of %d, be patient...' %
                    (num, filecount)), file=sys.__stdout__)
                    sys.__stdout__.flush()

        # Read the ZIP archive
        with zipfile.ZipFile(f, "r", compression) as zipfp:
            for num in range(filecount):
                self.assertEqual(zipfp.read("testfn%d" % num), self.data)
                # Print still working message since this test can be really slow
                if next_time <= time.monotonic():
                    next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
                    print((
                    '  zipTest still reading %d of %d, be patient...' %
                    (num, filecount)), file=sys.__stdout__)
                    sys.__stdout__.flush()

            # Check that testzip thinks the archive is valid
            self.assertIsNone(zipfp.testzip())

    def testStored(self):
        # Try the temp file first.  If we do TESTFN2 first, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            self.zipTest(f, zipfile.ZIP_STORED)
            self.assertFalse(f.closed)
        self.zipTest(TESTFN2, zipfile.ZIP_STORED)

    @requires_zlib()
    def testDeflated(self):
        # Try the temp file first.  If we do TESTFN2 first, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            self.zipTest(f, zipfile.ZIP_DEFLATED)
            self.assertFalse(f.closed)
        self.zipTest(TESTFN2, zipfile.ZIP_DEFLATED)

    def tearDown(self):
        os_helper.unlink(TESTFN2)


class TestRepack(unittest.TestCase):
    def setUp(self):
        # Create test data.
        line_gen = ("Test of zipfile line %d." % i for i in range(1000000))
        self.data = '\n'.join(line_gen).encode('ascii')

        # It will contain enough copies of self.data to reach about 8 GiB.
        self.datacount = 8*1024**3 // len(self.data)

        # memory usage should not exceed 10 MiB
        self.allowed_memory = 10*1024**2

    def _write_large_file(self, fh):
        next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
        for num in range(self.datacount):
            fh.write(self.data)
            # Print still working message since this test can be really slow
            if next_time <= time.monotonic():
                next_time = time.monotonic() + _PRINT_WORKING_MSG_INTERVAL
                print((
                '  writing %d of %d, be patient...' %
                (num, self.datacount)), file=sys.__stdout__)
                sys.__stdout__.flush()

    def test_strip_removed_large_file(self):
        """Should move the physical data of a file positioned after a large
        removed file without causing a memory issue."""
        # Try the temp file.  If we do TESTFN2, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            tracemalloc.start()
            self._test_strip_removed_large_file(f)
            self.assertFalse(f.closed)
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            self.assertLess(peak, self.allowed_memory)

    def _test_strip_removed_large_file(self, f):
        file = 'file.txt'
        file1 = 'largefile.txt'
        data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
        with zipfile.ZipFile(f, 'w') as zh:
            with zh.open(file1, 'w', force_zip64=True) as fh:
                self._write_large_file(fh)
            zh.writestr(file, data)

        with zipfile.ZipFile(f, 'a') as zh:
            zh.remove(file1)
            zh.repack()
            self.assertIsNone(zh.testzip())

    def test_strip_removed_file_before_large_file(self):
        """Should move the physical data of a large file positioned after a
        removed file without causing a memory issue."""
        # Try the temp file.  If we do TESTFN2, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            tracemalloc.start()
            self._test_strip_removed_file_before_large_file(f)
            self.assertFalse(f.closed)
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            self.assertLess(peak, self.allowed_memory)

    def _test_strip_removed_file_before_large_file(self, f):
        file = 'file.txt'
        file1 = 'largefile.txt'
        data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
        with zipfile.ZipFile(f, 'w') as zh:
            zh.writestr(file, data)
            with zh.open(file1, 'w', force_zip64=True) as fh:
                self._write_large_file(fh)

        with zipfile.ZipFile(f, 'a') as zh:
            zh.remove(file)
            zh.repack()
            self.assertIsNone(zh.testzip())

    def test_strip_removed_large_file_with_dd(self):
        """Should scan for the data descriptor of a removed large file without
        causing a memory issue."""
        # Try the temp file.  If we do TESTFN2, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            tracemalloc.start()
            self._test_strip_removed_large_file_with_dd(f)
            self.assertFalse(f.closed)
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            self.assertLess(peak, self.allowed_memory)

    def _test_strip_removed_large_file_with_dd(self, f):
        file = 'file.txt'
        file1 = 'largefile.txt'
        data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
        with zipfile.ZipFile(Unseekable(f), 'w') as zh:
            with zh.open(file1, 'w', force_zip64=True) as fh:
                self._write_large_file(fh)
            zh.writestr(file, data)

        with zipfile.ZipFile(f, 'a') as zh:
            zh.remove(file1)
            zh.repack()
            self.assertIsNone(zh.testzip())

    def test_strip_removed_large_file_with_dd_no_sig(self):
        """Should scan for the data descriptor (without signature) of a removed
        large file without causing a memory issue."""
        # Reduce data scale for this test, as it's especially slow...
        self.datacount = 30*1024**2 // len(self.data)
        self.allowed_memory = 200*1024

        # Try the temp file.  If we do TESTFN2, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            tracemalloc.start()
            self._test_strip_removed_large_file_with_dd_no_sig(f)
            self.assertFalse(f.closed)
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            self.assertLess(peak, self.allowed_memory)

    def _test_strip_removed_large_file_with_dd_no_sig(self, f):
        file = 'file.txt'
        file1 = 'largefile.txt'
        data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
        with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig):
            with zipfile.ZipFile(Unseekable(f), 'w') as zh:
                with zh.open(file1, 'w', force_zip64=True) as fh:
                    self._write_large_file(fh)
                zh.writestr(file, data)

        with zipfile.ZipFile(f, 'a') as zh:
            zh.remove(file1)
            zh.repack()
            self.assertIsNone(zh.testzip())

    @requires_zlib()
    def test_strip_removed_large_file_with_dd_no_sig_by_decompression(self):
        """Should scan for the data descriptor (without signature) of a removed
        large file without causing a memory issue."""
        # Try the temp file.  If we do TESTFN2, then it hogs
        # gigabytes of disk space for the duration of the test.
        with TemporaryFile() as f:
            tracemalloc.start()
            self._test_strip_removed_large_file_with_dd_no_sig_by_decompression(
                f, zipfile.ZIP_DEFLATED)
            self.assertFalse(f.closed)
            current, peak = tracemalloc.get_traced_memory()
            tracemalloc.stop()
            self.assertLess(peak, self.allowed_memory)

    def _test_strip_removed_large_file_with_dd_no_sig_by_decompression(self, f, method):
        file = 'file.txt'
        file1 = 'largefile.txt'
        data = b'Sed ut perspiciatis unde omnis iste natus error sit voluptatem'
        with mock.patch('zipfile.struct.pack', side_effect=struct_pack_no_dd_sig):
            with zipfile.ZipFile(Unseekable(f), 'w', compression=method) as zh:
                with zh.open(file1, 'w', force_zip64=True) as fh:
                    self._write_large_file(fh)
                zh.writestr(file, data)

        with zipfile.ZipFile(f, 'a') as zh:
            zh.remove(file1)
            zh.repack()
            self.assertIsNone(zh.testzip())


class OtherTests(unittest.TestCase):
    def testMoreThan64kFiles(self):
        # This test checks that more than 64k files can be added to an archive,
        # and that the resulting archive can be read properly by ZipFile
        with zipfile.ZipFile(TESTFN, mode="w", allowZip64=True) as zipf:
            zipf.debug = 100
            numfiles = (1 << 16) * 3//2
            for i in range(numfiles):
                zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
            self.assertEqual(len(zipf.namelist()), numfiles)

        with zipfile.ZipFile(TESTFN, mode="r") as zipf2:
            self.assertEqual(len(zipf2.namelist()), numfiles)
            for i in range(numfiles):
                content = zipf2.read("foo%08d" % i).decode('ascii')
                self.assertEqual(content, "%d" % (i**3 % 57))

    def testMoreThan64kFilesAppend(self):
        with zipfile.ZipFile(TESTFN, mode="w", allowZip64=False) as zipf:
            zipf.debug = 100
            numfiles = (1 << 16) - 1
            for i in range(numfiles):
                zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
            self.assertEqual(len(zipf.namelist()), numfiles)
            with self.assertRaises(zipfile.LargeZipFile):
                zipf.writestr("foo%08d" % numfiles, b'')
            self.assertEqual(len(zipf.namelist()), numfiles)

        with zipfile.ZipFile(TESTFN, mode="a", allowZip64=False) as zipf:
            zipf.debug = 100
            self.assertEqual(len(zipf.namelist()), numfiles)
            with self.assertRaises(zipfile.LargeZipFile):
                zipf.writestr("foo%08d" % numfiles, b'')
            self.assertEqual(len(zipf.namelist()), numfiles)

        with zipfile.ZipFile(TESTFN, mode="a", allowZip64=True) as zipf:
            zipf.debug = 100
            self.assertEqual(len(zipf.namelist()), numfiles)
            numfiles2 = (1 << 16) * 3//2
            for i in range(numfiles, numfiles2):
                zipf.writestr("foo%08d" % i, "%d" % (i**3 % 57))
            self.assertEqual(len(zipf.namelist()), numfiles2)

        with zipfile.ZipFile(TESTFN, mode="r") as zipf2:
            self.assertEqual(len(zipf2.namelist()), numfiles2)
            for i in range(numfiles2):
                content = zipf2.read("foo%08d" % i).decode('ascii')
                self.assertEqual(content, "%d" % (i**3 % 57))

    def tearDown(self):
        os_helper.unlink(TESTFN)
        os_helper.unlink(TESTFN2)

if __name__ == "__main__":
    unittest.main()