mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	bpo-39595: Improve zipfile.Path performance (#18406)
* Improve zipfile.Path performance on zipfiles with a large number of entries. * 📜🤖 Added by blurb_it. * Add bpo to blurb * Sync with importlib_metadata 1.5 (6fe70ca) * Update blurb. * Remove compatibility code * Add stubs module, omitted from earlier commit Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									e6be9b59a9
								
							
						
					
					
						commit
						e5bd73632e
					
				
					 7 changed files with 254 additions and 68 deletions
				
			
		|  | @ -391,6 +391,7 @@ class FastPath: | |||
| 
 | ||||
|     def __init__(self, root): | ||||
|         self.root = root | ||||
|         self.base = os.path.basename(root).lower() | ||||
| 
 | ||||
|     def joinpath(self, child): | ||||
|         return pathlib.Path(self.root, child) | ||||
|  | @ -413,12 +414,11 @@ def zip_children(self): | |||
|             ) | ||||
| 
 | ||||
|     def is_egg(self, search): | ||||
|         root_n_low = os.path.split(self.root)[1].lower() | ||||
| 
 | ||||
|         base = self.base | ||||
|         return ( | ||||
|             root_n_low == search.normalized + '.egg' | ||||
|             or root_n_low.startswith(search.prefix) | ||||
|             and root_n_low.endswith('.egg')) | ||||
|             base == search.versionless_egg_name | ||||
|             or base.startswith(search.prefix) | ||||
|             and base.endswith('.egg')) | ||||
| 
 | ||||
|     def search(self, name): | ||||
|         for child in self.children(): | ||||
|  | @ -439,6 +439,7 @@ class Prepared: | |||
|     prefix = '' | ||||
|     suffixes = '.dist-info', '.egg-info' | ||||
|     exact_matches = [''][:0] | ||||
|     versionless_egg_name = '' | ||||
| 
 | ||||
|     def __init__(self, name): | ||||
|         self.name = name | ||||
|  | @ -448,6 +449,7 @@ def __init__(self, name): | |||
|         self.prefix = self.normalized + '-' | ||||
|         self.exact_matches = [ | ||||
|             self.normalized + suffix for suffix in self.suffixes] | ||||
|         self.versionless_egg_name = self.normalized + '.egg' | ||||
| 
 | ||||
| 
 | ||||
| class MetadataPathFinder(DistributionFinder): | ||||
|  |  | |||
|  | @ -47,14 +47,28 @@ def tempdir_as_cwd(): | |||
|             yield tmp | ||||
| 
 | ||||
| 
 | ||||
| class SiteDir: | ||||
| @contextlib.contextmanager | ||||
| def install_finder(finder): | ||||
|     sys.meta_path.append(finder) | ||||
|     try: | ||||
|         yield | ||||
|     finally: | ||||
|         sys.meta_path.remove(finder) | ||||
| 
 | ||||
| 
 | ||||
| class Fixtures: | ||||
|     def setUp(self): | ||||
|         self.fixtures = ExitStack() | ||||
|         self.addCleanup(self.fixtures.close) | ||||
| 
 | ||||
| 
 | ||||
| class SiteDir(Fixtures): | ||||
|     def setUp(self): | ||||
|         super(SiteDir, self).setUp() | ||||
|         self.site_dir = self.fixtures.enter_context(tempdir()) | ||||
| 
 | ||||
| 
 | ||||
| class OnSysPath: | ||||
| class OnSysPath(Fixtures): | ||||
|     @staticmethod | ||||
|     @contextlib.contextmanager | ||||
|     def add_sys_path(dir): | ||||
|  | @ -198,3 +212,8 @@ def build_files(file_defs, prefix=pathlib.Path()): | |||
| def DALS(str): | ||||
|     "Dedent and left-strip" | ||||
|     return textwrap.dedent(str).lstrip() | ||||
| 
 | ||||
| 
 | ||||
| class NullFinder: | ||||
|     def find_module(self, name): | ||||
|         pass | ||||
|  |  | |||
							
								
								
									
										10
									
								
								Lib/test/test_importlib/stubs.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										10
									
								
								Lib/test/test_importlib/stubs.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,10 @@ | |||
| import unittest | ||||
| 
 | ||||
| 
 | ||||
| class fake_filesystem_unittest: | ||||
|     """ | ||||
|     Stubbed version of the pyfakefs module | ||||
|     """ | ||||
|     class TestCase(unittest.TestCase): | ||||
|         def setUpPyfakefs(self): | ||||
|             self.skipTest("pyfakefs not available") | ||||
|  | @ -7,6 +7,11 @@ | |||
| import unittest | ||||
| import importlib.metadata | ||||
| 
 | ||||
| try: | ||||
|     import pyfakefs.fake_filesystem_unittest as ffs | ||||
| except ImportError: | ||||
|     from .stubs import fake_filesystem_unittest as ffs | ||||
| 
 | ||||
| from . import fixtures | ||||
| from importlib.metadata import ( | ||||
|     Distribution, EntryPoint, | ||||
|  | @ -185,6 +190,33 @@ def test_egg(self): | |||
|                 version('foo') | ||||
| 
 | ||||
| 
 | ||||
| class MissingSysPath(fixtures.OnSysPath, unittest.TestCase): | ||||
|     site_dir = '/does-not-exist' | ||||
| 
 | ||||
|     def test_discovery(self): | ||||
|         """ | ||||
|         Discovering distributions should succeed even if | ||||
|         there is an invalid path on sys.path. | ||||
|         """ | ||||
|         importlib.metadata.distributions() | ||||
| 
 | ||||
| 
 | ||||
| class InaccessibleSysPath(fixtures.OnSysPath, ffs.TestCase): | ||||
|     site_dir = '/access-denied' | ||||
| 
 | ||||
|     def setUp(self): | ||||
|         super(InaccessibleSysPath, self).setUp() | ||||
|         self.setUpPyfakefs() | ||||
|         self.fs.create_dir(self.site_dir, perm_bits=000) | ||||
| 
 | ||||
|     def test_discovery(self): | ||||
|         """ | ||||
|         Discovering distributions should succeed even if | ||||
|         there is an invalid path on sys.path. | ||||
|         """ | ||||
|         list(importlib.metadata.distributions()) | ||||
| 
 | ||||
| 
 | ||||
| class TestEntryPoints(unittest.TestCase): | ||||
|     def __init__(self, *args): | ||||
|         super(TestEntryPoints, self).__init__(*args) | ||||
|  |  | |||
|  | @ -2724,16 +2724,71 @@ def test_extract_command(self): | |||
|                                 self.assertEqual(f.read(), zf.read(zi)) | ||||
| 
 | ||||
| 
 | ||||
| class TestExecutablePrependedZip(unittest.TestCase): | ||||
|     """Test our ability to open zip files with an executable prepended.""" | ||||
| 
 | ||||
|     def setUp(self): | ||||
|         self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') | ||||
|         self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') | ||||
| 
 | ||||
|     def _test_zip_works(self, name): | ||||
|         # bpo28494 sanity check: ensure is_zipfile works on these. | ||||
|         self.assertTrue(zipfile.is_zipfile(name), | ||||
|                         f'is_zipfile failed on {name}') | ||||
|         # Ensure we can operate on these via ZipFile. | ||||
|         with zipfile.ZipFile(name) as zipfp: | ||||
|             for n in zipfp.namelist(): | ||||
|                 data = zipfp.read(n) | ||||
|                 self.assertIn(b'FAVORITE_NUMBER', data) | ||||
| 
 | ||||
|     def test_read_zip_with_exe_prepended(self): | ||||
|         self._test_zip_works(self.exe_zip) | ||||
| 
 | ||||
|     def test_read_zip64_with_exe_prepended(self): | ||||
|         self._test_zip_works(self.exe_zip64) | ||||
| 
 | ||||
|     @unittest.skipUnless(sys.executable, 'sys.executable required.') | ||||
|     @unittest.skipUnless(os.access('/bin/bash', os.X_OK), | ||||
|                          'Test relies on #!/bin/bash working.') | ||||
|     def test_execute_zip2(self): | ||||
|         output = subprocess.check_output([self.exe_zip, sys.executable]) | ||||
|         self.assertIn(b'number in executable: 5', output) | ||||
| 
 | ||||
|     @unittest.skipUnless(sys.executable, 'sys.executable required.') | ||||
|     @unittest.skipUnless(os.access('/bin/bash', os.X_OK), | ||||
|                          'Test relies on #!/bin/bash working.') | ||||
|     def test_execute_zip64(self): | ||||
|         output = subprocess.check_output([self.exe_zip64, sys.executable]) | ||||
|         self.assertIn(b'number in executable: 5', output) | ||||
| 
 | ||||
| 
 | ||||
| # Poor man's technique to consume a (smallish) iterable. | ||||
| consume = tuple | ||||
| 
 | ||||
| 
 | ||||
| # from jaraco.itertools 5.0 | ||||
| class jaraco: | ||||
|     class itertools: | ||||
|         class Counter: | ||||
|             def __init__(self, i): | ||||
|                 self.count = 0 | ||||
|                 self._orig_iter = iter(i) | ||||
| 
 | ||||
|             def __iter__(self): | ||||
|                 return self | ||||
| 
 | ||||
|             def __next__(self): | ||||
|                 result = next(self._orig_iter) | ||||
|                 self.count += 1 | ||||
|                 return result | ||||
| 
 | ||||
| 
 | ||||
| def add_dirs(zf): | ||||
|     """ | ||||
|     Given a writable zip file zf, inject directory entries for | ||||
|     any directories implied by the presence of children. | ||||
|     """ | ||||
|     for name in zipfile.Path._implied_dirs(zf.namelist()): | ||||
|     for name in zipfile.CompleteDirs._implied_dirs(zf.namelist()): | ||||
|         zf.writestr(name, b"") | ||||
|     return zf | ||||
| 
 | ||||
|  | @ -2774,44 +2829,6 @@ def build_alpharep_fixture(): | |||
|     return zf | ||||
| 
 | ||||
| 
 | ||||
| class TestExecutablePrependedZip(unittest.TestCase): | ||||
|     """Test our ability to open zip files with an executable prepended.""" | ||||
| 
 | ||||
|     def setUp(self): | ||||
|         self.exe_zip = findfile('exe_with_zip', subdir='ziptestdata') | ||||
|         self.exe_zip64 = findfile('exe_with_z64', subdir='ziptestdata') | ||||
| 
 | ||||
|     def _test_zip_works(self, name): | ||||
|         # bpo-28494 sanity check: ensure is_zipfile works on these. | ||||
|         self.assertTrue(zipfile.is_zipfile(name), | ||||
|                         f'is_zipfile failed on {name}') | ||||
|         # Ensure we can operate on these via ZipFile. | ||||
|         with zipfile.ZipFile(name) as zipfp: | ||||
|             for n in zipfp.namelist(): | ||||
|                 data = zipfp.read(n) | ||||
|                 self.assertIn(b'FAVORITE_NUMBER', data) | ||||
| 
 | ||||
|     def test_read_zip_with_exe_prepended(self): | ||||
|         self._test_zip_works(self.exe_zip) | ||||
| 
 | ||||
|     def test_read_zip64_with_exe_prepended(self): | ||||
|         self._test_zip_works(self.exe_zip64) | ||||
| 
 | ||||
|     @unittest.skipUnless(sys.executable, 'sys.executable required.') | ||||
|     @unittest.skipUnless(os.access('/bin/bash', os.X_OK), | ||||
|                          'Test relies on #!/bin/bash working.') | ||||
|     def test_execute_zip2(self): | ||||
|         output = subprocess.check_output([self.exe_zip, sys.executable]) | ||||
|         self.assertIn(b'number in executable: 5', output) | ||||
| 
 | ||||
|     @unittest.skipUnless(sys.executable, 'sys.executable required.') | ||||
|     @unittest.skipUnless(os.access('/bin/bash', os.X_OK), | ||||
|                          'Test relies on #!/bin/bash working.') | ||||
|     def test_execute_zip64(self): | ||||
|         output = subprocess.check_output([self.exe_zip64, sys.executable]) | ||||
|         self.assertIn(b'number in executable: 5', output) | ||||
| 
 | ||||
| 
 | ||||
| class TestPath(unittest.TestCase): | ||||
|     def setUp(self): | ||||
|         self.fixtures = contextlib.ExitStack() | ||||
|  | @ -2849,6 +2866,14 @@ def test_iterdir_and_types(self): | |||
|             i, = h.iterdir() | ||||
|             assert i.is_file() | ||||
| 
 | ||||
|     def test_subdir_is_dir(self): | ||||
|         for alpharep in self.zipfile_alpharep(): | ||||
|             root = zipfile.Path(alpharep) | ||||
|             assert (root / 'b').is_dir() | ||||
|             assert (root / 'b/').is_dir() | ||||
|             assert (root / 'g').is_dir() | ||||
|             assert (root / 'g/').is_dir() | ||||
| 
 | ||||
|     def test_open(self): | ||||
|         for alpharep in self.zipfile_alpharep(): | ||||
|             root = zipfile.Path(alpharep) | ||||
|  | @ -2910,6 +2935,45 @@ def test_missing_dir_parent(self): | |||
|             root = zipfile.Path(alpharep) | ||||
|             assert (root / 'missing dir/').parent.at == '' | ||||
| 
 | ||||
|     def test_mutability(self): | ||||
|         """ | ||||
|         If the underlying zipfile is changed, the Path object should | ||||
|         reflect that change. | ||||
|         """ | ||||
|         for alpharep in self.zipfile_alpharep(): | ||||
|             root = zipfile.Path(alpharep) | ||||
|             a, b, g = root.iterdir() | ||||
|             alpharep.writestr('foo.txt', 'foo') | ||||
|             alpharep.writestr('bar/baz.txt', 'baz') | ||||
|             assert any( | ||||
|                 child.name == 'foo.txt' | ||||
|                 for child in root.iterdir()) | ||||
|             assert (root / 'foo.txt').read_text() == 'foo' | ||||
|             baz, = (root / 'bar').iterdir() | ||||
|             assert baz.read_text() == 'baz' | ||||
| 
 | ||||
|     HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13 | ||||
| 
 | ||||
|     def huge_zipfile(self): | ||||
|         """Create a read-only zipfile with a huge number of entries entries.""" | ||||
|         strm = io.BytesIO() | ||||
|         zf = zipfile.ZipFile(strm, "w") | ||||
|         for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)): | ||||
|             zf.writestr(entry, entry) | ||||
|         zf.mode = 'r' | ||||
|         return zf | ||||
| 
 | ||||
|     def test_joinpath_constant_time(self): | ||||
|         """ | ||||
|         Ensure joinpath on items in zipfile is linear time. | ||||
|         """ | ||||
|         root = zipfile.Path(self.huge_zipfile()) | ||||
|         entries = jaraco.itertools.Counter(root.iterdir()) | ||||
|         for entry in entries: | ||||
|             entry.joinpath('suffix') | ||||
|         # Check the file iterated all items | ||||
|         assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     unittest.main() | ||||
|  |  | |||
							
								
								
									
										102
									
								
								Lib/zipfile.py
									
										
									
									
									
								
							
							
						
						
									
										102
									
								
								Lib/zipfile.py
									
										
									
									
									
								
							|  | @ -16,6 +16,8 @@ | |||
| import sys | ||||
| import threading | ||||
| import time | ||||
| import contextlib | ||||
| from collections import OrderedDict | ||||
| 
 | ||||
| try: | ||||
|     import zlib # We may need its compression method | ||||
|  | @ -2159,6 +2161,79 @@ def _ancestry(path): | |||
|         path, tail = posixpath.split(path) | ||||
| 
 | ||||
| 
 | ||||
| class CompleteDirs(ZipFile): | ||||
|     """ | ||||
|     A ZipFile subclass that ensures that implied directories | ||||
|     are always included in the namelist. | ||||
|     """ | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _implied_dirs(names): | ||||
|         parents = itertools.chain.from_iterable(map(_parents, names)) | ||||
|         # Deduplicate entries in original order | ||||
|         implied_dirs = OrderedDict.fromkeys( | ||||
|             p + posixpath.sep for p in parents | ||||
|             # Cast names to a set for O(1) lookups | ||||
|             if p + posixpath.sep not in set(names) | ||||
|         ) | ||||
|         return implied_dirs | ||||
| 
 | ||||
|     def namelist(self): | ||||
|         names = super(CompleteDirs, self).namelist() | ||||
|         return names + list(self._implied_dirs(names)) | ||||
| 
 | ||||
|     def _name_set(self): | ||||
|         return set(self.namelist()) | ||||
| 
 | ||||
|     def resolve_dir(self, name): | ||||
|         """ | ||||
|         If the name represents a directory, return that name | ||||
|         as a directory (with the trailing slash). | ||||
|         """ | ||||
|         names = self._name_set() | ||||
|         dirname = name + '/' | ||||
|         dir_match = name not in names and dirname in names | ||||
|         return dirname if dir_match else name | ||||
| 
 | ||||
|     @classmethod | ||||
|     def make(cls, source): | ||||
|         """ | ||||
|         Given a source (filename or zipfile), return an | ||||
|         appropriate CompleteDirs subclass. | ||||
|         """ | ||||
|         if isinstance(source, CompleteDirs): | ||||
|             return source | ||||
| 
 | ||||
|         if not isinstance(source, ZipFile): | ||||
|             return cls(source) | ||||
| 
 | ||||
|         # Only allow for FastPath when supplied zipfile is read-only | ||||
|         if 'r' not in source.mode: | ||||
|             cls = CompleteDirs | ||||
| 
 | ||||
|         res = cls.__new__(cls) | ||||
|         vars(res).update(vars(source)) | ||||
|         return res | ||||
| 
 | ||||
| 
 | ||||
| class FastLookup(CompleteDirs): | ||||
|     """ | ||||
|     ZipFile subclass to ensure implicit | ||||
|     dirs exist and are resolved rapidly. | ||||
|     """ | ||||
|     def namelist(self): | ||||
|         with contextlib.suppress(AttributeError): | ||||
|             return self.__names | ||||
|         self.__names = super(FastLookup, self).namelist() | ||||
|         return self.__names | ||||
| 
 | ||||
|     def _name_set(self): | ||||
|         with contextlib.suppress(AttributeError): | ||||
|             return self.__lookup | ||||
|         self.__lookup = super(FastLookup, self)._name_set() | ||||
|         return self.__lookup | ||||
| 
 | ||||
| 
 | ||||
| class Path: | ||||
|     """ | ||||
|     A pathlib-compatible interface for zip files. | ||||
|  | @ -2227,7 +2302,7 @@ class Path: | |||
|     __repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})" | ||||
| 
 | ||||
|     def __init__(self, root, at=""): | ||||
|         self.root = root if isinstance(root, ZipFile) else ZipFile(root) | ||||
|         self.root = FastLookup.make(root) | ||||
|         self.at = at | ||||
| 
 | ||||
|     @property | ||||
|  | @ -2259,12 +2334,12 @@ def is_file(self): | |||
|         return not self.is_dir() | ||||
| 
 | ||||
|     def exists(self): | ||||
|         return self.at in self._names() | ||||
|         return self.at in self.root._name_set() | ||||
| 
 | ||||
|     def iterdir(self): | ||||
|         if not self.is_dir(): | ||||
|             raise ValueError("Can't listdir a file") | ||||
|         subs = map(self._next, self._names()) | ||||
|         subs = map(self._next, self.root.namelist()) | ||||
|         return filter(self._is_child, subs) | ||||
| 
 | ||||
|     def __str__(self): | ||||
|  | @ -2275,25 +2350,10 @@ def __repr__(self): | |||
| 
 | ||||
|     def joinpath(self, add): | ||||
|         next = posixpath.join(self.at, add) | ||||
|         next_dir = posixpath.join(self.at, add, "") | ||||
|         names = self._names() | ||||
|         return self._next(next_dir if next not in names and next_dir in names else next) | ||||
|         return self._next(self.root.resolve_dir(next)) | ||||
| 
 | ||||
|     __truediv__ = joinpath | ||||
| 
 | ||||
|     @staticmethod | ||||
|     def _implied_dirs(names): | ||||
|         return _unique_everseen( | ||||
|             parent + "/" | ||||
|             for name in names | ||||
|             for parent in _parents(name) | ||||
|             if parent + "/" not in names | ||||
|         ) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _add_implied_dirs(cls, names): | ||||
|         return names + list(cls._implied_dirs(names)) | ||||
| 
 | ||||
|     @property | ||||
|     def parent(self): | ||||
|         parent_at = posixpath.dirname(self.at.rstrip('/')) | ||||
|  | @ -2301,9 +2361,6 @@ def parent(self): | |||
|             parent_at += '/' | ||||
|         return self._next(parent_at) | ||||
| 
 | ||||
|     def _names(self): | ||||
|         return self._add_implied_dirs(self.root.namelist()) | ||||
| 
 | ||||
| 
 | ||||
| def main(args=None): | ||||
|     import argparse | ||||
|  | @ -2365,5 +2422,6 @@ def addToZip(zf, path, zippath): | |||
|                     zippath = '' | ||||
|                 addToZip(zf, path, zippath) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == "__main__": | ||||
|     main() | ||||
|  |  | |||
|  | @ -0,0 +1 @@ | |||
| Improved performance of zipfile.Path for files with a large number of entries. Also improved performance and fixed minor issue as published with `importlib_metadata 1.5 <https://importlib-metadata.readthedocs.io/en/latest/changelog%20(links).html#v1-5-0>`_. | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Jason R. Coombs
						Jason R. Coombs