mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	GH-73435: Implement recursive wildcards in pathlib.PurePath.match() (#101398)
				
					
				
			`PurePath.match()` now handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
This commit is contained in:
		
							parent
							
								
									4c770617c0
								
							
						
					
					
						commit
						49f90ba1ea
					
				
					 5 changed files with 123 additions and 15 deletions
				
			
		|  | @ -569,6 +569,13 @@ Pure paths provide the following methods and properties: | ||||||
|       >>> PurePath('a/b.py').match('/*.py') |       >>> PurePath('a/b.py').match('/*.py') | ||||||
|       False |       False | ||||||
| 
 | 
 | ||||||
|  |    The *pattern* may be another path object; this speeds up matching the same | ||||||
|  |    pattern against multiple files:: | ||||||
|  | 
 | ||||||
|  |       >>> pattern = PurePath('*.py') | ||||||
|  |       >>> PurePath('a/b.py').match(pattern) | ||||||
|  |       True | ||||||
|  | 
 | ||||||
|    As with other methods, case-sensitivity follows platform defaults:: |    As with other methods, case-sensitivity follows platform defaults:: | ||||||
| 
 | 
 | ||||||
|       >>> PurePosixPath('b.py').match('*.PY') |       >>> PurePosixPath('b.py').match('*.PY') | ||||||
|  | @ -581,6 +588,10 @@ Pure paths provide the following methods and properties: | ||||||
|    .. versionadded:: 3.12 |    .. versionadded:: 3.12 | ||||||
|       The *case_sensitive* argument. |       The *case_sensitive* argument. | ||||||
| 
 | 
 | ||||||
|  |    .. versionchanged:: 3.13 | ||||||
|  |       Support for the recursive wildcard "``**``" was added. In previous | ||||||
|  |       versions, it acted like the non-recursive wildcard "``*``". | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| .. method:: PurePath.relative_to(other, walk_up=False) | .. method:: PurePath.relative_to(other, walk_up=False) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -90,6 +90,9 @@ Improved Modules | ||||||
| pathlib | pathlib | ||||||
| ------- | ------- | ||||||
| 
 | 
 | ||||||
|  | * Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. | ||||||
|  |   (Contributed by Barney Gale in :gh:`73435`.) | ||||||
|  | 
 | ||||||
| * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and | * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and | ||||||
|   :meth:`~pathlib.Path.rglob`. |   :meth:`~pathlib.Path.rglob`. | ||||||
|   (Contributed by Barney Gale in :gh:`77609`.) |   (Contributed by Barney Gale in :gh:`77609`.) | ||||||
|  |  | ||||||
|  | @ -54,6 +54,7 @@ def _ignore_error(exception): | ||||||
|             getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) |             getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @functools.cache | ||||||
| def _is_case_sensitive(flavour): | def _is_case_sensitive(flavour): | ||||||
|     return flavour.normcase('Aa') == 'Aa' |     return flavour.normcase('Aa') == 'Aa' | ||||||
| 
 | 
 | ||||||
|  | @ -61,6 +62,22 @@ def _is_case_sensitive(flavour): | ||||||
| # Globbing helpers | # Globbing helpers | ||||||
| # | # | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | # fnmatch.translate() returns a regular expression that includes a prefix and | ||||||
|  | # a suffix, which enable matching newlines and ensure the end of the string is | ||||||
|  | # matched, respectively. These features are undesirable for our implementation | ||||||
|  | # of PurePatch.match(), which represents path separators as newlines and joins | ||||||
|  | # pattern segments together. As a workaround, we define a slice object that | ||||||
|  | # can remove the prefix and suffix from any translate() result. See the | ||||||
|  | # _compile_pattern_lines() function for more details. | ||||||
|  | _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') | ||||||
|  | _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) | ||||||
|  | _SWAP_SEP_AND_NEWLINE = { | ||||||
|  |     '/': str.maketrans({'/': '\n', '\n': '/'}), | ||||||
|  |     '\\': str.maketrans({'\\': '\n', '\n': '\\'}), | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @functools.lru_cache() | @functools.lru_cache() | ||||||
| def _make_selector(pattern_parts, flavour, case_sensitive): | def _make_selector(pattern_parts, flavour, case_sensitive): | ||||||
|     pat = pattern_parts[0] |     pat = pattern_parts[0] | ||||||
|  | @ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive): | ||||||
|     return re.compile(fnmatch.translate(pat), flags).match |     return re.compile(fnmatch.translate(pat), flags).match | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @functools.lru_cache() | ||||||
|  | def _compile_pattern_lines(pattern_lines, case_sensitive): | ||||||
|  |     """Compile the given pattern lines to an `re.Pattern` object. | ||||||
|  | 
 | ||||||
|  |     The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with | ||||||
|  |     its path separators and newlines swapped (e.g. '**\n*.py`). By using | ||||||
|  |     newlines to separate path components, and not setting `re.DOTALL`, we | ||||||
|  |     ensure that the `*` wildcard cannot match path separators. | ||||||
|  | 
 | ||||||
|  |     The returned `re.Pattern` object may have its `match()` method called to | ||||||
|  |     match a complete pattern, or `search()` to match from the right. The | ||||||
|  |     argument supplied to these methods must also have its path separators and | ||||||
|  |     newlines swapped. | ||||||
|  |     """ | ||||||
|  | 
 | ||||||
|  |     # Match the start of the path, or just after a path separator | ||||||
|  |     parts = ['^'] | ||||||
|  |     for part in pattern_lines.splitlines(keepends=True): | ||||||
|  |         if part == '**\n': | ||||||
|  |             # '**/' component: we use '[\s\S]' rather than '.' so that path | ||||||
|  |             # separators (i.e. newlines) are matched. The trailing '^' ensures | ||||||
|  |             # we terminate after a path separator (i.e. on a new line). | ||||||
|  |             part = r'[\s\S]*^' | ||||||
|  |         elif part == '**': | ||||||
|  |             # '**' component. | ||||||
|  |             part = r'[\s\S]*' | ||||||
|  |         elif '**' in part: | ||||||
|  |             raise ValueError("Invalid pattern: '**' can only be an entire path component") | ||||||
|  |         else: | ||||||
|  |             # Any other component: pass to fnmatch.translate(). We slice off | ||||||
|  |             # the common prefix and suffix added by translate() to ensure that | ||||||
|  |             # re.DOTALL is not set, and the end of the string not matched, | ||||||
|  |             # respectively. With DOTALL not set, '*' wildcards will not match | ||||||
|  |             # path separators, because the '.' characters in the pattern will | ||||||
|  |             # not match newlines. | ||||||
|  |             part = fnmatch.translate(part)[_FNMATCH_SLICE] | ||||||
|  |         parts.append(part) | ||||||
|  |     # Match the end of the path, always. | ||||||
|  |     parts.append(r'\Z') | ||||||
|  |     flags = re.MULTILINE | ||||||
|  |     if not case_sensitive: | ||||||
|  |         flags |= re.IGNORECASE | ||||||
|  |     return re.compile(''.join(parts), flags=flags) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class _Selector: | class _Selector: | ||||||
|     """A selector matches a specific glob pattern part against the children |     """A selector matches a specific glob pattern part against the children | ||||||
|     of a given path.""" |     of a given path.""" | ||||||
|  | @ -276,6 +338,10 @@ class PurePath: | ||||||
|         # to implement comparison methods like `__lt__()`. |         # to implement comparison methods like `__lt__()`. | ||||||
|         '_parts_normcase_cached', |         '_parts_normcase_cached', | ||||||
| 
 | 
 | ||||||
|  |         # The `_lines_cached` slot stores the string path with path separators | ||||||
|  |         # and newlines swapped. This is used to implement `match()`. | ||||||
|  |         '_lines_cached', | ||||||
|  | 
 | ||||||
|         # The `_hash` slot stores the hash of the case-normalized string |         # The `_hash` slot stores the hash of the case-normalized string | ||||||
|         # path. It's set when `__hash__()` is called for the first time. |         # path. It's set when `__hash__()` is called for the first time. | ||||||
|         '_hash', |         '_hash', | ||||||
|  | @ -441,6 +507,16 @@ def _parts_normcase(self): | ||||||
|             self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) |             self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) | ||||||
|             return self._parts_normcase_cached |             return self._parts_normcase_cached | ||||||
| 
 | 
 | ||||||
|  |     @property | ||||||
|  |     def _lines(self): | ||||||
|  |         # Path with separators and newlines swapped, for pattern matching. | ||||||
|  |         try: | ||||||
|  |             return self._lines_cached | ||||||
|  |         except AttributeError: | ||||||
|  |             trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] | ||||||
|  |             self._lines_cached = str(self).translate(trans) | ||||||
|  |             return self._lines_cached | ||||||
|  | 
 | ||||||
|     def __eq__(self, other): |     def __eq__(self, other): | ||||||
|         if not isinstance(other, PurePath): |         if not isinstance(other, PurePath): | ||||||
|             return NotImplemented |             return NotImplemented | ||||||
|  | @ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None): | ||||||
|         """ |         """ | ||||||
|         Return True if this path matches the given pattern. |         Return True if this path matches the given pattern. | ||||||
|         """ |         """ | ||||||
|  |         if not isinstance(path_pattern, PurePath): | ||||||
|  |             path_pattern = self.with_segments(path_pattern) | ||||||
|         if case_sensitive is None: |         if case_sensitive is None: | ||||||
|             case_sensitive = _is_case_sensitive(self._flavour) |             case_sensitive = _is_case_sensitive(self._flavour) | ||||||
|         pat = self.with_segments(path_pattern) |         pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) | ||||||
|         if not pat.parts: |         if path_pattern.drive or path_pattern.root: | ||||||
|  |             return pattern.match(self._lines) is not None | ||||||
|  |         elif path_pattern._tail: | ||||||
|  |             return pattern.search(self._lines) is not None | ||||||
|  |         else: | ||||||
|             raise ValueError("empty pattern") |             raise ValueError("empty pattern") | ||||||
|         pat_parts = pat.parts | 
 | ||||||
|         parts = self.parts |  | ||||||
|         if pat.drive or pat.root: |  | ||||||
|             if len(pat_parts) != len(parts): |  | ||||||
|                 return False |  | ||||||
|         elif len(pat_parts) > len(parts): |  | ||||||
|             return False |  | ||||||
|         for part, pat in zip(reversed(parts), reversed(pat_parts)): |  | ||||||
|             match = _compile_pattern(pat, case_sensitive) |  | ||||||
|             if not match(part): |  | ||||||
|                 return False |  | ||||||
|         return True |  | ||||||
| 
 | 
 | ||||||
| # Subclassing os.PathLike makes isinstance() checks slower, | # Subclassing os.PathLike makes isinstance() checks slower, | ||||||
| # which in turn makes Path construction slower. Register instead! | # which in turn makes Path construction slower. Register instead! | ||||||
|  |  | ||||||
|  | @ -310,8 +310,30 @@ def test_match_common(self): | ||||||
|         self.assertFalse(P('/ab.py').match('/a/*.py')) |         self.assertFalse(P('/ab.py').match('/a/*.py')) | ||||||
|         self.assertFalse(P('/a/b/c.py').match('/a/*.py')) |         self.assertFalse(P('/a/b/c.py').match('/a/*.py')) | ||||||
|         # Multi-part glob-style pattern. |         # Multi-part glob-style pattern. | ||||||
|         self.assertFalse(P('/a/b/c.py').match('/**/*.py')) |         self.assertTrue(P('a').match('**')) | ||||||
|  |         self.assertTrue(P('c.py').match('**')) | ||||||
|  |         self.assertTrue(P('a/b/c.py').match('**')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('**')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('/**')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('**/')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('/a/**')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('**/*.py')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('/**/*.py')) | ||||||
|         self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) |         self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py')) | ||||||
|  |         self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py')) | ||||||
|  |         self.assertFalse(P('c.py').match('**/a.py')) | ||||||
|  |         self.assertFalse(P('c.py').match('c/**')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a/b')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a/b/c')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a/b/c.')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**')) | ||||||
|  |         self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py')) | ||||||
|  |         self.assertRaises(ValueError, P('a').match, '**a/b/c') | ||||||
|  |         self.assertRaises(ValueError, P('a').match, 'a/b/c**') | ||||||
|         # Case-sensitive flag |         # Case-sensitive flag | ||||||
|         self.assertFalse(P('A.py').match('a.PY', case_sensitive=True)) |         self.assertFalse(P('A.py').match('a.PY', case_sensitive=True)) | ||||||
|         self.assertTrue(P('A.py').match('a.PY', case_sensitive=False)) |         self.assertTrue(P('A.py').match('a.PY', case_sensitive=False)) | ||||||
|  |  | ||||||
|  | @ -0,0 +1 @@ | ||||||
|  | Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Barney Gale
						Barney Gale