GH-73435: Implement recursive wildcards in pathlib.PurePath.match() (#101398)

`PurePath.match()` now handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com> Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
2025-10-31 13:41:24 +00:00 · 2023-05-30 21:18:09 +01:00 · 2023-05-30 21:18:09 +01:00 · 49f90ba1ea
commit 49f90ba1ea
parent 4c770617c0
5 changed files with 123 additions and 15 deletions
--- a/Doc/library/pathlib.rst
+++ b/Doc/library/pathlib.rst
@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
      >>> PurePath('a/b.py').match('/*.py')
      False
   The *pattern* may be another path object; this speeds up matching the same
   pattern against multiple files::
      >>> pattern = PurePath('*.py')
      >>> PurePath('a/b.py').match(pattern)
      True
   As with other methods, case-sensitivity follows platform defaults::
      >>> PurePosixPath('b.py').match('*.PY')
@ -581,6 +588,10 @@ Pure paths provide the following methods and properties:
   .. versionadded:: 3.12
      The *case_sensitive* argument.
   .. versionchanged:: 3.13
      Support for the recursive wildcard "``**``" was added. In previous
      versions, it acted like the non-recursive wildcard "``*``".
 .. method:: PurePath.relative_to(other, walk_up=False)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@ -90,6 +90,9 @@ Improved Modules
 pathlib
 -------
 * Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
  (Contributed by Barney Gale in :gh:`73435`.)
 * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and
  :meth:`~pathlib.Path.rglob`.
  (Contributed by Barney Gale in :gh:`77609`.)
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@ -54,6 +54,7 @@ def _ignore_error(exception):
            getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
@functools.cache
 def _is_case_sensitive(flavour):
    return flavour.normcase('Aa') == 'Aa'
@ -61,6 +62,22 @@ def _is_case_sensitive(flavour):
 # Globbing helpers
 #
 # fnmatch.translate() returns a regular expression that includes a prefix and
 # a suffix, which enable matching newlines and ensure the end of the string is
 # matched, respectively. These features are undesirable for our implementation
 # of PurePatch.match(), which represents path separators as newlines and joins
 # pattern segments together. As a workaround, we define a slice object that
 # can remove the prefix and suffix from any translate() result. See the
 # _compile_pattern_lines() function for more details.
 _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
 _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
 _SWAP_SEP_AND_NEWLINE = {
    '/': str.maketrans({'/': '\n', '\n': '/'}),
    '\\': str.maketrans({'\\': '\n', '\n': '\\'}),
 }
@functools.lru_cache()
 def _make_selector(pattern_parts, flavour, case_sensitive):
    pat = pattern_parts[0]
@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive):
    return re.compile(fnmatch.translate(pat), flags).match
@functools.lru_cache()
 def _compile_pattern_lines(pattern_lines, case_sensitive):
    """Compile the given pattern lines to an `re.Pattern` object.
    The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
    its path separators and newlines swapped (e.g. '**\n*.py`). By using
    newlines to separate path components, and not setting `re.DOTALL`, we
    ensure that the `*` wildcard cannot match path separators.
    The returned `re.Pattern` object may have its `match()` method called to
    match a complete pattern, or `search()` to match from the right. The
    argument supplied to these methods must also have its path separators and
    newlines swapped.
    """
    # Match the start of the path, or just after a path separator
    parts = ['^']
    for part in pattern_lines.splitlines(keepends=True):
        if part == '**\n':
            # '**/' component: we use '[\s\S]' rather than '.' so that path
            # separators (i.e. newlines) are matched. The trailing '^' ensures
            # we terminate after a path separator (i.e. on a new line).
            part = r'[\s\S]*^'
        elif part == '**':
            # '**' component.
            part = r'[\s\S]*'
        elif '**' in part:
            raise ValueError("Invalid pattern: '**' can only be an entire path component")
        else:
            # Any other component: pass to fnmatch.translate(). We slice off
            # the common prefix and suffix added by translate() to ensure that
            # re.DOTALL is not set, and the end of the string not matched,
            # respectively. With DOTALL not set, '*' wildcards will not match
            # path separators, because the '.' characters in the pattern will
            # not match newlines.
            part = fnmatch.translate(part)[_FNMATCH_SLICE]
        parts.append(part)
    # Match the end of the path, always.
    parts.append(r'\Z')
    flags = re.MULTILINE
    if not case_sensitive:
        flags |= re.IGNORECASE
    return re.compile(''.join(parts), flags=flags)
 class _Selector:
    """A selector matches a specific glob pattern part against the children
    of a given path."""
@ -276,6 +338,10 @@ class PurePath:
        # to implement comparison methods like `__lt__()`.
        '_parts_normcase_cached',
        # The `_lines_cached` slot stores the string path with path separators
        # and newlines swapped. This is used to implement `match()`.
        '_lines_cached',
        # The `_hash` slot stores the hash of the case-normalized string
        # path. It's set when `__hash__()` is called for the first time.
        '_hash',
@ -441,6 +507,16 @@ def _parts_normcase(self):
            self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
            return self._parts_normcase_cached
    @property
    def _lines(self):
        # Path with separators and newlines swapped, for pattern matching.
        try:
            return self._lines_cached
        except AttributeError:
            trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
            self._lines_cached = str(self).translate(trans)
            return self._lines_cached
    def __eq__(self, other):
        if not isinstance(other, PurePath):
            return NotImplemented
@ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None):
        """
        Return True if this path matches the given pattern.
        """
        if not isinstance(path_pattern, PurePath):
            path_pattern = self.with_segments(path_pattern)
        if case_sensitive is None:
            case_sensitive = _is_case_sensitive(self._flavour)
-        pat = self.with_segments(path_pattern)
+        pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
-        if not pat.parts:
+        if path_pattern.drive or path_pattern.root:
            return pattern.match(self._lines) is not None
        elif path_pattern._tail:
            return pattern.search(self._lines) is not None
        else:
            raise ValueError("empty pattern")
-        pat_parts = pat.parts
+
        parts = self.parts
        if pat.drive or pat.root:
            if len(pat_parts) != len(parts):
                return False
        elif len(pat_parts) > len(parts):
            return False
        for part, pat in zip(reversed(parts), reversed(pat_parts)):
            match = _compile_pattern(pat, case_sensitive)
            if not match(part):
                return False
        return True
 # Subclassing os.PathLike makes isinstance() checks slower,
 # which in turn makes Path construction slower. Register instead!
--- a/Lib/test/test_pathlib.py
+++ b/Lib/test/test_pathlib.py
@ -310,8 +310,30 @@ def test_match_common(self):
        self.assertFalse(P('/ab.py').match('/a/*.py'))
        self.assertFalse(P('/a/b/c.py').match('/a/*.py'))
        # Multi-part glob-style pattern.
-        self.assertFalse(P('/a/b/c.py').match('/**/*.py'))
+        self.assertTrue(P('a').match('**'))
        self.assertTrue(P('c.py').match('**'))
        self.assertTrue(P('a/b/c.py').match('**'))
        self.assertTrue(P('/a/b/c.py').match('**'))
        self.assertTrue(P('/a/b/c.py').match('/**'))
        self.assertTrue(P('/a/b/c.py').match('**/'))
        self.assertTrue(P('/a/b/c.py').match('/a/**'))
        self.assertTrue(P('/a/b/c.py').match('**/*.py'))
        self.assertTrue(P('/a/b/c.py').match('/**/*.py'))
        self.assertTrue(P('/a/b/c.py').match('/a/**/*.py'))
        self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py'))
        self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py'))
        self.assertFalse(P('c.py').match('**/a.py'))
        self.assertFalse(P('c.py').match('c/**'))
        self.assertFalse(P('a/b/c.py').match('**/a'))
        self.assertFalse(P('a/b/c.py').match('**/a/b'))
        self.assertFalse(P('a/b/c.py').match('**/a/b/c'))
        self.assertFalse(P('a/b/c.py').match('**/a/b/c.'))
        self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
        self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
        self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**'))
        self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py'))
        self.assertRaises(ValueError, P('a').match, '**a/b/c')
        self.assertRaises(ValueError, P('a').match, 'a/b/c**')
        # Case-sensitive flag
        self.assertFalse(P('A.py').match('a.PY', case_sensitive=True))
        self.assertTrue(P('A.py').match('a.PY', case_sensitive=False))
--- a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst
+++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst
@ -0,0 +1 @@
 Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
		`@ -0,0 +1 @@`
							Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.