mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
[3.9] gh-102153: Start stripping C0 control and space chars in urlsplit (GH-102508) (GH-104575) (GH-104592) (#104593)
gh-102153: Start stripping C0 control and space chars in `urlsplit` (GH-102508) `urllib.parse.urlsplit` has already been respecting the WHATWG spec a bit GH-25595. This adds more sanitizing to respect the "Remove any leading C0 control or space from input" [rule](https://url.spec.whatwg.org/GH-url-parsing:~:text=Remove%20any%20leading%20and%20trailing%20C0%20control%20or%20space%20from%20input.) in response to [CVE-2023-24329](https://nvd.nist.gov/vuln/detail/CVE-2023-24329). I simplified the docs by eliding the state of the world explanatory paragraph in this security release only backport. (people will see that in the mainline /3/ docs) (cherry picked from commit2f630e1ce1) (cherry picked from commit610cc0ab1b) (cherry picked from commitf48a96a280) Co-authored-by: Illia Volochii <illia.volochii@gmail.com> Co-authored-by: Gregory P. Smith [Google] <greg@krypto.org>
This commit is contained in:
parent
3d5dd1eee2
commit
d7f8a5fe07
4 changed files with 111 additions and 3 deletions
|
|
@ -25,6 +25,10 @@
|
|||
scenarios for parsing, and for backward compatibility purposes, some
|
||||
parsing quirks from older RFCs are retained. The testcases in
|
||||
test_urlparse.py provides a good indicator of parsing behavior.
|
||||
|
||||
The WHATWG URL Parser spec should also be considered. We are not compliant with
|
||||
it either due to existing user code API behavior expectations (Hyrum's Law).
|
||||
It serves as a useful guide when making changes.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
|
@ -78,6 +82,10 @@
|
|||
'0123456789'
|
||||
'+-.')
|
||||
|
||||
# Leading and trailing C0 control and space to be stripped per WHATWG spec.
|
||||
# == "".join([chr(i) for i in range(0, 0x20 + 1)])
|
||||
_WHATWG_C0_CONTROL_OR_SPACE = '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f '
|
||||
|
||||
# Unsafe bytes to be removed per WHATWG spec
|
||||
_UNSAFE_URL_BYTES_TO_REMOVE = ['\t', '\r', '\n']
|
||||
|
||||
|
|
@ -456,6 +464,10 @@ def urlsplit(url, scheme='', allow_fragments=True):
|
|||
"""
|
||||
|
||||
url, scheme, _coerce_result = _coerce_args(url, scheme)
|
||||
# Only lstrip url as some applications rely on preserving trailing space.
|
||||
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
|
||||
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
|
||||
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
|
||||
|
||||
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
|
||||
url = url.replace(b, "")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue