bpo-29979: Rewrite cgi.parse_multipart to make it consistent with FieldStorage (#991)

2025-12-08 06:10:17 +00:00 · 2017-05-08 14:08:34 +02:00 · 2017-05-08 14:08:34 +02:00 · cc3fa204d3
commit cc3fa204d3
parent f34c685020
5 changed files with 38 additions and 102 deletions
--- a/Doc/library/cgi.rst
+++ b/Doc/library/cgi.rst
@ -294,19 +294,20 @@ algorithms implemented in this module in other circumstances.
   This function is deprecated in this module. Use :func:`urllib.parse.parse_qsl`
   instead. It is maintained here only for backward compatibility.
-.. function:: parse_multipart(fp, pdict)
+.. function:: parse_multipart(fp, pdict, encoding="utf-8")
   Parse input of type :mimetype:`multipart/form-data` (for  file uploads).
-   Arguments are *fp* for the input file and *pdict* for a dictionary containing
+   Arguments are *fp* for the input file, *pdict* for a dictionary containing
-   other parameters in the :mailheader:`Content-Type` header.
+   other parameters in the :mailheader:`Content-Type` header, and *encoding*,
   the request encoding.
-   Returns a dictionary just like :func:`urllib.parse.parse_qs` keys are the field names, each
+   Returns a dictionary just like :func:`urllib.parse.parse_qs`: keys are the
-   value is a list of values for that field.  This is easy to use but not much good
+   field names, each value is a list of values for that field. For non-file
-   if you are expecting megabytes to be uploaded --- in that case, use the
+   fields, the value is a list of strings.
   :class:`FieldStorage` class instead which is much more flexible.
-   Note that this does not parse nested multipart parts --- use
+   This is easy to use but not much good if you are expecting megabytes to be
-   :class:`FieldStorage` for that.
+   uploaded --- in that case, use the :class:`FieldStorage` class instead
   which is much more flexible.
 .. function:: parse_header(string)
--- a/Doc/whatsnew/3.7.rst
+++ b/Doc/whatsnew/3.7.rst
@ -95,6 +95,14 @@ New Modules
 Improved Modules
 ================
 cgi
 ---
 :func:`~cgi.parse_multipart` returns the same results as
 :class:`~FieldStorage` : for non-file fields, the value associated to a key
 is a list of strings, not bytes.
 (Contributed by Pierre Quentel in :issue:`29979`.)
 binascii
 --------
--- a/Lib/cgi.py
+++ b/Lib/cgi.py
@ -198,105 +198,28 @@ def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
         DeprecationWarning, 2)
    return urllib.parse.parse_qsl(qs, keep_blank_values, strict_parsing)
-def parse_multipart(fp, pdict):
+def parse_multipart(fp, pdict, encoding="utf-8"):
    """Parse multipart input.
    Arguments:
    fp   : input file
    pdict: dictionary containing other parameters of content-type header
    encoding: request encoding
    Returns a dictionary just like parse_qs(): keys are the field names, each
-    value is a list of values for that field.  This is easy to use but not
+    value is a list of values for that field. For non-file fields, the value
-    much good if you are expecting megabytes to be uploaded -- in that case,
+    is a list of strings.
    use the FieldStorage class instead which is much more flexible.  Note
    that content-type is the raw, unparsed contents of the content-type
    header.
    XXX This does not parse nested multipart parts -- use FieldStorage for
    that.
    XXX This should really be subsumed by FieldStorage altogether -- no
    point in having two implementations of the same parsing algorithm.
    Also, FieldStorage protects itself better against certain DoS attacks
    by limiting the size of the data read in one chunk.  The API here
    does not support that kind of protection.  This also affects parse()
    since it can call parse_multipart().
    """
-    import http.client
+    # RFC 2026, Section 5.1 : The "multipart" boundary delimiters are always
-
+    # represented as 7bit US-ASCII.
-    boundary = b""
+    boundary = pdict['boundary'].decode('ascii')
-    if 'boundary' in pdict:
+    ctype = "multipart/form-data; boundary={}".format(boundary)
-        boundary = pdict['boundary']
+    headers = Message()
-    if not valid_boundary(boundary):
+    headers.set_type(ctype)
-        raise ValueError('Invalid boundary in multipart form: %r'
+    headers['Content-Length'] = pdict['CONTENT-LENGTH']
-                            % (boundary,))
+    fs = FieldStorage(fp, headers=headers, encoding=encoding,
-
+        environ={'REQUEST_METHOD': 'POST'})
-    nextpart = b"--" + boundary
+    return {k: fs.getlist(k) for k in fs}
    lastpart = b"--" + boundary + b"--"
    partdict = {}
    terminator = b""
    while terminator != lastpart:
        bytes = -1
        data = None
        if terminator:
            # At start of next part.  Read headers first.
            headers = http.client.parse_headers(fp)
            clength = headers.get('content-length')
            if clength:
                try:
                    bytes = int(clength)
                except ValueError:
                    pass
            if bytes > 0:
                if maxlen and bytes > maxlen:
                    raise ValueError('Maximum content length exceeded')
                data = fp.read(bytes)
            else:
                data = b""
        # Read lines until end of part.
        lines = []
        while 1:
            line = fp.readline()
            if not line:
                terminator = lastpart # End outer loop
                break
            if line.startswith(b"--"):
                terminator = line.rstrip()
                if terminator in (nextpart, lastpart):
                    break
            lines.append(line)
        # Done with part.
        if data is None:
            continue
        if bytes < 0:
            if lines:
                # Strip final line terminator
                line = lines[-1]
                if line[-2:] == b"\r\n":
                    line = line[:-2]
                elif line[-1:] == b"\n":
                    line = line[:-1]
                lines[-1] = line
                data = b"".join(lines)
        line = headers['content-disposition']
        if not line:
            continue
        key, params = parse_header(line)
        if key != 'form-data':
            continue
        if 'name' in params:
            name = params['name']
        else:
            continue
        if name in partdict:
            partdict[name].append(data)
        else:
            partdict[name] = [data]
    return partdict
 def _parseparam(s):
    while s[:1] == ';':
--- a/Lib/test/test_cgi.py
+++ b/Lib/test/test_cgi.py
@ -126,8 +126,8 @@ def test_parse_multipart(self):
        env = {'boundary': BOUNDARY.encode('latin1'),
               'CONTENT-LENGTH': '558'}
        result = cgi.parse_multipart(fp, env)
-        expected = {'submit': [b' Add '], 'id': [b'1234'],
+        expected = {'submit': [' Add '], 'id': ['1234'],
-                    'file': [b'Testing 123.\n'], 'title': [b'']}
+                    'file': [b'Testing 123.\n'], 'title': ['']}
        self.assertEqual(result, expected)
    def test_fieldstorage_properties(self):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -317,6 +317,10 @@ Extension Modules
 Library
 -------
 - bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
  making its results consistent with those of FieldStorage for
  multipart/form-data requests. Patch by Pierre Quentel.
 - bpo-30243: Removed the __init__ methods of _json's scanner and encoder.
  Misusing them could cause memory leaks or crashes.  Now scanner and encoder
  objects are completely initialized in the __new__ methods.