Forward port some fixes that were in email 2.5 but for some reason didn't make

it into email 4.0. Specifically, in Message.get_content_charset(), handle RFC 2231 headers that contain an encoding not known to Python, or a character in the data that isn't in the charset encoding. Also forward port the appropriate unit tests.
2025-10-31 13:41:24 +00:00 · 2006-07-26 05:54:46 +00:00 · 2006-07-26 05:54:46 +00:00 · d92ae78bdb
commit d92ae78bdb
parent 9815f8b252
3 changed files with 100 additions and 1 deletions
--- a/Lib/email/message.py
+++ b/Lib/email/message.py
@ -747,7 +747,18 @@ def get_content_charset(self, failobj=None):
        if isinstance(charset, tuple):
            # RFC 2231 encoded, so decode it, and it better end up as ascii.
            pcharset = charset[0] or 'us-ascii'
            try:
                # LookupError will be raised if the charset isn't known to
                # Python.  UnicodeError will be raised if the encoded text
                # contains a character not in the charset.
                charset = unicode(charset[2], pcharset).encode('us-ascii')
            except (LookupError, UnicodeError):
                charset = charset[2]
        # charset character must be in us-ascii range
        try:
            charset = unicode(charset, 'us-ascii').encode('us-ascii')
        except UnicodeError:
            return failobj
        # RFC 2046, $4.1.2 says charsets are not case sensitive
        return charset.lower()
--- a/Lib/email/test/test_email.py
+++ b/Lib/email/test/test_email.py
@ -3086,6 +3086,50 @@ def test_rfc2231_no_language_or_charset_in_charset(self):
        self.assertEqual(msg.get_content_charset(),
                         'this is even more ***fun*** is it not.pdf')
    def test_rfc2231_bad_encoding_in_filename(self):
        m = '''\
 Content-Disposition: inline;
 \tfilename*0*="bogus'xx'This%20is%20even%20more%20";
 \tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2="is it not.pdf"
 '''
        msg = email.message_from_string(m)
        self.assertEqual(msg.get_filename(),
                         'This is even more ***fun*** is it not.pdf')
    def test_rfc2231_bad_encoding_in_charset(self):
        m = """\
 Content-Type: text/plain; charset*=bogus''utf-8%E2%80%9D
 """
        msg = email.message_from_string(m)
        # This should return None because non-ascii characters in the charset
        # are not allowed.
        self.assertEqual(msg.get_content_charset(), None)
    def test_rfc2231_bad_character_in_charset(self):
        m = """\
 Content-Type: text/plain; charset*=ascii''utf-8%E2%80%9D
 """
        msg = email.message_from_string(m)
        # This should return None because non-ascii characters in the charset
        # are not allowed.
        self.assertEqual(msg.get_content_charset(), None)
    def test_rfc2231_bad_character_in_filename(self):
        m = '''\
 Content-Disposition: inline;
 \tfilename*0*="ascii'xx'This%20is%20even%20more%20";
 \tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2*="is it not.pdf%E2"
 '''
        msg = email.message_from_string(m)
        self.assertEqual(msg.get_filename(),
                         u'This is even more ***fun*** is it not.pdf\ufffd')
    def test_rfc2231_unknown_encoding(self):
        m = """\
 Content-Transfer-Encoding: 8bit
--- a/Lib/email/test/test_email_renamed.py
+++ b/Lib/email/test/test_email_renamed.py
@ -3092,6 +3092,50 @@ def test_rfc2231_no_language_or_charset_in_charset(self):
        self.assertEqual(msg.get_content_charset(),
                         'this is even more ***fun*** is it not.pdf')
    def test_rfc2231_bad_encoding_in_filename(self):
        m = '''\
 Content-Disposition: inline;
 \tfilename*0*="bogus'xx'This%20is%20even%20more%20";
 \tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2="is it not.pdf"
 '''
        msg = email.message_from_string(m)
        self.assertEqual(msg.get_filename(),
                         'This is even more ***fun*** is it not.pdf')
    def test_rfc2231_bad_encoding_in_charset(self):
        m = """\
 Content-Type: text/plain; charset*=bogus''utf-8%E2%80%9D
 """
        msg = email.message_from_string(m)
        # This should return None because non-ascii characters in the charset
        # are not allowed.
        self.assertEqual(msg.get_content_charset(), None)
    def test_rfc2231_bad_character_in_charset(self):
        m = """\
 Content-Type: text/plain; charset*=ascii''utf-8%E2%80%9D
 """
        msg = email.message_from_string(m)
        # This should return None because non-ascii characters in the charset
        # are not allowed.
        self.assertEqual(msg.get_content_charset(), None)
    def test_rfc2231_bad_character_in_filename(self):
        m = '''\
 Content-Disposition: inline;
 \tfilename*0*="ascii'xx'This%20is%20even%20more%20";
 \tfilename*1*="%2A%2A%2Afun%2A%2A%2A%20";
 \tfilename*2*="is it not.pdf%E2"
 '''
        msg = email.message_from_string(m)
        self.assertEqual(msg.get_filename(),
                         u'This is even more ***fun*** is it not.pdf\ufffd')
    def test_rfc2231_unknown_encoding(self):
        m = """\
 Content-Transfer-Encoding: 8bit