mirror of
https://github.com/python/cpython.git
synced 2026-06-08 02:41:11 +00:00
gh-53144: Improve charset support in the email package (GH-149942)
Defer to the codecs module for all aliases. Use MIME/IANA names for all IANA registered charsets. Fix email.contentmanager.set_text_content().
This commit is contained in:
parent
ac8f80ce7b
commit
c195a046f8
6 changed files with 223 additions and 35 deletions
|
|
@ -9,6 +9,7 @@
|
|||
'add_codec',
|
||||
]
|
||||
|
||||
import codecs
|
||||
from functools import partial
|
||||
|
||||
import email.base64mime
|
||||
|
|
@ -58,37 +59,71 @@
|
|||
'shift_jis': (BASE64, None, 'iso-2022-jp'),
|
||||
'iso-2022-jp': (BASE64, None, None),
|
||||
'koi8-r': (BASE64, BASE64, None),
|
||||
'utf-8': (SHORTEST, BASE64, 'utf-8'),
|
||||
}
|
||||
|
||||
# Aliases for other commonly-used names for character sets. Map
|
||||
# them to the real ones used in email.
|
||||
# Map Python codec names to their corresponding MIME/IANA names.
|
||||
ALIASES = {
|
||||
'latin_1': 'iso-8859-1',
|
||||
'latin-1': 'iso-8859-1',
|
||||
'latin_2': 'iso-8859-2',
|
||||
'latin-2': 'iso-8859-2',
|
||||
'latin_3': 'iso-8859-3',
|
||||
'latin-3': 'iso-8859-3',
|
||||
'latin_4': 'iso-8859-4',
|
||||
'latin-4': 'iso-8859-4',
|
||||
'latin_5': 'iso-8859-9',
|
||||
'latin-5': 'iso-8859-9',
|
||||
'latin_6': 'iso-8859-10',
|
||||
'latin-6': 'iso-8859-10',
|
||||
'latin_7': 'iso-8859-13',
|
||||
'latin-7': 'iso-8859-13',
|
||||
'latin_8': 'iso-8859-14',
|
||||
'latin-8': 'iso-8859-14',
|
||||
'latin_9': 'iso-8859-15',
|
||||
'latin-9': 'iso-8859-15',
|
||||
'latin_10':'iso-8859-16',
|
||||
'latin-10':'iso-8859-16',
|
||||
'cp949': 'ks_c_5601-1987',
|
||||
'euc_jp': 'euc-jp',
|
||||
'euc_kr': 'euc-kr',
|
||||
'ascii': 'us-ascii',
|
||||
}
|
||||
'ascii': 'us-ascii',
|
||||
'big5hkscs': 'big5-hkscs',
|
||||
'cp037': 'ibm037',
|
||||
'cp1026': 'ibm1026',
|
||||
'cp1140': 'ibm01140',
|
||||
'cp1250': 'windows-1250',
|
||||
'cp1251': 'windows-1251',
|
||||
'cp1252': 'windows-1252',
|
||||
'cp1253': 'windows-1253',
|
||||
'cp1254': 'windows-1254',
|
||||
'cp1255': 'windows-1255',
|
||||
'cp1256': 'windows-1256',
|
||||
'cp1257': 'windows-1257',
|
||||
'cp1258': 'windows-1258',
|
||||
'cp273': 'ibm273',
|
||||
'cp424': 'ibm424',
|
||||
'cp437': 'ibm437',
|
||||
'cp500': 'ibm500',
|
||||
'cp775': 'ibm775',
|
||||
'cp850': 'ibm850',
|
||||
'cp852': 'ibm852',
|
||||
'cp855': 'ibm855',
|
||||
'cp857': 'ibm857',
|
||||
'cp858': 'ibm00858',
|
||||
'cp860': 'ibm860',
|
||||
'cp861': 'ibm861',
|
||||
'cp862': 'ibm862',
|
||||
'cp863': 'ibm863',
|
||||
'cp864': 'ibm864',
|
||||
'cp865': 'ibm865',
|
||||
'cp866': 'ibm866',
|
||||
'cp869': 'ibm869',
|
||||
'cp874': 'windows-874',
|
||||
'euc_jp': 'euc-jp',
|
||||
'euc_kr': 'euc-kr',
|
||||
'hz': 'hz-gb-2312',
|
||||
'iso2022_jp': 'iso-2022-jp',
|
||||
'iso2022_jp_2': 'iso-2022-jp-2',
|
||||
'iso2022_kr': 'iso-2022-kr',
|
||||
'iso8859-1': 'iso-8859-1',
|
||||
'iso8859-10': 'iso-8859-10',
|
||||
'iso8859-11': 'iso-8859-11',
|
||||
'iso8859-13': 'iso-8859-13',
|
||||
'iso8859-14': 'iso-8859-14',
|
||||
'iso8859-15': 'iso-8859-15',
|
||||
'iso8859-16': 'iso-8859-16',
|
||||
'iso8859-2': 'iso-8859-2',
|
||||
'iso8859-3': 'iso-8859-3',
|
||||
'iso8859-4': 'iso-8859-4',
|
||||
'iso8859-5': 'iso-8859-5',
|
||||
'iso8859-6': 'iso-8859-6',
|
||||
'iso8859-7': 'iso-8859-7',
|
||||
'iso8859-8': 'iso-8859-8-i',
|
||||
'iso8859-9': 'iso-8859-9',
|
||||
'kz1048': 'kz-1048',
|
||||
'mac-roman': 'macintosh',
|
||||
|
||||
# CP949 is not registered in IANA. KS_C_5601-1987 is not the same,
|
||||
# but the closest registered option.
|
||||
'cp949': 'ks_c_5601-1987',
|
||||
}
|
||||
|
||||
|
||||
# Map charsets to their Unicode codec strings.
|
||||
|
|
@ -215,7 +250,18 @@ def __init__(self, input_charset=DEFAULT_CHARSET):
|
|||
raise errors.CharsetError(input_charset)
|
||||
input_charset = input_charset.lower()
|
||||
# Set the input charset after filtering through the aliases
|
||||
self.input_charset = ALIASES.get(input_charset, input_charset)
|
||||
# For backward compatibility, try ALIASES first to let the user
|
||||
# override it.
|
||||
if input_charset in ALIASES:
|
||||
input_charset = ALIASES[input_charset]
|
||||
else:
|
||||
try:
|
||||
input_codec = codecs.lookup(input_charset).name
|
||||
except LookupError:
|
||||
pass
|
||||
else:
|
||||
input_charset = ALIASES.get(input_codec, input_codec)
|
||||
self.input_charset = input_charset
|
||||
# We can try to guess which encoding and conversion to use by the
|
||||
# charset_map dictionary. Try that first, but let the user override
|
||||
# it.
|
||||
|
|
|
|||
|
|
@ -173,11 +173,11 @@ def set_text_content(msg, string, subtype="plain", charset='utf-8', cte=None,
|
|||
disposition=None, filename=None, cid=None,
|
||||
params=None, headers=None):
|
||||
_prepare_set(msg, 'text', subtype, headers)
|
||||
|
||||
charset = email.charset.Charset(charset).input_charset
|
||||
cte, payload = _encode_text(string, charset, cte, msg.policy)
|
||||
msg.set_payload(payload)
|
||||
msg.set_param('charset',
|
||||
email.charset.ALIASES.get(charset, charset),
|
||||
replace=True)
|
||||
msg.set_param('charset', charset, replace=True)
|
||||
msg['Content-Transfer-Encoding'] = cte
|
||||
_finalize_set(msg, disposition, filename, cid, params)
|
||||
raw_data_manager.add_set_handler(str, set_text_content)
|
||||
|
|
|
|||
|
|
@ -83,15 +83,15 @@ def test_chinese_codecs(self):
|
|||
h.append(s, Charset('big5hkscs'))
|
||||
eq(h.encode(), """\
|
||||
Chinese =?gb2312?b?1tDOxA==?= =?gbk?b?1tDOxA==?= =?gb18030?b?1tDOxA==?=
|
||||
=?hz?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5hkscs?b?pKSk5Q==?=""")
|
||||
=?hz-gb-2312?b?fntWUE5Efn0=?= =?big5?b?pKSk5Q==?= =?big5-hkscs?b?pKSk5Q==?=""")
|
||||
eq(decode_header(h.encode()),
|
||||
[(b'Chinese ', None),
|
||||
(b'\xd6\xd0\xce\xc4', 'gb2312'),
|
||||
(b'\xd6\xd0\xce\xc4', 'gbk'),
|
||||
(b'\xd6\xd0\xce\xc4', 'gb18030'),
|
||||
(b'~{VPND~}', 'hz'),
|
||||
(b'~{VPND~}', 'hz-gb-2312'),
|
||||
(b'\xa4\xa4\xa4\xe5', 'big5'),
|
||||
(b'\xa4\xa4\xa4\xe5', 'big5hkscs'),
|
||||
(b'\xa4\xa4\xa4\xe5', 'big5-hkscs'),
|
||||
])
|
||||
|
||||
def test_korean_codecs(self):
|
||||
|
|
|
|||
|
|
@ -342,6 +342,19 @@ def test_set_text_charset_latin_1(self):
|
|||
self.assertEqual(m.get_payload(decode=True).decode('utf-8'), content)
|
||||
self.assertEqual(m.get_content(), content)
|
||||
|
||||
def test_set_text_charset_cp949(self):
|
||||
m = self._make_message()
|
||||
content = "\ud55c\uad6d\uc5b4\n\uac02\n"
|
||||
raw_data_manager.set_content(m, content, charset='cp949')
|
||||
self.assertEqual(str(m), textwrap.dedent("""\
|
||||
Content-Type: text/plain; charset="ks_c_5601-1987"
|
||||
Content-Transfer-Encoding: base64
|
||||
|
||||
x9Gxub7uCoFBCg==
|
||||
"""))
|
||||
self.assertEqual(m.get_payload(decode=True).decode('ks_c_5601-1987'), content)
|
||||
self.assertEqual(m.get_content(), content)
|
||||
|
||||
def test_set_text_plain_long_line_heuristics(self):
|
||||
m = self._make_message()
|
||||
content = ("Simple but long message that is over 78 characters"
|
||||
|
|
|
|||
|
|
@ -4970,6 +4970,128 @@ def tearDown(self):
|
|||
except KeyError:
|
||||
pass
|
||||
|
||||
def test_attributes(self):
|
||||
from email import charset
|
||||
c = Charset()
|
||||
self.assertEqual(c.input_charset, 'us-ascii')
|
||||
self.assertEqual(c.header_encoding, None)
|
||||
self.assertEqual(c.body_encoding, None)
|
||||
self.assertEqual(c.output_charset, 'us-ascii')
|
||||
self.assertEqual(c.input_codec, None)
|
||||
self.assertEqual(c.output_codec, None)
|
||||
|
||||
c = Charset('us-ascii')
|
||||
self.assertEqual(c.input_charset, 'us-ascii')
|
||||
self.assertEqual(c.header_encoding, None)
|
||||
self.assertEqual(c.body_encoding, None)
|
||||
self.assertEqual(c.output_charset, 'us-ascii')
|
||||
self.assertEqual(c.input_codec, None)
|
||||
self.assertEqual(c.output_codec, None)
|
||||
|
||||
c = Charset('utf8')
|
||||
self.assertEqual(c.input_charset, 'utf-8')
|
||||
self.assertEqual(c.header_encoding, charset.SHORTEST)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'utf-8')
|
||||
self.assertEqual(c.input_codec, 'utf-8')
|
||||
self.assertEqual(c.output_codec, 'utf-8')
|
||||
|
||||
c = Charset('latin1')
|
||||
self.assertEqual(c.input_charset, 'iso-8859-1')
|
||||
self.assertEqual(c.header_encoding, charset.QP)
|
||||
self.assertEqual(c.body_encoding, charset.QP)
|
||||
self.assertEqual(c.output_charset, 'iso-8859-1')
|
||||
self.assertEqual(c.input_codec, 'iso-8859-1')
|
||||
self.assertEqual(c.output_codec, 'iso-8859-1')
|
||||
|
||||
c = Charset('latin9')
|
||||
self.assertEqual(c.input_charset, 'iso-8859-15')
|
||||
self.assertEqual(c.header_encoding, charset.QP)
|
||||
self.assertEqual(c.body_encoding, charset.QP)
|
||||
self.assertEqual(c.output_charset, 'iso-8859-15')
|
||||
self.assertEqual(c.input_codec, 'iso-8859-15')
|
||||
self.assertEqual(c.output_codec, 'iso-8859-15')
|
||||
|
||||
c = Charset('cyrillic')
|
||||
self.assertEqual(c.input_charset, 'iso-8859-5')
|
||||
self.assertEqual(c.header_encoding, charset.SHORTEST)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'iso-8859-5')
|
||||
self.assertEqual(c.input_codec, 'iso-8859-5')
|
||||
self.assertEqual(c.output_codec, 'iso-8859-5')
|
||||
|
||||
c = Charset('cp1251')
|
||||
self.assertEqual(c.input_charset, 'windows-1251')
|
||||
self.assertEqual(c.header_encoding, charset.SHORTEST)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'windows-1251')
|
||||
self.assertEqual(c.input_codec, 'windows-1251')
|
||||
self.assertEqual(c.output_codec, 'windows-1251')
|
||||
|
||||
c = Charset('cp1252')
|
||||
self.assertEqual(c.input_charset, 'windows-1252')
|
||||
self.assertEqual(c.header_encoding, charset.QP)
|
||||
self.assertEqual(c.body_encoding, charset.QP)
|
||||
self.assertEqual(c.output_charset, 'windows-1252')
|
||||
self.assertEqual(c.input_codec, 'windows-1252')
|
||||
self.assertEqual(c.output_codec, 'windows-1252')
|
||||
|
||||
c = Charset('eucjp')
|
||||
self.assertEqual(c.input_charset, 'euc-jp')
|
||||
self.assertEqual(c.header_encoding, charset.BASE64)
|
||||
self.assertEqual(c.body_encoding, None)
|
||||
self.assertEqual(c.output_charset, 'iso-2022-jp')
|
||||
self.assertEqual(c.input_codec, 'euc-jp')
|
||||
self.assertEqual(c.output_codec, 'iso-2022-jp')
|
||||
|
||||
c = Charset('cp949')
|
||||
self.assertEqual(c.input_charset, 'ks_c_5601-1987')
|
||||
self.assertEqual(c.header_encoding, charset.SHORTEST)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'ks_c_5601-1987')
|
||||
self.assertEqual(c.input_codec, 'ks_c_5601-1987')
|
||||
self.assertEqual(c.output_codec, 'ks_c_5601-1987')
|
||||
|
||||
c = Charset('gb2312')
|
||||
self.assertEqual(c.input_charset, 'gb2312')
|
||||
self.assertEqual(c.header_encoding, charset.BASE64)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'gb2312')
|
||||
self.assertEqual(c.input_codec, 'gb2312')
|
||||
self.assertEqual(c.output_codec, 'gb2312')
|
||||
|
||||
c = Charset('big5')
|
||||
self.assertEqual(c.input_charset, 'big5')
|
||||
self.assertEqual(c.header_encoding, charset.BASE64)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'big5')
|
||||
self.assertEqual(c.input_codec, 'big5')
|
||||
self.assertEqual(c.output_codec, 'big5')
|
||||
|
||||
def test_user_charsets(self):
|
||||
from email import charset
|
||||
c = Charset('fake0')
|
||||
self.assertEqual(c.input_charset, 'fake0')
|
||||
self.assertEqual(c.header_encoding, charset.SHORTEST)
|
||||
self.assertEqual(c.body_encoding, charset.BASE64)
|
||||
self.assertEqual(c.output_charset, 'fake0')
|
||||
self.assertEqual(c.input_codec, 'fake0')
|
||||
self.assertEqual(c.output_codec, 'fake0')
|
||||
|
||||
charset.add_alias('fake1', 'mime-fake')
|
||||
charset.add_alias('output-mime-fake', 'output-mime-fake-alias')
|
||||
charset.add_codec('mime-fake', 'fakecodec')
|
||||
charset.add_codec('output-mime-fake-alias', 'outputfakecodec')
|
||||
charset.add_charset('mime-fake', charset.QP, None, 'output-mime-fake')
|
||||
|
||||
c = Charset('fake1')
|
||||
self.assertEqual(c.input_charset, 'mime-fake')
|
||||
self.assertEqual(c.header_encoding, charset.QP)
|
||||
self.assertEqual(c.body_encoding, None)
|
||||
self.assertEqual(c.output_charset, 'output-mime-fake-alias')
|
||||
self.assertEqual(c.input_codec, 'fakecodec')
|
||||
self.assertEqual(c.output_codec, 'outputfakecodec')
|
||||
|
||||
def test_codec_encodeable(self):
|
||||
eq = self.assertEqual
|
||||
# Make sure us-ascii = no Unicode conversion
|
||||
|
|
@ -5010,6 +5132,11 @@ def test_unicode_charset_name(self):
|
|||
self.assertEqual(str(charset), 'us-ascii')
|
||||
self.assertRaises(errors.CharsetError, Charset, 'asc\xffii')
|
||||
|
||||
def test_bytes_charset_name(self):
|
||||
charset = Charset(b'us-ascii')
|
||||
self.assertEqual(str(charset), 'us-ascii')
|
||||
self.assertRaises(errors.CharsetError, Charset, b'asc\xffii')
|
||||
|
||||
|
||||
|
||||
# Test multilingual MIME headers.
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
The :mod:`email` package now supports all aliases of Python codecs and uses
|
||||
MIME/IANA names for all IANA registered charsets.
|
||||
Loading…
Add table
Add a link
Reference in a new issue