gh-79986: Add parsing for References/In-Reply-To email headers (#137201)

This is a followup to 46d88a1131 (#13397),
which added parsing for Message-ID. Similar handling is needed for the
other two identification headers.
This commit is contained in:
elenril 2025-12-11 15:20:53 +01:00 committed by GitHub
parent 4eab90f4f3
commit 79aa43a979
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 137 additions and 0 deletions

View file

@ -878,6 +878,12 @@ class MessageID(MsgID):
class InvalidMessageID(MessageID):
token_type = 'invalid-message-id'
class MessageIDList(TokenList):
token_type = 'message-id-list'
@property
def message_ids(self):
return [x for x in self if x.token_type=='msg-id']
class Header(TokenList):
token_type = 'header'
@ -2175,6 +2181,32 @@ def parse_message_id(value):
return message_id
def parse_message_ids(value):
"""in-reply-to = "In-Reply-To:" 1*msg-id CRLF
references = "References:" 1*msg-id CRLF
"""
message_id_list = MessageIDList()
while value:
if value[0] == ',':
# message id list separated with commas - this is invalid,
# but happens rather frequently in the wild
message_id_list.defects.append(
errors.InvalidHeaderDefect("comma in msg-id list"))
message_id_list.append(
WhiteSpaceTerminal(' ', 'invalid-comma-replacement'))
value = value[1:]
continue
try:
token, value = get_msg_id(value)
message_id_list.append(token)
except errors.HeaderParseError as ex:
token = get_unstructured(value)
message_id_list.append(InvalidMessageID(token))
message_id_list.defects.append(
errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(ex)))
break
return message_id_list
#
# XXX: As I begin to add additional header parsers, I'm realizing we probably
# have two level of parser routines: the get_XXX methods that get a token in

View file

@ -534,6 +534,18 @@ def parse(cls, value, kwds):
kwds['defects'].extend(parse_tree.all_defects)
class ReferencesHeader:
max_count = 1
value_parser = staticmethod(parser.parse_message_ids)
@classmethod
def parse(cls, value, kwds):
kwds['parse_tree'] = parse_tree = cls.value_parser(value)
kwds['decoded'] = str(parse_tree)
kwds['defects'].extend(parse_tree.all_defects)
# The header factory #
_default_header_map = {
@ -557,6 +569,8 @@ def parse(cls, value, kwds):
'content-disposition': ContentDispositionHeader,
'content-transfer-encoding': ContentTransferEncodingHeader,
'message-id': MessageIDHeader,
'in-reply-to': ReferencesHeader,
'references': ReferencesHeader,
}
class HeaderRegistry:

View file

@ -2867,6 +2867,81 @@ def test_get_msg_id_ws_only_local(self):
)
self.assertEqual(msg_id.token_type, 'msg-id')
def test_parse_message_ids_valid(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<foo@bar> <bar@foo>",
"<foo@bar> <bar@foo>",
"<foo@bar> <bar@foo>",
[],
)
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_empty(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
" ",
" ",
" ",
[errors.InvalidHeaderDefect],
)
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_comment(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<foo@bar> (foo's message from \"bar\")",
"<foo@bar> (foo's message from \"bar\")",
"<foo@bar> ",
[],
)
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar> ')
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_no_sep(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<foo@bar><bar@foo>",
"<foo@bar><bar@foo>",
"<foo@bar><bar@foo>",
[],
)
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar>')
self.assertEqual(message_ids.message_ids[1].value, '<bar@foo>')
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_comma_sep(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<foo@bar>,<bar@foo>",
"<foo@bar> <bar@foo>",
"<foo@bar> <bar@foo>",
[errors.InvalidHeaderDefect],
)
self.assertEqual(message_ids.message_ids[0].value, '<foo@bar>')
self.assertEqual(message_ids.message_ids[1].value, '<bar@foo>')
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_invalid_id(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
"<Date: Wed, 08 Jun 2002 09:78:58 +0600>",
[errors.InvalidHeaderDefect]*2,
)
self.assertEqual(message_ids.token_type, 'message-id-list')
def test_parse_message_ids_broken_ang(self):
message_ids = self._test_parse_x(
parser.parse_message_ids,
"<foo@bar> >bar@foo",
"<foo@bar> >bar@foo",
"<foo@bar> >bar@foo",
[errors.InvalidHeaderDefect]*1,
)
self.assertEqual(message_ids.token_type, 'message-id-list')
@parameterize

View file

@ -1821,5 +1821,18 @@ def test_message_id_header_is_not_folded(self):
h.fold(policy=policy.default.clone(max_line_length=20)),
'Message-ID:\n <ईमेलfromMessage@wők.com>\n')
def test_fold_references(self):
h = self.make_header(
'References',
'<referenceid1thatislongerthan@maxlinelength.com> '
'<referenceid2thatislongerthan@maxlinelength.com>'
)
self.assertEqual(
h.fold(policy=policy.default.clone(max_line_length=20)),
'References: '
'<referenceid1thatislongerthan@maxlinelength.com>\n'
' <referenceid2thatislongerthan@maxlinelength.com>\n')
if __name__ == '__main__':
unittest.main()

View file

@ -0,0 +1,3 @@
Add parsing for ``References`` and ``In-Reply-To`` headers to the :mod:`email`
library that parses the header content as lists of message id tokens. This
prevents them from being folded incorrectly.