| 
									
										
										
										
											2002-01-27 06:48:02 +00:00
										 |  |  |  | # Copyright (C) 2001,2002 Python Software Foundation | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | # Author: barry@zope.com (Barry Warsaw) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | """A parser of RFC 2822 and MIME email messages.
 | 
					
						
							|  |  |  |  | """
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-05-19 23:51:50 +00:00
										 |  |  |  | import re | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | from cStringIO import StringIO | 
					
						
							| 
									
										
										
										
											2002-01-27 06:48:02 +00:00
										 |  |  |  | from types import ListType | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-06-02 19:12:03 +00:00
										 |  |  |  | from email import Errors | 
					
						
							|  |  |  |  | from email import Message | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | EMPTYSTRING = '' | 
					
						
							|  |  |  |  | NL = '\n' | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-10-04 17:05:11 +00:00
										 |  |  |  |  | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | class Parser: | 
					
						
							| 
									
										
										
										
											2002-07-19 22:25:34 +00:00
										 |  |  |  |     def __init__(self, _class=Message.Message, strict=0): | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         """Parser of RFC 2822 and MIME email messages.
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         Creates an in-memory object tree representing the email message, which | 
					
						
							|  |  |  |  |         can then be manipulated and turned over to a Generator to return the | 
					
						
							|  |  |  |  |         textual representation of the message. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         The string must be formatted as a block of RFC 2822 headers and header | 
					
						
							|  |  |  |  |         continuation lines, optionally preceeded by a `Unix-from' header.  The | 
					
						
							|  |  |  |  |         header block is terminated either by the end of the string or by a | 
					
						
							|  |  |  |  |         blank line. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         _class is the class to instantiate for new message objects when they | 
					
						
							|  |  |  |  |         must be created.  This class must have a constructor that can take | 
					
						
							|  |  |  |  |         zero arguments.  Default is Message.Message. | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |         Optional strict tells the parser to be strictly RFC compliant or to be | 
					
						
							|  |  |  |  |         more forgiving in parsing of ill-formatted MIME documents.  When | 
					
						
							|  |  |  |  |         non-strict mode is used, the parser will try to make up for missing or | 
					
						
							|  |  |  |  |         erroneous boundaries and other peculiarities seen in the wild. | 
					
						
							| 
									
										
										
										
											2002-07-19 22:25:34 +00:00
										 |  |  |  |         Default is non-strict parsing. | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         """
 | 
					
						
							|  |  |  |  |         self._class = _class | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |         self._strict = strict | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |     def parse(self, fp, headersonly=0): | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         root = self._class() | 
					
						
							|  |  |  |  |         self._parseheaders(root, fp) | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |         if not headersonly: | 
					
						
							|  |  |  |  |             self._parsebody(root, fp) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         return root | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |     def parsestr(self, text, headersonly=0): | 
					
						
							|  |  |  |  |         return self.parse(StringIO(text), headersonly=headersonly) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     def _parseheaders(self, container, fp): | 
					
						
							|  |  |  |  |         # Parse the headers, returning a list of header/value pairs.  None as | 
					
						
							|  |  |  |  |         # the header means the Unix-From header. | 
					
						
							|  |  |  |  |         lastheader = '' | 
					
						
							|  |  |  |  |         lastvalue = [] | 
					
						
							|  |  |  |  |         lineno = 0 | 
					
						
							|  |  |  |  |         while 1: | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             # Don't strip the line before we test for the end condition, | 
					
						
							|  |  |  |  |             # because whitespace-only header lines are RFC compliant | 
					
						
							|  |  |  |  |             # continuation lines. | 
					
						
							|  |  |  |  |             line = fp.readline() | 
					
						
							|  |  |  |  |             if not line: | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                 break | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             line = line.splitlines()[0] | 
					
						
							|  |  |  |  |             if not line: | 
					
						
							|  |  |  |  |                 break | 
					
						
							|  |  |  |  |             # Ignore the trailing newline | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             lineno += 1 | 
					
						
							|  |  |  |  |             # Check for initial Unix From_ line | 
					
						
							|  |  |  |  |             if line.startswith('From '): | 
					
						
							|  |  |  |  |                 if lineno == 1: | 
					
						
							|  |  |  |  |                     container.set_unixfrom(line) | 
					
						
							|  |  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 elif self._strict: | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         'Unix-from in headers after first rfc822 header') | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 else: | 
					
						
							|  |  |  |  |                     # ignore the wierdly placed From_ line | 
					
						
							|  |  |  |  |                     # XXX: maybe set unixfrom anyway? or only if not already? | 
					
						
							|  |  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             # Header continuation line | 
					
						
							|  |  |  |  |             if line[0] in ' \t': | 
					
						
							|  |  |  |  |                 if not lastheader: | 
					
						
							|  |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         'Continuation line seen before first header') | 
					
						
							|  |  |  |  |                 lastvalue.append(line) | 
					
						
							|  |  |  |  |                 continue | 
					
						
							|  |  |  |  |             # Normal, non-continuation header.  BAW: this should check to make | 
					
						
							|  |  |  |  |             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we | 
					
						
							|  |  |  |  |             # should expose the header matching algorithm in the API, and | 
					
						
							|  |  |  |  |             # allow for a non-strict parsing mode (that ignores the line | 
					
						
							|  |  |  |  |             # instead of raising the exception). | 
					
						
							|  |  |  |  |             i = line.find(':') | 
					
						
							|  |  |  |  |             if i < 0: | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 if self._strict: | 
					
						
							|  |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         "Not a header, not a continuation: ``%s''"%line) | 
					
						
							|  |  |  |  |                 elif lineno == 1 and line.startswith('--'): | 
					
						
							|  |  |  |  |                     # allow through duplicate boundary tags. | 
					
						
							|  |  |  |  |                     continue | 
					
						
							|  |  |  |  |                 else: | 
					
						
							|  |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         "Not a header, not a continuation: ``%s''"%line) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             if lastheader: | 
					
						
							|  |  |  |  |                 container[lastheader] = NL.join(lastvalue) | 
					
						
							|  |  |  |  |             lastheader = line[:i] | 
					
						
							|  |  |  |  |             lastvalue = [line[i+1:].lstrip()] | 
					
						
							|  |  |  |  |         # Make sure we retain the last header | 
					
						
							|  |  |  |  |         if lastheader: | 
					
						
							|  |  |  |  |             container[lastheader] = NL.join(lastvalue) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def _parsebody(self, container, fp): | 
					
						
							|  |  |  |  |         # Parse the body, but first split the payload on the content-type | 
					
						
							|  |  |  |  |         # boundary if present. | 
					
						
							| 
									
										
										
										
											2001-09-26 05:44:09 +00:00
										 |  |  |  |         boundary = container.get_boundary() | 
					
						
							|  |  |  |  |         isdigest = (container.get_type() == 'multipart/digest') | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         # If there's a boundary, split the payload text into its constituent | 
					
						
							|  |  |  |  |         # parts and parse each separately.  Otherwise, just parse the rest of | 
					
						
							|  |  |  |  |         # the body as a single message.  Note: any exceptions raised in the | 
					
						
							|  |  |  |  |         # recursive parse need to have their line numbers coerced. | 
					
						
							|  |  |  |  |         if boundary: | 
					
						
							|  |  |  |  |             preamble = epilogue = None | 
					
						
							|  |  |  |  |             # Split into subparts.  The first boundary we're looking for won't | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |             # always have a leading newline since we're at the start of the | 
					
						
							|  |  |  |  |             # body text, and there's not always a preamble before the first | 
					
						
							|  |  |  |  |             # boundary. | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             separator = '--' + boundary | 
					
						
							|  |  |  |  |             payload = fp.read() | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |             # We use an RE here because boundaries can have trailing  | 
					
						
							|  |  |  |  |             # whitespace. | 
					
						
							|  |  |  |  |             mo = re.search( | 
					
						
							|  |  |  |  |                 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)', | 
					
						
							|  |  |  |  |                 payload) | 
					
						
							|  |  |  |  |             if not mo: | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                 raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                     "Couldn't find starting boundary: %s" % boundary) | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |             start = mo.start() | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             if start > 0: | 
					
						
							|  |  |  |  |                 # there's some pre-MIME boundary preamble | 
					
						
							|  |  |  |  |                 preamble = payload[0:start] | 
					
						
							| 
									
										
										
										
											2002-05-19 23:51:50 +00:00
										 |  |  |  |             # Find out what kind of line endings we're using | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |             start += len(mo.group('sep')) + len(mo.group('ws')) | 
					
						
							| 
									
										
										
										
											2002-05-19 23:51:50 +00:00
										 |  |  |  |             cre = re.compile('\r\n|\r|\n') | 
					
						
							|  |  |  |  |             mo = cre.search(payload, start) | 
					
						
							|  |  |  |  |             if mo: | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 start += len(mo.group(0)) | 
					
						
							| 
									
										
										
										
											2002-05-19 23:51:50 +00:00
										 |  |  |  |             # We create a compiled regexp first because we need to be able to | 
					
						
							|  |  |  |  |             # specify the start position, and the module function doesn't | 
					
						
							|  |  |  |  |             # support this signature. :( | 
					
						
							|  |  |  |  |             cre = re.compile('(?P<sep>\r\n|\r|\n)' + | 
					
						
							|  |  |  |  |                              re.escape(separator) + '--') | 
					
						
							|  |  |  |  |             mo = cre.search(payload, start) | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |             if mo: | 
					
						
							|  |  |  |  |                 terminator = mo.start() | 
					
						
							|  |  |  |  |                 linesep = mo.group('sep') | 
					
						
							|  |  |  |  |                 if mo.end() < len(payload): | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |                     # There's some post-MIME boundary epilogue | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                     epilogue = payload[mo.end():] | 
					
						
							|  |  |  |  |             elif self._strict: | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                 raise Errors.BoundaryError( | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                         "Couldn't find terminating boundary: %s" % boundary) | 
					
						
							|  |  |  |  |             else: | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |                 # Handle the case of no trailing boundary.  Check that it ends | 
					
						
							|  |  |  |  |                 # in a blank line.  Some cases (spamspamspam) don't even have | 
					
						
							|  |  |  |  |                 # that! | 
					
						
							|  |  |  |  |                 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload) | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 if not mo: | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |                     mo = re.search('(?P<sep>\r\n|\r|\n)$', payload) | 
					
						
							|  |  |  |  |                     if not mo: | 
					
						
							|  |  |  |  |                         raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                           'No terminating boundary and no trailing empty line') | 
					
						
							|  |  |  |  |                 linesep = mo.group('sep') | 
					
						
							|  |  |  |  |                 terminator = len(payload) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             # We split the textual payload on the boundary separator, which | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |             # includes the trailing newline. If the container is a | 
					
						
							|  |  |  |  |             # multipart/digest then the subparts are by default message/rfc822  | 
					
						
							|  |  |  |  |             # instead of text/plain.  In that case, they'll have a optional  | 
					
						
							|  |  |  |  |             # block of MIME headers, then an empty line followed by the  | 
					
						
							|  |  |  |  |             # message headers. | 
					
						
							| 
									
										
										
										
											2002-07-18 23:09:09 +00:00
										 |  |  |  |             parts = re.split( | 
					
						
							|  |  |  |  |                 linesep + re.escape(separator) + r'[ \t]*' + linesep, | 
					
						
							|  |  |  |  |                 payload[start:terminator]) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             for part in parts: | 
					
						
							| 
									
										
										
										
											2002-07-09 02:50:02 +00:00
										 |  |  |  |                 if isdigest:  | 
					
						
							|  |  |  |  |                     if part[0] == linesep: | 
					
						
							|  |  |  |  |                         # There's no header block so create an empty message | 
					
						
							|  |  |  |  |                         # object as the container, and lop off the newline so | 
					
						
							|  |  |  |  |                         # we can parse the sub-subobject | 
					
						
							|  |  |  |  |                         msgobj = self._class() | 
					
						
							|  |  |  |  |                         part = part[1:] | 
					
						
							|  |  |  |  |                     else: | 
					
						
							|  |  |  |  |                         parthdrs, part = part.split(linesep+linesep, 1) | 
					
						
							|  |  |  |  |                         # msgobj in this case is the "message/rfc822" container | 
					
						
							|  |  |  |  |                         msgobj = self.parsestr(parthdrs, headersonly=1) | 
					
						
							|  |  |  |  |                     # while submsgobj is the message itself | 
					
						
							|  |  |  |  |                     submsgobj = self.parsestr(part) | 
					
						
							|  |  |  |  |                     msgobj.attach(submsgobj) | 
					
						
							|  |  |  |  |                     msgobj.set_default_type('message/rfc822') | 
					
						
							|  |  |  |  |                 else: | 
					
						
							|  |  |  |  |                     msgobj = self.parsestr(part) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                 container.preamble = preamble | 
					
						
							|  |  |  |  |                 container.epilogue = epilogue | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |                 container.attach(msgobj) | 
					
						
							|  |  |  |  |         elif container.get_main_type() == 'multipart': | 
					
						
							|  |  |  |  |             # Very bad.  A message is a multipart with no boundary! | 
					
						
							|  |  |  |  |             raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                 'multipart message with no defined boundary') | 
					
						
							| 
									
										
										
										
											2001-09-26 05:44:09 +00:00
										 |  |  |  |         elif container.get_type() == 'message/delivery-status': | 
					
						
							|  |  |  |  |             # This special kind of type contains blocks of headers separated | 
					
						
							|  |  |  |  |             # by a blank line.  We'll represent each header block as a | 
					
						
							|  |  |  |  |             # separate Message object | 
					
						
							|  |  |  |  |             blocks = [] | 
					
						
							|  |  |  |  |             while 1: | 
					
						
							|  |  |  |  |                 blockmsg = self._class() | 
					
						
							|  |  |  |  |                 self._parseheaders(blockmsg, fp) | 
					
						
							|  |  |  |  |                 if not len(blockmsg): | 
					
						
							|  |  |  |  |                     # No more header blocks left | 
					
						
							|  |  |  |  |                     break | 
					
						
							|  |  |  |  |                 blocks.append(blockmsg) | 
					
						
							|  |  |  |  |             container.set_payload(blocks) | 
					
						
							|  |  |  |  |         elif container.get_main_type() == 'message': | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             # Create a container for the payload, but watch out for there not | 
					
						
							|  |  |  |  |             # being any headers left | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 msg = self.parse(fp) | 
					
						
							|  |  |  |  |             except Errors.HeaderParseError: | 
					
						
							|  |  |  |  |                 msg = self._class() | 
					
						
							|  |  |  |  |                 self._parsebody(msg, fp) | 
					
						
							| 
									
										
										
										
											2002-06-02 19:12:03 +00:00
										 |  |  |  |             container.attach(msg) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             container.set_payload(fp.read()) | 
					
						
							| 
									
										
										
										
											2001-10-11 15:43:00 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |  | 
					
						
							|  |  |  |  | class HeaderParser(Parser): | 
					
						
							|  |  |  |  |     """A subclass of Parser, this one only meaningfully parses message headers.
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     This class can be used if all you're interested in is the headers of a | 
					
						
							|  |  |  |  |     message.  While it consumes the message body, it does not parse it, but | 
					
						
							|  |  |  |  |     simply makes it available as a string payload. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     Parsing with this subclass can be considerably faster if all you're | 
					
						
							|  |  |  |  |     interested in is the message headers. | 
					
						
							|  |  |  |  |     """
 | 
					
						
							|  |  |  |  |     def _parsebody(self, container, fp): | 
					
						
							|  |  |  |  |         # Consume but do not parse, the body | 
					
						
							|  |  |  |  |         container.set_payload(fp.read()) |