| 
									
										
										
										
											2002-01-27 06:48:02 +00:00
										 |  |  |  | # Copyright (C) 2001,2002 Python Software Foundation | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | # Author: barry@zope.com (Barry Warsaw) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | """A parser of RFC 2822 and MIME email messages.
 | 
					
						
							|  |  |  |  | """
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | from cStringIO import StringIO | 
					
						
							| 
									
										
										
										
											2002-01-27 06:48:02 +00:00
										 |  |  |  | from types import ListType | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | # Intrapackage imports | 
					
						
							|  |  |  |  | import Errors | 
					
						
							|  |  |  |  | import Message | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | EMPTYSTRING = '' | 
					
						
							|  |  |  |  | NL = '\n' | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2001-10-04 17:05:11 +00:00
										 |  |  |  |  | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  | class Parser: | 
					
						
							|  |  |  |  |     def __init__(self, _class=Message.Message): | 
					
						
							|  |  |  |  |         """Parser of RFC 2822 and MIME email messages.
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         Creates an in-memory object tree representing the email message, which | 
					
						
							|  |  |  |  |         can then be manipulated and turned over to a Generator to return the | 
					
						
							|  |  |  |  |         textual representation of the message. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         The string must be formatted as a block of RFC 2822 headers and header | 
					
						
							|  |  |  |  |         continuation lines, optionally preceeded by a `Unix-from' header.  The | 
					
						
							|  |  |  |  |         header block is terminated either by the end of the string or by a | 
					
						
							|  |  |  |  |         blank line. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         _class is the class to instantiate for new message objects when they | 
					
						
							|  |  |  |  |         must be created.  This class must have a constructor that can take | 
					
						
							|  |  |  |  |         zero arguments.  Default is Message.Message. | 
					
						
							|  |  |  |  |         """
 | 
					
						
							|  |  |  |  |         self._class = _class | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def parse(self, fp): | 
					
						
							|  |  |  |  |         root = self._class() | 
					
						
							|  |  |  |  |         self._parseheaders(root, fp) | 
					
						
							|  |  |  |  |         self._parsebody(root, fp) | 
					
						
							|  |  |  |  |         return root | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def parsestr(self, text): | 
					
						
							|  |  |  |  |         return self.parse(StringIO(text)) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def _parseheaders(self, container, fp): | 
					
						
							|  |  |  |  |         # Parse the headers, returning a list of header/value pairs.  None as | 
					
						
							|  |  |  |  |         # the header means the Unix-From header. | 
					
						
							|  |  |  |  |         lastheader = '' | 
					
						
							|  |  |  |  |         lastvalue = [] | 
					
						
							|  |  |  |  |         lineno = 0 | 
					
						
							|  |  |  |  |         while 1: | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             # Don't strip the line before we test for the end condition, | 
					
						
							|  |  |  |  |             # because whitespace-only header lines are RFC compliant | 
					
						
							|  |  |  |  |             # continuation lines. | 
					
						
							|  |  |  |  |             line = fp.readline() | 
					
						
							|  |  |  |  |             if not line: | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |                 break | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             line = line.splitlines()[0] | 
					
						
							|  |  |  |  |             if not line: | 
					
						
							|  |  |  |  |                 break | 
					
						
							|  |  |  |  |             # Ignore the trailing newline | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             lineno += 1 | 
					
						
							|  |  |  |  |             # Check for initial Unix From_ line | 
					
						
							|  |  |  |  |             if line.startswith('From '): | 
					
						
							|  |  |  |  |                 if lineno == 1: | 
					
						
							|  |  |  |  |                     container.set_unixfrom(line) | 
					
						
							|  |  |  |  |                     continue | 
					
						
							|  |  |  |  |                 else: | 
					
						
							|  |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         'Unix-from in headers after first rfc822 header') | 
					
						
							|  |  |  |  |             # Header continuation line | 
					
						
							|  |  |  |  |             if line[0] in ' \t': | 
					
						
							|  |  |  |  |                 if not lastheader: | 
					
						
							|  |  |  |  |                     raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                         'Continuation line seen before first header') | 
					
						
							|  |  |  |  |                 lastvalue.append(line) | 
					
						
							|  |  |  |  |                 continue | 
					
						
							|  |  |  |  |             # Normal, non-continuation header.  BAW: this should check to make | 
					
						
							|  |  |  |  |             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we | 
					
						
							|  |  |  |  |             # should expose the header matching algorithm in the API, and | 
					
						
							|  |  |  |  |             # allow for a non-strict parsing mode (that ignores the line | 
					
						
							|  |  |  |  |             # instead of raising the exception). | 
					
						
							|  |  |  |  |             i = line.find(':') | 
					
						
							|  |  |  |  |             if i < 0: | 
					
						
							|  |  |  |  |                 raise Errors.HeaderParseError( | 
					
						
							|  |  |  |  |                     'Not a header, not a continuation') | 
					
						
							|  |  |  |  |             if lastheader: | 
					
						
							|  |  |  |  |                 container[lastheader] = NL.join(lastvalue) | 
					
						
							|  |  |  |  |             lastheader = line[:i] | 
					
						
							|  |  |  |  |             lastvalue = [line[i+1:].lstrip()] | 
					
						
							|  |  |  |  |         # Make sure we retain the last header | 
					
						
							|  |  |  |  |         if lastheader: | 
					
						
							|  |  |  |  |             container[lastheader] = NL.join(lastvalue) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def _parsebody(self, container, fp): | 
					
						
							|  |  |  |  |         # Parse the body, but first split the payload on the content-type | 
					
						
							|  |  |  |  |         # boundary if present. | 
					
						
							| 
									
										
										
										
											2001-09-26 05:44:09 +00:00
										 |  |  |  |         boundary = container.get_boundary() | 
					
						
							|  |  |  |  |         isdigest = (container.get_type() == 'multipart/digest') | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         # If there's a boundary, split the payload text into its constituent | 
					
						
							|  |  |  |  |         # parts and parse each separately.  Otherwise, just parse the rest of | 
					
						
							|  |  |  |  |         # the body as a single message.  Note: any exceptions raised in the | 
					
						
							|  |  |  |  |         # recursive parse need to have their line numbers coerced. | 
					
						
							|  |  |  |  |         if boundary: | 
					
						
							|  |  |  |  |             preamble = epilogue = None | 
					
						
							|  |  |  |  |             # Split into subparts.  The first boundary we're looking for won't | 
					
						
							|  |  |  |  |             # have the leading newline since we're at the start of the body | 
					
						
							|  |  |  |  |             # text. | 
					
						
							|  |  |  |  |             separator = '--' + boundary | 
					
						
							|  |  |  |  |             payload = fp.read() | 
					
						
							|  |  |  |  |             start = payload.find(separator) | 
					
						
							|  |  |  |  |             if start < 0: | 
					
						
							|  |  |  |  |                 raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                     "Couldn't find starting boundary: %s" % boundary) | 
					
						
							|  |  |  |  |             if start > 0: | 
					
						
							|  |  |  |  |                 # there's some pre-MIME boundary preamble | 
					
						
							|  |  |  |  |                 preamble = payload[0:start] | 
					
						
							|  |  |  |  |             start += len(separator) + 1 + isdigest | 
					
						
							|  |  |  |  |             terminator = payload.find('\n' + separator + '--', start) | 
					
						
							|  |  |  |  |             if terminator < 0: | 
					
						
							|  |  |  |  |                 raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                     "Couldn't find terminating boundary: %s" % boundary) | 
					
						
							|  |  |  |  |             if terminator+len(separator)+3 < len(payload): | 
					
						
							|  |  |  |  |                 # there's some post-MIME boundary epilogue | 
					
						
							|  |  |  |  |                 epilogue = payload[terminator+len(separator)+3:] | 
					
						
							|  |  |  |  |             # We split the textual payload on the boundary separator, which | 
					
						
							|  |  |  |  |             # includes the trailing newline.  If the container is a | 
					
						
							|  |  |  |  |             # multipart/digest then the subparts are by default message/rfc822 | 
					
						
							|  |  |  |  |             # instead of text/plain.  In that case, they'll have an extra | 
					
						
							|  |  |  |  |             # newline before the headers to distinguish the message's headers | 
					
						
							|  |  |  |  |             # from the subpart headers. | 
					
						
							|  |  |  |  |             if isdigest: | 
					
						
							|  |  |  |  |                 separator += '\n\n' | 
					
						
							|  |  |  |  |             else: | 
					
						
							|  |  |  |  |                 separator += '\n' | 
					
						
							|  |  |  |  |             parts = payload[start:terminator].split('\n' + separator) | 
					
						
							|  |  |  |  |             for part in parts: | 
					
						
							|  |  |  |  |                 msgobj = self.parsestr(part) | 
					
						
							|  |  |  |  |                 container.preamble = preamble | 
					
						
							|  |  |  |  |                 container.epilogue = epilogue | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |                 container.attach(msgobj) | 
					
						
							|  |  |  |  |         elif container.get_main_type() == 'multipart': | 
					
						
							|  |  |  |  |             # Very bad.  A message is a multipart with no boundary! | 
					
						
							|  |  |  |  |             raise Errors.BoundaryError( | 
					
						
							|  |  |  |  |                 'multipart message with no defined boundary') | 
					
						
							| 
									
										
										
										
											2001-09-26 05:44:09 +00:00
										 |  |  |  |         elif container.get_type() == 'message/delivery-status': | 
					
						
							|  |  |  |  |             # This special kind of type contains blocks of headers separated | 
					
						
							|  |  |  |  |             # by a blank line.  We'll represent each header block as a | 
					
						
							|  |  |  |  |             # separate Message object | 
					
						
							|  |  |  |  |             blocks = [] | 
					
						
							|  |  |  |  |             while 1: | 
					
						
							|  |  |  |  |                 blockmsg = self._class() | 
					
						
							|  |  |  |  |                 self._parseheaders(blockmsg, fp) | 
					
						
							|  |  |  |  |                 if not len(blockmsg): | 
					
						
							|  |  |  |  |                     # No more header blocks left | 
					
						
							|  |  |  |  |                     break | 
					
						
							|  |  |  |  |                 blocks.append(blockmsg) | 
					
						
							|  |  |  |  |             container.set_payload(blocks) | 
					
						
							|  |  |  |  |         elif container.get_main_type() == 'message': | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |             # Create a container for the payload, but watch out for there not | 
					
						
							|  |  |  |  |             # being any headers left | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 msg = self.parse(fp) | 
					
						
							|  |  |  |  |             except Errors.HeaderParseError: | 
					
						
							|  |  |  |  |                 msg = self._class() | 
					
						
							|  |  |  |  |                 self._parsebody(msg, fp) | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             container.set_payload(msg) | 
					
						
							| 
									
										
										
										
											2001-09-23 03:17:28 +00:00
										 |  |  |  |         else: | 
					
						
							| 
									
										
										
										
											2002-04-10 21:01:31 +00:00
										 |  |  |  |             container.set_payload(fp.read()) | 
					
						
							| 
									
										
										
										
											2001-10-11 15:43:00 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |  | 
					
						
							|  |  |  |  | class HeaderParser(Parser): | 
					
						
							|  |  |  |  |     """A subclass of Parser, this one only meaningfully parses message headers.
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     This class can be used if all you're interested in is the headers of a | 
					
						
							|  |  |  |  |     message.  While it consumes the message body, it does not parse it, but | 
					
						
							|  |  |  |  |     simply makes it available as a string payload. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     Parsing with this subclass can be considerably faster if all you're | 
					
						
							|  |  |  |  |     interested in is the message headers. | 
					
						
							|  |  |  |  |     """
 | 
					
						
							|  |  |  |  |     def _parsebody(self, container, fp): | 
					
						
							|  |  |  |  |         # Consume but do not parse, the body | 
					
						
							|  |  |  |  |         container.set_payload(fp.read()) |