mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 18:54:53 +00:00 
			
		
		
		
	 da2525ed2a
			
		
	
	
		da2525ed2a
		
	
	
	
	
		
			
			where in lax parsing, the first non-header line after a header block (e.g. the first line not containing a colon, and not a continuation), can be treated as the first body line, even without the RFC mandated blank line separator. rfc822 had this behavior, and I vaguely remember problems with this, but can't remember details. In any event, all the tests still pass, so I guess we'll find out. ;/ This patch works by returning the non-header, non-continuation line from _parseheader() and using that as the first header line prepended to fp.read() if given. It's usually None. We use this approach instead of trying to seek/tell the file-like object.
		
			
				
	
	
		
			292 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			292 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright (C) 2001,2002 Python Software Foundation
 | ||
| # Author: barry@zope.com (Barry Warsaw)
 | ||
| 
 | ||
| """A parser of RFC 2822 and MIME email messages.
 | ||
| """
 | ||
| 
 | ||
| import re
 | ||
| from cStringIO import StringIO
 | ||
| from types import ListType
 | ||
| 
 | ||
| from email import Errors
 | ||
| from email import Message
 | ||
| 
 | ||
| EMPTYSTRING = ''
 | ||
| NL = '\n'
 | ||
| 
 | ||
| try:
 | ||
|     True, False
 | ||
| except NameError:
 | ||
|     True = 1
 | ||
|     False = 0
 | ||
| 
 | ||
| nlcre = re.compile('\r\n|\r|\n')
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| class Parser:
 | ||
|     def __init__(self, _class=Message.Message, strict=False):
 | ||
|         """Parser of RFC 2822 and MIME email messages.
 | ||
| 
 | ||
|         Creates an in-memory object tree representing the email message, which
 | ||
|         can then be manipulated and turned over to a Generator to return the
 | ||
|         textual representation of the message.
 | ||
| 
 | ||
|         The string must be formatted as a block of RFC 2822 headers and header
 | ||
|         continuation lines, optionally preceeded by a `Unix-from' header.  The
 | ||
|         header block is terminated either by the end of the string or by a
 | ||
|         blank line.
 | ||
| 
 | ||
|         _class is the class to instantiate for new message objects when they
 | ||
|         must be created.  This class must have a constructor that can take
 | ||
|         zero arguments.  Default is Message.Message.
 | ||
| 
 | ||
|         Optional strict tells the parser to be strictly RFC compliant or to be
 | ||
|         more forgiving in parsing of ill-formatted MIME documents.  When
 | ||
|         non-strict mode is used, the parser will try to make up for missing or
 | ||
|         erroneous boundaries and other peculiarities seen in the wild.
 | ||
|         Default is non-strict parsing.
 | ||
|         """
 | ||
|         self._class = _class
 | ||
|         self._strict = strict
 | ||
| 
 | ||
|     def parse(self, fp, headersonly=False):
 | ||
|         """Create a message structure from the data in a file.
 | ||
| 
 | ||
|         Reads all the data from the file and returns the root of the message
 | ||
|         structure.  Optional headersonly is a flag specifying whether to stop
 | ||
|         parsing after reading the headers or not.  The default is False,
 | ||
|         meaning it parses the entire contents of the file.
 | ||
|         """
 | ||
|         root = self._class()
 | ||
|         firstbodyline = self._parseheaders(root, fp)
 | ||
|         if not headersonly:
 | ||
|             self._parsebody(root, fp, firstbodyline)
 | ||
|         return root
 | ||
| 
 | ||
|     def parsestr(self, text, headersonly=False):
 | ||
|         """Create a message structure from a string.
 | ||
| 
 | ||
|         Returns the root of the message structure.  Optional headersonly is a
 | ||
|         flag specifying whether to stop parsing after reading the headers or
 | ||
|         not.  The default is False, meaning it parses the entire contents of
 | ||
|         the file.
 | ||
|         """
 | ||
|         return self.parse(StringIO(text), headersonly=headersonly)
 | ||
| 
 | ||
|     def _parseheaders(self, container, fp):
 | ||
|         # Parse the headers, returning a list of header/value pairs.  None as
 | ||
|         # the header means the Unix-From header.
 | ||
|         lastheader = ''
 | ||
|         lastvalue = []
 | ||
|         lineno = 0
 | ||
|         firstbodyline = None
 | ||
|         while True:
 | ||
|             # Don't strip the line before we test for the end condition,
 | ||
|             # because whitespace-only header lines are RFC compliant
 | ||
|             # continuation lines.
 | ||
|             line = fp.readline()
 | ||
|             if not line:
 | ||
|                 break
 | ||
|             line = line.splitlines()[0]
 | ||
|             if not line:
 | ||
|                 break
 | ||
|             # Ignore the trailing newline
 | ||
|             lineno += 1
 | ||
|             # Check for initial Unix From_ line
 | ||
|             if line.startswith('From '):
 | ||
|                 if lineno == 1:
 | ||
|                     container.set_unixfrom(line)
 | ||
|                     continue
 | ||
|                 elif self._strict:
 | ||
|                     raise Errors.HeaderParseError(
 | ||
|                         'Unix-from in headers after first rfc822 header')
 | ||
|                 else:
 | ||
|                     # ignore the wierdly placed From_ line
 | ||
|                     # XXX: maybe set unixfrom anyway? or only if not already?
 | ||
|                     continue
 | ||
|             # Header continuation line
 | ||
|             if line[0] in ' \t':
 | ||
|                 if not lastheader:
 | ||
|                     raise Errors.HeaderParseError(
 | ||
|                         'Continuation line seen before first header')
 | ||
|                 lastvalue.append(line)
 | ||
|                 continue
 | ||
|             # Normal, non-continuation header.  BAW: this should check to make
 | ||
|             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 | ||
|             # should expose the header matching algorithm in the API, and
 | ||
|             # allow for a non-strict parsing mode (that ignores the line
 | ||
|             # instead of raising the exception).
 | ||
|             i = line.find(':')
 | ||
|             if i < 0:
 | ||
|                 if self._strict:
 | ||
|                     raise Errors.HeaderParseError(
 | ||
|                         "Not a header, not a continuation: ``%s''" % line)
 | ||
|                 elif lineno == 1 and line.startswith('--'):
 | ||
|                     # allow through duplicate boundary tags.
 | ||
|                     continue
 | ||
|                 else:
 | ||
|                     # There was no separating blank line as mandated by RFC
 | ||
|                     # 2822, but we're in non-strict mode.  So just offer up
 | ||
|                     # this current line as the first body line.
 | ||
|                     firstbodyline = line
 | ||
|                     break
 | ||
|             if lastheader:
 | ||
|                 container[lastheader] = NL.join(lastvalue)
 | ||
|             lastheader = line[:i]
 | ||
|             lastvalue = [line[i+1:].lstrip()]
 | ||
|         # Make sure we retain the last header
 | ||
|         if lastheader:
 | ||
|             container[lastheader] = NL.join(lastvalue)
 | ||
|         return firstbodyline
 | ||
| 
 | ||
|     def _parsebody(self, container, fp, firstbodyline=None):
 | ||
|         # Parse the body, but first split the payload on the content-type
 | ||
|         # boundary if present.
 | ||
|         boundary = container.get_boundary()
 | ||
|         isdigest = (container.get_content_type() == 'multipart/digest')
 | ||
|         # If there's a boundary, split the payload text into its constituent
 | ||
|         # parts and parse each separately.  Otherwise, just parse the rest of
 | ||
|         # the body as a single message.  Note: any exceptions raised in the
 | ||
|         # recursive parse need to have their line numbers coerced.
 | ||
|         if boundary:
 | ||
|             preamble = epilogue = None
 | ||
|             # Split into subparts.  The first boundary we're looking for won't
 | ||
|             # always have a leading newline since we're at the start of the
 | ||
|             # body text, and there's not always a preamble before the first
 | ||
|             # boundary.
 | ||
|             separator = '--' + boundary
 | ||
|             payload = fp.read()
 | ||
|             if firstbodyline is not None:
 | ||
|                 payload = firstbodyline + '\n' + payload
 | ||
|             # We use an RE here because boundaries can have trailing
 | ||
|             # whitespace.
 | ||
|             mo = re.search(
 | ||
|                 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
 | ||
|                 payload)
 | ||
|             if not mo:
 | ||
|                 if self._strict:
 | ||
|                     raise Errors.BoundaryError(
 | ||
|                         "Couldn't find starting boundary: %s" % boundary)
 | ||
|                 container.set_payload(payload)
 | ||
|                 return
 | ||
|             start = mo.start()
 | ||
|             if start > 0:
 | ||
|                 # there's some pre-MIME boundary preamble
 | ||
|                 preamble = payload[0:start]
 | ||
|             # Find out what kind of line endings we're using
 | ||
|             start += len(mo.group('sep')) + len(mo.group('ws'))
 | ||
|             mo = nlcre.search(payload, start)
 | ||
|             if mo:
 | ||
|                 start += len(mo.group(0))
 | ||
|             # We create a compiled regexp first because we need to be able to
 | ||
|             # specify the start position, and the module function doesn't
 | ||
|             # support this signature. :(
 | ||
|             cre = re.compile('(?P<sep>\r\n|\r|\n)' +
 | ||
|                              re.escape(separator) + '--')
 | ||
|             mo = cre.search(payload, start)
 | ||
|             if mo:
 | ||
|                 terminator = mo.start()
 | ||
|                 linesep = mo.group('sep')
 | ||
|                 if mo.end() < len(payload):
 | ||
|                     # There's some post-MIME boundary epilogue
 | ||
|                     epilogue = payload[mo.end():]
 | ||
|             elif self._strict:
 | ||
|                 raise Errors.BoundaryError(
 | ||
|                         "Couldn't find terminating boundary: %s" % boundary)
 | ||
|             else:
 | ||
|                 # Handle the case of no trailing boundary.  Check that it ends
 | ||
|                 # in a blank line.  Some cases (spamspamspam) don't even have
 | ||
|                 # that!
 | ||
|                 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
 | ||
|                 if not mo:
 | ||
|                     mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
 | ||
|                     if not mo:
 | ||
|                         raise Errors.BoundaryError(
 | ||
|                           'No terminating boundary and no trailing empty line')
 | ||
|                 linesep = mo.group('sep')
 | ||
|                 terminator = len(payload)
 | ||
|             # We split the textual payload on the boundary separator, which
 | ||
|             # includes the trailing newline. If the container is a
 | ||
|             # multipart/digest then the subparts are by default message/rfc822
 | ||
|             # instead of text/plain.  In that case, they'll have a optional
 | ||
|             # block of MIME headers, then an empty line followed by the
 | ||
|             # message headers.
 | ||
|             parts = re.split(
 | ||
|                 linesep + re.escape(separator) + r'[ \t]*' + linesep,
 | ||
|                 payload[start:terminator])
 | ||
|             for part in parts:
 | ||
|                 if isdigest:
 | ||
|                     if part.startswith(linesep):
 | ||
|                         # There's no header block so create an empty message
 | ||
|                         # object as the container, and lop off the newline so
 | ||
|                         # we can parse the sub-subobject
 | ||
|                         msgobj = self._class()
 | ||
|                         part = part[len(linesep):]
 | ||
|                     else:
 | ||
|                         parthdrs, part = part.split(linesep+linesep, 1)
 | ||
|                         # msgobj in this case is the "message/rfc822" container
 | ||
|                         msgobj = self.parsestr(parthdrs, headersonly=1)
 | ||
|                     # while submsgobj is the message itself
 | ||
|                     msgobj.set_default_type('message/rfc822')
 | ||
|                     maintype = msgobj.get_content_maintype()
 | ||
|                     if maintype in ('message', 'multipart'):
 | ||
|                         submsgobj = self.parsestr(part)
 | ||
|                         msgobj.attach(submsgobj)
 | ||
|                     else:
 | ||
|                         msgobj.set_payload(part)
 | ||
|                 else:
 | ||
|                     msgobj = self.parsestr(part)
 | ||
|                 container.preamble = preamble
 | ||
|                 container.epilogue = epilogue
 | ||
|                 container.attach(msgobj)
 | ||
|         elif container.get_main_type() == 'multipart':
 | ||
|             # Very bad.  A message is a multipart with no boundary!
 | ||
|             raise Errors.BoundaryError(
 | ||
|                 'multipart message with no defined boundary')
 | ||
|         elif container.get_type() == 'message/delivery-status':
 | ||
|             # This special kind of type contains blocks of headers separated
 | ||
|             # by a blank line.  We'll represent each header block as a
 | ||
|             # separate Message object
 | ||
|             blocks = []
 | ||
|             while True:
 | ||
|                 blockmsg = self._class()
 | ||
|                 self._parseheaders(blockmsg, fp)
 | ||
|                 if not len(blockmsg):
 | ||
|                     # No more header blocks left
 | ||
|                     break
 | ||
|                 blocks.append(blockmsg)
 | ||
|             container.set_payload(blocks)
 | ||
|         elif container.get_main_type() == 'message':
 | ||
|             # Create a container for the payload, but watch out for there not
 | ||
|             # being any headers left
 | ||
|             try:
 | ||
|                 msg = self.parse(fp)
 | ||
|             except Errors.HeaderParseError:
 | ||
|                 msg = self._class()
 | ||
|                 self._parsebody(msg, fp)
 | ||
|             container.attach(msg)
 | ||
|         else:
 | ||
|             text = fp.read()
 | ||
|             if firstbodyline is not None:
 | ||
|                 text = firstbodyline + '\n' + text
 | ||
|             container.set_payload(text)
 | ||
| 
 | ||
| 
 | ||
| 
 | ||
| class HeaderParser(Parser):
 | ||
|     """A subclass of Parser, this one only meaningfully parses message headers.
 | ||
| 
 | ||
|     This class can be used if all you're interested in is the headers of a
 | ||
|     message.  While it consumes the message body, it does not parse it, but
 | ||
|     simply makes it available as a string payload.
 | ||
| 
 | ||
|     Parsing with this subclass can be considerably faster if all you're
 | ||
|     interested in is the message headers.
 | ||
|     """
 | ||
|     def _parsebody(self, container, fp, firstbodyline=None):
 | ||
|         # Consume but do not parse, the body
 | ||
|         text = fp.read()
 | ||
|         if firstbodyline is not None:
 | ||
|             text = firstbodyline + '\n' + text
 | ||
|         container.set_payload(text)
 |