mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	New parser. Next up, making the current parser use this parser
This commit is contained in:
		
							parent
							
								
									e62c5c88f1
								
							
						
					
					
						commit
						39a0f04421
					
				
					 1 changed files with 362 additions and 0 deletions
				
			
		
							
								
								
									
										362
									
								
								Lib/email/FeedParser.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										362
									
								
								Lib/email/FeedParser.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,362 @@ | |||
| # A new Feed-style Parser | ||||
| 
 | ||||
| from email import Errors, Message | ||||
| import re | ||||
| 
 | ||||
| NLCRE = re.compile('\r\n|\r|\n') | ||||
| 
 | ||||
| EMPTYSTRING = '' | ||||
| NL = '\n' | ||||
| 
 | ||||
| NeedMoreData = object() | ||||
| 
 | ||||
| class FeedableLumpOfText: | ||||
|     "A file-like object that can have new data loaded into it" | ||||
| 
 | ||||
|     def __init__(self): | ||||
|         self._partial = '' | ||||
|         self._done = False | ||||
|         # _pending is a list of lines, in reverse order | ||||
|         self._pending = [] | ||||
| 
 | ||||
|     def readline(self): | ||||
|         """ Return a line of data. | ||||
| 
 | ||||
|             If data has been pushed back with unreadline(), the most recently | ||||
|             returned unreadline()d data will be returned. | ||||
|         """ | ||||
|         if not self._pending: | ||||
|             if self._done: | ||||
|                 return '' | ||||
|             return NeedMoreData | ||||
|         return self._pending.pop() | ||||
| 
 | ||||
|     def unreadline(self, line): | ||||
|         """ Push a line back into the object.  | ||||
|         """ | ||||
|         self._pending.append(line) | ||||
| 
 | ||||
|     def peekline(self): | ||||
|         """ Non-destructively look at the next line """ | ||||
|         if not self._pending: | ||||
|             if self._done: | ||||
|                 return '' | ||||
|             return NeedMoreData | ||||
|         return self._pending[-1] | ||||
| 
 | ||||
| 
 | ||||
|     # for r in self._input.readuntil(regexp): | ||||
|     #     if r is NeedMoreData: | ||||
|     #         yield NeedMoreData | ||||
|     #     preamble, matchobj = r | ||||
|     def readuntil(self, matchre, afterblank=False, includematch=False): | ||||
|         """ Read a line at a time until we get the specified RE.  | ||||
| 
 | ||||
|             Returns the text up to (and including, if includematch is true) the  | ||||
|             matched text, and the RE match object. If afterblank is true,  | ||||
|             there must be a blank line before the matched text. Moves current  | ||||
|             filepointer to the line following the matched line. If we reach  | ||||
|             end-of-file, return what we've got so far, and return None as the | ||||
|             RE match object. | ||||
|         """ | ||||
|         prematch = [] | ||||
|         blankseen = 0 | ||||
|         while 1:  | ||||
|             if not self._pending: | ||||
|                 if self._done: | ||||
|                     # end of file | ||||
|                     yield EMPTYSTRING.join(prematch), None | ||||
|                 else: | ||||
|                     yield NeedMoreData | ||||
|                 continue | ||||
|             line = self._pending.pop() | ||||
|             if afterblank: | ||||
|                 if NLCRE.match(line): | ||||
|                     blankseen = 1 | ||||
|                     continue | ||||
|                 else: | ||||
|                     blankseen = 0 | ||||
|             m = matchre.match(line) | ||||
|             if (m and not afterblank) or (m and afterblank and blankseen): | ||||
|                 if includematch: | ||||
|                     prematch.append(line) | ||||
|                 yield EMPTYSTRING.join(prematch), m | ||||
|             prematch.append(line) | ||||
| 
 | ||||
| 
 | ||||
|     NLatend = re.compile('(\r\n|\r|\n)$').match | ||||
|     NLCRE_crack = re.compile('(\r\n|\r|\n)') | ||||
| 
 | ||||
|     def push(self, data): | ||||
|         """ Push some new data into this object """ | ||||
|         # Handle any previous leftovers | ||||
|         data, self._partial = self._partial+data, '' | ||||
|         # Crack into lines, but leave the newlines on the end of each | ||||
|         lines = self.NLCRE_crack.split(data) | ||||
|         # The *ahem* interesting behaviour of re.split when supplied | ||||
|         # groups means that the last element is the data after the  | ||||
|         # final RE. In the case of a NL/CR terminated string, this is | ||||
|         # the empty string. | ||||
|         self._partial = lines.pop() | ||||
|         o = [] | ||||
|         for i in range(len(lines) / 2): | ||||
|             o.append(EMPTYSTRING.join([lines[i*2], lines[i*2+1]])) | ||||
|         self.pushlines(o) | ||||
|      | ||||
|     def pushlines(self, lines): | ||||
|         """ Push a list of new lines into the object """ | ||||
|         # Reverse and insert at the front of _pending | ||||
|         self._pending[:0] = lines[::-1] | ||||
| 
 | ||||
|     def end(self): | ||||
|         """ There is no more data """ | ||||
|         self._done = True | ||||
| 
 | ||||
|     def is_done(self): | ||||
|         return self._done | ||||
| 
 | ||||
|     def __iter__(self): | ||||
|         return self | ||||
| 
 | ||||
|     def next(self): | ||||
|         l = self.readline() | ||||
|         if l == '':  | ||||
|             raise StopIteration | ||||
|         return l | ||||
| 
 | ||||
| class FeedParser: | ||||
|     "A feed-style parser of email. copy docstring here" | ||||
| 
 | ||||
|     def __init__(self, _class=Message.Message): | ||||
|         "fnord fnord fnord" | ||||
|         self._class = _class | ||||
|         self._input = FeedableLumpOfText() | ||||
|         self._root = None | ||||
|         self._objectstack = [] | ||||
|         self._parse = self._parsegen().next | ||||
| 
 | ||||
|     def end(self): | ||||
|         self._input.end() | ||||
|         self._call_parse() | ||||
|         return self._root | ||||
| 
 | ||||
|     def feed(self, data): | ||||
|         self._input.push(data) | ||||
|         self._call_parse() | ||||
| 
 | ||||
|     def _call_parse(self): | ||||
|         try: | ||||
|             self._parse() | ||||
|         except StopIteration: | ||||
|             pass | ||||
| 
 | ||||
|     headerRE = re.compile(r'^(From |[-\w]{2,}:|[\t ])') | ||||
| 
 | ||||
|     def _parse_headers(self,headerlist): | ||||
|         # Passed a list of strings that are the headers for the  | ||||
|         # current object | ||||
|         lastheader = '' | ||||
|         lastvalue = [] | ||||
| 
 | ||||
| 
 | ||||
|         for lineno, line in enumerate(headerlist): | ||||
|             # Check for continuation | ||||
|             if line[0] in ' \t': | ||||
|                 if not lastheader: | ||||
|                     raise Errors.HeaderParseError('First line must not be a continuation') | ||||
|                 lastvalue.append(line) | ||||
|                 continue | ||||
| 
 | ||||
|             if lastheader: | ||||
|                 # XXX reconsider the joining of folded lines | ||||
|                 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip() | ||||
|                 lastheader, lastvalue = '', [] | ||||
| 
 | ||||
|             # Check for Unix-From | ||||
|             if line.startswith('From '): | ||||
|                 if lineno == 0: | ||||
|                     self._cur.set_unixfrom(line) | ||||
|                     continue | ||||
|                 elif lineno == len(headerlist) - 1: | ||||
|                     # Something looking like a unix-from at the end - it's | ||||
|                     # probably the first line of the body | ||||
|                     self._input.unreadline(line) | ||||
|                     return | ||||
|                 else: | ||||
|                     # Weirdly placed unix-from line. Ignore it. | ||||
|                     continue | ||||
| 
 | ||||
|             i = line.find(':') | ||||
|             if i < 0: | ||||
|                 # The older parser had various special-cases here. We've | ||||
|                 # already handled them | ||||
|                 raise Errors.HeaderParseError( | ||||
|                        "Not a header, not a continuation: ``%s''" % line) | ||||
|             lastheader = line[:i] | ||||
|             lastvalue = [line[i+1:].lstrip()] | ||||
| 
 | ||||
|         if lastheader: | ||||
|             # XXX reconsider the joining of folded lines | ||||
|             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip() | ||||
| 
 | ||||
| 
 | ||||
|     def _parsegen(self): | ||||
|         # Parse any currently available text | ||||
|         self._new_sub_object() | ||||
|         self._root = self._cur | ||||
|         completing = False | ||||
|         last = None | ||||
|          | ||||
|         for line in self._input: | ||||
|             if line is NeedMoreData: | ||||
|                 yield None # Need More Data | ||||
|                 continue | ||||
|             self._input.unreadline(line) | ||||
|             if not completing: | ||||
|                 headers = [] | ||||
|                 # Now collect all headers. | ||||
|                 for line in self._input: | ||||
|                     if line is NeedMoreData: | ||||
|                         yield None # Need More Data | ||||
|                         continue | ||||
|                     if not self.headerRE.match(line): | ||||
|                         self._parse_headers(headers) | ||||
|                         # A message/rfc822 has no body and no internal  | ||||
|                         # boundary. | ||||
|                         if self._cur.get_content_maintype() == "message": | ||||
|                             self._new_sub_object() | ||||
|                             completing = False | ||||
|                             headers = [] | ||||
|                             continue | ||||
|                         if line.strip(): | ||||
|                             # No blank line between headers and body.  | ||||
|                             # Push this line back, it's the first line of  | ||||
|                             # the body. | ||||
|                             self._input.unreadline(line) | ||||
|                         break | ||||
|                     else: | ||||
|                         headers.append(line) | ||||
|                 else: | ||||
|                     # We're done with the data and are still inside the headers | ||||
|                     self._parse_headers(headers) | ||||
| 
 | ||||
|             # Now we're dealing with the body | ||||
|             boundary = self._cur.get_boundary() | ||||
|             isdigest = (self._cur.get_content_type() == 'multipart/digest') | ||||
|             if boundary and not self._cur._finishing: | ||||
|                 separator = '--' + boundary | ||||
|                 self._cur._boundaryRE = re.compile( | ||||
|                         r'(?P<sep>' + re.escape(separator) + | ||||
|                         r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)$') | ||||
|                 for r in self._input.readuntil(self._cur._boundaryRE): | ||||
|                     if r is NeedMoreData: | ||||
|                          yield NeedMoreData | ||||
|                     else: | ||||
|                         preamble, matchobj = r | ||||
|                         break | ||||
|                 if not matchobj: | ||||
|                     # Broken - we hit the end of file. Just set the body  | ||||
|                     # to the text. | ||||
|                     if completing: | ||||
|                         self._attach_trailer(last, preamble) | ||||
|                     else: | ||||
|                         self._attach_preamble(self._cur, preamble) | ||||
|                     # XXX move back to the parent container. | ||||
|                     self._pop_container() | ||||
|                     completing = True | ||||
|                     continue | ||||
|                 if preamble: | ||||
|                     if completing: | ||||
|                         preamble = preamble[:-len(matchobj.group('linesep'))] | ||||
|                         self._attach_trailer(last, preamble) | ||||
|                     else: | ||||
|                         self._attach_preamble(self._cur, preamble) | ||||
|                 elif not completing: | ||||
|                     # The module docs specify an empty preamble is None, not '' | ||||
|                     self._cur.preamble = None | ||||
|                     # If we _are_ completing, the last object gets no payload | ||||
| 
 | ||||
|                 if matchobj.group('end'): | ||||
|                     # That was the end boundary tag. Bounce back to the | ||||
|                     # parent container | ||||
|                     last = self._pop_container() | ||||
|                     self._input.unreadline(matchobj.group('linesep')) | ||||
|                     completing = True | ||||
|                     continue | ||||
| 
 | ||||
|                 # A number of MTAs produced by a nameless large company | ||||
|                 # we shall call "SicroMoft" produce repeated boundary  | ||||
|                 # lines. | ||||
|                 while True: | ||||
|                     line = self._input.peekline() | ||||
|                     if line is NeedMoreData: | ||||
|                         yield None | ||||
|                         continue | ||||
|                     if self._cur._boundaryRE.match(line): | ||||
|                         self._input.readline() | ||||
|                     else: | ||||
|                         break | ||||
| 
 | ||||
|                 self._new_sub_object() | ||||
|                  | ||||
|                 completing = False | ||||
|                 if isdigest: | ||||
|                     self._cur.set_default_type('message/rfc822') | ||||
|                     continue | ||||
|             else: | ||||
|                 # non-multipart or after end-boundary | ||||
|                 if last is not self._root: | ||||
|                     last = self._pop_container() | ||||
|                 if self._cur.get_content_maintype() == "message": | ||||
|                     # We double-pop to leave the RFC822 object | ||||
|                     self._pop_container() | ||||
|                     completing = True | ||||
|                 elif self._cur._boundaryRE and last <> self._root: | ||||
|                     completing = True | ||||
|                 else: | ||||
|                     # Non-multipart top level, or in the trailer of the  | ||||
|                     # top level multipart | ||||
|                     while not self._input.is_done(): | ||||
|                         yield None | ||||
|                     data = list(self._input) | ||||
|                     body = EMPTYSTRING.join(data) | ||||
|                     self._attach_trailer(last, body) | ||||
| 
 | ||||
| 
 | ||||
|     def _attach_trailer(self, obj, trailer): | ||||
|         #import pdb ; pdb.set_trace() | ||||
|         if obj.get_content_maintype() in ( "multipart", "message" ): | ||||
|             obj.epilogue = trailer | ||||
|         else: | ||||
|             obj.set_payload(trailer) | ||||
| 
 | ||||
|     def _attach_preamble(self, obj, trailer): | ||||
|         if obj.get_content_maintype() in ( "multipart", "message" ): | ||||
|             obj.preamble = trailer | ||||
|         else: | ||||
|             obj.set_payload(trailer) | ||||
| 
 | ||||
| 
 | ||||
|     def _new_sub_object(self): | ||||
|         new = self._class() | ||||
|         #print "pushing", self._objectstack, repr(new) | ||||
|         if self._objectstack: | ||||
|             self._objectstack[-1].attach(new) | ||||
|         self._objectstack.append(new) | ||||
|         new._boundaryRE = None | ||||
|         new._finishing = False | ||||
|         self._cur = new | ||||
| 
 | ||||
|     def _pop_container(self): | ||||
|         # Move the pointer to the container of the current object. | ||||
|         # Returns the (old) current object | ||||
|         #import pdb ; pdb.set_trace() | ||||
|         #print "popping", self._objectstack | ||||
|         last = self._objectstack.pop() | ||||
|         if self._objectstack: | ||||
|             self._cur = self._objectstack[-1] | ||||
|         else: | ||||
|             self._cur._finishing = True | ||||
|         return last | ||||
| 
 | ||||
| 
 | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Anthony Baxter
						Anthony Baxter