mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	-- fixed width calculations for alternations
-- fixed literal check in branch operator (this broke test_tokenize, as reported by Mark Favas) -- added REPEAT_ONE operator (still not enabled, though) -- added some debugging stuff (maxlevel)
This commit is contained in:
		
							parent
							
								
									329e29198d
								
							
						
					
					
						commit
						2f2c67d7e5
					
				
					 4 changed files with 199 additions and 35 deletions
				
			
		|  | @ -270,6 +270,7 @@ def _compile_info(code, pattern, flags): | |||
|                     table[i+1] = table[table[i+1]-1]+1 | ||||
|             code.extend(table[1:]) # don't store first entry | ||||
|     elif charset: | ||||
|         # FIXME: use charset optimizer! | ||||
|         for char in charset: | ||||
|             emit(OPCODES[LITERAL]) | ||||
|             emit(char) | ||||
|  | @ -283,7 +284,7 @@ def _compile_info(code, pattern, flags): | |||
| except NameError: | ||||
|     pass | ||||
| 
 | ||||
| def _compile1(p, flags): | ||||
| def _code(p, flags): | ||||
| 
 | ||||
|     flags = p.pattern.flags | flags | ||||
|     code = [] | ||||
|  | @ -308,7 +309,7 @@ def compile(p, flags=0): | |||
|     else: | ||||
|         pattern = None | ||||
| 
 | ||||
|     code = _compile1(p, flags) | ||||
|     code = _code(p, flags) | ||||
| 
 | ||||
|     # print code | ||||
| 
 | ||||
|  |  | |||
|  | @ -137,12 +137,12 @@ def getwidth(self): | |||
|         lo = hi = 0L | ||||
|         for op, av in self.data: | ||||
|             if op is BRANCH: | ||||
|                 l = sys.maxint | ||||
|                 h = 0 | ||||
|                 i = sys.maxint | ||||
|                 j = 0 | ||||
|                 for av in av[1]: | ||||
|                     i, j = av.getwidth() | ||||
|                     l = min(l, i) | ||||
|                     h = min(h, j) | ||||
|                     l, h = av.getwidth() | ||||
|                     i = min(i, l) | ||||
|                     j = min(j, h) | ||||
|                 lo = lo + i | ||||
|                 hi = hi + j | ||||
|             elif op is CALL: | ||||
|  |  | |||
							
								
								
									
										217
									
								
								Modules/_sre.c
									
										
									
									
									
								
							
							
						
						
									
										217
									
								
								Modules/_sre.c
									
										
									
									
									
								
							|  | @ -219,6 +219,14 @@ mark_init(SRE_STATE* state) | |||
| static void | ||||
| mark_fini(SRE_STATE* state) | ||||
| { | ||||
| #if 0 | ||||
|     /* FIXME: debugging */ | ||||
|     if (state->maxlevel > 0) | ||||
|         printf("max %d\n", state->maxlevel); | ||||
|     if (state->mark_stack_base > 0) | ||||
|         printf("mark stack %d\n", state->mark_stack_base); | ||||
| #endif | ||||
| 
 | ||||
|     if (state->mark_stack) | ||||
|         free(state->mark_stack); | ||||
|     mark_init(state); | ||||
|  | @ -430,7 +438,7 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch) | |||
| } | ||||
| 
 | ||||
| LOCAL(int) | ||||
| SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | ||||
| SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) | ||||
| { | ||||
|     /* check if string matches the given pattern.  returns -1 for
 | ||||
|        error, 0 for failure, and 1 for success */ | ||||
|  | @ -443,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
| 
 | ||||
|     SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */ | ||||
| 
 | ||||
|     TRACE(("%8d: enter\n", PTR(ptr))); | ||||
|     TRACE(("%8d: enter %d\n", PTR(ptr), level)); | ||||
| 
 | ||||
|     if (pattern[0] == SRE_OP_INFO) { | ||||
|         /* optimization info block */ | ||||
|  | @ -456,6 +464,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|         pattern += pattern[1] + 1; | ||||
|     } | ||||
| 
 | ||||
|     /* FIXME: debugging */ | ||||
|     if (level > state->maxlevel) | ||||
|         state->maxlevel = level; | ||||
| 
 | ||||
|     for (;;) { | ||||
| 
 | ||||
|         switch (*pattern++) { | ||||
|  | @ -623,7 +635,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             state->ptr = ptr - pattern[1]; | ||||
|             if (state->ptr < state->beginning) | ||||
|                 return 0; | ||||
|             i = SRE_MATCH(state, pattern + 2); | ||||
|             i = SRE_MATCH(state, pattern + 2, level + 1); | ||||
|             if (i <= 0) | ||||
|                 return i; | ||||
|             if (pattern[1] > 0 && state->ptr != ptr) | ||||
|  | @ -638,7 +650,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             state->ptr = ptr - pattern[1]; | ||||
|             if (state->ptr < state->beginning) | ||||
|                 return 0; | ||||
|             i = SRE_MATCH(state, pattern + 2); | ||||
|             i = SRE_MATCH(state, pattern + 2, level + 1); | ||||
|             if (i < 0) | ||||
|                 return i; | ||||
|             if (i) | ||||
|  | @ -656,10 +668,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                 lastmark = state->lastmark; | ||||
|                 while (pattern[0]) { | ||||
|                     TRACE(("%8d: try branch\n", PTR(ptr))); | ||||
|                     if (pattern[2] != SRE_OP_LITERAL || | ||||
|                         (ptr < end && (SRE_CODE) ptr[0] == pattern[3])) { | ||||
|                     if (pattern[1] != SRE_OP_LITERAL || | ||||
|                         (ptr < end && (SRE_CODE) ptr[0] == pattern[2])) { | ||||
|                         state->ptr = ptr; | ||||
|                         i = SRE_MATCH(state, pattern + 1); | ||||
|                         i = SRE_MATCH(state, pattern + 1, level + 1); | ||||
|                         if (i) | ||||
|                             return i; | ||||
|                     } | ||||
|  | @ -670,6 +682,155 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             } | ||||
|             return 0; | ||||
| 
 | ||||
|         case SRE_OP_REPEAT_ONE: | ||||
|             /* match repeated sequence (maximizing regexp) */ | ||||
| 
 | ||||
|             /* this operator only works if the repeated item is
 | ||||
|                exactly one character wide, and we're not already | ||||
|                collecting backtracking points.  for other cases, | ||||
|                use the MAX_REPEAT operator instead */ | ||||
| 
 | ||||
|             /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ | ||||
| 
 | ||||
|             TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr), | ||||
|                    pattern[1], pattern[2])); | ||||
| 
 | ||||
|             count = 0; | ||||
| 
 | ||||
|             if (pattern[3] == SRE_OP_ANY) { | ||||
|                 /* repeated wildcard.  skip to the end of the target
 | ||||
|                    string, and backtrack from there */ | ||||
|                 /* FIXME: must look for line endings */ | ||||
|                 if (ptr + pattern[1] > end) | ||||
|                     return 0; /* cannot match */ | ||||
|                 count = pattern[2]; | ||||
|                 if (count > end - ptr) | ||||
|                     count = end - ptr; | ||||
|                 ptr += count; | ||||
| 
 | ||||
|             } else if (pattern[3] == SRE_OP_LITERAL) { | ||||
|                 /* repeated literal */ | ||||
|                 SRE_CODE chr = pattern[4]; | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     if (ptr >= end || (SRE_CODE) ptr[0] != chr) | ||||
|                         break; | ||||
|                     ptr++; | ||||
|                     count++; | ||||
|                 } | ||||
| 
 | ||||
|             } else if (pattern[3] == SRE_OP_LITERAL_IGNORE) { | ||||
|                 /* repeated literal */ | ||||
|                 SRE_CODE chr = pattern[4]; | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr) | ||||
|                         break; | ||||
|                     ptr++; | ||||
|                     count++; | ||||
|                 } | ||||
| 
 | ||||
|             } else if (pattern[3] == SRE_OP_NOT_LITERAL) { | ||||
|                 /* repeated non-literal */ | ||||
|                 SRE_CODE chr = pattern[4]; | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     if (ptr >= end || (SRE_CODE) ptr[0] == chr) | ||||
|                         break; | ||||
|                     ptr++; | ||||
|                     count++; | ||||
|                 } | ||||
| 
 | ||||
|             } else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) { | ||||
|                 /* repeated non-literal */ | ||||
|                 SRE_CODE chr = pattern[4]; | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr) | ||||
|                         break; | ||||
|                     ptr++; | ||||
|                     count++; | ||||
|                 } | ||||
| 
 | ||||
|             } else if (pattern[3] == SRE_OP_IN) { | ||||
|                 /* repeated set */ | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr)) | ||||
|                         break; | ||||
|                     ptr++; | ||||
|                     count++; | ||||
|                 } | ||||
| 
 | ||||
|             } else { | ||||
|                 /* repeated single character pattern */ | ||||
|                 state->ptr = ptr; | ||||
|                 while (count < (int) pattern[2]) { | ||||
|                     i = SRE_MATCH(state, pattern + 3, level + 1); | ||||
|                     if (i < 0) | ||||
|                         return i; | ||||
|                     if (!i) | ||||
|                         break; | ||||
|                     count++; | ||||
|                 } | ||||
|                 state->ptr = ptr; | ||||
|                 ptr += count; | ||||
|             } | ||||
| 
 | ||||
|             /* when we arrive here, count contains the number of
 | ||||
|                matches, and ptr points to the tail of the target | ||||
|                string.  check if the rest of the pattern matches, | ||||
|                and backtrack if not. */ | ||||
| 
 | ||||
|             TRACE(("%8d: repeat %d found\n", PTR(ptr), count)); | ||||
| 
 | ||||
|             if (count < (int) pattern[1]) | ||||
|                 return 0; | ||||
| 
 | ||||
|             if (pattern[pattern[0]] == SRE_OP_SUCCESS) { | ||||
|                 /* tail is empty.  we're finished */ | ||||
|                 TRACE(("%8d: tail is empty\n", PTR(ptr))); | ||||
|                 state->ptr = ptr; | ||||
|                 return 1; | ||||
| 
 | ||||
|             } else if (pattern[pattern[0]] == SRE_OP_LITERAL) { | ||||
|                 /* tail starts with a literal. skip positions where
 | ||||
|                    the rest of the pattern cannot possibly match */ | ||||
|                 SRE_CODE chr = pattern[pattern[0]+1]; | ||||
|                 TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr)); | ||||
|                 for (;;) { | ||||
|                     TRACE(("%8d: scan for tail match\n", PTR(ptr))); | ||||
|                     while (count >= (int) pattern[1] && | ||||
|                            (ptr >= end || *ptr != chr)) { | ||||
|                         ptr--; | ||||
|                         count--; | ||||
|                     } | ||||
|                     TRACE(("%8d: check tail\n", PTR(ptr))); | ||||
|                     if (count < (int) pattern[1]) | ||||
|                         break; | ||||
|                     state->ptr = ptr; | ||||
|                     i = SRE_MATCH(state, pattern + pattern[0], level + 1); | ||||
|                     if (i > 0) { | ||||
|                         TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); | ||||
|                         return 1; | ||||
|                     } | ||||
|                     ptr--; | ||||
|                     count--; | ||||
|                 } | ||||
| 
 | ||||
|             } else { | ||||
|                 /* general case */ | ||||
|                 TRACE(("%8d: tail is pattern\n", PTR(ptr))); | ||||
|                 while (count >= (int) pattern[1]) { | ||||
|                     state->ptr = ptr; | ||||
|                     i = SRE_MATCH(state, pattern + pattern[0], level + 1); | ||||
|                     if (i < 0) | ||||
|                         return i; | ||||
|                     if (i) { | ||||
|                         TRACE(("%8d: repeat %d picked\n", PTR(ptr), count)); | ||||
|                         return 1; | ||||
|                     } | ||||
|                     ptr--; | ||||
|                     count--; | ||||
|                 } | ||||
|             } | ||||
|             return 0; | ||||
| 
 | ||||
|         case SRE_OP_REPEAT: | ||||
|             /* create repeat context.  all the hard work is done
 | ||||
|                by the UNTIL operator */ | ||||
|  | @ -677,8 +838,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             TRACE(("%8d: repeat {%d,%d}\n", PTR(ptr), | ||||
|                    pattern[1], pattern[2])); | ||||
| 
 | ||||
|             state->ptr = ptr; | ||||
| 
 | ||||
|             rep.count = -1; | ||||
|             rep.pattern = pattern; | ||||
| 
 | ||||
|  | @ -686,10 +845,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             rep.prev = state->repeat; | ||||
|             state->repeat = &rep; | ||||
| 
 | ||||
|             i = SRE_MATCH(state, pattern + pattern[0]); | ||||
|             state->ptr = ptr; | ||||
|             i = SRE_MATCH(state, pattern + pattern[0], level + 1); | ||||
| 
 | ||||
|             state->repeat = rep.prev; | ||||
|             /* free(rp); */ | ||||
| 
 | ||||
|             return i; | ||||
| 
 | ||||
|  | @ -714,7 +873,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                 /* not enough matches */ | ||||
|                 TRACE(("%8d: match item (required)\n", PTR(ptr))); | ||||
|                 rp->count = count; | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3); | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3, level + 1); | ||||
|                 if (i) | ||||
|                     return i; | ||||
|                 rp->count = count - 1; | ||||
|  | @ -729,7 +888,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                 rp->count = count; | ||||
|                 lastmark = state->lastmark; | ||||
|                 mark_save(state, 0, lastmark); | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3); | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3, level + 1); | ||||
|                 if (i) | ||||
|                     return i; | ||||
|                 mark_restore(state, 0, lastmark); | ||||
|  | @ -741,11 +900,9 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                tail matches */ | ||||
|             TRACE(("%8d: match tail\n", PTR(ptr))); | ||||
|             state->repeat = rp->prev; | ||||
|             i = SRE_MATCH(state, pattern); | ||||
|             if (i) { | ||||
|                 /* free(rp); */ | ||||
|             i = SRE_MATCH(state, pattern, level + 1); | ||||
|             if (i) | ||||
|                 return i; | ||||
|             } | ||||
|             state->repeat = rp; | ||||
|             return 0; | ||||
| 
 | ||||
|  | @ -767,7 +924,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                 /* not enough matches */ | ||||
|                 TRACE(("%8d: match item (required)\n", PTR(ptr))); | ||||
|                 rp->count = count; | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3); | ||||
|                 i = SRE_MATCH(state, rp->pattern + 3, level + 1); | ||||
|                 if (i) | ||||
|                     return i; | ||||
|                 rp->count = count-1; | ||||
|  | @ -778,7 +935,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             /* see if the tail matches */ | ||||
|             TRACE(("%8d: match tail\n", PTR(ptr))); | ||||
|             state->repeat = rp->prev; | ||||
|             i = SRE_MATCH(state, pattern); | ||||
|             i = SRE_MATCH(state, pattern, level + 1); | ||||
|             if (i) { | ||||
|                 /* free(rp); */ | ||||
|                 return i; | ||||
|  | @ -790,7 +947,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern) | |||
| 
 | ||||
|             TRACE(("%8d: match item (optional)\n", PTR(ptr))); | ||||
|             rp->count = count; | ||||
|             i = SRE_MATCH(state, rp->pattern + 3); | ||||
|             i = SRE_MATCH(state, rp->pattern + 3, level + 1); | ||||
|             if (i) | ||||
|                 return i; | ||||
|             rp->count = count - 1; | ||||
|  | @ -865,7 +1022,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|                         state->ptr = ptr + 1; | ||||
|                         if (flags & SRE_INFO_LITERAL) | ||||
|                             return 1; /* we got all of it */ | ||||
|                         status = SRE_MATCH(state, pattern + 2*prefix_len); | ||||
|                         status = SRE_MATCH(state, pattern + 2*prefix_len, 1); | ||||
|                         if (status != 0) | ||||
|                             return status; | ||||
|                         /* close but no cigar -- try again */ | ||||
|  | @ -893,7 +1050,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             TRACE(("%8d: === SEARCH === literal\n", PTR(ptr))); | ||||
|             state->start = ptr; | ||||
|             state->ptr = ++ptr; | ||||
|             status = SRE_MATCH(state, pattern + 2); | ||||
|             status = SRE_MATCH(state, pattern + 2, 1); | ||||
|             if (status != 0) | ||||
|                 break; | ||||
|         } | ||||
|  | @ -907,7 +1064,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|             TRACE(("%8d: === SEARCH === charset\n", PTR(ptr))); | ||||
|             state->start = ptr; | ||||
|             state->ptr = ptr; | ||||
|             status = SRE_MATCH(state, pattern); | ||||
|             status = SRE_MATCH(state, pattern, 1); | ||||
|             if (status != 0) | ||||
|                 break; | ||||
|         } | ||||
|  | @ -916,7 +1073,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern) | |||
|         while (ptr <= end) { | ||||
|             TRACE(("%8d: === SEARCH ===\n", PTR(ptr)));  | ||||
|             state->start = state->ptr = ptr++; | ||||
|             status = SRE_MATCH(state, pattern); | ||||
|             status = SRE_MATCH(state, pattern, 1); | ||||
|             if (status != 0) | ||||
|                 break; | ||||
|         } | ||||
|  | @ -1032,6 +1189,9 @@ state_reset(SRE_STATE* state) | |||
| 
 | ||||
|     state->repeat = NULL; | ||||
| 
 | ||||
|     /* FIXME: debugging */ | ||||
|     state->maxlevel = 0; | ||||
| 
 | ||||
|     mark_fini(state); | ||||
| } | ||||
| 
 | ||||
|  | @ -1110,6 +1270,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, | |||
|         state->lower = sre_lower; | ||||
| 
 | ||||
|     state->mark_stack = NULL; | ||||
|     state->mark_stack_base = 0; | ||||
| 
 | ||||
|     state_reset(state); | ||||
| 
 | ||||
|  | @ -1262,10 +1423,10 @@ pattern_match(PatternObject* self, PyObject* args) | |||
|     state.ptr = state.start; | ||||
| 
 | ||||
|     if (state.charsize == 1) { | ||||
|         status = sre_match(&state, PatternObject_GetCode(self)); | ||||
|         status = sre_match(&state, PatternObject_GetCode(self), 1); | ||||
|     } else { | ||||
| #if defined(HAVE_UNICODE) | ||||
|         status = sre_umatch(&state, PatternObject_GetCode(self)); | ||||
|         status = sre_umatch(&state, PatternObject_GetCode(self), 1); | ||||
| #endif | ||||
|     } | ||||
| 
 | ||||
|  | @ -1941,10 +2102,10 @@ scanner_match(ScannerObject* self, PyObject* args) | |||
|     state->ptr = state->start; | ||||
| 
 | ||||
|     if (state->charsize == 1) { | ||||
|         status = sre_match(state, PatternObject_GetCode(self->pattern)); | ||||
|         status = sre_match(state, PatternObject_GetCode(self->pattern), 1); | ||||
|     } else { | ||||
| #if defined(HAVE_UNICODE) | ||||
|         status = sre_umatch(state, PatternObject_GetCode(self->pattern)); | ||||
|         status = sre_umatch(state, PatternObject_GetCode(self->pattern), 1); | ||||
| #endif | ||||
|     } | ||||
| 
 | ||||
|  |  | |||
|  | @ -74,6 +74,8 @@ typedef struct { | |||
|     SRE_REPEAT *repeat; /* current repeat context */ | ||||
|     /* hooks */ | ||||
|     SRE_TOLOWER_HOOK lower; | ||||
|     /* debugging */ | ||||
|     int maxlevel; | ||||
| } SRE_STATE; | ||||
| 
 | ||||
| typedef struct { | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Fredrik Lundh
						Fredrik Lundh