mirror of
				https://github.com/python/cpython.git
				synced 2025-10-30 21:21:22 +00:00 
			
		
		
		
	gh-109747: Improve errors for unsupported look-behind patterns (GH-109859)
Now re.error is raised instead of OverflowError or RuntimeError for too large width of look-behind pattern. The limit is increased to 2**32-1 (was 2**31-1).
This commit is contained in:
		
							parent
							
								
									ca0f3d858d
								
							
						
					
					
						commit
						e2b3d831fd
					
				
					 6 changed files with 46 additions and 13 deletions
				
			
		|  | @ -147,6 +147,8 @@ def _compile(code, pattern, flags): | ||||||
|                 emit(0) # look ahead |                 emit(0) # look ahead | ||||||
|             else: |             else: | ||||||
|                 lo, hi = av[1].getwidth() |                 lo, hi = av[1].getwidth() | ||||||
|  |                 if lo > MAXCODE: | ||||||
|  |                     raise error("looks too much behind") | ||||||
|                 if lo != hi: |                 if lo != hi: | ||||||
|                     raise error("look-behind requires fixed-width pattern") |                     raise error("look-behind requires fixed-width pattern") | ||||||
|                 emit(lo) # look behind |                 emit(lo) # look behind | ||||||
|  | @ -547,7 +549,7 @@ def _compile_info(code, pattern, flags): | ||||||
|     else: |     else: | ||||||
|         emit(MAXCODE) |         emit(MAXCODE) | ||||||
|         prefix = prefix[:MAXCODE] |         prefix = prefix[:MAXCODE] | ||||||
|     emit(min(hi, MAXCODE)) |     emit(hi) | ||||||
|     # add literal prefix |     # add literal prefix | ||||||
|     if prefix: |     if prefix: | ||||||
|         emit(len(prefix)) # length |         emit(len(prefix)) # length | ||||||
|  |  | ||||||
|  | @ -67,6 +67,10 @@ | ||||||
| TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE | ||||||
| GLOBAL_FLAGS = SRE_FLAG_DEBUG | GLOBAL_FLAGS = SRE_FLAG_DEBUG | ||||||
| 
 | 
 | ||||||
|  | # Maximal value returned by SubPattern.getwidth(). | ||||||
|  | # Must be larger than MAXREPEAT, MAXCODE and sys.maxsize. | ||||||
|  | MAXWIDTH = 1 << 64 | ||||||
|  | 
 | ||||||
| class State: | class State: | ||||||
|     # keeps track of state for parsing |     # keeps track of state for parsing | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
|  | @ -177,7 +181,7 @@ def getwidth(self): | ||||||
|         lo = hi = 0 |         lo = hi = 0 | ||||||
|         for op, av in self.data: |         for op, av in self.data: | ||||||
|             if op is BRANCH: |             if op is BRANCH: | ||||||
|                 i = MAXREPEAT - 1 |                 i = MAXWIDTH | ||||||
|                 j = 0 |                 j = 0 | ||||||
|                 for av in av[1]: |                 for av in av[1]: | ||||||
|                     l, h = av.getwidth() |                     l, h = av.getwidth() | ||||||
|  | @ -196,6 +200,9 @@ def getwidth(self): | ||||||
|             elif op in _REPEATCODES: |             elif op in _REPEATCODES: | ||||||
|                 i, j = av[2].getwidth() |                 i, j = av[2].getwidth() | ||||||
|                 lo = lo + i * av[0] |                 lo = lo + i * av[0] | ||||||
|  |                 if av[1] == MAXREPEAT and j: | ||||||
|  |                     hi = MAXWIDTH | ||||||
|  |                 else: | ||||||
|                     hi = hi + j * av[1] |                     hi = hi + j * av[1] | ||||||
|             elif op in _UNITCODES: |             elif op in _UNITCODES: | ||||||
|                 lo = lo + 1 |                 lo = lo + 1 | ||||||
|  | @ -216,7 +223,7 @@ def getwidth(self): | ||||||
|                 hi = hi + j |                 hi = hi + j | ||||||
|             elif op is SUCCESS: |             elif op is SUCCESS: | ||||||
|                 break |                 break | ||||||
|         self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) |         self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH) | ||||||
|         return self.width |         return self.width | ||||||
| 
 | 
 | ||||||
| class Tokenizer: | class Tokenizer: | ||||||
|  |  | ||||||
|  | @ -1861,6 +1861,29 @@ def test_repeat_minmax_overflow(self): | ||||||
|         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) |         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128) | ||||||
|         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) |         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128)) | ||||||
| 
 | 
 | ||||||
|  |     def test_look_behind_overflow(self): | ||||||
|  |         string = "x" * 2_500_000 | ||||||
|  |         p1 = r"(?<=((.{%d}){%d}){%d})" | ||||||
|  |         p2 = r"(?<!((.{%d}){%d}){%d})" | ||||||
|  |         # Test that the templates are valid and look-behind with width 2**21 | ||||||
|  |         # (larger than sys.maxunicode) are supported. | ||||||
|  |         self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(), | ||||||
|  |                          (2**21, 2**21)) | ||||||
|  |         self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(), | ||||||
|  |                          (0, 0)) | ||||||
|  |         # Test that 2**22 is accepted as a repetition number and look-behind | ||||||
|  |         # width. | ||||||
|  |         re.compile(p1 % (2**22, 1, 1)) | ||||||
|  |         re.compile(p1 % (1, 2**22, 1)) | ||||||
|  |         re.compile(p1 % (1, 1, 2**22)) | ||||||
|  |         re.compile(p2 % (2**22, 1, 1)) | ||||||
|  |         re.compile(p2 % (1, 2**22, 1)) | ||||||
|  |         re.compile(p2 % (1, 1, 2**22)) | ||||||
|  |         # But 2**66 is too large for look-behind width. | ||||||
|  |         errmsg = "looks too much behind" | ||||||
|  |         self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22)) | ||||||
|  |         self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22)) | ||||||
|  | 
 | ||||||
|     def test_backref_group_name_in_exception(self): |     def test_backref_group_name_in_exception(self): | ||||||
|         # Issue 17341: Poor error message when compiling invalid regex |         # Issue 17341: Poor error message when compiling invalid regex | ||||||
|         self.checkPatternError('(?P=<foo>)', |         self.checkPatternError('(?P=<foo>)', | ||||||
|  |  | ||||||
|  | @ -0,0 +1,3 @@ | ||||||
|  | Improve errors for unsupported look-behind patterns. Now re.error is raised | ||||||
|  | instead of OverflowError or RuntimeError for too large width of look-behind | ||||||
|  | pattern. | ||||||
|  | @ -2070,8 +2070,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) | ||||||
|             GET_SKIP; |             GET_SKIP; | ||||||
|             GET_ARG; /* 0 for lookahead, width for lookbehind */ |             GET_ARG; /* 0 for lookahead, width for lookbehind */ | ||||||
|             code--; /* Back up over arg to simplify math below */ |             code--; /* Back up over arg to simplify math below */ | ||||||
|             if (arg & 0x80000000) |  | ||||||
|                 FAIL; /* Width too large */ |  | ||||||
|             /* Stop 1 before the end; we check the SUCCESS below */ |             /* Stop 1 before the end; we check the SUCCESS below */ | ||||||
|             if (_validate_inner(code+1, code+skip-2, groups)) |             if (_validate_inner(code+1, code+skip-2, groups)) | ||||||
|                 FAIL; |                 FAIL; | ||||||
|  |  | ||||||
|  | @ -591,8 +591,8 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) | ||||||
|         /* optimization info block */ |         /* optimization info block */ | ||||||
|         /* <INFO> <1=skip> <2=flags> <3=min> ... */ |         /* <INFO> <1=skip> <2=flags> <3=min> ... */ | ||||||
|         if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) { |         if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) { | ||||||
|             TRACE(("reject (got %zd chars, need %zd)\n", |             TRACE(("reject (got %tu chars, need %zu)\n", | ||||||
|                    end - ptr, (Py_ssize_t) pattern[3])); |                    end - ptr, (size_t) pattern[3])); | ||||||
|             RETURN_FAILURE; |             RETURN_FAILURE; | ||||||
|         } |         } | ||||||
|         pattern += pattern[1] + 1; |         pattern += pattern[1] + 1; | ||||||
|  | @ -1509,7 +1509,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) | ||||||
|             /* <ASSERT> <skip> <back> <pattern> */ |             /* <ASSERT> <skip> <back> <pattern> */ | ||||||
|             TRACE(("|%p|%p|ASSERT %d\n", pattern, |             TRACE(("|%p|%p|ASSERT %d\n", pattern, | ||||||
|                    ptr, pattern[1])); |                    ptr, pattern[1])); | ||||||
|             if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1]) |             if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1]) | ||||||
|                 RETURN_FAILURE; |                 RETURN_FAILURE; | ||||||
|             state->ptr = ptr - pattern[1]; |             state->ptr = ptr - pattern[1]; | ||||||
|             DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2); |             DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2); | ||||||
|  | @ -1522,7 +1522,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) | ||||||
|             /* <ASSERT_NOT> <skip> <back> <pattern> */ |             /* <ASSERT_NOT> <skip> <back> <pattern> */ | ||||||
|             TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, |             TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, | ||||||
|                    ptr, pattern[1])); |                    ptr, pattern[1])); | ||||||
|             if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) { |             if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) { | ||||||
|                 state->ptr = ptr - pattern[1]; |                 state->ptr = ptr - pattern[1]; | ||||||
|                 LASTMARK_SAVE(); |                 LASTMARK_SAVE(); | ||||||
|                 if (state->repeat) |                 if (state->repeat) | ||||||
|  | @ -1658,9 +1658,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) | ||||||
| 
 | 
 | ||||||
|         flags = pattern[2]; |         flags = pattern[2]; | ||||||
| 
 | 
 | ||||||
|         if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) { |         if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) { | ||||||
|             TRACE(("reject (got %u chars, need %u)\n", |             TRACE(("reject (got %tu chars, need %zu)\n", | ||||||
|                    (unsigned int)(end - ptr), pattern[3])); |                    end - ptr, (size_t) pattern[3])); | ||||||
|             return 0; |             return 0; | ||||||
|         } |         } | ||||||
|         if (pattern[3] > 1) { |         if (pattern[3] > 1) { | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Serhiy Storchaka
						Serhiy Storchaka