This commit is contained in:
L. Grobol 2025-12-08 06:10:43 +02:00 committed by GitHub
commit c4868f10c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 8 additions and 15 deletions

View file

@ -2479,12 +2479,6 @@ expression support in the :mod:`re` module).
+-----------------------+-----------------------------+ +-----------------------+-----------------------------+
| ``\f`` or ``\x0c`` | Form Feed | | ``\f`` or ``\x0c`` | Form Feed |
+-----------------------+-----------------------------+ +-----------------------+-----------------------------+
| ``\x1c`` | File Separator |
+-----------------------+-----------------------------+
| ``\x1d`` | Group Separator |
+-----------------------+-----------------------------+
| ``\x1e`` | Record Separator |
+-----------------------+-----------------------------+
| ``\x85`` | Next Line (C1 Control Code) | | ``\x85`` | Next Line (C1 Control Code) |
+-----------------------+-----------------------------+ +-----------------------+-----------------------------+
| ``\u2028`` | Line Separator | | ``\u2028`` | Line Separator |

View file

@ -0,0 +1,4 @@
Remove Unicode characters that have the bidirectional B property but are not
mandatory line breakers (U+001C, U+001D and U+001E) from the list of
line-breaking characters. ``str.splitlines()`` will not break on these
characters any more.

View file

@ -2367,7 +2367,7 @@ static const unsigned short index1[] = {
static const unsigned short index2[] = { static const unsigned short index2[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 2, 2, 2, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 0, 0, 0, 0, 1, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4,
6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 4, 4, 4, 4, 4, 4, 16, 16, 16, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 4, 4, 4, 4, 4, 4, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 4, 4, 4, 5, 17, 5, 18, 18, 18, 18, 18, 18, 18, 18, 18, 16, 16, 16, 16, 4, 4, 4, 5, 17, 5, 18, 18, 18, 18, 18, 18, 18, 18, 18,
@ -6581,8 +6581,7 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch)
} }
/* Returns 1 for Unicode characters having the line break /* Returns 1 for Unicode characters having the line break
* property 'BK', 'CR', 'LF' or 'NL' or having bidirectional * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.
* type 'B', 0 otherwise.
*/ */
int _PyUnicode_IsLinebreak(const Py_UCS4 ch) int _PyUnicode_IsLinebreak(const Py_UCS4 ch)
{ {
@ -6591,9 +6590,6 @@ int _PyUnicode_IsLinebreak(const Py_UCS4 ch)
case 0x000B: case 0x000B:
case 0x000C: case 0x000C:
case 0x000D: case 0x000D:
case 0x001C:
case 0x001D:
case 0x001E:
case 0x0085: case 0x0085:
case 0x2028: case 0x2028:
case 0x2029: case 0x2029:

View file

@ -438,7 +438,7 @@ def makeunicodetype(unicode, trace):
flags |= ALPHA_MASK flags |= ALPHA_MASK
if "Lowercase" in properties: if "Lowercase" in properties:
flags |= LOWER_MASK flags |= LOWER_MASK
if 'Line_Break' in properties or bidirectional == "B": if 'Line_Break' in properties:
flags |= LINEBREAK_MASK flags |= LINEBREAK_MASK
linebreaks.append(char) linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"): if category == "Zs" or bidirectional in ("WS", "B", "S"):
@ -604,8 +604,7 @@ def makeunicodetype(unicode, trace):
# Generate code for _PyUnicode_IsLinebreak() # Generate code for _PyUnicode_IsLinebreak()
fprint("/* Returns 1 for Unicode characters having the line break") fprint("/* Returns 1 for Unicode characters having the line break")
fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") fprint(" * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.")
fprint(" * type 'B', 0 otherwise.")
fprint(" */") fprint(" */")
fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
fprint('{') fprint('{')