From e535e89b901d71cb7ae591776507e9b35cb71f84 Mon Sep 17 00:00:00 2001 From: "L. Grobol" Date: Thu, 10 Apr 2025 15:17:17 +0200 Subject: [PATCH 1/4] Remove from linebreak/_PyUnicode_IsLinebreak characters that are bidirectional B but don't have the line break property This is exactly three characters: U+001C "FILE SEPARATOR", U+001D "GROUP SEPARATOR" and U+001E "RECORD SEPARATOR", all of which have the Combining Mark line breaking property, meaning that they should *not* be present at a line break --- Tools/unicode/makeunicodedata.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Tools/unicode/makeunicodedata.py b/Tools/unicode/makeunicodedata.py index 889ae8fc869..ba6c494c375 100644 --- a/Tools/unicode/makeunicodedata.py +++ b/Tools/unicode/makeunicodedata.py @@ -437,7 +437,7 @@ def makeunicodetype(unicode, trace): flags |= ALPHA_MASK if "Lowercase" in properties: flags |= LOWER_MASK - if 'Line_Break' in properties or bidirectional == "B": + if 'Line_Break' in properties: flags |= LINEBREAK_MASK linebreaks.append(char) if category == "Zs" or bidirectional in ("WS", "B", "S"): @@ -603,8 +603,7 @@ def makeunicodetype(unicode, trace): # Generate code for _PyUnicode_IsLinebreak() fprint("/* Returns 1 for Unicode characters having the line break") - fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional") - fprint(" * type 'B', 0 otherwise.") + fprint(" * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.") fprint(" */") fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)') fprint('{') From 22633de8b339d6672da8081ddbbf360d76ed72e0 Mon Sep 17 00:00:00 2001 From: "L. Grobol" Date: Thu, 10 Apr 2025 15:23:21 +0200 Subject: [PATCH 2/4] remove the offending characters from the splitlines table --- Doc/library/stdtypes.rst | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index 48d179c2703..ee07debe1e0 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -2295,12 +2295,6 @@ expression support in the :mod:`re` module). +-----------------------+-----------------------------+ | ``\f`` or ``\x0c`` | Form Feed | +-----------------------+-----------------------------+ - | ``\x1c`` | File Separator | - +-----------------------+-----------------------------+ - | ``\x1d`` | Group Separator | - +-----------------------+-----------------------------+ - | ``\x1e`` | Record Separator | - +-----------------------+-----------------------------+ | ``\x85`` | Next Line (C1 Control Code) | +-----------------------+-----------------------------+ | ``\u2028`` | Line Separator | From dfe0b2e5319459a3c6f5291f740495c35c12b84d Mon Sep 17 00:00:00 2001 From: "L. Grobol" Date: Thu, 10 Apr 2025 15:47:56 +0200 Subject: [PATCH 3/4] update unicodetype_db.h --- Objects/unicodetype_db.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h index 5be810dd674..93de16bc42b 100644 --- a/Objects/unicodetype_db.h +++ b/Objects/unicodetype_db.h @@ -2971,7 +2971,7 @@ static const unsigned short index1[] = { static const unsigned short index2[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 2, 2, 2, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, + 0, 0, 0, 0, 1, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 4, 4, 4, 4, 4, 4, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 4, 4, 4, 5, 17, 5, 18, 18, 18, 18, 18, 18, 18, 18, 18, @@ -6711,8 +6711,7 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch) } /* Returns 1 for Unicode characters having the line break - * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional - * type 'B', 0 otherwise. + * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise. */ int _PyUnicode_IsLinebreak(const Py_UCS4 ch) { @@ -6721,9 +6720,6 @@ int _PyUnicode_IsLinebreak(const Py_UCS4 ch) case 0x000B: case 0x000C: case 0x000D: - case 0x001C: - case 0x001D: - case 0x001E: case 0x0085: case 0x2028: case 0x2029: From eadb0e21039dd0316188264649187f4ea9a2f751 Mon Sep 17 00:00:00 2001 From: "L. Grobol" Date: Thu, 10 Apr 2025 18:27:53 +0200 Subject: [PATCH 4/4] add tentative news blurb --- .../2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst new file mode 100644 index 00000000000..96a504e504c --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst @@ -0,0 +1,4 @@ +Remove Unicode characters that have the bidirectional B property but are not +mandatory line breakers (U+001C, U+001D and U+001E) from the list of +line-breaking characters. ``str.splitlines()`` will not break on these +characters any more.