Merge 286ab7ba6e into 7099af8f5e

2025-12-08 06:10:17 +00:00 · 2025-12-08 06:10:43 +02:00 · 2025-12-08 06:10:43 +02:00 · c4868f10c1
commit c4868f10c1
parent 7099af8f5e 286ab7ba6e
4 changed files with 8 additions and 15 deletions
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -2479,12 +2479,6 @@ expression support in the :mod:`re` module).
   +-----------------------+-----------------------------+
   | ``\f`` or ``\x0c``    | Form Feed                   |
   +-----------------------+-----------------------------+
-   | ``\x1c``              | File Separator              |
-   +-----------------------+-----------------------------+
-   | ``\x1d``              | Group Separator             |
-   +-----------------------+-----------------------------+
-   | ``\x1e``              | Record Separator            |
-   +-----------------------+-----------------------------+
   | ``\x85``              | Next Line (C1 Control Code) |
   +-----------------------+-----------------------------+
   | ``\u2028``            | Line Separator              |
--- a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-10-18-26-49.gh-issue-66428.sJ9yJn.rst
@ -0,0 +1,4 @@
+Remove Unicode characters that have the bidirectional B property but are not
+mandatory line breakers (U+001C, U+001D and U+001E) from the list of
+line-breaking characters. ``str.splitlines()`` will not break on these
+characters any more.
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@ -2367,7 +2367,7 @@ static const unsigned short index1[] = {

 static const unsigned short index2[] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 2, 2, 2, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4,
+    0, 0, 0, 0, 1, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4,
    6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 5, 4, 4, 4, 4, 4, 4, 16, 16, 16, 16,
    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
    16, 16, 16, 16, 4, 4, 4, 5, 17, 5, 18, 18, 18, 18, 18, 18, 18, 18, 18,
@ -6581,8 +6581,7 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch)
 }

 /* Returns 1 for Unicode characters having the line break
- * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional
- * type 'B', 0 otherwise.
+ * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.
 */
 int _PyUnicode_IsLinebreak(const Py_UCS4 ch)
 {
@ -6591,9 +6590,6 @@ int _PyUnicode_IsLinebreak(const Py_UCS4 ch)
    case 0x000B:
    case 0x000C:
    case 0x000D:
-    case 0x001C:
-    case 0x001D:
-    case 0x001E:
    case 0x0085:
    case 0x2028:
    case 0x2029:
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -438,7 +438,7 @@ def makeunicodetype(unicode, trace):
                flags |= ALPHA_MASK
            if "Lowercase" in properties:
                flags |= LOWER_MASK
-            if 'Line_Break' in properties or bidirectional == "B":
+            if 'Line_Break' in properties:
                flags |= LINEBREAK_MASK
                linebreaks.append(char)
            if category == "Zs" or bidirectional in ("WS", "B", "S"):
@ -604,8 +604,7 @@ def makeunicodetype(unicode, trace):

        # Generate code for _PyUnicode_IsLinebreak()
        fprint("/* Returns 1 for Unicode characters having the line break")
-        fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
-        fprint(" * type 'B', 0 otherwise.")
+        fprint(" * property 'BK', 'CR', 'LF' or 'NL', 0 otherwise.")
        fprint(" */")
        fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
        fprint('{')