closes gh-138706: update Unicode to 17.0.0 (#138719)

2025-12-08 06:10:17 +00:00 · 2025-09-11 09:58:39 -07:00 · 2025-09-11 09:58:39 -07:00 · 5bd4bf04c4
commit 5bd4bf04c4
parent e0f54a608e
11 changed files with 22094 additions and 22031 deletions
--- a/Doc/library/stdtypes.rst
+++ b/Doc/library/stdtypes.rst
@ -1843,9 +1843,9 @@ expression support in the :mod:`re` module).
   lowercase, :meth:`lower` would do nothing to ``'ß'``; :meth:`casefold`
   converts it to ``"ss"``.

-   The casefolding algorithm is
-   `described in section 3.13 'Default Case Folding' of the Unicode Standard
-   <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G33992>`__.
+   The casefolding algorithm is `described in section 3.13.3 'Default Case
+   Folding' of the Unicode Standard
+   <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G53253>`__.

   .. versionadded:: 3.3

@ -2056,7 +2056,7 @@ expression support in the :mod:`re` module).
   property being one of "Lm", "Lt", "Lu", "Ll", or "Lo".  Note that this is different
   from the `Alphabetic property defined in the section 4.10 'Letters, Alphabetic, and
   Ideographic' of the Unicode Standard
-   <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G91002>`_.
+   <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-4/#G91002>`__.


 .. method:: str.isascii()
@ -2196,9 +2196,9 @@ expression support in the :mod:`re` module).
   Return a copy of the string with all the cased characters [4]_ converted to
   lowercase.

-   The lowercasing algorithm used is
-   `described in section 3.13 'Default Case Folding' of the Unicode Standard
-   <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G33992>`__.
+   The lowercasing algorithm used is `described in section 3.13.2 'Default Case
+   Conversion' of the Unicode Standard
+   <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G34078>`__.


 .. method:: str.lstrip(chars=None, /)
@ -2561,9 +2561,9 @@ expression support in the :mod:`re` module).
   character(s) is not "Lu" (Letter, uppercase), but e.g. "Lt" (Letter,
   titlecase).

-   The uppercasing algorithm used is
-   `described in section 3.13 'Default Case Folding' of the Unicode Standard
-   <https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G33992>`__.
+   The uppercasing algorithm used is `described in section 3.13.2 'Default Case
+   Conversion' of the Unicode Standard
+   <https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-3/#G34078>`__.


 .. method:: str.zfill(width, /)
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@ -17,8 +17,8 @@

 This module provides access to the Unicode Character Database (UCD) which
 defines character properties for all Unicode characters. The data contained in
-this database is compiled from the `UCD version 16.0.0
-<https://www.unicode.org/Public/16.0.0/ucd>`_.
+this database is compiled from the `UCD version 17.0.0
+<https://www.unicode.org/Public/17.0.0/ucd>`_.

 The module uses the same names and symbols as defined by Unicode
 Standard Annex #44, `"Unicode Character Database"
@ -211,6 +211,6 @@ In addition, the module exposes the following constant:

 .. rubric:: Footnotes

-.. [#] https://www.unicode.org/Public/16.0.0/ucd/NameAliases.txt
+.. [#] https://www.unicode.org/Public/17.0.0/ucd/NameAliases.txt

-.. [#] https://www.unicode.org/Public/16.0.0/ucd/NamedSequences.txt
+.. [#] https://www.unicode.org/Public/17.0.0/ucd/NamedSequences.txt
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@ -384,8 +384,8 @@ Character Database.


 .. _UAX-31: https://www.unicode.org/reports/tr31/
-.. _PropList.txt: https://www.unicode.org/Public/16.0.0/ucd/PropList.txt
-.. _DerivedCoreProperties.txt: https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt
+.. _PropList.txt: https://www.unicode.org/Public/17.0.0/ucd/PropList.txt
+.. _DerivedCoreProperties.txt: https://www.unicode.org/Public/17.0.0/ucd/DerivedCoreProperties.txt
 .. _normalization form: https://www.unicode.org/reports/tr15/#Norm_Forms


@ -793,7 +793,7 @@ with the given *name*::
 This sequence cannot appear in :ref:`bytes literals <bytes-literal>`.

 .. versionchanged:: 3.3
-   Support for `name aliases <https://www.unicode.org/Public/16.0.0/ucd/NameAliases.txt>`__
+   Support for `name aliases <https://www.unicode.org/Public/17.0.0/ucd/NameAliases.txt>`__
   has been added.

 .. _string-escape-long-hex:
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@ -648,6 +648,12 @@ typing
  (Contributed by Nikita Sobolev in :gh:`137191`.)


+unicodedata
+-----------
+
+* The Unicode database has been updated to Unicode 17.0.0.
+
+
 wave
 ----

--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -24,7 +24,7 @@
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = '9e43ee3929471739680c0e705482b4ae1c4122e4'
+    expectedchecksum = '8b2615a9fc627676cbc0b6fac0191177df97ef5f'

    @requires_resource('cpu')
    def test_method_checksum(self):
@ -77,7 +77,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # Update this if the database changes. Make sure to do a full rebuild
    # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '23ab09ed4abdf93db23b97359108ed630dd8311d'
+    expectedchecksum = '65670ae03a324c5f9e826a4de3e25bae4d73c9b7'

    @requires_resource('cpu')
    def test_function_checksum(self):
--- a/Misc/NEWS.d/next/Library/2025-09-09-10-48-26.gh-issue-138706.xB--LX.rst
+++ b/Misc/NEWS.d/next/Library/2025-09-09-10-48-26.gh-issue-138706.xB--LX.rst
@ -0,0 +1 @@
+Update :mod:`unicodedata` database to Unicode 17.0.0.
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -1020,13 +1020,14 @@ is_unified_ideograph(Py_UCS4 code)
        (0x3400 <= code && code <= 0x4DBF)   || /* CJK Ideograph Extension A */
        (0x4E00 <= code && code <= 0x9FFF)   || /* CJK Ideograph */
        (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
-        (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
+        (0x2A700 <= code && code <= 0x2B73F) || /* CJK Ideograph Extension C */
        (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
-        (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
+        (0x2B820 <= code && code <= 0x2CEAD) || /* CJK Ideograph Extension E */
        (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
        (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
        (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
-        (0x31350 <= code && code <= 0x323AF);   /* CJK Ideograph Extension H */
+        (0x31350 <= code && code <= 0x323AF) || /* CJK Ideograph Extension H */
+        (0x323B0 <= code && code <= 0x33479);   /* CJK Ideograph Extension J */
 }

 /* macros used to determine if the given code point is in the PUA range that
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -44,7 +44,7 @@
 #   * Doc/library/stdtypes.rst, and
 #   * Doc/library/unicodedata.rst
 #   * Doc/reference/lexical_analysis.rst (three occurrences)
-UNIDATA_VERSION = "16.0.0"
+UNIDATA_VERSION = "17.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -104,13 +104,14 @@
    ('3400', '4DBF'),    # CJK Ideograph Extension A CJK
    ('4E00', '9FFF'),    # CJK Ideograph
    ('20000', '2A6DF'),  # CJK Ideograph Extension B
-    ('2A700', '2B739'),  # CJK Ideograph Extension C
+    ('2A700', '2B73F'),  # CJK Ideograph Extension C
    ('2B740', '2B81D'),  # CJK Ideograph Extension D
-    ('2B820', '2CEA1'),  # CJK Ideograph Extension E
+    ('2B820', '2CEAD'),  # CJK Ideograph Extension E
    ('2CEB0', '2EBE0'),  # CJK Ideograph Extension F
    ('2EBF0', '2EE5D'),  # CJK Ideograph Extension I
    ('30000', '3134A'),  # CJK Ideograph Extension G
    ('31350', '323AF'),  # CJK Ideograph Extension H
+    ('323B0', '33479'),  # CJK Ideograph Extension J
 ]
				`@ -0,0 +1 @@`
				Update :mod:`unicodedata` database to Unicode 17.0.0.