cpython/Lib/test/test_ucn.py

""" Test script for the Unicode implementation.

Written by Bill Tutt.
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""#"

import unittest
import unicodedata

from test import support
from http.client import HTTPException
from test.test_normalization import check_version

try:
    from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
except ImportError:
    INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1

class UnicodeNamesTest(unittest.TestCase):

    def checkletter(self, name, code):
        # Helper that put all \N escapes inside eval'd raw strings,
        # to make sure this script runs even if the compiler
        # chokes on \N escapes
        res = eval(r'"\N{%s}"' % name)
        self.assertEqual(res, code)
        return res

    def test_general(self):
        # General and case insensitivity test:
        chars = [
            "LATIN CAPITAL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER R",
            "LATIN CAPITAL LETTER E",
            "LATIN SMALL LETTER D",
            "SPACE",
            "LATIN SMALL LETTER f",
            "LATIN CAPITAL LeTtEr o",
            "LATIN SMaLl LETTER x",
            "SPACE",
            "LATIN SMALL LETTER A",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER E",
            "SPACE",
            "LATIN SMALL LETTER T",
            "LATIN SMALL LETTER H",
            "LATIN SMALL LETTER E",
            "SpAcE",
            "LATIN SMALL LETTER S",
            "LATIN SMALL LETTER H",
            "LATIN small LETTER e",
            "LATIN small LETTER e",
            "LATIN SMALL LETTER P",
            "FULL STOP"
        ]
        string = "The rEd fOx ate the sheep."

        self.assertEqual(
            "".join([self.checkletter(*args) for args in zip(chars, string)]),
            string
        )

    def test_ascii_letters(self):
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

    def test_hangul_syllables(self):
        self.checkletter("HANGUL SYLLABLE GA", "\uac00")
        self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
        self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
        self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
        self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
        self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
        self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
        self.checkletter("HANGUL SYLLABLE YI", "\uc758")
        self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
        self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
        self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
        self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
        self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")

        self.assertRaises(ValueError, unicodedata.name, "\ud7a4")

    def test_cjk_unified_ideographs(self):
        self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
        self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
        self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
        self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
        self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")

    def test_bmp_characters(self):
        for code in range(0x10000):
            char = chr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)

    def test_misc_symbols(self):
        self.checkletter("PILCROW SIGN", "\u00b6")
        self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
        self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
        self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")

    def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias)

    def test_aliases_names_in_pua_range(self):
        # We are storing aliases in the PUA 15, but their names shouldn't leak
        for cp in range(0xf0000, 0xf0100):
            with self.assertRaises(ValueError) as cm:
                unicodedata.name(chr(cp))
            self.assertEqual(str(cm.exception), 'no such name')

    def test_named_sequences_names_in_pua_range(self):
        # We are storing named seq in the PUA 15, but their names shouldn't leak
        for cp in range(0xf0100, 0xf0fff):
            with self.assertRaises(ValueError) as cm:
                unicodedata.name(chr(cp))
            self.assertEqual(str(cm.exception), 'no such name')

    def test_named_sequences_sample(self):
        # Check a few named sequences.  See #12753.
        sequences = [
            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
        ]
        for seqname, codepoints in sequences:
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

    def test_named_sequences_full(self):
        # Check all the named sequences
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
        except (OSError, HTTPException):
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

    def test_errors(self):
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, 'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')

    def test_strict_error_handling(self):
        # bogus character name
        self.assertRaises(
            UnicodeError,
            str, b"\\N{blah}", 'unicode-escape', 'strict'
        )
        # long bogus character name
        self.assertRaises(
            UnicodeError,
            str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
        )
        # missing closing brace
        self.assertRaises(
            UnicodeError,
            str, b"\\N{SPACE", 'unicode-escape', 'strict'
        )
        # missing opening brace
        self.assertRaises(
            UnicodeError,
            str, b"\\NSPACE", 'unicode-escape', 'strict'
        )

    @support.cpython_only
    @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
    @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
    def test_issue16335(self, size):
        # very very long bogus character name
        x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
        self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
        self.assertRaisesRegex(UnicodeError,
            'unknown Unicode character name',
            x.decode, 'unicode-escape'
        )


if __name__ == "__main__":
    unittest.main()
New test for the ucnhash module. 2000-06-30 09:45:20 +00:00			`""" Test script for the Unicode implementation.`

			`Written by Bill Tutt.`
Move uchhash functionality into unicodedata (after the recent crop of changes, the files are small enough to do this). Also adds "name" and "lookup" functions to unicodedata. 2001-01-24 07:59:11 +00:00			`Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)`
New test for the ucnhash module. 2000-06-30 09:45:20 +00:00
			`(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.`

			`"""#"`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
			`import unittest`
#12753: Add support for Unicode name aliases and named sequences. 2011-10-21 21:57:36 +03:00			`import unicodedata`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
#2621 rename test.test_support to test.support 2008-05-20 21:35:26 +00:00			`from test import support`
#12753: Add support for Unicode name aliases and named sequences. 2011-10-21 21:57:36 +03:00			`from http.client import HTTPException`
			`from test.test_normalization import check_version`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 10:06:39 +02:00			`try:`
			`from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX`
			`except ImportError:`
			`INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1`

Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`class UnicodeNamesTest(unittest.TestCase):`

			`def checkletter(self, name, code):`
			`# Helper that put all \N escapes inside eval'd raw strings,`
Whitespace normalization. 2003-03-07 17:30:48 +00:00			`# to make sure this script runs even if the compiler`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`# chokes on \N escapes`
SF patch# 1757758 by Alexandre Vassalotti, fixing test_ucn. 2007-07-21 00:15:34 +00:00			`res = eval(r'"\N{%s}"' % name)`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`self.assertEqual(res, code)`
			`return res`

			`def test_general(self):`
			`# General and case insensitivity test:`
			`chars = [`
			`"LATIN CAPITAL LETTER T",`
			`"LATIN SMALL LETTER H",`
			`"LATIN SMALL LETTER E",`
			`"SPACE",`
			`"LATIN SMALL LETTER R",`
			`"LATIN CAPITAL LETTER E",`
			`"LATIN SMALL LETTER D",`
			`"SPACE",`
			`"LATIN SMALL LETTER f",`
			`"LATIN CAPITAL LeTtEr o",`
			`"LATIN SMaLl LETTER x",`
			`"SPACE",`
			`"LATIN SMALL LETTER A",`
			`"LATIN SMALL LETTER T",`
			`"LATIN SMALL LETTER E",`
			`"SPACE",`
			`"LATIN SMALL LETTER T",`
			`"LATIN SMALL LETTER H",`
			`"LATIN SMALL LETTER E",`
			`"SpAcE",`
			`"LATIN SMALL LETTER S",`
			`"LATIN SMALL LETTER H",`
			`"LATIN small LETTER e",`
			`"LATIN small LETTER e",`
			`"LATIN SMALL LETTER P",`
			`"FULL STOP"`
			`]`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`string = "The rEd fOx ate the sheep."`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
			`self.assertEqual(`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`"".join([self.checkletter(*args) for args in zip(chars, string)]),`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`string`
			`)`

			`def test_ascii_letters(self):`
Merged revisions 55007-55179 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/p3yk ........ r55077 \| guido.van.rossum \| 2007-05-02 11:54:37 -0700 (Wed, 02 May 2007) \| 2 lines Use the new print syntax, at least. ........ r55142 \| fred.drake \| 2007-05-04 21:27:30 -0700 (Fri, 04 May 2007) \| 1 line remove old cruftiness ........ r55143 \| fred.drake \| 2007-05-04 21:52:16 -0700 (Fri, 04 May 2007) \| 1 line make this work with the new Python ........ r55162 \| neal.norwitz \| 2007-05-06 22:29:18 -0700 (Sun, 06 May 2007) \| 1 line Get asdl code gen working with Python 2.3. Should continue to work with 3.0 ........ r55164 \| neal.norwitz \| 2007-05-07 00:00:38 -0700 (Mon, 07 May 2007) \| 1 line Verify checkins to p3yk (sic) branch go to 3000 list. ........ r55166 \| neal.norwitz \| 2007-05-07 00:12:35 -0700 (Mon, 07 May 2007) \| 1 line Fix this test so it runs again by importing warnings_test properly. ........ r55167 \| neal.norwitz \| 2007-05-07 01:03:22 -0700 (Mon, 07 May 2007) \| 8 lines So long xrange. range() now supports values that are outside -sys.maxint to sys.maxint. floats raise a TypeError. This has been sitting for a long time. It probably has some problems and needs cleanup. Objects/rangeobject.c now uses 4-space indents since it is almost completely new. ........ r55171 \| guido.van.rossum \| 2007-05-07 10:21:26 -0700 (Mon, 07 May 2007) \| 4 lines Fix two tests that were previously depending on significant spaces at the end of a line (and before that on Python 2.x print behavior that has no exact equivalent in 3.0). ........ 2007-05-07 22:24:25 +00:00			`for char in "".join(map(chr, range(ord("a"), ord("z")))):`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`name = "LATIN SMALL LETTER %s" % char.upper()`
			`code = unicodedata.lookup(name)`
			`self.assertEqual(unicodedata.name(code), name)`

			`def test_hangul_syllables(self):`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.checkletter("HANGUL SYLLABLE GA", "\uac00")`
			`self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")`
			`self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")`
			`self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")`
			`self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")`
			`self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")`
			`self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")`
			`self.checkletter("HANGUL SYLLABLE YI", "\uc758")`
			`self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")`
			`self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")`
			`self.checkletter("HANGUL SYLLABLE PAN", "\ud310")`
			`self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")`
			`self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.assertRaises(ValueError, unicodedata.name, "\ud7a4")`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
			`def test_cjk_unified_ideographs(self):`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")`
Issue #10459: Update CJK character names to Unicode 6.0. 2010-11-22 09:00:02 +00:00			`self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")`
Issue #10459: Update CJK character names to Unicode 6.0. 2010-11-22 09:00:02 +00:00			`self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")`
			`self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
			`def test_bmp_characters(self):`
Merged revisions 55007-55179 via svnmerge from svn+ssh://pythondev@svn.python.org/python/branches/p3yk ........ r55077 \| guido.van.rossum \| 2007-05-02 11:54:37 -0700 (Wed, 02 May 2007) \| 2 lines Use the new print syntax, at least. ........ r55142 \| fred.drake \| 2007-05-04 21:27:30 -0700 (Fri, 04 May 2007) \| 1 line remove old cruftiness ........ r55143 \| fred.drake \| 2007-05-04 21:52:16 -0700 (Fri, 04 May 2007) \| 1 line make this work with the new Python ........ r55162 \| neal.norwitz \| 2007-05-06 22:29:18 -0700 (Sun, 06 May 2007) \| 1 line Get asdl code gen working with Python 2.3. Should continue to work with 3.0 ........ r55164 \| neal.norwitz \| 2007-05-07 00:00:38 -0700 (Mon, 07 May 2007) \| 1 line Verify checkins to p3yk (sic) branch go to 3000 list. ........ r55166 \| neal.norwitz \| 2007-05-07 00:12:35 -0700 (Mon, 07 May 2007) \| 1 line Fix this test so it runs again by importing warnings_test properly. ........ r55167 \| neal.norwitz \| 2007-05-07 01:03:22 -0700 (Mon, 07 May 2007) \| 8 lines So long xrange. range() now supports values that are outside -sys.maxint to sys.maxint. floats raise a TypeError. This has been sitting for a long time. It probably has some problems and needs cleanup. Objects/rangeobject.c now uses 4-space indents since it is almost completely new. ........ r55171 \| guido.van.rossum \| 2007-05-07 10:21:26 -0700 (Mon, 07 May 2007) \| 4 lines Fix two tests that were previously depending on significant spaces at the end of a line (and before that on Python 2.x print behavior that has no exact equivalent in 3.0). ........ 2007-05-07 22:24:25 +00:00			`for code in range(0x10000):`
Rename 'unicode' to 'str' in its tp_name field. Rename 'str' to 'str8'. Change all occurrences of unichr to chr. 2007-05-03 17:18:26 +00:00			`char = chr(code)`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`name = unicodedata.name(char, None)`
			`if name is not None:`
			`self.assertEqual(unicodedata.lookup(name), char)`

			`def test_misc_symbols(self):`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.checkletter("PILCROW SIGN", "\u00b6")`
			`self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")`
			`self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")`
			`self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
#12753: Add support for Unicode name aliases and named sequences. 2011-10-21 21:57:36 +03:00			`def test_aliases(self):`
			`# Check that the aliases defined in the NameAliases.txt file work.`
			`# This should be updated when new aliases are added or the file`
			`# should be downloaded and parsed instead. See #12753.`
			`aliases = [`
			`('LATIN CAPITAL LETTER GHA', 0x01A2),`
			`('LATIN SMALL LETTER GHA', 0x01A3),`
			`('KANNADA LETTER LLLA', 0x0CDE),`
			`('LAO LETTER FO FON', 0x0E9D),`
			`('LAO LETTER FO FAY', 0x0E9F),`
			`('LAO LETTER RO', 0x0EA3),`
			`('LAO LETTER LO', 0x0EA5),`
			`('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),`
			`('YI SYLLABLE ITERATION MARK', 0xA015),`
			`('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),`
			`('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)`
			`]`
			`for alias, codepoint in aliases:`
			`self.checkletter(alias, chr(codepoint))`
			`name = unicodedata.name(chr(codepoint))`
			`self.assertNotEqual(name, alias)`
			`self.assertEqual(unicodedata.lookup(alias),`
			`unicodedata.lookup(name))`
			`with self.assertRaises(KeyError):`
			`unicodedata.ucd_3_2_0.lookup(alias)`

			`def test_aliases_names_in_pua_range(self):`
			`# We are storing aliases in the PUA 15, but their names shouldn't leak`
			`for cp in range(0xf0000, 0xf0100):`
			`with self.assertRaises(ValueError) as cm:`
			`unicodedata.name(chr(cp))`
			`self.assertEqual(str(cm.exception), 'no such name')`

			`def test_named_sequences_names_in_pua_range(self):`
			`# We are storing named seq in the PUA 15, but their names shouldn't leak`
			`for cp in range(0xf0100, 0xf0fff):`
			`with self.assertRaises(ValueError) as cm:`
			`unicodedata.name(chr(cp))`
			`self.assertEqual(str(cm.exception), 'no such name')`

			`def test_named_sequences_sample(self):`
			`# Check a few named sequences. See #12753.`
			`sequences = [`
			`('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),`
			`('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),`
			`('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),`
			`('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),`
			`('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),`
			`]`
			`for seqname, codepoints in sequences:`
			`self.assertEqual(unicodedata.lookup(seqname), codepoints)`
			`with self.assertRaises(SyntaxError):`
			`self.checkletter(seqname, None)`
			`with self.assertRaises(KeyError):`
			`unicodedata.ucd_3_2_0.lookup(seqname)`

			`def test_named_sequences_full(self):`
			`# Check all the named sequences`
#22650: test suite: load Unicode test data files from www.pythontest.net 2014-11-06 14:37:49 +01:00			`url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %`
#12753: Add support for Unicode name aliases and named sequences. 2011-10-21 21:57:36 +03:00			`unicodedata.unidata_version)`
			`try:`
			`testdata = support.open_urlresource(url, encoding="utf-8",`
			`check=check_version)`
Replace IOError with OSError (#16715) 2012-12-25 16:47:37 +02:00			`except (OSError, HTTPException):`
#12753: Add support for Unicode name aliases and named sequences. 2011-10-21 21:57:36 +03:00			`self.skipTest("Could not retrieve " + url)`
			`self.addCleanup(testdata.close)`
			`for line in testdata:`
			`line = line.strip()`
			`if not line or line.startswith('#'):`
			`continue`
			`seqname, codepoints = line.split(';')`
			`codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())`
			`self.assertEqual(unicodedata.lookup(seqname), codepoints)`
			`with self.assertRaises(SyntaxError):`
			`self.checkletter(seqname, None)`
			`with self.assertRaises(KeyError):`
			`unicodedata.ucd_3_2_0.lookup(seqname)`

Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`def test_errors(self):`
			`self.assertRaises(TypeError, unicodedata.name)`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.assertRaises(TypeError, unicodedata.name, 'xx')`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`self.assertRaises(TypeError, unicodedata.lookup)`
Rip out all the u"..." literals and calls to unicode(). 2007-05-02 19:09:54 +00:00			`self.assertRaises(KeyError, unicodedata.lookup, 'unknown')`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00
SF patch# 1757758 by Alexandre Vassalotti, fixing test_ucn. 2007-07-21 00:15:34 +00:00			`def test_strict_error_handling(self):`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`# bogus character name`
			`self.assertRaises(`
			`UnicodeError,`
SF patch# 1757758 by Alexandre Vassalotti, fixing test_ucn. 2007-07-21 00:15:34 +00:00			`str, b"\\N{blah}", 'unicode-escape', 'strict'`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`)`
			`# long bogus character name`
			`self.assertRaises(`
			`UnicodeError,`
Changes in anticipation of stricter str vs. bytes enforcement. 2007-08-27 18:31:48 +00:00			`str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`)`
			`# missing closing brace`
			`self.assertRaises(`
			`UnicodeError,`
SF patch# 1757758 by Alexandre Vassalotti, fixing test_ucn. 2007-07-21 00:15:34 +00:00			`str, b"\\N{SPACE", 'unicode-escape', 'strict'`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`)`
			`# missing opening brace`
			`self.assertRaises(`
			`UnicodeError,`
SF patch# 1757758 by Alexandre Vassalotti, fixing test_ucn. 2007-07-21 00:15:34 +00:00			`str, b"\\NSPACE", 'unicode-escape', 'strict'`
Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`)`

Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 10:06:39 +02:00			`@support.cpython_only`
			`@unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")`
			`@support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)`
Add bigmemtest decorator to test of issue #16335. 2013-01-21 20:23:58 +02:00			`def test_issue16335(self, size):`
Issue #16335: Fix integer overflow in unicode-escape decoder. 2013-01-21 11:38:00 +02:00			`# very very long bogus character name`
Issue #20532: Tests which use _testcapi now are marked as CPython only. 2014-02-07 10:06:39 +02:00			`x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'`
			`self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))`
Add bigmemtest decorator to test of issue #16335. 2013-01-21 20:23:58 +02:00			`self.assertRaisesRegex(UnicodeError,`
			`'unknown Unicode character name',`
			`x.decode, 'unicode-escape'`
			`)`
Issue #16335: Fix integer overflow in unicode-escape decoder. 2013-01-21 11:38:00 +02:00

Port test_ucn and test_unicodedata to PyUnit. Add a few tests for error cases increasing coverage in unicodedata.c from 87% to 95% (when the normalization tests are run). From SF patch #662807. 2003-02-26 14:49:41 +00:00			`if __name__ == "__main__":`
Issue #21741: Update 147 test modules to use test discovery. I have compared output between pre- and post-patch runs of these tests to make sure there's nothing missing and nothing broken, on both Windows and Linux. The only differences I found were actually tests that were previously not run. 2015-04-13 15:00:43 -05:00			`unittest.main()`