Update Unicode database to Unicode 4.1.

2025-11-01 06:01:29 +00:00 · 2006-03-09 23:38:20 +00:00 · 2006-03-09 23:38:20 +00:00 · 480f1bb67b
commit 480f1bb67b
parent e2b4677253
12 changed files with 17302 additions and 13365 deletions
--- a/Doc/lib/libunicodedata.tex
+++ b/Doc/lib/libunicodedata.tex
@ -14,11 +14,11 @@
 This module provides access to the Unicode Character Database which
 defines character properties for all Unicode characters. The data in
 this database is based on the \file{UnicodeData.txt} file version
-3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
+4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}.
 The module uses the same names and symbols as defined by the
-UnicodeData File Format 3.2.0 (see
+UnicodeData File Format 4.1.0 (see
-\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}).  It
+\url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}).  It
 defines the following functions:
 \begin{funcdesc}{lookup}{name}
@ -130,3 +130,12 @@ The version of the Unicode database used in this module.
 \versionadded{2.3}
 \end{datadesc}
 \begin{datadesc}{db_3_2_0}
 This is an object that has the same methods as the entire
 module, but uses the Unicode database version 3.2 instead,
 for applications that require this specific version of
 the Unicode database (such as IDNA).
 \versionadded{2.5}
 \end{datadesc}
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@ -14,12 +14,14 @@ typedef struct {
    int size;
    /* Get name for a given character code.  Returns non-zero if
-       success, zero if not.  Does not set Python exceptions. */
+       success, zero if not.  Does not set Python exceptions. 
-    int (*getname)(Py_UCS4 code, char* buffer, int buflen);
+       If self is NULL, data come from the default version of the database.
       If it is not NULL, it should be a unicodedata.db_X_Y_Z object */
    int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
    /* Get character code for a given name.  Same error handling
       as for getname. */
-    int (*getcode)(const char* name, int namelen, Py_UCS4* code);
+    int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
 } _PyUnicode_Name_CAPI;
--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@ -1,6 +1,7 @@
 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
-import stringprep, unicodedata, re, codecs
+import stringprep, re, codecs
 from unicodedata import db_3_2_0 as unicodedata
 # IDNA section 3.1
 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
--- a/Lib/stringprep.py
+++ b/Lib/stringprep.py
@ -5,7 +5,7 @@
 and mappings, for which a mapping function is provided.
 """
-import unicodedata
+from unicodedata import db_3_2_0 as unicodedata
 assert unicodedata.unidata_version == '3.2.0'
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -16,7 +16,7 @@
 class UnicodeMethodsTest(unittest.TestCase):
    # update this, if the database changes
-    expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9'
+    expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba'
    def test_method_checksum(self):
        h = sha.sha()
@ -75,7 +75,7 @@ def tearDown(self):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):
    # update this, if the database changes
-    expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af'
+    expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b'
    def test_function_checksum(self):
        data = []
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -279,6 +279,10 @@ Core and builtins
 Extension Modules
 -----------------
 - The unicodedata module was updated to the 4.1 version of the Unicode
  database. The 3.2 version is still available as unicodedata.db_3_2_0
  for applications that require this specific version (such as IDNA).
 - The timing module is no longer built by default.  It was deprecated
  in PEP 4 in Python 2.0 or earlier.
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -14,6 +14,7 @@
 #include "Python.h"
 #include "ucnhash.h"
 #include "structmember.h"
 /* character properties */
@ -28,6 +29,14 @@ typedef struct {
 						   _PyUnicode_EastAsianWidth */
 } _PyUnicode_DatabaseRecord;
 typedef struct change_record {
    /* sequence of fields should be the same as in merge_old_version */
    const unsigned char bidir_changed;
    const unsigned char category_changed;
    const unsigned char decimal_changed;
    const int numeric_changed;
 } change_record;
 /* data file generated by Tools/unicode/makeunicodedata.py */
 #include "unicodedata_db.h"
@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
 }
 /* ------------- Previous-version API ------------------------------------- */
 typedef struct previous_version {
    PyObject_HEAD
    const char *name;
    const change_record* (*getrecord)(Py_UCS4);
    Py_UCS4 (*normalization)(Py_UCS4);
 } PreviousDBVersion;
 #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
 /* Forward declaration */
 static PyMethodDef unicodedata_functions[];
 static PyMemberDef DB_members[] = {
 	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
        {NULL}
 };
 static PyTypeObject Xxo_Type = {
 	/* The ob_type field must be initialized in the module init function
 	 * to be portable to Windows without using C++. */
 	PyObject_HEAD_INIT(NULL)
 	0,			/*ob_size*/
 	"unicodedata.DB",		/*tp_name*/
 	sizeof(PreviousDBVersion),	/*tp_basicsize*/
 	0,			/*tp_itemsize*/
 	/* methods */
 	(destructor)PyObject_Del, /*tp_dealloc*/
 	0,			/*tp_print*/
 	0,                      /*tp_getattr*/
 	0,			/*tp_setattr*/
 	0,			/*tp_compare*/
 	0,			/*tp_repr*/
 	0,			/*tp_as_number*/
 	0,			/*tp_as_sequence*/
 	0,			/*tp_as_mapping*/
 	0,			/*tp_hash*/
        0,                      /*tp_call*/
        0,                      /*tp_str*/
        PyObject_GenericGetAttr,/*tp_getattro*/
        0,                      /*tp_setattro*/
        0,                      /*tp_as_buffer*/
        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
        0,                      /*tp_doc*/
        0,                      /*tp_traverse*/
        0,                      /*tp_clear*/
        0,                      /*tp_richcompare*/
        0,                      /*tp_weaklistoffset*/
        0,                      /*tp_iter*/
        0,                      /*tp_iternext*/
        unicodedata_functions,  /*tp_methods*/
        DB_members,             /*tp_members*/
        0,                      /*tp_getset*/
        0,                      /*tp_base*/
        0,                      /*tp_dict*/
        0,                      /*tp_descr_get*/
        0,                      /*tp_descr_set*/
        0,                      /*tp_dictoffset*/
        0,                      /*tp_init*/
        0,                      /*tp_alloc*/
        0,                      /*tp_new*/
        0,                      /*tp_free*/
        0,                      /*tp_is_gc*/
 };
 static PyObject*
 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
                     Py_UCS4 (*normalization)(Py_UCS4))
 {
 	PreviousDBVersion *self;
 	self = PyObject_New(PreviousDBVersion, &Xxo_Type);
 	if (self == NULL)
 		return NULL;
 	self->name = name;
 	self->getrecord = getrecord;
        self->normalization = normalization;
 	return (PyObject*)self;
 }
 /* --- Module API --------------------------------------------------------- */
 PyDoc_STRVAR(unicodedata_decimal__doc__,
@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    int have_old = 0;
    long rc;
    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
        return NULL;
    }
-    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
+
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0) {
            /* unassigned */
            have_old = 1;
            rc = -1;
        } 
        else if (old->decimal_changed != 0xFF) {
            have_old = 1;
            rc = old->decimal_changed;
        }
    }
    if (!have_old)
        rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError,
@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
    int have_old = 0;
    double rc;
    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
+
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0) {
            /* unassigned */
            have_old = 1;
            rc = -1;
        } 
        else if (old->decimal_changed != 0xFF) {
            have_old = 1;
            rc = old->decimal_changed;
        }
    }
    if (!have_old)
        rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->category;
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed != 0xFF)
            index = old->category_changed;
    }
    return PyString_FromString(_PyUnicode_CategoryNames[index]);
 }
@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->bidirectional;
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0)
            index = 0; /* unassigned */
        else if (old->bidir_changed != 0xFF)
            index = old->bidir_changed;
    }
    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 }
@ -219,6 +351,7 @@ static PyObject *
 unicodedata_combining(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    int index;
    if (!PyArg_ParseTuple(args, "O!:combining",
 			  &PyUnicode_Type, &v))
@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    return PyInt_FromLong((int) _getrecord(v)->combining);
+    index = (int) _getrecord(v)->combining;
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0)
            index = 0; /* unassigned */
    }
    return PyInt_FromLong(index);
 }
 PyDoc_STRVAR(unicodedata_mirrored__doc__,
@ -242,6 +381,7 @@ static PyObject *
 unicodedata_mirrored(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    int index;
    if (!PyArg_ParseTuple(args, "O!:mirrored",
 			  &PyUnicode_Type, &v))
@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    return PyInt_FromLong((int) _getrecord(v)->mirrored);
+    index = (int) _getrecord(v)->mirrored;
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0)
            index = 0; /* unassigned */
    }
    return PyInt_FromLong(index);
 }
 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->east_asian_width;
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0)
            index = 0; /* unassigned */
    }
    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 }
@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
    code = (int) *PyUnicode_AS_UNICODE(v);
    if (self) {
        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
        if (old->category_changed == 0)
            return PyString_FromString(""); /* unassigned */
    }
    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
 }
 void
-get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
+get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 {
    if (code >= 0x110000) {
        *index = 0;
-    } 
+    } else if (self && get_old_record(self, code)->category_changed==0) {
        /* unassigned in old version */
        *index = 0;
    }
    else {
        *index = decomp_index1[(code>>DECOMP_SHIFT)];
        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
 #define SCount  (LCount*NCount)
 static PyObject*
-nfd_nfkd(PyObject *input, int k)
+nfd_nfkd(PyObject *self, PyObject *input, int k)
 {
    PyObject *result;
    Py_UNICODE *i, *end, *o;
@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
                }
                continue;
            }
-            /* Other decompoistions. */
+            /* normalization changes */
-            get_decomp_record(code, &index, &prefix, &count);
+            if (self) {
                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
                if (value != 0) {
                    stack[stackptr++] = value;
                    continue;
                }
            }
            /* Other decompositions. */
            get_decomp_record(self, code, &index, &prefix, &count);
            /* Copy character if it is not decomposable, or has a
               compatibility decomposition, but we do NFD. */
@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
 }
 static int
-find_nfc_index(struct reindex* nfc, Py_UNICODE code)
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 {
    int index;
    for (index = 0; nfc[index].start; index++) {
@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
 }
 static PyObject*
-nfc_nfkc(PyObject *input, int k)
+nfc_nfkc(PyObject *self, PyObject *input, int k)
 {
    PyObject *result;
    Py_UNICODE *i, *i1, *o, *end;
@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
    Py_UNICODE *skipped[20];
    int cskipped = 0;
-    result = nfd_nfkd(input, k);
+    result = nfd_nfkd(self, input, k);
    if (!result)
        return NULL;
@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
          continue;
      }
-      f = find_nfc_index(nfc_first, *i);
+      f = find_nfc_index(self, nfc_first, *i);
      if (f == -1) {
          *o++ = *i++;
          continue;
@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
              i1++;
              continue;
          }
-          l = find_nfc_index(nfc_last, *i1);
+          l = find_nfc_index(self, nfc_last, *i1);
          /* *i1 cannot be combined with *i. If *i1
             is a starter, we don't need to look further.
             Otherwise, record the combining class. */
@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
          /* Mark the second character unused. */
          skipped[cskipped++] = i1;
          i1++;
-          f = find_nfc_index(nfc_first, *i);
+          f = find_nfc_index(self, nfc_first, *i);
          if (f == -1)
              break;
      }
@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
    }
    if (strcmp(form, "NFC") == 0)
-        return nfc_nfkc(input, 0);
+        return nfc_nfkc(self, input, 0);
    if (strcmp(form, "NFKC") == 0)
-        return nfc_nfkc(input, 1);
+        return nfc_nfkc(self, input, 1);
    if (strcmp(form, "NFD") == 0)
-        return nfd_nfkd(input, 0);
+        return nfd_nfkd(self, input, 0);
    if (strcmp(form, "NFKD") == 0)
-        return nfd_nfkd(input, 1);
+        return nfd_nfkd(self, input, 1);
    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    return NULL;
 }
@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
 }
 static int
-_getucname(Py_UCS4 code, char* buffer, int buflen)
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 {
    int offset;
    int i;
@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
    if (code >= 0x110000)
        return 0;
    if (self) {
        const change_record *old = get_old_record(self, code);
        if (old->category_changed == 0) {
            /* unassigned */
            return 0;
        } 
    }
    /* get offset into phrasebook */
    offset = phrasebook_offset1[(code>>phrasebook_shift)];
    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
 }
 static int
-_cmpname(int code, const char* name, int namelen)
+_cmpname(PyObject *self, int code, const char* name, int namelen)
 {
    /* check if code corresponds to the given name */
    int i;
    char buffer[NAME_MAXLEN];
-    if (!_getucname(code, buffer, sizeof(buffer)))
+    if (!_getucname(self, code, buffer, sizeof(buffer)))
        return 0;
    for (i = 0; i < namelen; i++) {
        if (toupper(name[i]) != buffer[i])
@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
 }
 static int
-_getcode(const char* name, int namelen, Py_UCS4* code)
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 {
    unsigned int h, v;
    unsigned int mask = code_size-1;
@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
    v = code_hash[i];
    if (!v)
        return 0;
-    if (_cmpname(v, name, namelen)) {
+    if (_cmpname(self, v, name, namelen)) {
        *code = v;
        return 1;
    }
@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
        v = code_hash[i];
        if (!v)
            return 0;
-        if (_cmpname(v, name, namelen)) {
+        if (_cmpname(self, v, name, namelen)) {
            *code = v;
            return 1;
        }
@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
 	return NULL;
    }
-    if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
+    if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
-                             name, sizeof(name))) {
+                    name, sizeof(name))) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "no such name");
            return NULL;
@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
        return NULL;
-    if (!_getcode(name, namelen, &code)) {
+    if (!_getcode(self, name, namelen, &code)) {
        char fmt[] = "undefined character name '%s'";
        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
        sprintf(buf, fmt, name);
@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
    {NULL, NULL}		/* sentinel */
 };
 PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
@ -1007,6 +1187,11 @@ initunicodedata(void)
    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
    /* Previous versions */
    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
    if (v != NULL)
        PyModule_AddObject(m, "db_3_2_0", v);
    /* Export C API */
    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
    if (v != NULL)
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                    /* found a name.  look it up in the unicode database */
                    message = "unknown Unicode character name";
                    s++;
-                    if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr))
+                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
                        goto store;
                }
            }
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -26,13 +26,15 @@
 import sys
 SCRIPT = sys.argv[0]
-VERSION = "2.3"
+VERSION = "2.5"
 # The Unicode Database
-UNIDATA_VERSION = "3.2.0"
+UNIDATA_VERSION = "4.1.0"
-UNICODE_DATA = "UnicodeData.txt"
+UNICODE_DATA = "UnicodeData%s.txt"
-COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
+COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
-EASTASIAN_WIDTH = "EastAsianWidth.txt"
+EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
 old_versions = ["3.2.0"]
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -57,13 +59,23 @@
 def maketables(trace=0):
-    print "--- Reading", UNICODE_DATA, "..."
+    print "--- Reading", UNICODE_DATA % "", "..."
-    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
+    version = ""
-                          EASTASIAN_WIDTH)
+    unicode = UnicodeData(UNICODE_DATA % version,
                          COMPOSITION_EXCLUSIONS % version,
                          EASTASIAN_WIDTH % version)
    print len(filter(None, unicode.table)), "characters"
    for version in old_versions:
        print "--- Reading", UNICODE_DATA % ("-"+version), "..."
        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
                                  COMPOSITION_EXCLUSIONS % ("-"+version),
                                  EASTASIAN_WIDTH % ("-"+version))
        print len(filter(None, old_unicode.table)), "characters"
        merge_old_version(version, unicode, old_unicode)
    makeunicodename(unicode, trace)
    makeunicodedata(unicode, trace)
    makeunicodetype(unicode, trace)
@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
        if record:
            if record[5]:
                decomp = record[5].split()
                if len(decomp) > 19:
                    raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
                # prefix
                if decomp[0][0] == "<":
                    prefix = decomp.pop(0)
@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
    Array("comp_index", index).dump(fp, trace)
    Array("comp_data", index2).dump(fp, trace)
    # Generate delta tables for old versions
    for version, table, normalization in unicode.changed:
        cversion = version.replace(".","_")
        records = [table[0]]
        cache = {table[0]:0}
        index = [0] * len(table)
        for i, record in enumerate(table):
            try:
                index[i] = cache[record]
            except KeyError:
                index[i] = cache[record] = len(records)
                records.append(record)
        index1, index2, shift = splitbins(index, trace)
        print >>fp, "static const change_record change_records_%s[] = {" % cversion
        for record in records:
            print >>fp, "\t{ %s }," % ", ".join(map(str,record))
        print >>fp, "};"
        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
        print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
        print >>fp, "{"
        print >>fp, "\tint index;"
        print >>fp, "\tif (n >= 0x110000) index = 0;"
        print >>fp, "\telse {"
        print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
        print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
              (cversion, shift, ((1<<shift)-1))
        print >>fp, "\t}"
        print >>fp, "\treturn change_records_%s+index;" % cversion
        print >>fp, "}\n"
        print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
        print >>fp, "{"
        print >>fp, "\tswitch(n) {"
        for k, v in normalization:
            print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
        print >>fp, "\tdefault: return 0;"
        print >>fp, "\t}\n}\n"
    fp.close()
 # --------------------------------------------------------------------
@ -540,6 +592,82 @@ def cmpwords((aword, alist),(bword, blist)):
    fp.close()
 def merge_old_version(version, new, old):
    # Changes to exclusion file not implemented yet
    if old.exclusions != new.exclusions:
        raise NotImplementedError, "exclusions differ"
    # In these change records, 0xFF means "no change"
    bidir_changes = [0xFF]*0x110000
    category_changes = [0xFF]*0x110000
    decimal_changes = [0xFF]*0x110000
    # In numeric data, 0 means "no change",
    # -1 means "did not have a numeric value
    numeric_changes = [0] * 0x110000
    # normalization_changes is a list of key-value pairs
    normalization_changes = []
    for i in range(0x110000):
        if new.table[i] is None:
            # Characters unassigned in the new version ought to
            # be unassigned in the old one
            assert old.table[i] is None
            continue
        # check characters unassigned in the old version
        if old.table[i] is None:
            # category 0 is "unassigned"
            category_changes[i] = 0
            continue
        # check characters that differ
        if old.table[i] != new.table[i]:
            for k in range(len(old.table[i])):
                if old.table[i][k] != new.table[i][k]:
                    value = old.table[i][k]
                    if k == 2:
                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
                        category_changes[i] = CATEGORY_NAMES.index(value)
                    elif k == 4:
                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
                    elif k == 5:
                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
                        # We assume that all normalization changes are in 1:1 mappings
                        assert " " not in value
                        normalization_changes.append((i, value))
                    elif k == 6:
                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
                        # we only support changes where the old value is a single digit
                        assert value in "0123456789"
                        decimal_changes[i] = int(value)
                    elif k == 8:
                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
                        # Since 0 encodes "no change", the old value is better not 0
                        assert value != "0" and value != "-1"
                        if not value:
                            numeric_changes[i] = -1
                        else:
                            assert re.match("^[0-9]+$", value)
                            numeric_changes[i] = int(value)
                    elif k == 11:
                        # change to ISO comment, ignore
                        pass
                    elif k == 12:
                        # change to simple uppercase mapping; ignore
                        pass
                    elif k == 13:
                        # change to simple lowercase mapping; ignore
                        pass
                    elif k == 14:
                        # change to simple titlecase mapping; ignore
                        pass
                    else:
                        class Difference(Exception):pass
                        raise Difference, (hex(i), k, old.table[i], new.table[i])
    new.changed.append((version, zip(bidir_changes, category_changes,
                                     decimal_changes, numeric_changes),
                        normalization_changes))
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@ -551,6 +679,7 @@ def cmpwords((aword, alist),(bword, blist)):
 class UnicodeData:
    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
        while 1:
@ -569,13 +698,14 @@ def __init__(self, filename, exclusions, eastasianwidth, expand=1):
                if s:
                    if s[1][-6:] == "First>":
                        s[1] = ""
-                        field = s[:]
+                        field = s
                    elif s[1][-5:] == "Last>":
                        s[1] = ""
                        field = None
                elif field:
-                    field[0] = hex(i)
+                    f2 = field[:]
-                    table[i] = field
+                    f2[0] = "%X" % i
                    table[i] = f2
        # public attributes
        self.filename = filename