mirror of
				https://github.com/python/cpython.git
				synced 2025-11-01 06:01:29 +00:00 
			
		
		
		
	Update Unicode database to Unicode 4.1.
This commit is contained in:
		
							parent
							
								
									e2b4677253
								
							
						
					
					
						commit
						480f1bb67b
					
				
					 12 changed files with 17302 additions and 13365 deletions
				
			
		|  | @ -14,11 +14,11 @@ | ||||||
| This module provides access to the Unicode Character Database which | This module provides access to the Unicode Character Database which | ||||||
| defines character properties for all Unicode characters. The data in | defines character properties for all Unicode characters. The data in | ||||||
| this database is based on the \file{UnicodeData.txt} file version | this database is based on the \file{UnicodeData.txt} file version | ||||||
| 3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}. | 4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}. | ||||||
| 
 | 
 | ||||||
| The module uses the same names and symbols as defined by the | The module uses the same names and symbols as defined by the | ||||||
| UnicodeData File Format 3.2.0 (see | UnicodeData File Format 4.1.0 (see | ||||||
| \url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}).  It | \url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}).  It | ||||||
| defines the following functions: | defines the following functions: | ||||||
| 
 | 
 | ||||||
| \begin{funcdesc}{lookup}{name} | \begin{funcdesc}{lookup}{name} | ||||||
|  | @ -130,3 +130,12 @@ The version of the Unicode database used in this module. | ||||||
| 
 | 
 | ||||||
| \versionadded{2.3} | \versionadded{2.3} | ||||||
| \end{datadesc} | \end{datadesc} | ||||||
|  | 
 | ||||||
|  | \begin{datadesc}{db_3_2_0} | ||||||
|  | This is an object that has the same methods as the entire | ||||||
|  | module, but uses the Unicode database version 3.2 instead, | ||||||
|  | for applications that require this specific version of | ||||||
|  | the Unicode database (such as IDNA). | ||||||
|  | 
 | ||||||
|  | \versionadded{2.5} | ||||||
|  | \end{datadesc} | ||||||
|  |  | ||||||
|  | @ -14,12 +14,14 @@ typedef struct { | ||||||
|     int size; |     int size; | ||||||
| 
 | 
 | ||||||
|     /* Get name for a given character code.  Returns non-zero if
 |     /* Get name for a given character code.  Returns non-zero if
 | ||||||
|        success, zero if not.  Does not set Python exceptions. */ |        success, zero if not.  Does not set Python exceptions.  | ||||||
|     int (*getname)(Py_UCS4 code, char* buffer, int buflen); |        If self is NULL, data come from the default version of the database. | ||||||
|  |        If it is not NULL, it should be a unicodedata.db_X_Y_Z object */ | ||||||
|  |     int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen); | ||||||
| 
 | 
 | ||||||
|     /* Get character code for a given name.  Same error handling
 |     /* Get character code for a given name.  Same error handling
 | ||||||
|        as for getname. */ |        as for getname. */ | ||||||
|     int (*getcode)(const char* name, int namelen, Py_UCS4* code); |     int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code); | ||||||
| 
 | 
 | ||||||
| } _PyUnicode_Name_CAPI; | } _PyUnicode_Name_CAPI; | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) | ||||||
| 
 | 
 | ||||||
| import stringprep, unicodedata, re, codecs | import stringprep, re, codecs | ||||||
|  | from unicodedata import db_3_2_0 as unicodedata | ||||||
| 
 | 
 | ||||||
| # IDNA section 3.1 | # IDNA section 3.1 | ||||||
| dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") | ||||||
|  |  | ||||||
|  | @ -5,7 +5,7 @@ | ||||||
| and mappings, for which a mapping function is provided. | and mappings, for which a mapping function is provided. | ||||||
| """ | """ | ||||||
| 
 | 
 | ||||||
| import unicodedata | from unicodedata import db_3_2_0 as unicodedata | ||||||
| 
 | 
 | ||||||
| assert unicodedata.unidata_version == '3.2.0' | assert unicodedata.unidata_version == '3.2.0' | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ | ||||||
| class UnicodeMethodsTest(unittest.TestCase): | class UnicodeMethodsTest(unittest.TestCase): | ||||||
| 
 | 
 | ||||||
|     # update this, if the database changes |     # update this, if the database changes | ||||||
|     expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9' |     expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba' | ||||||
| 
 | 
 | ||||||
|     def test_method_checksum(self): |     def test_method_checksum(self): | ||||||
|         h = sha.sha() |         h = sha.sha() | ||||||
|  | @ -75,7 +75,7 @@ def tearDown(self): | ||||||
| class UnicodeFunctionsTest(UnicodeDatabaseTest): | class UnicodeFunctionsTest(UnicodeDatabaseTest): | ||||||
| 
 | 
 | ||||||
|     # update this, if the database changes |     # update this, if the database changes | ||||||
|     expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af' |     expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b' | ||||||
| 
 | 
 | ||||||
|     def test_function_checksum(self): |     def test_function_checksum(self): | ||||||
|         data = [] |         data = [] | ||||||
|  |  | ||||||
|  | @ -279,6 +279,10 @@ Core and builtins | ||||||
| Extension Modules | Extension Modules | ||||||
| ----------------- | ----------------- | ||||||
| 
 | 
 | ||||||
|  | - The unicodedata module was updated to the 4.1 version of the Unicode | ||||||
|  |   database. The 3.2 version is still available as unicodedata.db_3_2_0 | ||||||
|  |   for applications that require this specific version (such as IDNA). | ||||||
|  | 
 | ||||||
| - The timing module is no longer built by default.  It was deprecated | - The timing module is no longer built by default.  It was deprecated | ||||||
|   in PEP 4 in Python 2.0 or earlier. |   in PEP 4 in Python 2.0 or earlier. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -14,6 +14,7 @@ | ||||||
| 
 | 
 | ||||||
| #include "Python.h" | #include "Python.h" | ||||||
| #include "ucnhash.h" | #include "ucnhash.h" | ||||||
|  | #include "structmember.h" | ||||||
| 
 | 
 | ||||||
| /* character properties */ | /* character properties */ | ||||||
| 
 | 
 | ||||||
|  | @ -28,6 +29,14 @@ typedef struct { | ||||||
| 						   _PyUnicode_EastAsianWidth */ | 						   _PyUnicode_EastAsianWidth */ | ||||||
| } _PyUnicode_DatabaseRecord; | } _PyUnicode_DatabaseRecord; | ||||||
| 
 | 
 | ||||||
|  | typedef struct change_record { | ||||||
|  |     /* sequence of fields should be the same as in merge_old_version */ | ||||||
|  |     const unsigned char bidir_changed; | ||||||
|  |     const unsigned char category_changed; | ||||||
|  |     const unsigned char decimal_changed; | ||||||
|  |     const int numeric_changed; | ||||||
|  | } change_record; | ||||||
|  | 
 | ||||||
| /* data file generated by Tools/unicode/makeunicodedata.py */ | /* data file generated by Tools/unicode/makeunicodedata.py */ | ||||||
| #include "unicodedata_db.h" | #include "unicodedata_db.h" | ||||||
| 
 | 
 | ||||||
|  | @ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v) | ||||||
|     return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); |     return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /* ------------- Previous-version API ------------------------------------- */ | ||||||
|  | typedef struct previous_version { | ||||||
|  |     PyObject_HEAD | ||||||
|  |     const char *name; | ||||||
|  |     const change_record* (*getrecord)(Py_UCS4); | ||||||
|  |     Py_UCS4 (*normalization)(Py_UCS4); | ||||||
|  | } PreviousDBVersion; | ||||||
|  | 
 | ||||||
|  | #define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v)) | ||||||
|  | 
 | ||||||
|  | /* Forward declaration */ | ||||||
|  | static PyMethodDef unicodedata_functions[]; | ||||||
|  | 
 | ||||||
|  | static PyMemberDef DB_members[] = { | ||||||
|  | 	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY}, | ||||||
|  |         {NULL} | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static PyTypeObject Xxo_Type = { | ||||||
|  | 	/* The ob_type field must be initialized in the module init function
 | ||||||
|  | 	 * to be portable to Windows without using C++. */ | ||||||
|  | 	PyObject_HEAD_INIT(NULL) | ||||||
|  | 	0,			/*ob_size*/ | ||||||
|  | 	"unicodedata.DB",		/*tp_name*/ | ||||||
|  | 	sizeof(PreviousDBVersion),	/*tp_basicsize*/ | ||||||
|  | 	0,			/*tp_itemsize*/ | ||||||
|  | 	/* methods */ | ||||||
|  | 	(destructor)PyObject_Del, /*tp_dealloc*/ | ||||||
|  | 	0,			/*tp_print*/ | ||||||
|  | 	0,                      /*tp_getattr*/ | ||||||
|  | 	0,			/*tp_setattr*/ | ||||||
|  | 	0,			/*tp_compare*/ | ||||||
|  | 	0,			/*tp_repr*/ | ||||||
|  | 	0,			/*tp_as_number*/ | ||||||
|  | 	0,			/*tp_as_sequence*/ | ||||||
|  | 	0,			/*tp_as_mapping*/ | ||||||
|  | 	0,			/*tp_hash*/ | ||||||
|  |         0,                      /*tp_call*/ | ||||||
|  |         0,                      /*tp_str*/ | ||||||
|  |         PyObject_GenericGetAttr,/*tp_getattro*/ | ||||||
|  |         0,                      /*tp_setattro*/ | ||||||
|  |         0,                      /*tp_as_buffer*/ | ||||||
|  |         Py_TPFLAGS_DEFAULT,     /*tp_flags*/ | ||||||
|  |         0,                      /*tp_doc*/ | ||||||
|  |         0,                      /*tp_traverse*/ | ||||||
|  |         0,                      /*tp_clear*/ | ||||||
|  |         0,                      /*tp_richcompare*/ | ||||||
|  |         0,                      /*tp_weaklistoffset*/ | ||||||
|  |         0,                      /*tp_iter*/ | ||||||
|  |         0,                      /*tp_iternext*/ | ||||||
|  |         unicodedata_functions,  /*tp_methods*/ | ||||||
|  |         DB_members,             /*tp_members*/ | ||||||
|  |         0,                      /*tp_getset*/ | ||||||
|  |         0,                      /*tp_base*/ | ||||||
|  |         0,                      /*tp_dict*/ | ||||||
|  |         0,                      /*tp_descr_get*/ | ||||||
|  |         0,                      /*tp_descr_set*/ | ||||||
|  |         0,                      /*tp_dictoffset*/ | ||||||
|  |         0,                      /*tp_init*/ | ||||||
|  |         0,                      /*tp_alloc*/ | ||||||
|  |         0,                      /*tp_new*/ | ||||||
|  |         0,                      /*tp_free*/ | ||||||
|  |         0,                      /*tp_is_gc*/ | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | static PyObject* | ||||||
|  | new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4), | ||||||
|  |                      Py_UCS4 (*normalization)(Py_UCS4)) | ||||||
|  | { | ||||||
|  | 	PreviousDBVersion *self; | ||||||
|  | 	self = PyObject_New(PreviousDBVersion, &Xxo_Type); | ||||||
|  | 	if (self == NULL) | ||||||
|  | 		return NULL; | ||||||
|  | 	self->name = name; | ||||||
|  | 	self->getrecord = getrecord; | ||||||
|  |         self->normalization = normalization; | ||||||
|  | 	return (PyObject*)self; | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /* --- Module API --------------------------------------------------------- */ | /* --- Module API --------------------------------------------------------- */ | ||||||
| 
 | 
 | ||||||
| PyDoc_STRVAR(unicodedata_decimal__doc__, | PyDoc_STRVAR(unicodedata_decimal__doc__, | ||||||
|  | @ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args) | ||||||
| { | { | ||||||
|     PyUnicodeObject *v; |     PyUnicodeObject *v; | ||||||
|     PyObject *defobj = NULL; |     PyObject *defobj = NULL; | ||||||
|  |     int have_old = 0; | ||||||
|     long rc; |     long rc; | ||||||
| 
 | 
 | ||||||
|     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) |     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) | ||||||
|  | @ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args) | ||||||
| 			"need a single Unicode character as parameter"); | 			"need a single Unicode character as parameter"); | ||||||
|         return NULL; |         return NULL; | ||||||
|     } |     } | ||||||
|     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); | 
 | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) { | ||||||
|  |             /* unassigned */ | ||||||
|  |             have_old = 1; | ||||||
|  |             rc = -1; | ||||||
|  |         }  | ||||||
|  |         else if (old->decimal_changed != 0xFF) { | ||||||
|  |             have_old = 1; | ||||||
|  |             rc = old->decimal_changed; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (!have_old) | ||||||
|  |         rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v)); | ||||||
|     if (rc < 0) { |     if (rc < 0) { | ||||||
| 	if (defobj == NULL) { | 	if (defobj == NULL) { | ||||||
| 	    PyErr_SetString(PyExc_ValueError, | 	    PyErr_SetString(PyExc_ValueError, | ||||||
|  | @ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args) | ||||||
| { | { | ||||||
|     PyUnicodeObject *v; |     PyUnicodeObject *v; | ||||||
|     PyObject *defobj = NULL; |     PyObject *defobj = NULL; | ||||||
|  |     int have_old = 0; | ||||||
|     double rc; |     double rc; | ||||||
| 
 | 
 | ||||||
|     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) |     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) | ||||||
|  | @ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args) | ||||||
| 			"need a single Unicode character as parameter"); | 			"need a single Unicode character as parameter"); | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); | 
 | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) { | ||||||
|  |             /* unassigned */ | ||||||
|  |             have_old = 1; | ||||||
|  |             rc = -1; | ||||||
|  |         }  | ||||||
|  |         else if (old->decimal_changed != 0xFF) { | ||||||
|  |             have_old = 1; | ||||||
|  |             rc = old->decimal_changed; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (!have_old) | ||||||
|  |         rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v)); | ||||||
|     if (rc < 0) { |     if (rc < 0) { | ||||||
| 	if (defobj == NULL) { | 	if (defobj == NULL) { | ||||||
| 	    PyErr_SetString(PyExc_ValueError, "not a numeric character"); | 	    PyErr_SetString(PyExc_ValueError, "not a numeric character"); | ||||||
|  | @ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args) | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     index = (int) _getrecord(v)->category; |     index = (int) _getrecord(v)->category; | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed != 0xFF) | ||||||
|  |             index = old->category_changed; | ||||||
|  |     } | ||||||
|     return PyString_FromString(_PyUnicode_CategoryNames[index]); |     return PyString_FromString(_PyUnicode_CategoryNames[index]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args) | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     index = (int) _getrecord(v)->bidirectional; |     index = (int) _getrecord(v)->bidirectional; | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) | ||||||
|  |             index = 0; /* unassigned */ | ||||||
|  |         else if (old->bidir_changed != 0xFF) | ||||||
|  |             index = old->bidir_changed; | ||||||
|  |     } | ||||||
|     return PyString_FromString(_PyUnicode_BidirectionalNames[index]); |     return PyString_FromString(_PyUnicode_BidirectionalNames[index]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -219,6 +351,7 @@ static PyObject * | ||||||
| unicodedata_combining(PyObject *self, PyObject *args) | unicodedata_combining(PyObject *self, PyObject *args) | ||||||
| { | { | ||||||
|     PyUnicodeObject *v; |     PyUnicodeObject *v; | ||||||
|  |     int index; | ||||||
| 
 | 
 | ||||||
|     if (!PyArg_ParseTuple(args, "O!:combining", |     if (!PyArg_ParseTuple(args, "O!:combining", | ||||||
| 			  &PyUnicode_Type, &v)) | 			  &PyUnicode_Type, &v)) | ||||||
|  | @ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args) | ||||||
| 			"need a single Unicode character as parameter"); | 			"need a single Unicode character as parameter"); | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     return PyInt_FromLong((int) _getrecord(v)->combining); |     index = (int) _getrecord(v)->combining; | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) | ||||||
|  |             index = 0; /* unassigned */ | ||||||
|  |     } | ||||||
|  |     return PyInt_FromLong(index); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| PyDoc_STRVAR(unicodedata_mirrored__doc__, | PyDoc_STRVAR(unicodedata_mirrored__doc__, | ||||||
|  | @ -242,6 +381,7 @@ static PyObject * | ||||||
| unicodedata_mirrored(PyObject *self, PyObject *args) | unicodedata_mirrored(PyObject *self, PyObject *args) | ||||||
| { | { | ||||||
|     PyUnicodeObject *v; |     PyUnicodeObject *v; | ||||||
|  |     int index; | ||||||
| 
 | 
 | ||||||
|     if (!PyArg_ParseTuple(args, "O!:mirrored", |     if (!PyArg_ParseTuple(args, "O!:mirrored", | ||||||
| 			  &PyUnicode_Type, &v)) | 			  &PyUnicode_Type, &v)) | ||||||
|  | @ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args) | ||||||
| 			"need a single Unicode character as parameter"); | 			"need a single Unicode character as parameter"); | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     return PyInt_FromLong((int) _getrecord(v)->mirrored); |     index = (int) _getrecord(v)->mirrored; | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) | ||||||
|  |             index = 0; /* unassigned */ | ||||||
|  |     } | ||||||
|  |     return PyInt_FromLong(index); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| PyDoc_STRVAR(unicodedata_east_asian_width__doc__, | PyDoc_STRVAR(unicodedata_east_asian_width__doc__, | ||||||
|  | @ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args) | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
|     index = (int) _getrecord(v)->east_asian_width; |     index = (int) _getrecord(v)->east_asian_width; | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) | ||||||
|  |             index = 0; /* unassigned */ | ||||||
|  |     } | ||||||
|     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); |     return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args) | ||||||
| 
 | 
 | ||||||
|     code = (int) *PyUnicode_AS_UNICODE(v); |     code = (int) *PyUnicode_AS_UNICODE(v); | ||||||
| 
 | 
 | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v)); | ||||||
|  |         if (old->category_changed == 0) | ||||||
|  |             return PyString_FromString(""); /* unassigned */ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     if (code < 0 || code >= 0x110000) |     if (code < 0 || code >= 0x110000) | ||||||
|         index = 0; |         index = 0; | ||||||
|     else { |     else { | ||||||
|  | @ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void | void | ||||||
| get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) | get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count) | ||||||
| { | { | ||||||
|     if (code >= 0x110000) { |     if (code >= 0x110000) { | ||||||
|         *index = 0; |         *index = 0; | ||||||
|     }  |     } else if (self && get_old_record(self, code)->category_changed==0) { | ||||||
|  |         /* unassigned in old version */ | ||||||
|  |         *index = 0; | ||||||
|  |     } | ||||||
|     else { |     else { | ||||||
|         *index = decomp_index1[(code>>DECOMP_SHIFT)]; |         *index = decomp_index1[(code>>DECOMP_SHIFT)]; | ||||||
|         *index = decomp_index2[(*index<<DECOMP_SHIFT)+ |         *index = decomp_index2[(*index<<DECOMP_SHIFT)+ | ||||||
|  | @ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) | ||||||
| #define SCount  (LCount*NCount) | #define SCount  (LCount*NCount) | ||||||
| 
 | 
 | ||||||
| static PyObject* | static PyObject* | ||||||
| nfd_nfkd(PyObject *input, int k) | nfd_nfkd(PyObject *self, PyObject *input, int k) | ||||||
| { | { | ||||||
|     PyObject *result; |     PyObject *result; | ||||||
|     Py_UNICODE *i, *end, *o; |     Py_UNICODE *i, *end, *o; | ||||||
|  | @ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k) | ||||||
|                 } |                 } | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|             /* Other decompoistions. */ |             /* normalization changes */ | ||||||
|             get_decomp_record(code, &index, &prefix, &count); |             if (self) { | ||||||
|  |                 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code); | ||||||
|  |                 if (value != 0) { | ||||||
|  |                     stack[stackptr++] = value; | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             /* Other decompositions. */ | ||||||
|  |             get_decomp_record(self, code, &index, &prefix, &count); | ||||||
| 
 | 
 | ||||||
|             /* Copy character if it is not decomposable, or has a
 |             /* Copy character if it is not decomposable, or has a
 | ||||||
|                compatibility decomposition, but we do NFD. */ |                compatibility decomposition, but we do NFD. */ | ||||||
|  | @ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| find_nfc_index(struct reindex* nfc, Py_UNICODE code) | find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code) | ||||||
| { | { | ||||||
|     int index; |     int index; | ||||||
|     for (index = 0; nfc[index].start; index++) { |     for (index = 0; nfc[index].start; index++) { | ||||||
|  | @ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static PyObject* | static PyObject* | ||||||
| nfc_nfkc(PyObject *input, int k) | nfc_nfkc(PyObject *self, PyObject *input, int k) | ||||||
| { | { | ||||||
|     PyObject *result; |     PyObject *result; | ||||||
|     Py_UNICODE *i, *i1, *o, *end; |     Py_UNICODE *i, *i1, *o, *end; | ||||||
|  | @ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k) | ||||||
|     Py_UNICODE *skipped[20]; |     Py_UNICODE *skipped[20]; | ||||||
|     int cskipped = 0; |     int cskipped = 0; | ||||||
| 
 | 
 | ||||||
|     result = nfd_nfkd(input, k); |     result = nfd_nfkd(self, input, k); | ||||||
|     if (!result) |     if (!result) | ||||||
|         return NULL; |         return NULL; | ||||||
| 
 | 
 | ||||||
|  | @ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k) | ||||||
|           continue; |           continue; | ||||||
|       } |       } | ||||||
| 
 | 
 | ||||||
|       f = find_nfc_index(nfc_first, *i); |       f = find_nfc_index(self, nfc_first, *i); | ||||||
|       if (f == -1) { |       if (f == -1) { | ||||||
|           *o++ = *i++; |           *o++ = *i++; | ||||||
|           continue; |           continue; | ||||||
|  | @ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k) | ||||||
|               i1++; |               i1++; | ||||||
|               continue; |               continue; | ||||||
|           } |           } | ||||||
|           l = find_nfc_index(nfc_last, *i1); |           l = find_nfc_index(self, nfc_last, *i1); | ||||||
|           /* *i1 cannot be combined with *i. If *i1
 |           /* *i1 cannot be combined with *i. If *i1
 | ||||||
|              is a starter, we don't need to look further. |              is a starter, we don't need to look further. | ||||||
|              Otherwise, record the combining class. */ |              Otherwise, record the combining class. */ | ||||||
|  | @ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k) | ||||||
|           /* Mark the second character unused. */ |           /* Mark the second character unused. */ | ||||||
|           skipped[cskipped++] = i1; |           skipped[cskipped++] = i1; | ||||||
|           i1++; |           i1++; | ||||||
|           f = find_nfc_index(nfc_first, *i); |           f = find_nfc_index(self, nfc_first, *i); | ||||||
|           if (f == -1) |           if (f == -1) | ||||||
|               break; |               break; | ||||||
|       } |       } | ||||||
|  | @ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args) | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (strcmp(form, "NFC") == 0) |     if (strcmp(form, "NFC") == 0) | ||||||
|         return nfc_nfkc(input, 0); |         return nfc_nfkc(self, input, 0); | ||||||
|     if (strcmp(form, "NFKC") == 0) |     if (strcmp(form, "NFKC") == 0) | ||||||
|         return nfc_nfkc(input, 1); |         return nfc_nfkc(self, input, 1); | ||||||
|     if (strcmp(form, "NFD") == 0) |     if (strcmp(form, "NFD") == 0) | ||||||
|         return nfd_nfkd(input, 0); |         return nfd_nfkd(self, input, 0); | ||||||
|     if (strcmp(form, "NFKD") == 0) |     if (strcmp(form, "NFKD") == 0) | ||||||
|         return nfd_nfkd(input, 1); |         return nfd_nfkd(self, input, 1); | ||||||
|     PyErr_SetString(PyExc_ValueError, "invalid normalization form"); |     PyErr_SetString(PyExc_ValueError, "invalid normalization form"); | ||||||
|     return NULL; |     return NULL; | ||||||
| } | } | ||||||
|  | @ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| _getucname(Py_UCS4 code, char* buffer, int buflen) | _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen) | ||||||
| { | { | ||||||
|     int offset; |     int offset; | ||||||
|     int i; |     int i; | ||||||
|  | @ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) | ||||||
|     if (code >= 0x110000) |     if (code >= 0x110000) | ||||||
|         return 0; |         return 0; | ||||||
| 
 | 
 | ||||||
|  |     if (self) { | ||||||
|  |         const change_record *old = get_old_record(self, code); | ||||||
|  |         if (old->category_changed == 0) { | ||||||
|  |             /* unassigned */ | ||||||
|  |             return 0; | ||||||
|  |         }  | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|     /* get offset into phrasebook */ |     /* get offset into phrasebook */ | ||||||
|     offset = phrasebook_offset1[(code>>phrasebook_shift)]; |     offset = phrasebook_offset1[(code>>phrasebook_shift)]; | ||||||
|     offset = phrasebook_offset2[(offset<<phrasebook_shift) + |     offset = phrasebook_offset2[(offset<<phrasebook_shift) + | ||||||
|  | @ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| _cmpname(int code, const char* name, int namelen) | _cmpname(PyObject *self, int code, const char* name, int namelen) | ||||||
| { | { | ||||||
|     /* check if code corresponds to the given name */ |     /* check if code corresponds to the given name */ | ||||||
|     int i; |     int i; | ||||||
|     char buffer[NAME_MAXLEN]; |     char buffer[NAME_MAXLEN]; | ||||||
|     if (!_getucname(code, buffer, sizeof(buffer))) |     if (!_getucname(self, code, buffer, sizeof(buffer))) | ||||||
|         return 0; |         return 0; | ||||||
|     for (i = 0; i < namelen; i++) { |     for (i = 0; i < namelen; i++) { | ||||||
|         if (toupper(name[i]) != buffer[i]) |         if (toupper(name[i]) != buffer[i]) | ||||||
|  | @ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static int | static int | ||||||
| _getcode(const char* name, int namelen, Py_UCS4* code) | _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code) | ||||||
| { | { | ||||||
|     unsigned int h, v; |     unsigned int h, v; | ||||||
|     unsigned int mask = code_size-1; |     unsigned int mask = code_size-1; | ||||||
|  | @ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) | ||||||
|     v = code_hash[i]; |     v = code_hash[i]; | ||||||
|     if (!v) |     if (!v) | ||||||
|         return 0; |         return 0; | ||||||
|     if (_cmpname(v, name, namelen)) { |     if (_cmpname(self, v, name, namelen)) { | ||||||
|         *code = v; |         *code = v; | ||||||
|         return 1; |         return 1; | ||||||
|     } |     } | ||||||
|  | @ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code) | ||||||
|         v = code_hash[i]; |         v = code_hash[i]; | ||||||
|         if (!v) |         if (!v) | ||||||
|             return 0; |             return 0; | ||||||
|         if (_cmpname(v, name, namelen)) { |         if (_cmpname(self, v, name, namelen)) { | ||||||
|             *code = v; |             *code = v; | ||||||
|             return 1; |             return 1; | ||||||
|         } |         } | ||||||
|  | @ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args) | ||||||
| 	return NULL; | 	return NULL; | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v), |     if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v), | ||||||
|                              name, sizeof(name))) { |                     name, sizeof(name))) { | ||||||
| 	if (defobj == NULL) { | 	if (defobj == NULL) { | ||||||
| 	    PyErr_SetString(PyExc_ValueError, "no such name"); | 	    PyErr_SetString(PyExc_ValueError, "no such name"); | ||||||
|             return NULL; |             return NULL; | ||||||
|  | @ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args) | ||||||
|     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) |     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) | ||||||
|         return NULL; |         return NULL; | ||||||
| 
 | 
 | ||||||
|     if (!_getcode(name, namelen, &code)) { |     if (!_getcode(self, name, namelen, &code)) { | ||||||
|         char fmt[] = "undefined character name '%s'"; |         char fmt[] = "undefined character name '%s'"; | ||||||
|         char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); |         char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); | ||||||
|         sprintf(buf, fmt, name); |         sprintf(buf, fmt, name); | ||||||
|  | @ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = { | ||||||
|     {NULL, NULL}		/* sentinel */ |     {NULL, NULL}		/* sentinel */ | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| PyDoc_STRVAR(unicodedata_docstring, | PyDoc_STRVAR(unicodedata_docstring, | ||||||
| "This module provides access to the Unicode Character Database which\n\
 | "This module provides access to the Unicode Character Database which\n\
 | ||||||
| defines character properties for all Unicode characters. The data in\n\ | defines character properties for all Unicode characters. The data in\n\ | ||||||
|  | @ -1007,6 +1187,11 @@ initunicodedata(void) | ||||||
| 
 | 
 | ||||||
|     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); |     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); | ||||||
| 
 | 
 | ||||||
|  |     /* Previous versions */ | ||||||
|  |     v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0); | ||||||
|  |     if (v != NULL) | ||||||
|  |         PyModule_AddObject(m, "db_3_2_0", v); | ||||||
|  | 
 | ||||||
|     /* Export C API */ |     /* Export C API */ | ||||||
|     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); |     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); | ||||||
|     if (v != NULL) |     if (v != NULL) | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										22596
									
								
								Modules/unicodename_db.h
									
										
									
									
									
								
							
							
						
						
									
										22596
									
								
								Modules/unicodename_db.h
									
										
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, | ||||||
|                     /* found a name.  look it up in the unicode database */ |                     /* found a name.  look it up in the unicode database */ | ||||||
|                     message = "unknown Unicode character name"; |                     message = "unknown Unicode character name"; | ||||||
|                     s++; |                     s++; | ||||||
|                     if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr)) |                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr)) | ||||||
|                         goto store; |                         goto store; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|  |  | ||||||
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -26,13 +26,15 @@ | ||||||
| import sys | import sys | ||||||
| 
 | 
 | ||||||
| SCRIPT = sys.argv[0] | SCRIPT = sys.argv[0] | ||||||
| VERSION = "2.3" | VERSION = "2.5" | ||||||
| 
 | 
 | ||||||
| # The Unicode Database | # The Unicode Database | ||||||
| UNIDATA_VERSION = "3.2.0" | UNIDATA_VERSION = "4.1.0" | ||||||
| UNICODE_DATA = "UnicodeData.txt" | UNICODE_DATA = "UnicodeData%s.txt" | ||||||
| COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" | COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" | ||||||
| EASTASIAN_WIDTH = "EastAsianWidth.txt" | EASTASIAN_WIDTH = "EastAsianWidth%s.txt" | ||||||
|  | 
 | ||||||
|  | old_versions = ["3.2.0"] | ||||||
| 
 | 
 | ||||||
| CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", | CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", | ||||||
|     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", |     "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", | ||||||
|  | @ -57,13 +59,23 @@ | ||||||
| 
 | 
 | ||||||
| def maketables(trace=0): | def maketables(trace=0): | ||||||
| 
 | 
 | ||||||
|     print "--- Reading", UNICODE_DATA, "..." |     print "--- Reading", UNICODE_DATA % "", "..." | ||||||
| 
 | 
 | ||||||
|     unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS, |     version = "" | ||||||
|                           EASTASIAN_WIDTH) |     unicode = UnicodeData(UNICODE_DATA % version, | ||||||
|  |                           COMPOSITION_EXCLUSIONS % version, | ||||||
|  |                           EASTASIAN_WIDTH % version) | ||||||
| 
 | 
 | ||||||
|     print len(filter(None, unicode.table)), "characters" |     print len(filter(None, unicode.table)), "characters" | ||||||
| 
 | 
 | ||||||
|  |     for version in old_versions: | ||||||
|  |         print "--- Reading", UNICODE_DATA % ("-"+version), "..." | ||||||
|  |         old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), | ||||||
|  |                                   COMPOSITION_EXCLUSIONS % ("-"+version), | ||||||
|  |                                   EASTASIAN_WIDTH % ("-"+version)) | ||||||
|  |         print len(filter(None, old_unicode.table)), "characters" | ||||||
|  |         merge_old_version(version, unicode, old_unicode) | ||||||
|  | 
 | ||||||
|     makeunicodename(unicode, trace) |     makeunicodename(unicode, trace) | ||||||
|     makeunicodedata(unicode, trace) |     makeunicodedata(unicode, trace) | ||||||
|     makeunicodetype(unicode, trace) |     makeunicodetype(unicode, trace) | ||||||
|  | @ -119,6 +131,8 @@ def makeunicodedata(unicode, trace): | ||||||
|         if record: |         if record: | ||||||
|             if record[5]: |             if record[5]: | ||||||
|                 decomp = record[5].split() |                 decomp = record[5].split() | ||||||
|  |                 if len(decomp) > 19: | ||||||
|  |                     raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char | ||||||
|                 # prefix |                 # prefix | ||||||
|                 if decomp[0][0] == "<": |                 if decomp[0][0] == "<": | ||||||
|                     prefix = decomp.pop(0) |                     prefix = decomp.pop(0) | ||||||
|  | @ -278,6 +292,44 @@ def makeunicodedata(unicode, trace): | ||||||
|     Array("comp_index", index).dump(fp, trace) |     Array("comp_index", index).dump(fp, trace) | ||||||
|     Array("comp_data", index2).dump(fp, trace) |     Array("comp_data", index2).dump(fp, trace) | ||||||
| 
 | 
 | ||||||
|  |     # Generate delta tables for old versions | ||||||
|  |     for version, table, normalization in unicode.changed: | ||||||
|  |         cversion = version.replace(".","_") | ||||||
|  |         records = [table[0]] | ||||||
|  |         cache = {table[0]:0} | ||||||
|  |         index = [0] * len(table) | ||||||
|  |         for i, record in enumerate(table): | ||||||
|  |             try: | ||||||
|  |                 index[i] = cache[record] | ||||||
|  |             except KeyError: | ||||||
|  |                 index[i] = cache[record] = len(records) | ||||||
|  |                 records.append(record) | ||||||
|  |         index1, index2, shift = splitbins(index, trace) | ||||||
|  |         print >>fp, "static const change_record change_records_%s[] = {" % cversion | ||||||
|  |         for record in records: | ||||||
|  |             print >>fp, "\t{ %s }," % ", ".join(map(str,record)) | ||||||
|  |         print >>fp, "};" | ||||||
|  |         Array("changes_%s_index" % cversion, index1).dump(fp, trace) | ||||||
|  |         Array("changes_%s_data" % cversion, index2).dump(fp, trace) | ||||||
|  |         print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion | ||||||
|  |         print >>fp, "{" | ||||||
|  |         print >>fp, "\tint index;" | ||||||
|  |         print >>fp, "\tif (n >= 0x110000) index = 0;" | ||||||
|  |         print >>fp, "\telse {" | ||||||
|  |         print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift) | ||||||
|  |         print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \ | ||||||
|  |               (cversion, shift, ((1<<shift)-1)) | ||||||
|  |         print >>fp, "\t}" | ||||||
|  |         print >>fp, "\treturn change_records_%s+index;" % cversion | ||||||
|  |         print >>fp, "}\n" | ||||||
|  |         print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion | ||||||
|  |         print >>fp, "{" | ||||||
|  |         print >>fp, "\tswitch(n) {" | ||||||
|  |         for k, v in normalization: | ||||||
|  |             print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v) | ||||||
|  |         print >>fp, "\tdefault: return 0;" | ||||||
|  |         print >>fp, "\t}\n}\n" | ||||||
|  | 
 | ||||||
|     fp.close() |     fp.close() | ||||||
| 
 | 
 | ||||||
| # -------------------------------------------------------------------- | # -------------------------------------------------------------------- | ||||||
|  | @ -540,6 +592,82 @@ def cmpwords((aword, alist),(bword, blist)): | ||||||
| 
 | 
 | ||||||
|     fp.close() |     fp.close() | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | def merge_old_version(version, new, old): | ||||||
|  |     # Changes to exclusion file not implemented yet | ||||||
|  |     if old.exclusions != new.exclusions: | ||||||
|  |         raise NotImplementedError, "exclusions differ" | ||||||
|  | 
 | ||||||
|  |     # In these change records, 0xFF means "no change" | ||||||
|  |     bidir_changes = [0xFF]*0x110000 | ||||||
|  |     category_changes = [0xFF]*0x110000 | ||||||
|  |     decimal_changes = [0xFF]*0x110000 | ||||||
|  |     # In numeric data, 0 means "no change", | ||||||
|  |     # -1 means "did not have a numeric value | ||||||
|  |     numeric_changes = [0] * 0x110000 | ||||||
|  |     # normalization_changes is a list of key-value pairs | ||||||
|  |     normalization_changes = [] | ||||||
|  |     for i in range(0x110000): | ||||||
|  |         if new.table[i] is None: | ||||||
|  |             # Characters unassigned in the new version ought to | ||||||
|  |             # be unassigned in the old one | ||||||
|  |             assert old.table[i] is None | ||||||
|  |             continue | ||||||
|  |         # check characters unassigned in the old version | ||||||
|  |         if old.table[i] is None: | ||||||
|  |             # category 0 is "unassigned" | ||||||
|  |             category_changes[i] = 0 | ||||||
|  |             continue | ||||||
|  |         # check characters that differ | ||||||
|  |         if old.table[i] != new.table[i]: | ||||||
|  |             for k in range(len(old.table[i])): | ||||||
|  |                 if old.table[i][k] != new.table[i][k]: | ||||||
|  |                     value = old.table[i][k] | ||||||
|  |                     if k == 2: | ||||||
|  |                         #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k] | ||||||
|  |                         category_changes[i] = CATEGORY_NAMES.index(value) | ||||||
|  |                     elif k == 4: | ||||||
|  |                         #print "BIDIR",hex(i), old.table[i][k], new.table[i][k] | ||||||
|  |                         bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value) | ||||||
|  |                     elif k == 5: | ||||||
|  |                         #print "DECOMP",hex(i), old.table[i][k], new.table[i][k] | ||||||
|  |                         # We assume that all normalization changes are in 1:1 mappings | ||||||
|  |                         assert " " not in value | ||||||
|  |                         normalization_changes.append((i, value)) | ||||||
|  |                     elif k == 6: | ||||||
|  |                         #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k] | ||||||
|  |                         # we only support changes where the old value is a single digit | ||||||
|  |                         assert value in "0123456789" | ||||||
|  |                         decimal_changes[i] = int(value) | ||||||
|  |                     elif k == 8: | ||||||
|  |                         # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] | ||||||
|  |                         # Since 0 encodes "no change", the old value is better not 0 | ||||||
|  |                         assert value != "0" and value != "-1" | ||||||
|  |                         if not value: | ||||||
|  |                             numeric_changes[i] = -1 | ||||||
|  |                         else: | ||||||
|  |                             assert re.match("^[0-9]+$", value) | ||||||
|  |                             numeric_changes[i] = int(value) | ||||||
|  |                     elif k == 11: | ||||||
|  |                         # change to ISO comment, ignore | ||||||
|  |                         pass | ||||||
|  |                     elif k == 12: | ||||||
|  |                         # change to simple uppercase mapping; ignore | ||||||
|  |                         pass | ||||||
|  |                     elif k == 13: | ||||||
|  |                         # change to simple lowercase mapping; ignore | ||||||
|  |                         pass | ||||||
|  |                     elif k == 14: | ||||||
|  |                         # change to simple titlecase mapping; ignore | ||||||
|  |                         pass | ||||||
|  |                     else: | ||||||
|  |                         class Difference(Exception):pass | ||||||
|  |                         raise Difference, (hex(i), k, old.table[i], new.table[i]) | ||||||
|  |     new.changed.append((version, zip(bidir_changes, category_changes, | ||||||
|  |                                      decimal_changes, numeric_changes), | ||||||
|  |                         normalization_changes)) | ||||||
|  |      | ||||||
|  | 
 | ||||||
| # -------------------------------------------------------------------- | # -------------------------------------------------------------------- | ||||||
| # the following support code is taken from the unidb utilities | # the following support code is taken from the unidb utilities | ||||||
| # Copyright (c) 1999-2000 by Secret Labs AB | # Copyright (c) 1999-2000 by Secret Labs AB | ||||||
|  | @ -551,6 +679,7 @@ def cmpwords((aword, alist),(bword, blist)): | ||||||
| class UnicodeData: | class UnicodeData: | ||||||
| 
 | 
 | ||||||
|     def __init__(self, filename, exclusions, eastasianwidth, expand=1): |     def __init__(self, filename, exclusions, eastasianwidth, expand=1): | ||||||
|  |         self.changed = [] | ||||||
|         file = open(filename) |         file = open(filename) | ||||||
|         table = [None] * 0x110000 |         table = [None] * 0x110000 | ||||||
|         while 1: |         while 1: | ||||||
|  | @ -569,13 +698,14 @@ def __init__(self, filename, exclusions, eastasianwidth, expand=1): | ||||||
|                 if s: |                 if s: | ||||||
|                     if s[1][-6:] == "First>": |                     if s[1][-6:] == "First>": | ||||||
|                         s[1] = "" |                         s[1] = "" | ||||||
|                         field = s[:] |                         field = s | ||||||
|                     elif s[1][-5:] == "Last>": |                     elif s[1][-5:] == "Last>": | ||||||
|                         s[1] = "" |                         s[1] = "" | ||||||
|                         field = None |                         field = None | ||||||
|                 elif field: |                 elif field: | ||||||
|                     field[0] = hex(i) |                     f2 = field[:] | ||||||
|                     table[i] = field |                     f2[0] = "%X" % i | ||||||
|  |                     table[i] = f2 | ||||||
| 
 | 
 | ||||||
|         # public attributes |         # public attributes | ||||||
|         self.filename = filename |         self.filename = filename | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Martin v. Löwis
						Martin v. Löwis