| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | /*
 | 
					
						
							|  |  |  |    Unicode character type helpers. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |    Written by Marc-Andre Lemburg (mal@lemburg.com). | 
					
						
							|  |  |  |    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |    Copyright (c) Corporation for National Research Initiatives. | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | */ | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "Python.h"
 | 
					
						
							|  |  |  | #include "unicodeobject.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  | #define ALPHA_MASK 0x01
 | 
					
						
							|  |  |  | #define DECIMAL_MASK 0x02
 | 
					
						
							|  |  |  | #define DIGIT_MASK 0x04
 | 
					
						
							|  |  |  | #define LOWER_MASK 0x08
 | 
					
						
							|  |  |  | #define LINEBREAK_MASK 0x10
 | 
					
						
							|  |  |  | #define SPACE_MASK 0x20
 | 
					
						
							|  |  |  | #define TITLE_MASK 0x40
 | 
					
						
							|  |  |  | #define UPPER_MASK 0x80
 | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | #define XID_START_MASK 0x100
 | 
					
						
							|  |  |  | #define XID_CONTINUE_MASK 0x200
 | 
					
						
							| 
									
										
										
										
											2008-07-04 15:55:02 +00:00
										 |  |  | #define PRINTABLE_MASK 0x400
 | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  | #define NODELTA_MASK 0x800
 | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  | #define NUMERIC_MASK 0x1000
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | typedef struct { | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  |     const Py_UCS4 upper; | 
					
						
							|  |  |  |     const Py_UCS4 lower; | 
					
						
							|  |  |  |     const Py_UCS4 title; | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const unsigned char decimal; | 
					
						
							|  |  |  |     const unsigned char digit; | 
					
						
							| 
									
										
										
										
											2004-06-02 16:49:17 +00:00
										 |  |  |     const unsigned short flags; | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  | } _PyUnicode_TypeRecord; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "unicodetype_db.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const _PyUnicode_TypeRecord * | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | gettyperecord(Py_UCS4 code) | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     int index; | 
					
						
							| 
									
										
										
										
											2000-07-06 13:57:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-10-18 16:11:54 +00:00
										 |  |  |     if (code >= 0x110000) | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |         index = 0; | 
					
						
							| 
									
										
										
										
											2003-12-29 01:36:01 +00:00
										 |  |  |     else | 
					
						
							|  |  |  |     { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |         index = index1[(code>>SHIFT)]; | 
					
						
							|  |  |  |         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))]; | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2001-06-26 20:36:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     return &_PyUnicode_TypeRecords[index]; | 
					
						
							|  |  |  | } | 
					
						
							| 
									
										
										
										
											2000-07-06 13:57:38 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | /* Returns the titlecase Unicode characters corresponding to ch or just
 | 
					
						
							|  |  |  |    ch if no titlecase mapping is known. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | Py_UCS4 _PyUnicode_ToTitlecase(register Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							| 
									
										
										
										
											2009-04-26 01:02:07 +00:00
										 |  |  |     int delta = ctype->title; | 
					
						
							| 
									
										
										
										
											2001-06-27 06:28:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  |     if (ctype->flags & NODELTA_MASK) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |         return delta; | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     if (delta >= 32768) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |             delta -= 65536; | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return ch + delta; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns 1 for Unicode characters having the category 'Lt', 0
 | 
					
						
							|  |  |  |    otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsTitlecase(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & TITLE_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | /* Returns 1 for Unicode characters having the XID_Start property, 0
 | 
					
						
							|  |  |  |    otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsXidStart(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & XID_START_MASK) != 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns 1 for Unicode characters having the XID_Continue property,
 | 
					
						
							|  |  |  |    0 otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsXidContinue(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2007-08-14 22:37:03 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & XID_CONTINUE_MASK) != 0; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | /* Returns the integer decimal (0-9) for Unicode characters having
 | 
					
						
							|  |  |  |    this property, -1 otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_ToDecimalDigit(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsDecimalDigit(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     if (_PyUnicode_ToDecimalDigit(ch) < 0) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |         return 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  |     return 1; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns the integer digit (0-9) for Unicode characters having
 | 
					
						
							|  |  |  |    this property, -1 otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_ToDigit(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsDigit(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     if (_PyUnicode_ToDigit(ch) < 0) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |         return 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  |     return 1; | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns the numeric value as double for Unicode characters having
 | 
					
						
							|  |  |  |    this property, -1.0 otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsNumeric(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2009-10-06 21:03:20 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & NUMERIC_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-06-11 18:37:52 +00:00
										 |  |  | /* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
 | 
					
						
							|  |  |  |    0 otherwise. | 
					
						
							|  |  |  |    All characters except those characters defined in the Unicode character | 
					
						
							|  |  |  |    database as following categories are considered printable. | 
					
						
							|  |  |  |       * Cc (Other, Control) | 
					
						
							|  |  |  |       * Cf (Other, Format) | 
					
						
							|  |  |  |       * Cs (Other, Surrogate) | 
					
						
							|  |  |  |       * Co (Other, Private Use) | 
					
						
							|  |  |  |       * Cn (Other, Not Assigned) | 
					
						
							|  |  |  |       * Zl Separator, Line ('\u2028', LINE SEPARATOR) | 
					
						
							|  |  |  |       * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) | 
					
						
							|  |  |  |       * Zs (Separator, Space) other than ASCII space('\x20'). | 
					
						
							|  |  |  | */ | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsPrintable(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2008-06-11 18:37:52 +00:00
										 |  |  | { | 
					
						
							|  |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2008-07-04 15:55:02 +00:00
										 |  |  |     return (ctype->flags & PRINTABLE_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2008-06-11 18:37:52 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | /* Returns 1 for Unicode characters having the category 'Ll', 0
 | 
					
						
							|  |  |  |    otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsLowercase(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & LOWER_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns 1 for Unicode characters having the category 'Lu', 0
 | 
					
						
							|  |  |  |    otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsUppercase(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return (ctype->flags & UPPER_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns the uppercase Unicode characters corresponding to ch or just
 | 
					
						
							|  |  |  |    ch if no uppercase mapping is known. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     int delta = ctype->upper; | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  |     if (ctype->flags & NODELTA_MASK) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |         return delta; | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     if (delta >= 32768) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |             delta -= 65536; | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     return ch + delta; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /* Returns the lowercase Unicode characters corresponding to ch or just
 | 
					
						
							|  |  |  |    ch if no lowercase mapping is known. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     int delta = ctype->lower; | 
					
						
							| 
									
										
										
										
											2008-09-10 14:08:48 +00:00
										 |  |  |     if (ctype->flags & NODELTA_MASK) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |         return delta; | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     if (delta >= 32768) | 
					
						
							| 
									
										
										
										
											2010-08-11 17:31:17 +00:00
										 |  |  |             delta -= 65536; | 
					
						
							| 
									
										
										
										
											2002-10-18 16:40:36 +00:00
										 |  |  |     return ch + delta; | 
					
						
							| 
									
										
										
										
											2000-03-10 22:52:46 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-07-05 09:48:59 +00:00
										 |  |  | /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
 | 
					
						
							|  |  |  |    'Lo' or 'Lm',  0 otherwise. */ | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2010-08-18 20:44:58 +00:00
										 |  |  | int _PyUnicode_IsAlpha(Py_UCS4 ch) | 
					
						
							| 
									
										
										
										
											2000-07-05 09:48:59 +00:00
										 |  |  | { | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); | 
					
						
							| 
									
										
										
										
											2000-07-05 09:48:59 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2000-09-25 21:48:13 +00:00
										 |  |  |     return (ctype->flags & ALPHA_MASK) != 0; | 
					
						
							| 
									
										
										
										
											2000-07-05 09:48:59 +00:00
										 |  |  | } | 
					
						
							|  |  |  | 
 |