mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 23:21:29 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			126 lines
		
	
	
		
			No EOL
		
	
	
		
			4.8 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
			
		
		
	
	
			126 lines
		
	
	
		
			No EOL
		
	
	
		
			4.8 KiB
		
	
	
	
		
			TeX
		
	
	
	
	
	
\section{\module{unicodedata} ---
 | 
						|
         Unicode Database}
 | 
						|
 | 
						|
\declaremodule{standard}{unicodedata}
 | 
						|
\modulesynopsis{Access the Unicode Database.}
 | 
						|
\moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
 | 
						|
\sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
 | 
						|
\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
 | 
						|
 | 
						|
\index{Unicode}
 | 
						|
\index{character}
 | 
						|
\indexii{Unicode}{database}
 | 
						|
 | 
						|
This module provides access to the Unicode Character Database which
 | 
						|
defines character properties for all Unicode characters. The data in
 | 
						|
this database is based on the \file{UnicodeData.txt} file version
 | 
						|
3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
 | 
						|
 | 
						|
The module uses the same names and symbols as defined by the
 | 
						|
UnicodeData File Format 3.2.0 (see
 | 
						|
\url{http://www.unicode.org/Public/UNIDATA/UnicodeData.html}).  It
 | 
						|
defines the following functions:
 | 
						|
 | 
						|
\begin{funcdesc}{lookup}{name}
 | 
						|
  Look up character by name.  If a character with the
 | 
						|
  given name is found, return the corresponding Unicode
 | 
						|
  character.  If not found, \exception{KeyError} is raised.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{name}{unichr\optional{, default}}
 | 
						|
  Returns the name assigned to the Unicode character
 | 
						|
  \var{unichr} as a string. If no name is defined,
 | 
						|
  \var{default} is returned, or, if not given,
 | 
						|
  \exception{ValueError} is raised.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{decimal}{unichr\optional{, default}}
 | 
						|
  Returns the decimal value assigned to the Unicode character
 | 
						|
  \var{unichr} as integer. If no such value is defined,
 | 
						|
  \var{default} is returned, or, if not given,
 | 
						|
  \exception{ValueError} is raised.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{digit}{unichr\optional{, default}}
 | 
						|
  Returns the digit value assigned to the Unicode character
 | 
						|
  \var{unichr} as integer. If no such value is defined,
 | 
						|
  \var{default} is returned, or, if not given,
 | 
						|
  \exception{ValueError} is raised.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{numeric}{unichr\optional{, default}}
 | 
						|
  Returns the numeric value assigned to the Unicode character
 | 
						|
  \var{unichr} as float. If no such value is defined, \var{default} is
 | 
						|
  returned, or, if not given, \exception{ValueError} is raised.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{category}{unichr}
 | 
						|
  Returns the general category assigned to the Unicode character
 | 
						|
  \var{unichr} as string.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{bidirectional}{unichr}
 | 
						|
  Returns the bidirectional category assigned to the Unicode character
 | 
						|
  \var{unichr} as string. If no such value is defined, an empty string
 | 
						|
  is returned.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{combining}{unichr}
 | 
						|
  Returns the canonical combining class assigned to the Unicode
 | 
						|
  character \var{unichr} as integer. Returns \code{0} if no combining
 | 
						|
  class is defined.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{mirrored}{unichr}
 | 
						|
  Returns the mirrored property of assigned to the Unicode character
 | 
						|
  \var{unichr} as integer. Returns \code{1} if the character has been
 | 
						|
  identified as a ``mirrored'' character in bidirectional text,
 | 
						|
  \code{0} otherwise.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{decomposition}{unichr}
 | 
						|
  Returns the character decomposition mapping assigned to the Unicode
 | 
						|
  character \var{unichr} as string. An empty string is returned in case
 | 
						|
  no such mapping is defined.
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
\begin{funcdesc}{normalize}{form, unistr}
 | 
						|
 | 
						|
Return the normal form \var{form} for the Unicode string \var{unistr}.
 | 
						|
Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
 | 
						|
 | 
						|
The Unicode standard defines various normalization forms of a Unicode
 | 
						|
string, based on the definition of canonical equivalence and
 | 
						|
compatibility equivalence. In Unicode, several characters can be
 | 
						|
expressed in various way. For example, the character U+00C7 (LATIN
 | 
						|
CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
 | 
						|
U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
 | 
						|
 | 
						|
For each character, there are two normal forms: normal form C and
 | 
						|
normal form D. Normal form D (NFD) is also known as canonical
 | 
						|
decomposition, and translates each character into its decomposed form.
 | 
						|
Normal form C (NFC) first applies a canonical decomposition, then
 | 
						|
composes pre-combined characters again.
 | 
						|
 | 
						|
In addition to these two forms, there two additional normal forms
 | 
						|
based on compatibility equivalence. In Unicode, certain characters are
 | 
						|
supported which normally would be unified with other characters. For
 | 
						|
example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
 | 
						|
(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
 | 
						|
compatibility with existing character sets (e.g. gb2312).
 | 
						|
 | 
						|
The normal form KD (NFKD) will apply the compatibility decomposition,
 | 
						|
i.e. replace all compatibility characters with their equivalents. The
 | 
						|
normal form KC (NFKC) first applies the compatibility decomposition,
 | 
						|
followed by the canonical composition.
 | 
						|
 | 
						|
\versionadded{2.3}
 | 
						|
\end{funcdesc}
 | 
						|
 | 
						|
In addition, the module exposes the following constant:
 | 
						|
 | 
						|
\begin{datadesc}{unidata_version}
 | 
						|
The version of the Unicode database used in this module.
 | 
						|
 | 
						|
\versionadded{2.3}
 | 
						|
\end{datadesc} |