| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  | # -*- coding: iso-8859-1 -*- | 
					
						
							|  |  |  |  | """ Codec for the Punicode encoding, as specified in RFC 3492
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | Written by Martin v. L<EFBFBD>wis. | 
					
						
							|  |  |  |  | """
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | import codecs | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | ##################### Encoding ##################################### | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def segregate(str): | 
					
						
							| 
									
										
										
										
											2003-04-24 16:02:54 +00:00
										 |  |  |  |     """3.1 Basic code point segregation""" | 
					
						
							| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  |     base = [] | 
					
						
							|  |  |  |  |     extended = {} | 
					
						
							|  |  |  |  |     for c in str: | 
					
						
							|  |  |  |  |         if ord(c) < 128: | 
					
						
							|  |  |  |  |             base.append(c) | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             extended[c] = 1 | 
					
						
							|  |  |  |  |     extended = extended.keys() | 
					
						
							|  |  |  |  |     extended.sort() | 
					
						
							|  |  |  |  |     return "".join(base).encode("ascii"),extended | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def selective_len(str, max): | 
					
						
							|  |  |  |  |     """Return the length of str, considering only characters below max.""" | 
					
						
							|  |  |  |  |     res = 0 | 
					
						
							|  |  |  |  |     for c in str: | 
					
						
							|  |  |  |  |         if ord(c) < max: | 
					
						
							|  |  |  |  |             res += 1 | 
					
						
							|  |  |  |  |     return res | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def selective_find(str, char, index, pos): | 
					
						
							|  |  |  |  |     """Return a pair (index, pos), indicating the next occurrence of
 | 
					
						
							|  |  |  |  |     char in str. index is the position of the character considering | 
					
						
							|  |  |  |  |     only ordinals up to and including char, and pos is the position in | 
					
						
							|  |  |  |  |     the full string. index/pos is the starting position in the full | 
					
						
							|  |  |  |  |     string."""
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     l = len(str) | 
					
						
							|  |  |  |  |     while 1: | 
					
						
							|  |  |  |  |         pos += 1 | 
					
						
							|  |  |  |  |         if pos == l: | 
					
						
							|  |  |  |  |             return (-1, -1) | 
					
						
							|  |  |  |  |         c = str[pos] | 
					
						
							|  |  |  |  |         if c == char: | 
					
						
							|  |  |  |  |             return index+1, pos | 
					
						
							|  |  |  |  |         elif c < char: | 
					
						
							|  |  |  |  |             index += 1 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def insertion_unsort(str, extended): | 
					
						
							|  |  |  |  |     """3.2 Insertion unsort coding""" | 
					
						
							|  |  |  |  |     oldchar = 0x80 | 
					
						
							|  |  |  |  |     result = [] | 
					
						
							|  |  |  |  |     oldindex = -1 | 
					
						
							|  |  |  |  |     for c in extended: | 
					
						
							|  |  |  |  |         index = pos = -1 | 
					
						
							|  |  |  |  |         char = ord(c) | 
					
						
							|  |  |  |  |         curlen = selective_len(str, char) | 
					
						
							|  |  |  |  |         delta = (curlen+1) * (char - oldchar) | 
					
						
							|  |  |  |  |         while 1: | 
					
						
							|  |  |  |  |             index,pos = selective_find(str,c,index,pos) | 
					
						
							|  |  |  |  |             if index == -1: | 
					
						
							|  |  |  |  |                 break | 
					
						
							|  |  |  |  |             delta += index - oldindex | 
					
						
							|  |  |  |  |             result.append(delta-1) | 
					
						
							|  |  |  |  |             oldindex = index | 
					
						
							|  |  |  |  |             delta = 0 | 
					
						
							|  |  |  |  |         oldchar = char | 
					
						
							| 
									
										
										
										
											2003-04-24 16:02:54 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  |     return result | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def T(j, bias): | 
					
						
							|  |  |  |  |     # Punycode parameters: tmin = 1, tmax = 26, base = 36 | 
					
						
							|  |  |  |  |     res = 36 * (j + 1) - bias | 
					
						
							|  |  |  |  |     if res < 1: return 1 | 
					
						
							|  |  |  |  |     if res > 26: return 26 | 
					
						
							|  |  |  |  |     return res | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | digits = "abcdefghijklmnopqrstuvwxyz0123456789" | 
					
						
							|  |  |  |  | def generate_generalized_integer(N, bias): | 
					
						
							|  |  |  |  |     """3.3 Generalized variable-length integers""" | 
					
						
							|  |  |  |  |     result = [] | 
					
						
							|  |  |  |  |     j = 0 | 
					
						
							|  |  |  |  |     while 1: | 
					
						
							|  |  |  |  |         t = T(j, bias) | 
					
						
							|  |  |  |  |         if N < t: | 
					
						
							|  |  |  |  |             result.append(digits[N]) | 
					
						
							|  |  |  |  |             return result | 
					
						
							|  |  |  |  |         result.append(digits[t + ((N - t) % (36 - t))]) | 
					
						
							|  |  |  |  |         N = (N - t) // (36 - t) | 
					
						
							|  |  |  |  |         j += 1 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def adapt(delta, first, numchars): | 
					
						
							|  |  |  |  |     if first: | 
					
						
							|  |  |  |  |         delta //= 700 | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         delta //= 2 | 
					
						
							|  |  |  |  |     delta += delta // numchars | 
					
						
							|  |  |  |  |     # ((base - tmin) * tmax) // 2 == 455 | 
					
						
							|  |  |  |  |     divisions = 0 | 
					
						
							|  |  |  |  |     while delta > 455: | 
					
						
							|  |  |  |  |         delta = delta // 35 # base - tmin | 
					
						
							|  |  |  |  |         divisions += 36 | 
					
						
							|  |  |  |  |     bias = divisions + (36 * delta // (delta + 38)) | 
					
						
							|  |  |  |  |     return bias | 
					
						
							| 
									
										
										
										
											2003-04-24 16:02:54 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | def generate_integers(baselen, deltas): | 
					
						
							|  |  |  |  |     """3.4 Bias adaptation""" | 
					
						
							|  |  |  |  |     # Punycode parameters: initial bias = 72, damp = 700, skew = 38 | 
					
						
							|  |  |  |  |     result = [] | 
					
						
							|  |  |  |  |     bias = 72 | 
					
						
							|  |  |  |  |     for points, delta in enumerate(deltas): | 
					
						
							|  |  |  |  |         s = generate_generalized_integer(delta, bias) | 
					
						
							|  |  |  |  |         result.extend(s) | 
					
						
							|  |  |  |  |         bias = adapt(delta, points==0, baselen+points+1) | 
					
						
							|  |  |  |  |     return "".join(result) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def punycode_encode(text): | 
					
						
							|  |  |  |  |     base, extended = segregate(text) | 
					
						
							|  |  |  |  |     base = base.encode("ascii") | 
					
						
							|  |  |  |  |     deltas = insertion_unsort(text, extended) | 
					
						
							|  |  |  |  |     extended = generate_integers(len(base), deltas) | 
					
						
							|  |  |  |  |     if base: | 
					
						
							|  |  |  |  |         return base + "-" + extended | 
					
						
							|  |  |  |  |     return extended | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | ##################### Decoding ##################################### | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def decode_generalized_number(extended, extpos, bias, errors): | 
					
						
							|  |  |  |  |     """3.3 Generalized variable-length integers""" | 
					
						
							|  |  |  |  |     result = 0 | 
					
						
							|  |  |  |  |     w = 1 | 
					
						
							|  |  |  |  |     j = 0 | 
					
						
							|  |  |  |  |     while 1: | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             char = ord(extended[extpos]) | 
					
						
							|  |  |  |  |         except IndexError: | 
					
						
							|  |  |  |  |             if errors == "strict": | 
					
						
							|  |  |  |  |                 raise UnicodeError, "incomplete punicode string" | 
					
						
							|  |  |  |  |             return extpos + 1, None | 
					
						
							|  |  |  |  |         extpos += 1 | 
					
						
							|  |  |  |  |         if 0x41 <= char <= 0x5A: # A-Z | 
					
						
							|  |  |  |  |             digit = char - 0x41 | 
					
						
							|  |  |  |  |         elif 0x30 <= char <= 0x39: | 
					
						
							|  |  |  |  |             digit = char - 22 # 0x30-26 | 
					
						
							|  |  |  |  |         elif errors == "strict": | 
					
						
							|  |  |  |  |             raise UnicodeError("Invalid extended code point '%s'" | 
					
						
							|  |  |  |  |                                % extended[extpos]) | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             return extpos, None | 
					
						
							|  |  |  |  |         t = T(j, bias) | 
					
						
							|  |  |  |  |         result += digit * w | 
					
						
							|  |  |  |  |         if digit < t: | 
					
						
							|  |  |  |  |             return extpos, result | 
					
						
							|  |  |  |  |         w = w * (36 - t) | 
					
						
							|  |  |  |  |         j += 1 | 
					
						
							| 
									
										
										
										
											2003-04-24 16:02:54 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | def insertion_sort(base, extended, errors): | 
					
						
							|  |  |  |  |     """3.2 Insertion unsort coding""" | 
					
						
							|  |  |  |  |     char = 0x80 | 
					
						
							|  |  |  |  |     pos = -1 | 
					
						
							|  |  |  |  |     bias = 72 | 
					
						
							|  |  |  |  |     extpos = 0 | 
					
						
							|  |  |  |  |     while extpos < len(extended): | 
					
						
							|  |  |  |  |         newpos, delta = decode_generalized_number(extended, extpos, | 
					
						
							|  |  |  |  |                                                   bias, errors) | 
					
						
							|  |  |  |  |         if delta is None: | 
					
						
							|  |  |  |  |             # There was an error in decoding. We can't continue because | 
					
						
							|  |  |  |  |             # synchronization is lost. | 
					
						
							|  |  |  |  |             return base | 
					
						
							|  |  |  |  |         pos += delta+1 | 
					
						
							|  |  |  |  |         char += pos // (len(base) + 1) | 
					
						
							|  |  |  |  |         if char > 0x10FFFF: | 
					
						
							|  |  |  |  |             if errors == "strict": | 
					
						
							|  |  |  |  |                 raise UnicodeError, ("Invalid character U+%x" % char) | 
					
						
							|  |  |  |  |             char = ord('?') | 
					
						
							|  |  |  |  |         pos = pos % (len(base) + 1) | 
					
						
							|  |  |  |  |         base = base[:pos] + unichr(char) + base[pos:] | 
					
						
							|  |  |  |  |         bias = adapt(delta, (extpos == 0), len(base)) | 
					
						
							|  |  |  |  |         extpos = newpos | 
					
						
							|  |  |  |  |     return base | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def punycode_decode(text, errors): | 
					
						
							|  |  |  |  |     pos = text.rfind("-") | 
					
						
							|  |  |  |  |     if pos == -1: | 
					
						
							|  |  |  |  |         base = "" | 
					
						
							|  |  |  |  |         extended = text | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         base = text[:pos] | 
					
						
							|  |  |  |  |         extended = text[pos+1:] | 
					
						
							|  |  |  |  |     base = unicode(base, "ascii", errors) | 
					
						
							|  |  |  |  |     extended = extended.upper() | 
					
						
							|  |  |  |  |     return insertion_sort(base, extended, errors) | 
					
						
							| 
									
										
										
										
											2003-04-24 16:02:54 +00:00
										 |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2003-04-18 10:39:54 +00:00
										 |  |  |  | ### Codec APIs | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class Codec(codecs.Codec): | 
					
						
							|  |  |  |  |     def encode(self,input,errors='strict'): | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         res = punycode_encode(input) | 
					
						
							|  |  |  |  |         return res, len(input) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def decode(self,input,errors='strict'): | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if errors not in ('strict', 'replace', 'ignore'): | 
					
						
							|  |  |  |  |             raise UnicodeError, "Unsupported error handling "+errors | 
					
						
							|  |  |  |  |         res = punycode_decode(input, errors) | 
					
						
							|  |  |  |  |         return res, len(input) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class StreamWriter(Codec,codecs.StreamWriter): | 
					
						
							|  |  |  |  |     pass | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class StreamReader(Codec,codecs.StreamReader): | 
					
						
							|  |  |  |  |     pass | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | ### encodings module API | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def getregentry(): | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return (Codec().encode,Codec().decode,StreamReader,StreamWriter) |