gh-149891: Add more encoding aliases (GH-149892)

Support all aliases officially registered in IANA, except
Extended_UNIX_Code_Packed_Format_for_Japanese.

New names:
KSC_5601, KS_C_5601-1989, iso-ir-149, GB_2312-80, windows-936, mac,
CCSID00858, CCSID01140, and a number of "cs"-prefixed names.

Fix csHPRoman8, which was not normalized.
This commit is contained in:
Serhiy Storchaka 2026-06-05 15:08:04 +03:00 committed by GitHub
parent e4db68b9c9
commit 49f4ecfb08
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 53 additions and 5 deletions

View file

@ -43,6 +43,7 @@
# big5hkscs codec
'big5_hkscs' : 'big5hkscs',
'csbig5hkscs' : 'big5hkscs',
'hkscs' : 'big5hkscs',
# bz2_codec codec
@ -71,6 +72,7 @@
# cp1140 codec
'1140' : 'cp1140',
'ccsid01140' : 'cp1140',
'cp01140' : 'cp1140',
'csibm01140' : 'cp1140',
'ebcdic_us_37_euro' : 'cp1140',
@ -79,38 +81,47 @@
# cp1250 codec
'1250' : 'cp1250',
'cswindows1250' : 'cp1250',
'windows_1250' : 'cp1250',
# cp1251 codec
'1251' : 'cp1251',
'cswindows1251' : 'cp1251',
'windows_1251' : 'cp1251',
# cp1252 codec
'1252' : 'cp1252',
'cswindows1252' : 'cp1252',
'windows_1252' : 'cp1252',
# cp1253 codec
'1253' : 'cp1253',
'cswindows1253' : 'cp1253',
'windows_1253' : 'cp1253',
# cp1254 codec
'1254' : 'cp1254',
'cswindows1254' : 'cp1254',
'windows_1254' : 'cp1254',
# cp1255 codec
'1255' : 'cp1255',
'cswindows1255' : 'cp1255',
'windows_1255' : 'cp1255',
# cp1256 codec
'1256' : 'cp1256',
'cswindows1256' : 'cp1256',
'windows_1256' : 'cp1256',
# cp1257 codec
'1257' : 'cp1257',
'cswindows1257' : 'cp1257',
'windows_1257' : 'cp1257',
# cp1258 codec
'1258' : 'cp1258',
'cswindows1258' : 'cp1258',
'windows_1258' : 'cp1258',
# cp273 codec
@ -163,6 +174,7 @@
# cp858 codec
'858' : 'cp858',
'ccsid00858' : 'cp858',
'cp00858' : 'cp858',
'csibm00858' : 'cp858',
'csibm858' : 'cp858',
@ -214,11 +226,13 @@
# cp874 codec
'874' : 'cp874',
'cswindows874' : 'cp874',
'ms874' : 'cp874',
'windows_874' : 'cp874',
# cp932 codec
'932' : 'cp932',
'cswindows31j' : 'cp932',
'ms932' : 'cp932',
'mskanji' : 'cp932',
'ms_kanji' : 'cp932',
@ -226,10 +240,14 @@
# cp949 codec
'949' : 'cp949',
'csksc56011987' : 'cp949',
'iso_ir_149' : 'cp949',
'korean' : 'cp949',
'ks_c_5601_1987' : 'cp949',
'ks_c_5601_1989' : 'cp949',
'ksc5601' : 'cp949',
'ks_c_5601' : 'cp949',
'ks_c_5601_1987' : 'cp949',
'ksc_5601' : 'cp949',
'ksx1001' : 'cp949',
'ks_x_1001' : 'cp949',
'ms949' : 'cp949',
@ -248,41 +266,47 @@
'eucjisx0213' : 'euc_jisx0213',
# euc_jp codec
'cseucpkdfmtjapanese' : 'euc_jp',
'eucjp' : 'euc_jp',
'ujis' : 'euc_jp',
'u_jis' : 'euc_jp',
# euc_kr codec
'euckr' : 'euc_kr',
'cseuckr' : 'euc_kr',
'euckr' : 'euc_kr',
# gb18030 codec
'csgb18030' : 'gb18030',
'gb18030_2000' : 'gb18030',
# gb2312 codec
'chinese' : 'gb2312',
'csgb2312' : 'gb2312',
'csiso58gb231280' : 'gb2312',
'euc_cn' : 'gb2312',
'euccn' : 'gb2312',
'eucgb2312_cn' : 'gb2312',
'gb2312_1980' : 'gb2312',
'gb2312_80' : 'gb2312',
'gb_2312_80' : 'gb2312',
'iso_ir_58' : 'gb2312',
# gbk codec
'936' : 'gbk',
'cp936' : 'gbk',
'csgbk' : 'gbk',
'ms936' : 'gbk',
'windows_936' : 'gbk',
# hex_codec codec
'hex' : 'hex_codec',
# hp_roman8 codec
'roman8' : 'hp_roman8',
'r8' : 'hp_roman8',
'csHPRoman8' : 'hp_roman8',
'cp1051' : 'hp_roman8',
'cshproman8' : 'hp_roman8',
'ibm1051' : 'hp_roman8',
'r8' : 'hp_roman8',
'roman8' : 'hp_roman8',
# hz codec
'hzgb' : 'hz',
@ -299,6 +323,7 @@
'iso_2022_jp_1' : 'iso2022_jp_1',
# iso2022_jp_2 codec
'csiso2022jp2' : 'iso2022_jp_2',
'iso2022jp_2' : 'iso2022_jp_2',
'iso_2022_jp_2' : 'iso2022_jp_2',
@ -334,12 +359,14 @@
'iso_8859_11_2001' : 'iso8859_11',
# iso8859_13 codec
'csiso885913' : 'iso8859_13',
'iso_8859_13' : 'iso8859_13',
'l7' : 'iso8859_13',
'latin7' : 'iso8859_13',
'latin_7' : 'iso8859_13',
# iso8859_14 codec
'csiso885914' : 'iso8859_14',
'iso_8859_14' : 'iso8859_14',
'iso_8859_14_1998' : 'iso8859_14',
'iso_celtic' : 'iso8859_14',
@ -349,12 +376,14 @@
'latin_8' : 'iso8859_14',
# iso8859_15 codec
'csiso885915' : 'iso8859_15',
'iso_8859_15' : 'iso8859_15',
'l9' : 'iso8859_15',
'latin9' : 'iso8859_15',
'latin_9' : 'iso8859_15',
# iso8859_16 codec
'csiso885916' : 'iso8859_16',
'iso_8859_16' : 'iso8859_16',
'iso_8859_16_2001' : 'iso8859_16',
'iso_ir_226' : 'iso8859_16',
@ -416,6 +445,8 @@
'iso_ir_126' : 'iso8859_7',
# iso8859_8 codec
'csiso88598e' : 'iso8859_8',
'csiso88598i' : 'iso8859_8',
'csisolatinhebrew' : 'iso8859_8',
'hebrew' : 'iso8859_8',
'iso_8859_8' : 'iso8859_8',
@ -440,7 +471,11 @@
# koi8_r codec
'cskoi8r' : 'koi8_r',
# koi8_u codec
'cskoi8u' : 'koi8_u',
# kz1048 codec
'cskz1048' : 'kz1048',
'kz_1048' : 'kz1048',
'rk1048' : 'kz1048',
'strk1048_2002' : 'kz1048',
@ -480,7 +515,9 @@
'maclatin2' : 'mac_latin2',
# mac_roman codec
'csmacintosh' : 'mac_roman',
'macintosh' : 'mac_roman',
'mac' : 'mac_roman',
'macroman' : 'mac_roman',
# mac_turkish codec
@ -521,6 +558,7 @@
's_jisx0213' : 'shift_jisx0213',
# tis_620 codec
'cstis620' : 'tis_620',
'tis620' : 'tis_620',
'tis_620_0' : 'tis_620',
'tis_620_2529_0' : 'tis_620',
@ -528,33 +566,42 @@
'iso_ir_166' : 'tis_620',
# utf_16 codec
'csutf16' : 'utf_16',
'u16' : 'utf_16',
'utf16' : 'utf_16',
# utf_16_be codec
'csutf16be' : 'utf_16_be',
'unicodebigunmarked' : 'utf_16_be',
'utf_16be' : 'utf_16_be',
# utf_16_le codec
'csutf16le' : 'utf_16_le',
'unicodelittleunmarked' : 'utf_16_le',
'utf_16le' : 'utf_16_le',
# utf_32 codec
'csutf32' : 'utf_32',
'u32' : 'utf_32',
'utf32' : 'utf_32',
# utf_32_be codec
'csutf32be' : 'utf_32_be',
'utf_32be' : 'utf_32_be',
# utf_32_le codec
'csutf32le' : 'utf_32_le',
'utf_32le' : 'utf_32_le',
# utf_7 codec
'csunicode11utf7' : 'utf_7',
'csutf7' : 'utf_7',
'u7' : 'utf_7',
'utf7' : 'utf_7',
'unicode_1_1_utf_7' : 'utf_7',
# utf_8 codec
'csutf8' : 'utf_8',
'u8' : 'utf_8',
'utf' : 'utf_8',
'utf8' : 'utf_8',

View file

@ -0,0 +1 @@
Add support for more encoding aliases `officially registered in IANA <https://www.iana.org/assignments/character-sets/character-sets.xhtml>`__.