mirror of
https://github.com/python/cpython.git
synced 2026-06-04 16:50:51 +00:00
gh-62259: Add support of multi-byte encodings in the XML parser (GH-149860)
Supported encodings: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312", "GBK", "johab", and "Shift_JIS". Partially supported encodings (only BMP characters): "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8" (without hyphen). The parser now raises ValueError for known unsupported multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data.
This commit is contained in:
parent
a34edf7446
commit
8ab7b43a14
47 changed files with 401 additions and 29 deletions
|
|
@ -63,12 +63,26 @@ The :mod:`!xml.parsers.expat` module contains two functions:
|
|||
|
||||
.. function:: ParserCreate(encoding=None, namespace_separator=None)
|
||||
|
||||
Creates and returns a new :class:`xmlparser` object. *encoding*, if specified,
|
||||
must be a string naming the encoding used by the XML data. Expat doesn't
|
||||
support as many encodings as Python does, and its repertoire of encodings can't
|
||||
be extended; it supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If
|
||||
*encoding* [1]_ is given it will override the implicit or explicit encoding of the
|
||||
document.
|
||||
Creates and returns a new :class:`xmlparser` object.
|
||||
*encoding* [1]_, if specified, must be a string naming the encoding
|
||||
used by the XML data.
|
||||
If it is given it will override the implicit or explicit encoding
|
||||
of the document.
|
||||
|
||||
.. impl-detail::
|
||||
|
||||
Expat natively understands and processes UTF-8, UTF-16, UTF-16BE,
|
||||
UTF-16LE, ISO-8859-1, and US-ASCII.
|
||||
For other encodings (including aliases like Latin1 and ASCII) it
|
||||
falls back to Python.
|
||||
It supports most of 8-bit encodings and many multi-byte encodings
|
||||
like Shift_JIS, although only BMP characters (``U+0000-U+FFFF``)
|
||||
are supported with non-native encodings (this restriction is also
|
||||
applied to aliases like UTF8).
|
||||
These restrictions only apply if *encoding* is not given.
|
||||
|
||||
.. versionchanged:: next
|
||||
Added support for multi-byte encodings.
|
||||
|
||||
.. _xmlparser-non-root:
|
||||
|
||||
|
|
@ -113,7 +127,6 @@ The :mod:`!xml.parsers.expat` module contains two functions:
|
|||
XML document. Call ``ParserCreate`` for each document to provide unique
|
||||
parser instances.
|
||||
|
||||
|
||||
.. seealso::
|
||||
|
||||
`The Expat XML Parser <http://www.libexpat.org/>`_
|
||||
|
|
@ -1083,9 +1096,11 @@ The ``errors`` module has the following attributes:
|
|||
|
||||
.. rubric:: Footnotes
|
||||
|
||||
.. [1] The encoding string included in XML output should conform to the
|
||||
appropriate standards. For example, "UTF-8" is valid, but "UTF8" is
|
||||
not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
|
||||
.. [1] The encoding string included in XML output should conform to
|
||||
the appropriate standards. For example, "UTF-8" is valid, but
|
||||
"UTF8" is not valid in an XML document's declaration, even though
|
||||
Python accepts it as an encoding name.
|
||||
See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
|
||||
and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -86,7 +86,6 @@ New modules
|
|||
Improved modules
|
||||
================
|
||||
|
||||
|
||||
gzip
|
||||
----
|
||||
|
||||
|
|
@ -101,6 +100,21 @@ os
|
|||
process via a pidfd. Available on Linux 5.6+.
|
||||
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.)
|
||||
|
||||
xml
|
||||
---
|
||||
|
||||
* Add support for multiple multi-byte encodings in the :mod:`XML parser
|
||||
<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP",
|
||||
"GB2312", "GBK", "johab", and "Shift_JIS".
|
||||
Add partial support (only BMP characters) for multi-byte encodings
|
||||
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
|
||||
"Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
|
||||
(without hyphen).
|
||||
The parser now raises :exc:`ValueError` for known unsupported
|
||||
multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
|
||||
instead of failing later, when encounter non-ASCII data.
|
||||
(Contributed by Serhiy Storchaka in :gh:`62259`.)
|
||||
|
||||
.. Add improved modules above alphabetically, not here at the end.
|
||||
|
||||
Optimizations
|
||||
|
|
|
|||
|
|
@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
|
|||
in Python 3.5+?
|
||||
|
||||
*/
|
||||
extern PyObject* _PyCodec_LookupTextEncoding(
|
||||
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
|
||||
const char *encoding,
|
||||
const char *alternate_command);
|
||||
|
||||
|
|
|
|||
|
|
@ -93,7 +93,7 @@ class CodecInfo(tuple):
|
|||
|
||||
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
|
||||
incrementalencoder=None, incrementaldecoder=None, name=None,
|
||||
*, _is_text_encoding=None):
|
||||
*, _is_text_encoding=None, _expat_decoding_table=None):
|
||||
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
|
||||
self.name = name
|
||||
self.encode = encode
|
||||
|
|
@ -104,6 +104,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
|
|||
self.streamreader = streamreader
|
||||
if _is_text_encoding is not None:
|
||||
self._is_text_encoding = _is_text_encoding
|
||||
if _expat_decoding_table is not None:
|
||||
self._expat_decoding_table = _expat_decoding_table
|
||||
return self
|
||||
|
||||
def __repr__(self):
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,18 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
0x80, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
0xf8f0, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
|
||||
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
|
||||
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
|
||||
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
|
||||
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
|
||||
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
|
||||
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
|
||||
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -1, -1, -1, 0xf8f1, 0xf8f2, 0xf8f3),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -385,4 +385,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -1,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -250,4 +250,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -43,4 +43,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,17 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -2, -2, -2, -2, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
|
||||
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
|
||||
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
|
||||
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
|
||||
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
|
||||
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
|
||||
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
|
||||
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,18 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(
|
||||
*range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
|
||||
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
|
||||
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
|
||||
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
|
||||
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
|
||||
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
|
||||
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
|
||||
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -36,4 +36,18 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(
|
||||
*range(0x5c), 0xa5, *range(0x5d, 0x7e), 0x203e, 0x7f,
|
||||
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-1, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
|
||||
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
|
||||
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
|
||||
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
|
||||
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
|
||||
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
|
||||
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
|
||||
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -43,4 +43,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamwriter=StreamWriter,
|
||||
streamreader=StreamReader,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -152,4 +152,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -39,4 +39,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -39,4 +39,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -147,4 +147,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -34,4 +34,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -34,4 +34,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -35,4 +35,5 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=False,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -39,4 +39,13 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
|
||||
-4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -127,4 +127,14 @@ def getregentry():
|
|||
incrementaldecoder=IncrementalDecoder,
|
||||
streamreader=StreamReader,
|
||||
streamwriter=StreamWriter,
|
||||
# The same as for UTF-8.
|
||||
_expat_decoding_table=(*range(128),
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
|
||||
-3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3,
|
||||
-4, -4, -4, -4, -4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1),
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1892,9 +1892,11 @@ def test_copy(self):
|
|||
self.assertIsNot(dup, orig)
|
||||
self.assertEqual(dup, orig)
|
||||
self.assertTrue(orig._is_text_encoding)
|
||||
self.assertIsInstance(orig._expat_decoding_table, tuple)
|
||||
self.assertEqual(dup.encode, orig.encode)
|
||||
self.assertEqual(dup.name, orig.name)
|
||||
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
|
||||
self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
|
||||
|
||||
# Test a CodecInfo with _is_text_encoding equal to false.
|
||||
orig = codecs.lookup("base64")
|
||||
|
|
@ -1902,9 +1904,11 @@ def test_copy(self):
|
|||
self.assertIsNot(dup, orig)
|
||||
self.assertEqual(dup, orig)
|
||||
self.assertFalse(orig._is_text_encoding)
|
||||
self.assertNotHasAttr(orig, '_expat_decoding_table')
|
||||
self.assertEqual(dup.encode, orig.encode)
|
||||
self.assertEqual(dup.name, orig.name)
|
||||
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
|
||||
self.assertNotHasAttr(dup, '_expat_decoding_table')
|
||||
|
||||
def test_deepcopy(self):
|
||||
orig = codecs.lookup('utf-8')
|
||||
|
|
@ -1912,9 +1916,11 @@ def test_deepcopy(self):
|
|||
self.assertIsNot(dup, orig)
|
||||
self.assertEqual(dup, orig)
|
||||
self.assertTrue(orig._is_text_encoding)
|
||||
self.assertIsInstance(orig._expat_decoding_table, tuple)
|
||||
self.assertEqual(dup.encode, orig.encode)
|
||||
self.assertEqual(dup.name, orig.name)
|
||||
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
|
||||
self.assertIs(dup._expat_decoding_table, orig._expat_decoding_table)
|
||||
|
||||
# Test a CodecInfo with _is_text_encoding equal to false.
|
||||
orig = codecs.lookup("base64")
|
||||
|
|
@ -1922,9 +1928,11 @@ def test_deepcopy(self):
|
|||
self.assertIsNot(dup, orig)
|
||||
self.assertEqual(dup, orig)
|
||||
self.assertFalse(orig._is_text_encoding)
|
||||
self.assertNotHasAttr(orig, '_expat_decoding_table')
|
||||
self.assertEqual(dup.encode, orig.encode)
|
||||
self.assertEqual(dup.name, orig.name)
|
||||
self.assertEqual(dup.incrementalencoder, orig.incrementalencoder)
|
||||
self.assertNotHasAttr(dup, '_expat_decoding_table')
|
||||
|
||||
def test_pickle(self):
|
||||
codec_info = codecs.lookup('utf-8')
|
||||
|
|
@ -1940,6 +1948,8 @@ def test_pickle(self):
|
|||
unpickled_codec_info.incrementalencoder
|
||||
)
|
||||
self.assertTrue(unpickled_codec_info._is_text_encoding)
|
||||
self.assertEqual(unpickled_codec_info._expat_decoding_table,
|
||||
codec_info._expat_decoding_table)
|
||||
|
||||
# Test a CodecInfo with _is_text_encoding equal to false.
|
||||
codec_info = codecs.lookup('base64')
|
||||
|
|
@ -1955,6 +1965,7 @@ def test_pickle(self):
|
|||
unpickled_codec_info.incrementalencoder
|
||||
)
|
||||
self.assertFalse(unpickled_codec_info._is_text_encoding)
|
||||
self.assertNotHasAttr(unpickled_codec_info, '_expat_decoding_table')
|
||||
|
||||
|
||||
class StreamReaderTest(unittest.TestCase):
|
||||
|
|
|
|||
|
|
@ -276,7 +276,9 @@ def test_parse_again(self):
|
|||
expat.errors.XML_ERROR_FINISHED)
|
||||
|
||||
@support.subTests('encoding', [
|
||||
'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
|
||||
# built-in Expat encodings
|
||||
'iso-8859-1', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
|
||||
# 8-bit Python encodings
|
||||
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
|
||||
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
|
||||
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
|
||||
|
|
@ -288,6 +290,12 @@ def test_parse_again(self):
|
|||
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
|
||||
'mac-roman', 'mac-turkish',
|
||||
'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
|
||||
# multi-byte Python encodings
|
||||
"cp932", "cp949", "cp950",
|
||||
"Big5","EUC-JP", "GB2312", "GBK", "johab", "Shift_JIS",
|
||||
'UTF8', 'utf-8-sig',
|
||||
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
|
||||
"Shift_JIS-2004", "Shift_JISX0213",
|
||||
])
|
||||
def test_supported_encodings(self, encoding):
|
||||
out = self.Outputter()
|
||||
|
|
@ -305,7 +313,7 @@ def test_supported_encodings(self, encoding):
|
|||
])
|
||||
|
||||
@support.subTests('encoding', [
|
||||
'UTF-8', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be',
|
||||
'UTF-8', 'utf-8', 'utf8', 'utf-16', 'utf-16le', 'utf-16be',
|
||||
'koi8-u', 'cp1125', 'cp1251', 'iso8859-5', 'mac-cyrillic',
|
||||
])
|
||||
def test_supported_encodings2(self, encoding):
|
||||
|
|
@ -324,15 +332,46 @@ def test_supported_encodings2(self, encoding):
|
|||
"End element: 'корінь'",
|
||||
])
|
||||
|
||||
@support.subTests('encoding', [
|
||||
'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
|
||||
])
|
||||
def test_supported_non_bmp(self, encoding):
|
||||
out = self.Outputter()
|
||||
parser = expat.ParserCreate()
|
||||
self._hookup_callbacks(parser, out)
|
||||
c = '\U00020e6d\U00028e36'
|
||||
data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
|
||||
f'<root>{c}</root>').encode(encoding)
|
||||
parser.Parse(data, True)
|
||||
self.assertEqual(out.out, [
|
||||
('XML declaration', ('1.0', encoding, -1)),
|
||||
"Start element: 'root' {}",
|
||||
f'Character data: {c!r}',
|
||||
"End element: 'root'",
|
||||
])
|
||||
|
||||
@support.subTests('encoding', [
|
||||
'UTF8', 'utf-8-sig',
|
||||
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
|
||||
"Shift_JIS-2004", "Shift_JISX0213",
|
||||
])
|
||||
def test_unsupported_non_bmp(self, encoding):
|
||||
parser = expat.ParserCreate()
|
||||
c = '\U00020e6d\U00028e36'
|
||||
data = (f'<?xml version="1.0" encoding="{encoding}"?>\n'
|
||||
f'<root>{c}</root>').encode(encoding)
|
||||
with self.assertRaises(expat.ExpatError):
|
||||
parser.Parse(data, True)
|
||||
|
||||
@support.subTests('encoding', [
|
||||
'UTF-7',
|
||||
"Big5-HKSCS", "Big5",
|
||||
"cp932", "cp949", "cp950",
|
||||
"EUC_JIS-2004", "EUC_JISX0213", "EUC-JP", "EUC-KR",
|
||||
"GB18030", "GB2312", "GBK",
|
||||
"unicode-escape", "raw-unicode-escape",
|
||||
"EUC-KR",
|
||||
"GB18030",
|
||||
"HZ-GB-2312",
|
||||
"ISO-2022-JP", "ISO-2022-JP-1", "ISO-2022-JP-2004",
|
||||
"ISO-2022-JP-2", "ISO-2022-JP-3", "ISO-2022-JP-EXT",
|
||||
"ISO-2022-KR",
|
||||
"johab",
|
||||
"Shift_JIS", "Shift_JIS-2004", "Shift_JISX0213",
|
||||
])
|
||||
def test_unsupported_encodings(self, encoding):
|
||||
parser = expat.ParserCreate()
|
||||
|
|
|
|||
|
|
@ -1008,6 +1008,8 @@ def check(encoding, body=''):
|
|||
check("iso-8859-15", '\u20ac')
|
||||
check("cp437", '\u221a')
|
||||
check("mac-roman", '\u02da')
|
||||
check('shift-jis-2004', '\u203e\u3406\uff66')
|
||||
check('euc-jis-2004', '\u3406\uff66')
|
||||
|
||||
def xml(encoding, body=''):
|
||||
return "<?xml version='1.0' encoding='%s'?><xml>%s</xml>" % (encoding, body)
|
||||
|
|
@ -1026,6 +1028,12 @@ def bxml(encoding, body=''):
|
|||
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
|
||||
'mac-roman', 'mac-turkish',
|
||||
'koi8-r', 'koi8-t', 'koi8-u', 'kz1048', 'ptcp154',
|
||||
'big5', 'big5hkscs',
|
||||
'cp932', 'cp949', 'cp950',
|
||||
'euc-jp', 'euc-jis-2004', 'euc-jisx0213',
|
||||
'gb2312', 'gbk', 'johab',
|
||||
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
|
||||
'utf-8-sig', 'utf8',
|
||||
]
|
||||
for encoding in supported_encodings:
|
||||
with self.subTest(encoding=encoding):
|
||||
|
|
@ -1035,12 +1043,10 @@ def bxml(encoding, body=''):
|
|||
('<xml>&#%d;</xml>' % ord(c)).encode())
|
||||
|
||||
unsupported_ascii_compatible_encodings = [
|
||||
'big5', 'big5hkscs',
|
||||
'cp932', 'cp949', 'cp950',
|
||||
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
|
||||
'gb2312', 'gbk', 'gb18030',
|
||||
'iso2022-kr', 'johab',
|
||||
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
|
||||
'euc-kr', 'gb18030',
|
||||
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
|
||||
'iso2022-jp-3', 'iso2022-jp-ext',
|
||||
'iso2022-kr', 'hz',
|
||||
'utf-7',
|
||||
]
|
||||
for encoding in unsupported_ascii_compatible_encodings:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,9 @@
|
|||
Add support for multiple multi-byte encodings in the :mod:`XML parser
|
||||
<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312",
|
||||
"GBK", "johab", and "Shift_JIS". Add partial support (only BMP characters)
|
||||
for multi-byte encodings "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213",
|
||||
"Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases
|
||||
like "UTF8" (without hyphen). The parser now raises :exc:`ValueError` for
|
||||
known unsupported multi-byte encodings such us "ISO-2022-JP" or
|
||||
"raw-unicode-escape" instead of failing later, when encounter non-ASCII
|
||||
data.
|
||||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include "Python.h"
|
||||
#include "pycore_ceval.h" // _Py_EnterRecursiveCall()
|
||||
#include "pycore_codecs.h" // _PyCodec_LookupTextEncoding()
|
||||
#include "pycore_import.h" // _PyImport_SetModule()
|
||||
#include "pycore_pyhash.h" // _Py_HashSecret
|
||||
#include "pycore_traceback.h" // _PyTraceback_Add()
|
||||
|
|
@ -1438,6 +1439,57 @@ static struct PyMethodDef xmlparse_methods[] = {
|
|||
Make it as simple as possible.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
int map[256];
|
||||
char name[0];
|
||||
} pyexpat_encoding_info;
|
||||
|
||||
static pyexpat_encoding_info *
|
||||
pyexpat_encoding_create(const char *name, PyObject *mapping)
|
||||
{
|
||||
if (!PyTuple_Check(mapping) || PyTuple_GET_SIZE(mapping) != 256) {
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"_expat_decoding_table must be a 256-tuple of integers");
|
||||
return NULL;
|
||||
}
|
||||
pyexpat_encoding_info *info = (pyexpat_encoding_info *)PyMem_Malloc(
|
||||
sizeof(pyexpat_encoding_info) + strlen(name) + 1);
|
||||
if (info == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
for (int i = 0; i < 256; i++) {
|
||||
int j = PyLong_AsInt(PyTuple_GET_ITEM(mapping, i));
|
||||
if (j == -1 && PyErr_Occurred()) {
|
||||
PyMem_Free(info);
|
||||
return NULL;
|
||||
}
|
||||
info->map[i] = j;
|
||||
}
|
||||
strcpy(info->name, name);
|
||||
return info;
|
||||
}
|
||||
|
||||
static int
|
||||
pyexpat_encoding_convert(void *data, const char *s)
|
||||
{
|
||||
pyexpat_encoding_info *info = (pyexpat_encoding_info *)data;
|
||||
int i = (unsigned char)s[0];
|
||||
assert(info->map[i] < -1);
|
||||
PyObject *u = PyUnicode_Decode(s, -info->map[i], info->name, NULL);
|
||||
if (u == NULL) {
|
||||
return -1;
|
||||
}
|
||||
if (PyUnicode_GET_LENGTH(u) != 1) {
|
||||
Py_DECREF(u);
|
||||
return -1;
|
||||
}
|
||||
Py_UCS4 ch = PyUnicode_ReadChar(u, 0);
|
||||
Py_DECREF(u);
|
||||
return (int)ch;
|
||||
}
|
||||
|
||||
|
||||
static const unsigned char template_buffer[256] =
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
|
||||
|
|
@ -1470,6 +1522,43 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
|
|||
if (PyErr_Occurred())
|
||||
return XML_STATUS_ERROR;
|
||||
|
||||
PyObject *codec = _PyCodec_LookupTextEncoding(name, NULL);
|
||||
if (codec == NULL) {
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
if (!PyTuple_CheckExact(codec)) {
|
||||
PyObject *attr;
|
||||
if (PyObject_GetOptionalAttrString(codec, "_expat_decoding_table", &attr) < 0) {
|
||||
Py_DECREF(codec);
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
if (attr != NULL) {
|
||||
if (attr == Py_False) {
|
||||
Py_DECREF(attr);
|
||||
Py_DECREF(codec);
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"encoding '%s' is not supported",
|
||||
name);
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
pyexpat_encoding_info *data = pyexpat_encoding_create(name, attr);
|
||||
Py_DECREF(attr);
|
||||
if (data == NULL) {
|
||||
Py_DECREF(codec);
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
for (i = 0; i < 256; i++) {
|
||||
info->map[i] = data->map[i];
|
||||
}
|
||||
info->data = data;
|
||||
info->convert = pyexpat_encoding_convert;
|
||||
info->release = PyMem_Free;
|
||||
Py_DECREF(codec);
|
||||
return XML_STATUS_OK;
|
||||
}
|
||||
}
|
||||
Py_DECREF(codec);
|
||||
|
||||
u = PyUnicode_Decode((const char*) template_buffer, 256, name, "replace");
|
||||
if (u == NULL) {
|
||||
Py_XDECREF(u);
|
||||
|
|
@ -1478,8 +1567,9 @@ PyUnknownEncodingHandler(void *encodingHandlerData,
|
|||
|
||||
if (PyUnicode_GET_LENGTH(u) != 256) {
|
||||
Py_DECREF(u);
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"multi-byte encodings are not supported");
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"multi-byte encoding '%s' is not supported",
|
||||
name);
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ Copyright (c) Corporation for National Research Initiatives.
|
|||
|
||||
#include "Python.h"
|
||||
#include "pycore_call.h" // _PyObject_CallNoArgs()
|
||||
#include "pycore_codecs.h" // export _PyCodec_LookupTextEncoding()
|
||||
#include "pycore_interp.h" // PyInterpreterState.codec_search_path
|
||||
#include "pycore_pyerrors.h" // _PyErr_FormatNote()
|
||||
#include "pycore_pystate.h" // _PyInterpreterState_GET()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue