mirror of
https://github.com/python/cpython.git
synced 2025-12-08 06:10:17 +00:00
parent
6826166280
commit
a3ce2f77f0
6 changed files with 123 additions and 22 deletions
|
|
@ -30,6 +30,7 @@
|
|||
|
||||
import codecs
|
||||
import sys
|
||||
from _codecs import _normalize_encoding
|
||||
from . import aliases
|
||||
|
||||
_cache = {}
|
||||
|
|
@ -55,18 +56,7 @@ def normalize_encoding(encoding):
|
|||
if isinstance(encoding, bytes):
|
||||
encoding = str(encoding, "ascii")
|
||||
|
||||
chars = []
|
||||
punct = False
|
||||
for c in encoding:
|
||||
if c.isalnum() or c == '.':
|
||||
if punct and chars:
|
||||
chars.append('_')
|
||||
if c.isascii():
|
||||
chars.append(c)
|
||||
punct = False
|
||||
else:
|
||||
punct = True
|
||||
return ''.join(chars)
|
||||
return _normalize_encoding(encoding)
|
||||
|
||||
def search_function(encoding):
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
:mod:`encodings`: Improve :func:`~encodings.normalize_encoding` performance
|
||||
by implementing the function in C using the private
|
||||
``_Py_normalize_encoding`` which has been modified to make lowercase
|
||||
conversion optional.
|
||||
|
|
@ -1018,6 +1018,47 @@ _codecs_lookup_error_impl(PyObject *module, const char *name)
|
|||
return PyCodec_LookupError(name);
|
||||
}
|
||||
|
||||
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
|
||||
|
||||
/*[clinic input]
|
||||
_codecs._normalize_encoding
|
||||
encoding: unicode
|
||||
|
||||
Normalize an encoding name *encoding*.
|
||||
|
||||
Used for encodings.normalize_encoding. Does not convert to lower case.
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding)
|
||||
/*[clinic end generated code: output=d27465d81e361f8e input=3ff3f4d64995b988]*/
|
||||
{
|
||||
Py_ssize_t len;
|
||||
const char *cstr = PyUnicode_AsUTF8AndSize(encoding, &len);
|
||||
if (cstr == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (len > PY_SSIZE_T_MAX) {
|
||||
PyErr_SetString(PyExc_OverflowError, "encoding is too large");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *normalized = PyMem_Malloc(len + 1);
|
||||
if (normalized == NULL) {
|
||||
return PyErr_NoMemory();
|
||||
}
|
||||
|
||||
if (!_Py_normalize_encoding(cstr, normalized, len + 1, 0)) {
|
||||
PyMem_Free(normalized);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *result = PyUnicode_FromString(normalized);
|
||||
PyMem_Free(normalized);
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --- Module API --------------------------------------------------------- */
|
||||
|
||||
static PyMethodDef _codecs_functions[] = {
|
||||
|
|
@ -1067,6 +1108,7 @@ static PyMethodDef _codecs_functions[] = {
|
|||
_CODECS_REGISTER_ERROR_METHODDEF
|
||||
_CODECS__UNREGISTER_ERROR_METHODDEF
|
||||
_CODECS_LOOKUP_ERROR_METHODDEF
|
||||
_CODECS__NORMALIZE_ENCODING_METHODDEF
|
||||
{NULL, NULL} /* sentinel */
|
||||
};
|
||||
|
||||
|
|
|
|||
66
Modules/clinic/_codecsmodule.c.h
generated
66
Modules/clinic/_codecsmodule.c.h
generated
|
|
@ -2779,6 +2779,70 @@ exit:
|
|||
return return_value;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(_codecs__normalize_encoding__doc__,
|
||||
"_normalize_encoding($module, /, encoding)\n"
|
||||
"--\n"
|
||||
"\n"
|
||||
"Normalize an encoding name *encoding*.\n"
|
||||
"\n"
|
||||
"Used for encodings.normalize_encoding. Does not convert to lower case.");
|
||||
|
||||
#define _CODECS__NORMALIZE_ENCODING_METHODDEF \
|
||||
{"_normalize_encoding", _PyCFunction_CAST(_codecs__normalize_encoding), METH_FASTCALL|METH_KEYWORDS, _codecs__normalize_encoding__doc__},
|
||||
|
||||
static PyObject *
|
||||
_codecs__normalize_encoding_impl(PyObject *module, PyObject *encoding);
|
||||
|
||||
static PyObject *
|
||||
_codecs__normalize_encoding(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
|
||||
{
|
||||
PyObject *return_value = NULL;
|
||||
#if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
|
||||
|
||||
#define NUM_KEYWORDS 1
|
||||
static struct {
|
||||
PyGC_Head _this_is_not_used;
|
||||
PyObject_VAR_HEAD
|
||||
Py_hash_t ob_hash;
|
||||
PyObject *ob_item[NUM_KEYWORDS];
|
||||
} _kwtuple = {
|
||||
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
|
||||
.ob_hash = -1,
|
||||
.ob_item = { &_Py_ID(encoding), },
|
||||
};
|
||||
#undef NUM_KEYWORDS
|
||||
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
|
||||
|
||||
#else // !Py_BUILD_CORE
|
||||
# define KWTUPLE NULL
|
||||
#endif // !Py_BUILD_CORE
|
||||
|
||||
static const char * const _keywords[] = {"encoding", NULL};
|
||||
static _PyArg_Parser _parser = {
|
||||
.keywords = _keywords,
|
||||
.fname = "_normalize_encoding",
|
||||
.kwtuple = KWTUPLE,
|
||||
};
|
||||
#undef KWTUPLE
|
||||
PyObject *argsbuf[1];
|
||||
PyObject *encoding;
|
||||
|
||||
args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
|
||||
/*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
|
||||
if (!args) {
|
||||
goto exit;
|
||||
}
|
||||
if (!PyUnicode_Check(args[0])) {
|
||||
_PyArg_BadArgument("_normalize_encoding", "argument 'encoding'", "str", args[0]);
|
||||
goto exit;
|
||||
}
|
||||
encoding = args[0];
|
||||
return_value = _codecs__normalize_encoding_impl(module, encoding);
|
||||
|
||||
exit:
|
||||
return return_value;
|
||||
}
|
||||
|
||||
#ifndef _CODECS_MBCS_DECODE_METHODDEF
|
||||
#define _CODECS_MBCS_DECODE_METHODDEF
|
||||
#endif /* !defined(_CODECS_MBCS_DECODE_METHODDEF) */
|
||||
|
|
@ -2802,4 +2866,4 @@ exit:
|
|||
#ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
||||
#define _CODECS_CODE_PAGE_ENCODE_METHODDEF
|
||||
#endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
|
||||
/*[clinic end generated code: output=ed13f20dfb09e306 input=a9049054013a1b77]*/
|
||||
/*[clinic end generated code: output=a968c493bb28be3e input=a9049054013a1b77]*/
|
||||
|
|
|
|||
|
|
@ -3449,13 +3449,14 @@ PyUnicode_FromEncodedObject(PyObject *obj,
|
|||
return v;
|
||||
}
|
||||
|
||||
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
|
||||
also convert to lowercase. Return 1 on success, or 0 on error (encoding is
|
||||
longer than lower_len-1). */
|
||||
/* Normalize an encoding name like encodings.normalize_encoding()
|
||||
but allow to convert to lowercase if *to_lower* is true.
|
||||
Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
|
||||
int
|
||||
_Py_normalize_encoding(const char *encoding,
|
||||
char *lower,
|
||||
size_t lower_len)
|
||||
size_t lower_len,
|
||||
int to_lower)
|
||||
{
|
||||
const char *e;
|
||||
char *l;
|
||||
|
|
@ -3486,7 +3487,7 @@ _Py_normalize_encoding(const char *encoding,
|
|||
if (l == l_end) {
|
||||
return 0;
|
||||
}
|
||||
*l++ = Py_TOLOWER(c);
|
||||
*l++ = to_lower ? Py_TOLOWER(c) : c;
|
||||
}
|
||||
else {
|
||||
punct = 1;
|
||||
|
|
@ -3521,7 +3522,7 @@ PyUnicode_Decode(const char *s,
|
|||
}
|
||||
|
||||
/* Shortcuts for common default encodings */
|
||||
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
|
||||
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
|
||||
char *lower = buflower;
|
||||
|
||||
/* Fast paths */
|
||||
|
|
@ -3778,7 +3779,7 @@ PyUnicode_AsEncodedString(PyObject *unicode,
|
|||
}
|
||||
|
||||
/* Shortcuts for common default encodings */
|
||||
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
|
||||
if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
|
||||
char *lower = buflower;
|
||||
|
||||
/* Fast paths */
|
||||
|
|
|
|||
|
|
@ -178,7 +178,7 @@ _Py_mbrtowc(wchar_t *pwc, const char *str, size_t len, mbstate_t *pmbs)
|
|||
|
||||
#define USE_FORCE_ASCII
|
||||
|
||||
extern int _Py_normalize_encoding(const char *, char *, size_t);
|
||||
extern int _Py_normalize_encoding(const char *, char *, size_t, int);
|
||||
|
||||
/* Workaround FreeBSD and OpenIndiana locale encoding issue with the C locale
|
||||
and POSIX locale. nl_langinfo(CODESET) announces an alias of the
|
||||
|
|
@ -229,7 +229,7 @@ check_force_ascii(void)
|
|||
}
|
||||
|
||||
char encoding[20]; /* longest name: "iso_646.irv_1991\0" */
|
||||
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding))) {
|
||||
if (!_Py_normalize_encoding(codeset, encoding, sizeof(encoding), 1)) {
|
||||
goto error;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue