gh-127787: refactor helpers for PyUnicodeErrorObject internal interface (GH-127789)

- Unify `get_unicode` and `get_string` in a single function.

- Allow to retrieve the underlying `object` attribute, its
  size, and the adjusted 'start' and 'end', all at once.
  Add a new `_PyUnicodeError_GetParams` internal function for this.
  (In `exceptions.c`, it's somewhat common to not need all the attributes,
  but the compiler has opportunity to inline the function and optimize
  unneeded work away. Outside that file, we'll usually need all or
  most of them at once.)

- Use a common implementation for the following functions:

  - `PyUnicode{Decode,Encode}Error_GetEncoding`
  - `PyUnicode{Decode,Encode,Translate}Error_GetObject`
  - `PyUnicode{Decode,Encode,Translate}Error_{Get,Set}Reason`
  - `PyUnicode{Decode,Encode,Translate}Error_{Get,Set}{Start,End}`
This commit is contained in:
Bénédikt Tran 2025-01-03 13:37:02 +01:00 committed by GitHub
parent 8abd6cef68
commit fa985bee61
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 297 additions and 159 deletions

View file

@ -94,6 +94,12 @@ PyAPI_FUNC(void) _PyErr_ChainExceptions1(PyObject *);
/* In exceptions.c */
PyAPI_FUNC(int) _PyUnicodeError_GetParams(
PyObject *self,
PyObject **obj, Py_ssize_t *objlen,
Py_ssize_t *start, Py_ssize_t *end,
int as_bytes);
PyAPI_FUNC(PyObject*) PyUnstable_Exc_PrepReraiseStar(
PyObject *orig,
PyObject *excs);

View file

@ -2667,47 +2667,167 @@ SimpleExtendsException(PyExc_Exception, ValueError,
SimpleExtendsException(PyExc_ValueError, UnicodeError,
"Unicode related error.");
/*
* Check the validity of 'attr' as a unicode or bytes object depending
* on 'as_bytes' and return a new reference on it if it is the case.
*
* The 'name' is the attribute name and is only used for error reporting.
*
* On success, this returns a strong reference on 'attr'.
* On failure, this sets a TypeError and returns NULL.
*/
static PyObject *
get_bytes(PyObject *attr, const char *name)
as_unicode_error_attribute(PyObject *attr, const char *name, int as_bytes)
{
if (!attr) {
PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name);
assert(as_bytes == 0 || as_bytes == 1);
if (attr == NULL) {
PyErr_Format(PyExc_TypeError, "%s attribute not set", name);
return NULL;
}
if (!PyBytes_Check(attr)) {
PyErr_Format(PyExc_TypeError, "%.200s attribute must be bytes", name);
return NULL;
}
return Py_NewRef(attr);
}
static PyObject *
get_unicode(PyObject *attr, const char *name)
{
if (!attr) {
PyErr_Format(PyExc_TypeError, "%.200s attribute not set", name);
return NULL;
}
if (!PyUnicode_Check(attr)) {
if (!(as_bytes ? PyBytes_Check(attr) : PyUnicode_Check(attr))) {
PyErr_Format(PyExc_TypeError,
"%.200s attribute must be unicode", name);
"%s attribute must be %s",
name,
as_bytes ? "bytes" : "unicode");
return NULL;
}
return Py_NewRef(attr);
}
static int
set_unicodefromstring(PyObject **attr, const char *value)
#define PyUnicodeError_Check(PTR) \
PyObject_TypeCheck((PTR), (PyTypeObject *)PyExc_UnicodeError)
#define PyUnicodeError_CAST(PTR) \
(assert(PyUnicodeError_Check(PTR)), ((PyUnicodeErrorObject *)(PTR)))
/* class names to use when reporting errors */
#define Py_UNICODE_ENCODE_ERROR_NAME "UnicodeEncodeError"
#define Py_UNICODE_DECODE_ERROR_NAME "UnicodeDecodeError"
#define Py_UNICODE_TRANSLATE_ERROR_NAME "UnicodeTranslateError"
/*
* Check that 'self' is a UnicodeError object.
*
* On success, this returns 0.
* On failure, this sets a TypeError exception and returns -1.
*
* The 'expect_type' is the name of the expected type, which is
* only used for error reporting.
*
* As an implementation detail, the `PyUnicode*Error_*` functions
* currently allow *any* subclass of UnicodeError as 'self'.
*
* Use one of the `Py_UNICODE_*_ERROR_NAME` macros to avoid typos.
*/
static inline int
check_unicode_error_type(PyObject *self, const char *expect_type)
{
PyObject *obj = PyUnicode_FromString(value);
if (!obj)
assert(self != NULL);
if (!PyUnicodeError_Check(self)) {
PyErr_Format(PyExc_TypeError,
"expecting a %s object, got %T", expect_type, self);
return -1;
Py_XSETREF(*attr, obj);
}
return 0;
}
// --- PyUnicodeEncodeObject: internal helpers --------------------------------
//
// In the helpers below, the caller is responsible to ensure that 'self'
// is a PyUnicodeErrorObject, although this is verified on DEBUG builds
// through PyUnicodeError_CAST().
/*
* Return the underlying (str) 'encoding' attribute of a UnicodeError object.
*/
static inline PyObject *
unicode_error_get_encoding_impl(PyObject *self)
{
assert(self != NULL);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
return as_unicode_error_attribute(exc->encoding, "encoding", false);
}
/*
* Return the underlying 'object' attribute of a UnicodeError object
* as a bytes or a string instance, depending on the 'as_bytes' flag.
*/
static inline PyObject *
unicode_error_get_object_impl(PyObject *self, int as_bytes)
{
assert(self != NULL);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
return as_unicode_error_attribute(exc->object, "object", as_bytes);
}
/*
* Return the underlying (str) 'reason' attribute of a UnicodeError object.
*/
static inline PyObject *
unicode_error_get_reason_impl(PyObject *self)
{
assert(self != NULL);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
return as_unicode_error_attribute(exc->reason, "reason", false);
}
/*
* Set the underlying (str) 'reason' attribute of a UnicodeError object.
*
* Return 0 on success and -1 on failure.
*/
static inline int
unicode_error_set_reason_impl(PyObject *self, const char *reason)
{
assert(self != NULL);
PyObject *value = PyUnicode_FromString(reason);
if (value == NULL) {
return -1;
}
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
Py_XSETREF(exc->reason, value);
return 0;
}
/*
* Set the 'start' attribute of a UnicodeError object.
*
* Return 0 on success and -1 on failure.
*/
static inline int
unicode_error_set_start_impl(PyObject *self, Py_ssize_t start)
{
assert(self != NULL);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
exc->start = start;
return 0;
}
/*
* Set the 'end' attribute of a UnicodeError object.
*
* Return 0 on success and -1 on failure.
*/
static inline int
unicode_error_set_end_impl(PyObject *self, Py_ssize_t end)
{
assert(self != NULL);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
exc->end = end;
return 0;
}
// --- PyUnicodeEncodeObject: internal getters --------------------------------
/*
* Adjust the (inclusive) 'start' value of a UnicodeError object.
*
@ -2728,6 +2848,7 @@ unicode_error_adjust_start(Py_ssize_t start, Py_ssize_t objlen)
return start;
}
/*
* Adjust the (exclusive) 'end' value of a UnicodeError object.
*
@ -2748,134 +2869,162 @@ unicode_error_adjust_end(Py_ssize_t end, Py_ssize_t objlen)
return end;
}
#define _PyUnicodeError_CAST(PTR) ((PyUnicodeErrorObject *)(PTR))
#define PyUnicodeError_Check(PTR) \
PyObject_TypeCheck((PTR), (PyTypeObject *)PyExc_UnicodeError)
#define PyUnicodeError_CAST(PTR) \
(assert(PyUnicodeError_Check(PTR)), _PyUnicodeError_CAST(PTR))
static inline int
check_unicode_error_type(PyObject *self, const char *expect_type)
/*
* Get various common parameters of a UnicodeError object.
*
* The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject,
* although this condition is verified by this function on DEBUG builds.
*
* Return 0 on success and -1 on failure.
*
* Output parameters:
*
* obj A strong reference to the 'object' attribute.
* objlen The 'object' length.
* start The clipped 'start' attribute.
* end The clipped 'end' attribute.
*
* An output parameter can be NULL to indicate that
* the corresponding value does not need to be stored.
*
* Input parameter:
*
* as_bytes If 1, the error's 'object' attribute must be a bytes object,
* i.e. the call is for a `UnicodeDecodeError`. Otherwise, the
* 'object' attribute must be a string.
*
* A TypeError is raised if the 'object' type is incompatible.
*/
int
_PyUnicodeError_GetParams(PyObject *self,
PyObject **obj, Py_ssize_t *objlen,
Py_ssize_t *start, Py_ssize_t *end,
int as_bytes)
{
if (!PyUnicodeError_Check(self)) {
PyErr_Format(PyExc_TypeError,
"expecting a %s object, got %T", expect_type, self);
assert(self != NULL);
assert(as_bytes == 0 || as_bytes == 1);
PyUnicodeErrorObject *exc = PyUnicodeError_CAST(self);
PyObject *r = as_unicode_error_attribute(exc->object, "object", as_bytes);
if (r == NULL) {
return -1;
}
Py_ssize_t n = as_bytes ? PyBytes_GET_SIZE(r) : PyUnicode_GET_LENGTH(r);
if (objlen != NULL) {
*objlen = n;
}
if (start != NULL) {
*start = unicode_error_adjust_start(exc->start, n);
assert(*start >= 0);
assert(*start <= n);
}
if (end != NULL) {
*end = unicode_error_adjust_end(exc->end, n);
assert(*end >= 0);
assert(*end <= n);
}
if (obj != NULL) {
*obj = r;
}
else {
Py_DECREF(r);
}
return 0;
}
static inline PyUnicodeErrorObject *
as_unicode_error(PyObject *self, const char *expect_type)
{
int rc = check_unicode_error_type(self, expect_type);
return rc < 0 ? NULL : _PyUnicodeError_CAST(self);
}
// --- PyUnicodeEncodeObject: 'encoding' getters ------------------------------
// Note: PyUnicodeTranslateError does not have an 'encoding' attribute.
PyObject *
PyUnicodeEncodeError_GetEncoding(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
return exc == NULL ? NULL : get_unicode(exc->encoding, "encoding");
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_encoding_impl(self);
}
PyObject *
PyUnicodeDecodeError_GetEncoding(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
return exc == NULL ? NULL : get_unicode(exc->encoding, "encoding");
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_encoding_impl(self);
}
// --- PyUnicodeEncodeObject: 'object' getters --------------------------------
PyObject *
PyUnicodeEncodeError_GetObject(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
return exc == NULL ? NULL : get_unicode(exc->object, "object");
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_object_impl(self, false);
}
PyObject *
PyUnicodeDecodeError_GetObject(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
return exc == NULL ? NULL : get_bytes(exc->object, "object");
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_object_impl(self, true);
}
PyObject *
PyUnicodeTranslateError_GetObject(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError");
return exc == NULL ? NULL : get_unicode(exc->object, "object");
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_object_impl(self, false);
}
// --- PyUnicodeEncodeObject: 'start' getters ---------------------------------
/*
* Specialization of _PyUnicodeError_GetParams() for the 'start' attribute.
*
* The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject,
* although this condition is verified by this function on DEBUG builds.
*/
static inline int
unicode_error_get_start_impl(PyObject *self, Py_ssize_t *start, int as_bytes)
{
assert(self != NULL);
return _PyUnicodeError_GetParams(self, NULL, NULL, start, NULL, as_bytes);
}
int
PyUnicodeEncodeError_GetStart(PyObject *self, Py_ssize_t *start)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_unicode(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyUnicode_GET_LENGTH(obj);
Py_DECREF(obj);
*start = unicode_error_adjust_start(exc->start, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, false);
}
int
PyUnicodeDecodeError_GetStart(PyObject *self, Py_ssize_t *start)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_bytes(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyBytes_GET_SIZE(obj);
Py_DECREF(obj);
*start = unicode_error_adjust_start(exc->start, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, true);
}
int
PyUnicodeTranslateError_GetStart(PyObject *self, Py_ssize_t *start)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_unicode(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyUnicode_GET_LENGTH(obj);
Py_DECREF(obj);
*start = unicode_error_adjust_start(exc->start, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_start_impl(self, start, false);
}
static inline int
unicode_error_set_start_impl(PyObject *self, Py_ssize_t start)
{
PyUnicodeErrorObject *exc = _PyUnicodeError_CAST(self);
exc->start = start;
return 0;
}
// --- PyUnicodeEncodeObject: 'start' setters ---------------------------------
int
PyUnicodeEncodeError_SetStart(PyObject *self, Py_ssize_t start)
{
int rc = check_unicode_error_type(self, "UnicodeEncodeError");
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_start_impl(self, start);
}
@ -2883,7 +3032,7 @@ PyUnicodeEncodeError_SetStart(PyObject *self, Py_ssize_t start)
int
PyUnicodeDecodeError_SetStart(PyObject *self, Py_ssize_t start)
{
int rc = check_unicode_error_type(self, "UnicodeDecodeError");
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_start_impl(self, start);
}
@ -2891,78 +3040,57 @@ PyUnicodeDecodeError_SetStart(PyObject *self, Py_ssize_t start)
int
PyUnicodeTranslateError_SetStart(PyObject *self, Py_ssize_t start)
{
int rc = check_unicode_error_type(self, "UnicodeTranslateError");
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_start_impl(self, start);
}
// --- PyUnicodeEncodeObject: 'end' getters -----------------------------------
/*
* Specialization of _PyUnicodeError_GetParams() for the 'end' attribute.
*
* The caller is responsible to ensure that 'self' is a PyUnicodeErrorObject,
* although this condition is verified by this function on DEBUG builds.
*/
static inline int
unicode_error_get_end_impl(PyObject *self, Py_ssize_t *end, int as_bytes)
{
assert(self != NULL);
return _PyUnicodeError_GetParams(self, NULL, NULL, NULL, end, as_bytes);
}
int
PyUnicodeEncodeError_GetEnd(PyObject *self, Py_ssize_t *end)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_unicode(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyUnicode_GET_LENGTH(obj);
Py_DECREF(obj);
*end = unicode_error_adjust_end(exc->end, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, false);
}
int
PyUnicodeDecodeError_GetEnd(PyObject *self, Py_ssize_t *end)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_bytes(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyBytes_GET_SIZE(obj);
Py_DECREF(obj);
*end = unicode_error_adjust_end(exc->end, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, true);
}
int
PyUnicodeTranslateError_GetEnd(PyObject *self, Py_ssize_t *end)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError");
if (exc == NULL) {
return -1;
}
PyObject *obj = get_unicode(exc->object, "object");
if (obj == NULL) {
return -1;
}
Py_ssize_t size = PyUnicode_GET_LENGTH(obj);
Py_DECREF(obj);
*end = unicode_error_adjust_end(exc->end, size);
return 0;
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_get_end_impl(self, end, false);
}
static inline int
unicode_error_set_end_impl(PyObject *self, Py_ssize_t end)
{
PyUnicodeErrorObject *exc = _PyUnicodeError_CAST(self);
exc->end = end;
return 0;
}
// --- PyUnicodeEncodeObject: 'end' setters -----------------------------------
int
PyUnicodeEncodeError_SetEnd(PyObject *self, Py_ssize_t end)
{
int rc = check_unicode_error_type(self, "UnicodeEncodeError");
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_end_impl(self, end);
}
@ -2970,7 +3098,7 @@ PyUnicodeEncodeError_SetEnd(PyObject *self, Py_ssize_t end)
int
PyUnicodeDecodeError_SetEnd(PyObject *self, Py_ssize_t end)
{
int rc = check_unicode_error_type(self, "UnicodeDecodeError");
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_end_impl(self, end);
}
@ -2978,56 +3106,60 @@ PyUnicodeDecodeError_SetEnd(PyObject *self, Py_ssize_t end)
int
PyUnicodeTranslateError_SetEnd(PyObject *self, Py_ssize_t end)
{
int rc = check_unicode_error_type(self, "UnicodeTranslateError");
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_end_impl(self, end);
}
// --- PyUnicodeEncodeObject: 'reason' getters --------------------------------
PyObject *
PyUnicodeEncodeError_GetReason(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
return exc == NULL ? NULL : get_unicode(exc->reason, "reason");
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_reason_impl(self);
}
PyObject *
PyUnicodeDecodeError_GetReason(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
return exc == NULL ? NULL : get_unicode(exc->reason, "reason");
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_reason_impl(self);
}
PyObject *
PyUnicodeTranslateError_GetReason(PyObject *self)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError");
return exc == NULL ? NULL : get_unicode(exc->reason, "reason");
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? NULL : unicode_error_get_reason_impl(self);
}
// --- PyUnicodeEncodeObject: 'reason' setters --------------------------------
int
PyUnicodeEncodeError_SetReason(PyObject *self, const char *reason)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeEncodeError");
return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason);
int rc = check_unicode_error_type(self, Py_UNICODE_ENCODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason);
}
int
PyUnicodeDecodeError_SetReason(PyObject *self, const char *reason)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeDecodeError");
return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason);
int rc = check_unicode_error_type(self, Py_UNICODE_DECODE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason);
}
int
PyUnicodeTranslateError_SetReason(PyObject *self, const char *reason)
{
PyUnicodeErrorObject *exc = as_unicode_error(self, "UnicodeTranslateError");
return exc == NULL ? -1 : set_unicodefromstring(&exc->reason, reason);
int rc = check_unicode_error_type(self, Py_UNICODE_TRANSLATE_ERROR_NAME);
return rc < 0 ? -1 : unicode_error_set_reason_impl(self, reason);
}