From a966d94e76d91ef60f9912a98a3869f38ecd438b Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" <68491+gpshead@users.noreply.github.com> Date: Thu, 22 Jan 2026 09:21:07 -0800 Subject: [PATCH] gh-144157: Optimize bytes.translate() by deferring change detection (GH-144158) Optimize bytes.translate() by deferring change detection Move the equality check out of the hot loop to allow better compiler optimization. Instead of checking each byte during translation, perform a single memcmp at the end to determine if the input can be returned unchanged. This allows compilers to unroll and pipeline the loops, resulting in ~2x throughput improvement for medium-to-large inputs (tested on an AMD zen2). No change observed on small inputs. It will also be faster for bytes subclasses as those do not need change detection. --- .../2026-01-22-16-20-16.gh-issue-144157.dxyp7k.rst | 2 ++ Objects/bytesobject.c | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-01-22-16-20-16.gh-issue-144157.dxyp7k.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-01-22-16-20-16.gh-issue-144157.dxyp7k.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-01-22-16-20-16.gh-issue-144157.dxyp7k.rst new file mode 100644 index 00000000000..ff62d739d78 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-01-22-16-20-16.gh-issue-144157.dxyp7k.rst @@ -0,0 +1,2 @@ +:meth:`bytes.translate` now allows the compiler to unroll its loop more +usefully for a 2x speedup in the common no-deletions specified case. diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 2b0925017f2..56de99bde11 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -2237,11 +2237,15 @@ bytes_translate_impl(PyBytesObject *self, PyObject *table, /* If no deletions are required, use faster code */ for (i = inlen; --i >= 0; ) { c = Py_CHARMASK(*input++); - if (Py_CHARMASK((*output++ = table_chars[c])) != c) - changed = 1; + *output++ = table_chars[c]; } - if (!changed && PyBytes_CheckExact(input_obj)) { - Py_SETREF(result, Py_NewRef(input_obj)); + /* Check if anything changed (for returning original object) */ + /* We save this check until the end so that the compiler will */ + /* unroll the loop above leading to MUCH faster code. */ + if (PyBytes_CheckExact(input_obj)) { + if (memcmp(PyBytes_AS_STRING(input_obj), output_start, inlen) == 0) { + Py_SETREF(result, Py_NewRef(input_obj)); + } } PyBuffer_Release(&del_table_view); PyBuffer_Release(&table_view);