gh-144015: Add portable SIMD optimization for bytes.hex() et. al. (GH-143991)

Add SIMD optimization for `bytes.hex()`, `bytearray.hex()`, and `binascii.hexlify()` as well as `hashlib` `.hexdigest()` methods using platform-agnostic GCC/Clang vector extensions that compile to native SIMD instructions on our [PEP-11 Tier 1 Linux and macOS](https://peps.python.org/pep-0011/#tier-1) platforms. - 1.1-3x faster for common small data (16-64 bytes, covering md5 through sha512 digest sizes) - Up to 11x faster for large data (1KB+) - Retains the existing scalar code for short inputs (<16 bytes) or platforms lacking SIMD instructions, no observable performance regressions there. ## Supported platforms: - x86-64: the compiler generates SSE2 - always available, no flags or CPU feature checks needed - ARM64: NEON is always available, always available, no flags or CPU feature checks needed - ARM32: Requires NEON support and that appropriate compiler flags enable that (e.g., `-march=native` on a Raspberry Pi 3+) - while we _could_ use runtime detection to allow neon when compiled without a recent enough `-march=` flag (`cortex-a53` and later IIRC), there are diminishing returns in doing so. Anyone using 32-bit ARM in a situation where performance matters will already be compiling with such flags. (as opposed to 32-bit Raspbian compilation that defaults to aiming primarily for compatibility with rpi1&0 armv6 arch=armhf which lacks neon) - Windows/MSVC: Not supported. MSVC lacks `__builtin_shufflevector`, so the existing scalar path is used. Leaving it as an opportunity for the future for someone to figure out how to express the intent to that compiler. This is compile time detection of features that are always available on the target architectures. No need for runtime feature inspection.
2026-05-30 06:10:49 +00:00 · 2026-02-22 19:19:03 -08:00 · 2026-02-22 19:19:03 -08:00 · ad4ee7cb0f
commit ad4ee7cb0f
parent 79f6caf8f1
6 changed files with 256 additions and 6 deletions
--- a/Python/pystrhex.c
+++ b/Python/pystrhex.c
@ -4,6 +4,113 @@
 #include "pycore_strhex.h"        // _Py_strhex_with_sep()
 #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()

+/* Scalar hexlify: convert len bytes to 2*len hex characters.
+   Uses table lookup via Py_hexdigits for the conversion. */
+static inline void
+_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
+{
+    /* Various optimizations like using math instead of a table lookup,
+       manually unrolling the loop, storing the global table pointer locally,
+       and doing wider dst writes have been tried and benchmarked; all produced
+       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
+       table was a bit slower.  So we keep our old simple and obvious code. */
+    for (Py_ssize_t i = 0; i < len; i++) {
+        unsigned char c = src[i];
+        *dst++ = Py_hexdigits[c >> 4];
+        *dst++ = Py_hexdigits[c & 0x0f];
+    }
+}
+
+/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
+   Uses __builtin_shufflevector for portable interleave that compiles to
+   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
+   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
+   for the target microarch allow it [try -march=native if running 32-bit
+   on an RPi3 or later]).
+
+   Performance:
+   - For more common small data it varies between 1.1-3x faster.
+   - Up to 11x faster on larger data than the scalar code.
+
+   While faster is possible for big data using AVX2 or AVX512, that
+   adds a ton of complication. Who ever really hexes huge data?
+   The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
+*/
+#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
+
+/* 128-bit vector of 16 unsigned bytes */
+typedef unsigned char v16u8 __attribute__((vector_size(16)));
+/* 128-bit vector of 16 signed bytes - for efficient comparison.
+   Using signed comparison generates pcmpgtb on x86-64 instead of
+   the slower psubusb+pcmpeqb sequence from unsigned comparison.
+   ARM NEON performs the same either way. */
+typedef signed char v16s8 __attribute__((vector_size(16)));
+
+/* Splat a byte value across all 16 lanes */
+static inline v16u8
+v16u8_splat(unsigned char x)
+{
+    return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
+}
+
+static inline v16s8
+v16s8_splat(signed char x)
+{
+    return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
+}
+
+/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
+   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
+static void
+_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
+{
+    const v16u8 mask_0f = v16u8_splat(0x0f);
+    const v16u8 ascii_0 = v16u8_splat('0');
+    const v16u8 offset = v16u8_splat('a' - '0' - 10);  /* 0x27 */
+    const v16s8 nine = v16s8_splat(9);
+
+    Py_ssize_t i = 0;
+
+    /* Process 16 bytes at a time */
+    for (; i + 16 <= len; i += 16, dst += 32) {
+        /* Load 16 bytes (memcpy for safe unaligned access) */
+        v16u8 data;
+        memcpy(&data, src + i, 16);
+
+        /* Extract high and low nibbles using vector operators */
+        v16u8 hi = (data >> 4) & mask_0f;
+        v16u8 lo = data & mask_0f;
+
+        /* Compare > 9 using signed comparison for efficient codegen.
+           Nibble values 0-15 are safely in signed byte range.
+           This generates pcmpgtb on x86-64, avoiding the slower
+           psubusb+pcmpeqb sequence from unsigned comparison. */
+        v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
+        v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
+
+        /* Convert nibbles to hex ASCII */
+        hi = hi + ascii_0 + (hi_gt9 & offset);
+        lo = lo + ascii_0 + (lo_gt9 & offset);
+
+        /* Interleave hi/lo nibbles using portable shufflevector.
+           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
+           or vzip on ARM32. */
+        v16u8 result0 = __builtin_shufflevector(hi, lo,
+            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+        v16u8 result1 = __builtin_shufflevector(hi, lo,
+            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+        /* Store 32 hex characters */
+        memcpy(dst, &result0, 16);
+        memcpy(dst + 16, &result1, 16);
+    }
+
+    /* Scalar fallback for remaining 0-15 bytes */
+    _Py_hexlify_scalar(src + i, dst, len - i);
+}
+
+#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
+
 static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
                                 PyObject* sep, int bytes_per_sep_group,
                                 const int return_bytes)
@ -82,13 +189,15 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
    unsigned char c;

    if (bytes_per_sep_group == 0) {
-        for (i = j = 0; i < arglen; ++i) {
-            assert((j + 1) < resultlen);
-            c = argbuf[i];
-            retbuf[j++] = Py_hexdigits[c >> 4];
-            retbuf[j++] = Py_hexdigits[c & 0x0f];
+#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
+        if (arglen >= 16) {
+            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
+        }
+        else
+#endif
+        {
+            _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
        }
-        assert(j == resultlen);
    }
    else {
        /* The number of complete chunk+sep periods */