mirror of
https://github.com/python/cpython.git
synced 2025-10-19 07:53:46 +00:00
Compare commits
2 commits
d86ad870cc
...
bedaea0598
Author | SHA1 | Date | |
---|---|---|---|
![]() |
bedaea0598 | ||
![]() |
920de7ccdc |
3 changed files with 36 additions and 22 deletions
|
@ -989,17 +989,22 @@ defined in Unicode. A simple and straightforward way that can store each Unicode
|
||||||
code point, is to store each code point as four consecutive bytes. There are two
|
code point, is to store each code point as four consecutive bytes. There are two
|
||||||
possibilities: store the bytes in big endian or in little endian order. These
|
possibilities: store the bytes in big endian or in little endian order. These
|
||||||
two encodings are called ``UTF-32-BE`` and ``UTF-32-LE`` respectively. Their
|
two encodings are called ``UTF-32-BE`` and ``UTF-32-LE`` respectively. Their
|
||||||
disadvantage is that if e.g. you use ``UTF-32-BE`` on a little endian machine you
|
disadvantage is that if, for example, you use ``UTF-32-BE`` on a little endian
|
||||||
will always have to swap bytes on encoding and decoding. ``UTF-32`` avoids this
|
machine you will always have to swap bytes on encoding and decoding.
|
||||||
problem: bytes will always be in natural endianness. When these bytes are read
|
Python's ``UTF-16`` and ``UTF-32`` codecs avoid this problem by using the
|
||||||
by a CPU with a different endianness, then bytes have to be swapped though. To
|
platform's native byte order when no BOM is present.
|
||||||
be able to detect the endianness of a ``UTF-16`` or ``UTF-32`` byte sequence,
|
Python follows prevailing platform
|
||||||
there's the so called BOM ("Byte Order Mark"). This is the Unicode character
|
practice, so native-endian data round-trips without redundant byte swapping,
|
||||||
``U+FEFF``. This character can be prepended to every ``UTF-16`` or ``UTF-32``
|
even though the Unicode Standard defaults to big-endian when the byte order is
|
||||||
byte sequence. The byte swapped version of this character (``0xFFFE``) is an
|
unspecified. When these bytes are read by a CPU with a different endianness,
|
||||||
illegal character that may not appear in a Unicode text. So when the
|
the bytes have to be swapped. To be able to detect the endianness of a
|
||||||
first character in a ``UTF-16`` or ``UTF-32`` byte sequence
|
``UTF-16`` or ``UTF-32`` byte sequence, a BOM ("Byte Order Mark") is used.
|
||||||
appears to be a ``U+FFFE`` the bytes have to be swapped on decoding.
|
This is the Unicode character ``U+FEFF``. This character can be prepended to every
|
||||||
|
``UTF-16`` or ``UTF-32`` byte sequence. The byte swapped version of this character
|
||||||
|
(``0xFFFE``) is an illegal character that may not appear in a Unicode text.
|
||||||
|
When the first character of a ``UTF-16`` or ``UTF-32`` byte sequence is
|
||||||
|
``U+FFFE``, the bytes have to be swapped on decoding.
|
||||||
|
|
||||||
Unfortunately the character ``U+FEFF`` had a second purpose as
|
Unfortunately the character ``U+FEFF`` had a second purpose as
|
||||||
a ``ZERO WIDTH NO-BREAK SPACE``: a character that has no width and doesn't allow
|
a ``ZERO WIDTH NO-BREAK SPACE``: a character that has no width and doesn't allow
|
||||||
a word to be split. It can e.g. be used to give hints to a ligature algorithm.
|
a word to be split. It can e.g. be used to give hints to a ligature algorithm.
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Fix undefined behavior when using unaligned store in JIT's ``patch_*`` functions.
|
30
Python/jit.c
30
Python/jit.c
|
@ -157,12 +157,18 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
|
||||||
uint8_t width)
|
uint8_t width)
|
||||||
{
|
{
|
||||||
assert(loc_start + width <= 32);
|
assert(loc_start + width <= 32);
|
||||||
|
uint32_t temp_val;
|
||||||
|
// Use memcpy to safely read the value, avoiding potential alignment
|
||||||
|
// issues and strict aliasing violations.
|
||||||
|
memcpy(&temp_val, loc, sizeof(temp_val));
|
||||||
// Clear the bits we're about to patch:
|
// Clear the bits we're about to patch:
|
||||||
*loc &= ~(((1ULL << width) - 1) << loc_start);
|
temp_val &= ~(((1ULL << width) - 1) << loc_start);
|
||||||
assert(get_bits(*loc, loc_start, width) == 0);
|
assert(get_bits(temp_val, loc_start, width) == 0);
|
||||||
// Patch the bits:
|
// Patch the bits:
|
||||||
*loc |= get_bits(value, value_start, width) << loc_start;
|
temp_val |= get_bits(value, value_start, width) << loc_start;
|
||||||
assert(get_bits(*loc, loc_start, width) == get_bits(value, value_start, width));
|
assert(get_bits(temp_val, loc_start, width) == get_bits(value, value_start, width));
|
||||||
|
// Safely write the modified value back to memory.
|
||||||
|
memcpy(loc, &temp_val, sizeof(temp_val));
|
||||||
}
|
}
|
||||||
|
|
||||||
// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions
|
// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions
|
||||||
|
@ -204,30 +210,29 @@ set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
|
||||||
void
|
void
|
||||||
patch_32(unsigned char *location, uint64_t value)
|
patch_32(unsigned char *location, uint64_t value)
|
||||||
{
|
{
|
||||||
uint32_t *loc32 = (uint32_t *)location;
|
|
||||||
// Check that we're not out of range of 32 unsigned bits:
|
// Check that we're not out of range of 32 unsigned bits:
|
||||||
assert(value < (1ULL << 32));
|
assert(value < (1ULL << 32));
|
||||||
*loc32 = (uint32_t)value;
|
uint32_t final_value = (uint32_t)value;
|
||||||
|
memcpy(location, &final_value, sizeof(final_value));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 32-bit relative address.
|
// 32-bit relative address.
|
||||||
void
|
void
|
||||||
patch_32r(unsigned char *location, uint64_t value)
|
patch_32r(unsigned char *location, uint64_t value)
|
||||||
{
|
{
|
||||||
uint32_t *loc32 = (uint32_t *)location;
|
|
||||||
value -= (uintptr_t)location;
|
value -= (uintptr_t)location;
|
||||||
// Check that we're not out of range of 32 signed bits:
|
// Check that we're not out of range of 32 signed bits:
|
||||||
assert((int64_t)value >= -(1LL << 31));
|
assert((int64_t)value >= -(1LL << 31));
|
||||||
assert((int64_t)value < (1LL << 31));
|
assert((int64_t)value < (1LL << 31));
|
||||||
*loc32 = (uint32_t)value;
|
uint32_t final_value = (uint32_t)value;
|
||||||
|
memcpy(location, &final_value, sizeof(final_value));
|
||||||
}
|
}
|
||||||
|
|
||||||
// 64-bit absolute address.
|
// 64-bit absolute address.
|
||||||
void
|
void
|
||||||
patch_64(unsigned char *location, uint64_t value)
|
patch_64(unsigned char *location, uint64_t value)
|
||||||
{
|
{
|
||||||
uint64_t *loc64 = (uint64_t *)location;
|
memcpy(location, &value, sizeof(value));
|
||||||
*loc64 = value;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r
|
// 12-bit low part of an absolute address. Pairs nicely with patch_aarch64_21r
|
||||||
|
@ -410,7 +415,10 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
|
||||||
{
|
{
|
||||||
uint8_t *loc8 = (uint8_t *)location;
|
uint8_t *loc8 = (uint8_t *)location;
|
||||||
// Try to relax the GOT load into an immediate value:
|
// Try to relax the GOT load into an immediate value:
|
||||||
uint64_t relaxed = *(uint64_t *)(value + 4) - 4;
|
uint64_t relaxed;
|
||||||
|
memcpy(&relaxed, (void *)(value + 4), sizeof(relaxed));
|
||||||
|
relaxed -= 4;
|
||||||
|
|
||||||
if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
|
if ((int64_t)relaxed - (int64_t)location >= -(1LL << 31) &&
|
||||||
(int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
|
(int64_t)relaxed - (int64_t)location + 1 < (1LL << 31))
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue