closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
This commit is contained in:
Benjamin Peterson 2017-12-09 10:26:52 -08:00 committed by GitHub
parent 28d8d14013
commit 42aa93b8ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 3364 additions and 2505 deletions

View file

@ -5,6 +5,8 @@
#include "Python-ast.h"
#undef Yield /* undefine macro conflicting with winbase.h */
#include "internal/hash.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include "errcode.h"
#include "marshal.h"
@ -2184,6 +2186,34 @@ _imp_exec_builtin_impl(PyObject *module, PyObject *mod)
return exec_builtin_or_dynamic(mod);
}
/*[clinic input]
_imp.source_hash
key: long
source: Py_buffer
[clinic start generated code]*/
static PyObject *
_imp_source_hash_impl(PyObject *module, long key, Py_buffer *source)
/*[clinic end generated code: output=edb292448cf399ea input=9aaad1e590089789]*/
{
uint64_t hash = _Py_KeyedHash((uint64_t)key, source->buf, source->len);
#if !PY_LITTLE_ENDIAN
// Force to little-endian. There really ought to be a succinct standard way
// to do this.
union {
uint64_t x;
unsigned char data[sizeof(uint64_t)];
} pun;
pun.x = hash;
for (size_t i = 0; i < sizeof(pun.data); i++) {
pun.data[sizeof(pun.data) - i - 1] = pun.data[i];
}
hash = pun.x;
#endif
return PyBytes_FromStringAndSize((const char *)&hash, sizeof(hash));
}
PyDoc_STRVAR(doc_imp,
"(Extremely) low-level import machinery bits as used by importlib and imp.");
@ -2203,6 +2233,7 @@ static PyMethodDef imp_methods[] = {
_IMP_EXEC_DYNAMIC_METHODDEF
_IMP_EXEC_BUILTIN_METHODDEF
_IMP__FIX_CO_FILENAME_METHODDEF
_IMP_SOURCE_HASH_METHODDEF
{NULL, NULL} /* sentinel */
};
@ -2219,6 +2250,8 @@ static struct PyModuleDef impmodule = {
NULL
};
const char *_Py_CheckHashBasedPycsMode = "default";
PyMODINIT_FUNC
PyInit_imp(void)
{
@ -2230,6 +2263,15 @@ PyInit_imp(void)
d = PyModule_GetDict(m);
if (d == NULL)
goto failure;
PyObject *pyc_mode = PyUnicode_FromString(_Py_CheckHashBasedPycsMode);
if (pyc_mode == NULL) {
goto failure;
}
if (PyDict_SetItemString(d, "check_hash_based_pycs", pyc_mode) < 0) {
Py_DECREF(pyc_mode);
goto failure;
}
Py_DECREF(pyc_mode);
return m;
failure: