mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	gh-115103: Implement delayed free mechanism for free-threaded builds (#115367)
This adds `_PyMem_FreeDelayed()` and supporting functions. The `_PyMem_FreeDelayed()` function frees memory with the same allocator as `PyMem_Free()`, but after some delay to ensure that concurrent lock-free readers have finished.
This commit is contained in:
		
							parent
							
								
									d207c7cd5a
								
							
						
					
					
						commit
						e3ad6ca56f
					
				
					 8 changed files with 226 additions and 0 deletions
				
			
		|  | @ -231,6 +231,7 @@ struct _is { | ||||||
| 
 | 
 | ||||||
|     struct _Py_dict_state dict_state; |     struct _Py_dict_state dict_state; | ||||||
|     struct _Py_exc_state exc_state; |     struct _Py_exc_state exc_state; | ||||||
|  |     struct _Py_mem_interp_free_queue mem_free_queue; | ||||||
| 
 | 
 | ||||||
|     struct ast_state ast; |     struct ast_state ast; | ||||||
|     struct types_state types; |     struct types_state types; | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| #ifndef Py_INTERNAL_PYMEM_H | #ifndef Py_INTERNAL_PYMEM_H | ||||||
| #define Py_INTERNAL_PYMEM_H | #define Py_INTERNAL_PYMEM_H | ||||||
| 
 | 
 | ||||||
|  | #include "pycore_llist.h"           // struct llist_node | ||||||
| #include "pycore_lock.h"            // PyMutex | #include "pycore_lock.h"            // PyMutex | ||||||
| 
 | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
|  | @ -48,6 +49,11 @@ struct _pymem_allocators { | ||||||
|     PyObjectArenaAllocator obj_arena; |     PyObjectArenaAllocator obj_arena; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | struct _Py_mem_interp_free_queue { | ||||||
|  |     int has_work;   // true if the queue is not empty
 | ||||||
|  |     PyMutex mutex;  // protects the queue
 | ||||||
|  |     struct llist_node head;  // queue of _mem_work_chunk items
 | ||||||
|  | }; | ||||||
| 
 | 
 | ||||||
| /* Set the memory allocator of the specified domain to the default.
 | /* Set the memory allocator of the specified domain to the default.
 | ||||||
|    Save the old allocator into *old_alloc if it's non-NULL. |    Save the old allocator into *old_alloc if it's non-NULL. | ||||||
|  | @ -110,6 +116,19 @@ extern int _PyMem_SetupAllocators(PyMemAllocatorName allocator); | ||||||
| /* Is the debug allocator enabled? */ | /* Is the debug allocator enabled? */ | ||||||
| extern int _PyMem_DebugEnabled(void); | extern int _PyMem_DebugEnabled(void); | ||||||
| 
 | 
 | ||||||
|  | // Enqueue a pointer to be freed possibly after some delay.
 | ||||||
|  | extern void _PyMem_FreeDelayed(void *ptr); | ||||||
|  | 
 | ||||||
|  | // Periodically process delayed free requests.
 | ||||||
|  | extern void _PyMem_ProcessDelayed(PyThreadState *tstate); | ||||||
|  | 
 | ||||||
|  | // Abandon all thread-local delayed free requests and push them to the
 | ||||||
|  | // interpreter's queue.
 | ||||||
|  | extern void _PyMem_AbandonDelayed(PyThreadState *tstate); | ||||||
|  | 
 | ||||||
|  | // On interpreter shutdown, frees all delayed free requests.
 | ||||||
|  | extern void _PyMem_FiniDelayed(PyInterpreterState *interp); | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -92,6 +92,11 @@ extern void _PyMem_ArenaFree(void *, void *, size_t); | ||||||
|     { NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree } |     { NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree } | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | #define _Py_mem_free_queue_INIT(queue) \ | ||||||
|  |     { \ | ||||||
|  |         .head = LLIST_INIT(queue.head), \ | ||||||
|  |     } | ||||||
|  | 
 | ||||||
| #ifdef __cplusplus | #ifdef __cplusplus | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | @ -176,6 +176,7 @@ extern PyTypeObject _PyExc_MemoryError; | ||||||
|         }, \ |         }, \ | ||||||
|         .dtoa = _dtoa_state_INIT(&(INTERP)), \ |         .dtoa = _dtoa_state_INIT(&(INTERP)), \ | ||||||
|         .dict_state = _dict_state_INIT, \ |         .dict_state = _dict_state_INIT, \ | ||||||
|  |         .mem_free_queue = _Py_mem_free_queue_INIT(INTERP.mem_free_queue), \ | ||||||
|         .func_state = { \ |         .func_state = { \ | ||||||
|             .next_version = 1, \ |             .next_version = 1, \ | ||||||
|         }, \ |         }, \ | ||||||
|  |  | ||||||
|  | @ -29,6 +29,7 @@ typedef struct _PyThreadStateImpl { | ||||||
|     PyThreadState base; |     PyThreadState base; | ||||||
| 
 | 
 | ||||||
|     struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
 |     struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
 | ||||||
|  |     struct llist_node mem_free_queue; // delayed free queue
 | ||||||
| 
 | 
 | ||||||
| #ifdef Py_GIL_DISABLED | #ifdef Py_GIL_DISABLED | ||||||
|     struct _gc_thread_state gc; |     struct _gc_thread_state gc; | ||||||
|  |  | ||||||
|  | @ -948,6 +948,196 @@ _PyMem_Strdup(const char *str) | ||||||
|     return copy; |     return copy; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /***********************************************/ | ||||||
|  | /* Delayed freeing support for Py_GIL_DISABLED */ | ||||||
|  | /***********************************************/ | ||||||
|  | 
 | ||||||
|  | // So that sizeof(struct _mem_work_chunk) is 4096 bytes on 64-bit platforms.
 | ||||||
|  | #define WORK_ITEMS_PER_CHUNK 254 | ||||||
|  | 
 | ||||||
|  | // A pointer to be freed once the QSBR read sequence reaches qsbr_goal.
 | ||||||
|  | struct _mem_work_item { | ||||||
|  |     void *ptr; | ||||||
|  |     uint64_t qsbr_goal; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // A fixed-size buffer of pointers to be freed
 | ||||||
|  | struct _mem_work_chunk { | ||||||
|  |     // Linked list node of chunks in queue
 | ||||||
|  |     struct llist_node node; | ||||||
|  | 
 | ||||||
|  |     Py_ssize_t rd_idx;  // index of next item to read
 | ||||||
|  |     Py_ssize_t wr_idx;  // index of next item to write
 | ||||||
|  |     struct _mem_work_item array[WORK_ITEMS_PER_CHUNK]; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | void | ||||||
|  | _PyMem_FreeDelayed(void *ptr) | ||||||
|  | { | ||||||
|  | #ifndef Py_GIL_DISABLED | ||||||
|  |     PyMem_Free(ptr); | ||||||
|  | #else | ||||||
|  |     if (_PyRuntime.stoptheworld.world_stopped) { | ||||||
|  |         // Free immediately if the world is stopped, including during
 | ||||||
|  |         // interpreter shutdown.
 | ||||||
|  |         PyMem_Free(ptr); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); | ||||||
|  |     struct llist_node *head = &tstate->mem_free_queue; | ||||||
|  | 
 | ||||||
|  |     struct _mem_work_chunk *buf = NULL; | ||||||
|  |     if (!llist_empty(head)) { | ||||||
|  |         // Try to re-use the last buffer
 | ||||||
|  |         buf = llist_data(head->prev, struct _mem_work_chunk, node); | ||||||
|  |         if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { | ||||||
|  |             // already full
 | ||||||
|  |             buf = NULL; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (buf == NULL) { | ||||||
|  |         buf = PyMem_Calloc(1, sizeof(*buf)); | ||||||
|  |         if (buf != NULL) { | ||||||
|  |             llist_insert_tail(head, &buf->node); | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     if (buf == NULL) { | ||||||
|  |         // failed to allocate a buffer, free immediately
 | ||||||
|  |         _PyEval_StopTheWorld(tstate->base.interp); | ||||||
|  |         PyMem_Free(ptr); | ||||||
|  |         _PyEval_StartTheWorld(tstate->base.interp); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     assert(buf != NULL && buf->wr_idx < WORK_ITEMS_PER_CHUNK); | ||||||
|  |     uint64_t seq = _Py_qsbr_deferred_advance(tstate->qsbr); | ||||||
|  |     buf->array[buf->wr_idx].ptr = ptr; | ||||||
|  |     buf->array[buf->wr_idx].qsbr_goal = seq; | ||||||
|  |     buf->wr_idx++; | ||||||
|  | 
 | ||||||
|  |     if (buf->wr_idx == WORK_ITEMS_PER_CHUNK) { | ||||||
|  |         _PyMem_ProcessDelayed((PyThreadState *)tstate); | ||||||
|  |     } | ||||||
|  | #endif | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static struct _mem_work_chunk * | ||||||
|  | work_queue_first(struct llist_node *head) | ||||||
|  | { | ||||||
|  |     return llist_data(head->next, struct _mem_work_chunk, node); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void | ||||||
|  | process_queue(struct llist_node *head, struct _qsbr_thread_state *qsbr, | ||||||
|  |               bool keep_empty) | ||||||
|  | { | ||||||
|  |     while (!llist_empty(head)) { | ||||||
|  |         struct _mem_work_chunk *buf = work_queue_first(head); | ||||||
|  | 
 | ||||||
|  |         while (buf->rd_idx < buf->wr_idx) { | ||||||
|  |             struct _mem_work_item *item = &buf->array[buf->rd_idx]; | ||||||
|  |             if (!_Py_qsbr_poll(qsbr, item->qsbr_goal)) { | ||||||
|  |                 return; | ||||||
|  |             } | ||||||
|  | 
 | ||||||
|  |             PyMem_Free(item->ptr); | ||||||
|  |             buf->rd_idx++; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         assert(buf->rd_idx == buf->wr_idx); | ||||||
|  |         if (keep_empty && buf->node.next == head) { | ||||||
|  |             // Keep the last buffer in the queue to reduce re-allocations
 | ||||||
|  |             buf->rd_idx = buf->wr_idx = 0; | ||||||
|  |             return; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         llist_remove(&buf->node); | ||||||
|  |         PyMem_Free(buf); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void | ||||||
|  | process_interp_queue(struct _Py_mem_interp_free_queue *queue, | ||||||
|  |                      struct _qsbr_thread_state *qsbr) | ||||||
|  | { | ||||||
|  |     if (!_Py_atomic_load_int_relaxed(&queue->has_work)) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Try to acquire the lock, but don't block if it's already held.
 | ||||||
|  |     if (_PyMutex_LockTimed(&queue->mutex, 0, 0) == PY_LOCK_ACQUIRED) { | ||||||
|  |         process_queue(&queue->head, qsbr, false); | ||||||
|  | 
 | ||||||
|  |         int more_work = !llist_empty(&queue->head); | ||||||
|  |         _Py_atomic_store_int_relaxed(&queue->has_work, more_work); | ||||||
|  | 
 | ||||||
|  |         PyMutex_Unlock(&queue->mutex); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void | ||||||
|  | _PyMem_ProcessDelayed(PyThreadState *tstate) | ||||||
|  | { | ||||||
|  |     PyInterpreterState *interp = tstate->interp; | ||||||
|  |     _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; | ||||||
|  | 
 | ||||||
|  |     // Process thread-local work
 | ||||||
|  |     process_queue(&tstate_impl->mem_free_queue, tstate_impl->qsbr, true); | ||||||
|  | 
 | ||||||
|  |     // Process shared interpreter work
 | ||||||
|  |     process_interp_queue(&interp->mem_free_queue, tstate_impl->qsbr); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void | ||||||
|  | _PyMem_AbandonDelayed(PyThreadState *tstate) | ||||||
|  | { | ||||||
|  |     PyInterpreterState *interp = tstate->interp; | ||||||
|  |     struct llist_node *queue = &((_PyThreadStateImpl *)tstate)->mem_free_queue; | ||||||
|  | 
 | ||||||
|  |     if (llist_empty(queue)) { | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Check if the queue contains one empty buffer
 | ||||||
|  |     struct _mem_work_chunk *buf = work_queue_first(queue); | ||||||
|  |     if (buf->rd_idx == buf->wr_idx) { | ||||||
|  |         llist_remove(&buf->node); | ||||||
|  |         PyMem_Free(buf); | ||||||
|  |         assert(llist_empty(queue)); | ||||||
|  |         return; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Merge the thread's work queue into the interpreter's work queue.
 | ||||||
|  |     PyMutex_Lock(&interp->mem_free_queue.mutex); | ||||||
|  |     llist_concat(&interp->mem_free_queue.head, queue); | ||||||
|  |     _Py_atomic_store_int_relaxed(&interp->mem_free_queue.has_work, 1); | ||||||
|  |     PyMutex_Unlock(&interp->mem_free_queue.mutex); | ||||||
|  | 
 | ||||||
|  |     assert(llist_empty(queue));  // the thread's queue is now empty
 | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | void | ||||||
|  | _PyMem_FiniDelayed(PyInterpreterState *interp) | ||||||
|  | { | ||||||
|  |     struct llist_node *head = &interp->mem_free_queue.head; | ||||||
|  |     while (!llist_empty(head)) { | ||||||
|  |         struct _mem_work_chunk *buf = work_queue_first(head); | ||||||
|  | 
 | ||||||
|  |         while (buf->rd_idx < buf->wr_idx) { | ||||||
|  |             // Free the remaining items immediately. There should be no other
 | ||||||
|  |             // threads accessing the memory at this point during shutdown.
 | ||||||
|  |             struct _mem_work_item *item = &buf->array[buf->rd_idx]; | ||||||
|  |             PyMem_Free(item->ptr); | ||||||
|  |             buf->rd_idx++; | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         llist_remove(&buf->node); | ||||||
|  |         PyMem_Free(buf); | ||||||
|  |     } | ||||||
|  | } | ||||||
| 
 | 
 | ||||||
| /**************************/ | /**************************/ | ||||||
| /* the "object" allocator */ | /* the "object" allocator */ | ||||||
|  |  | ||||||
|  | @ -1837,6 +1837,9 @@ finalize_interp_clear(PyThreadState *tstate) | ||||||
| 
 | 
 | ||||||
|     finalize_interp_types(tstate->interp); |     finalize_interp_types(tstate->interp); | ||||||
| 
 | 
 | ||||||
|  |     /* Free any delayed free requests immediately */ | ||||||
|  |     _PyMem_FiniDelayed(tstate->interp); | ||||||
|  | 
 | ||||||
|     /* finalize_interp_types may allocate Python objects so we may need to
 |     /* finalize_interp_types may allocate Python objects so we may need to
 | ||||||
|        abandon mimalloc segments again */ |        abandon mimalloc segments again */ | ||||||
|     _PyThreadState_ClearMimallocHeaps(tstate); |     _PyThreadState_ClearMimallocHeaps(tstate); | ||||||
|  |  | ||||||
|  | @ -617,6 +617,7 @@ init_interpreter(PyInterpreterState *interp, | ||||||
| #ifdef Py_GIL_DISABLED | #ifdef Py_GIL_DISABLED | ||||||
|     _Py_brc_init_state(interp); |     _Py_brc_init_state(interp); | ||||||
| #endif | #endif | ||||||
|  |     llist_init(&interp->mem_free_queue.head); | ||||||
|     for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { |     for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { | ||||||
|         interp->monitors.tools[i] = 0; |         interp->monitors.tools[i] = 0; | ||||||
|     } |     } | ||||||
|  | @ -1353,6 +1354,7 @@ init_threadstate(_PyThreadStateImpl *_tstate, | ||||||
|     // Initialize biased reference counting inter-thread queue
 |     // Initialize biased reference counting inter-thread queue
 | ||||||
|     _Py_brc_init_thread(tstate); |     _Py_brc_init_thread(tstate); | ||||||
| #endif | #endif | ||||||
|  |     llist_init(&_tstate->mem_free_queue); | ||||||
| 
 | 
 | ||||||
|     if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) { |     if (interp->stoptheworld.requested || _PyRuntime.stoptheworld.requested) { | ||||||
|         // Start in the suspended state if there is an ongoing stop-the-world.
 |         // Start in the suspended state if there is an ongoing stop-the-world.
 | ||||||
|  | @ -1574,6 +1576,7 @@ PyThreadState_Clear(PyThreadState *tstate) | ||||||
|         // don't call _PyInterpreterState_SetNotRunningMain() yet.
 |         // don't call _PyInterpreterState_SetNotRunningMain() yet.
 | ||||||
|         tstate->on_delete(tstate->on_delete_data); |         tstate->on_delete(tstate->on_delete_data); | ||||||
|     } |     } | ||||||
|  | 
 | ||||||
| #ifdef Py_GIL_DISABLED | #ifdef Py_GIL_DISABLED | ||||||
|     // Each thread should clear own freelists in free-threading builds.
 |     // Each thread should clear own freelists in free-threading builds.
 | ||||||
|     struct _Py_object_freelists *freelists = _Py_object_freelists_GET(); |     struct _Py_object_freelists *freelists = _Py_object_freelists_GET(); | ||||||
|  | @ -1583,6 +1586,9 @@ PyThreadState_Clear(PyThreadState *tstate) | ||||||
|     _Py_brc_remove_thread(tstate); |     _Py_brc_remove_thread(tstate); | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
|  |     // Merge our queue of pointers to be freed into the interpreter queue.
 | ||||||
|  |     _PyMem_AbandonDelayed(tstate); | ||||||
|  | 
 | ||||||
|     _PyThreadState_ClearMimallocHeaps(tstate); |     _PyThreadState_ClearMimallocHeaps(tstate); | ||||||
| 
 | 
 | ||||||
|     tstate->_status.cleared = 1; |     tstate->_status.cleared = 1; | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Sam Gross
						Sam Gross