mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	Issue #12778: Reduce memory consumption when JSON-encoding a large container of many small objects.
This commit is contained in:
		
							parent
							
								
									9bb6dfe0a8
								
							
						
					
					
						commit
						df7fc9dda7
					
				
					 3 changed files with 175 additions and 50 deletions
				
			
		
							
								
								
									
										203
									
								
								Modules/_json.c
									
										
									
									
									
								
							
							
						
						
									
										203
									
								
								Modules/_json.c
									
										
									
									
									
								
							|  | @ -75,6 +75,121 @@ static PyMemberDef encoder_members[] = { | |||
|     {NULL} | ||||
| }; | ||||
| 
 | ||||
| /*
 | ||||
|  * A two-level accumulator of unicode objects that avoids both the overhead | ||||
|  * of keeping a huge number of small separate objects, and the quadratic | ||||
|  * behaviour of using a naive repeated concatenation scheme. | ||||
|  */ | ||||
| 
 | ||||
| typedef struct { | ||||
|     PyObject *large;  /* A list of previously accumulated large strings */ | ||||
|     PyObject *small;  /* Pending small strings */ | ||||
| } accumulator; | ||||
| 
 | ||||
| static PyObject * | ||||
| join_list_unicode(PyObject *lst) | ||||
| { | ||||
|     /* return u''.join(lst) */ | ||||
|     static PyObject *sep = NULL; | ||||
|     if (sep == NULL) { | ||||
|         sep = PyUnicode_FromStringAndSize("", 0); | ||||
|         if (sep == NULL) | ||||
|             return NULL; | ||||
|     } | ||||
|     return PyUnicode_Join(sep, lst); | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| init_accumulator(accumulator *acc) | ||||
| { | ||||
|     acc->large = PyList_New(0); | ||||
|     if (acc->large == NULL) | ||||
|         return -1; | ||||
|     acc->small = PyList_New(0); | ||||
|     if (acc->small == NULL) { | ||||
|         Py_CLEAR(acc->large); | ||||
|         return -1; | ||||
|     } | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| flush_accumulator(accumulator *acc) | ||||
| { | ||||
|     Py_ssize_t nsmall = PyList_GET_SIZE(acc->small); | ||||
|     if (nsmall) { | ||||
|         int ret; | ||||
|         PyObject *joined = join_list_unicode(acc->small); | ||||
|         if (joined == NULL) | ||||
|             return -1; | ||||
|         if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) { | ||||
|             Py_DECREF(joined); | ||||
|             return -1; | ||||
|         } | ||||
|         ret = PyList_Append(acc->large, joined); | ||||
|         Py_DECREF(joined); | ||||
|         return ret; | ||||
|     } | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| accumulate_unicode(accumulator *acc, PyObject *obj) | ||||
| { | ||||
|     int ret; | ||||
|     Py_ssize_t nsmall; | ||||
|     assert(PyUnicode_Check(obj)); | ||||
| 
 | ||||
|     if (PyList_Append(acc->small, obj)) | ||||
|         return -1; | ||||
|     nsmall = PyList_GET_SIZE(acc->small); | ||||
|     /* Each item in a list of unicode objects has an overhead (in 64-bit
 | ||||
|      * builds) of: | ||||
|      *   - 8 bytes for the list slot | ||||
|      *   - 56 bytes for the header of the unicode object | ||||
|      * that is, 64 bytes.  100000 such objects waste more than 6MB | ||||
|      * compared to a single concatenated string. | ||||
|      */ | ||||
|     if (nsmall < 100000) | ||||
|         return 0; | ||||
|     PyObject *joined = join_list_unicode(acc->small); | ||||
|     if (joined == NULL) | ||||
|         return -1; | ||||
|     if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) { | ||||
|         Py_DECREF(joined); | ||||
|         return -1; | ||||
|     } | ||||
|     ret = PyList_Append(acc->large, joined); | ||||
|     Py_DECREF(joined); | ||||
|     return ret; | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| finish_accumulator(accumulator *acc) | ||||
| { | ||||
|     int ret; | ||||
|     PyObject *res; | ||||
| 
 | ||||
|     ret = flush_accumulator(acc); | ||||
|     Py_CLEAR(acc->small); | ||||
|     if (ret) { | ||||
|         Py_CLEAR(acc->large); | ||||
|         return NULL; | ||||
|     } | ||||
|     res = acc->large; | ||||
|     acc->large = NULL; | ||||
|     return res; | ||||
| } | ||||
| 
 | ||||
| static void | ||||
| destroy_accumulator(accumulator *acc) | ||||
| { | ||||
|     Py_CLEAR(acc->small); | ||||
|     Py_CLEAR(acc->large); | ||||
| } | ||||
| 
 | ||||
| /* Forward decls */ | ||||
| 
 | ||||
| static PyObject * | ||||
| ascii_escape_unicode(PyObject *pystr); | ||||
| static PyObject * | ||||
|  | @ -101,11 +216,11 @@ encoder_dealloc(PyObject *self); | |||
| static int | ||||
| encoder_clear(PyObject *self); | ||||
| static int | ||||
| encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level); | ||||
| encoder_listencode_list(PyEncoderObject *s, accumulator *acc, PyObject *seq, Py_ssize_t indent_level); | ||||
| static int | ||||
| encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level); | ||||
| encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, PyObject *obj, Py_ssize_t indent_level); | ||||
| static int | ||||
| encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level); | ||||
| encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, PyObject *dct, Py_ssize_t indent_level); | ||||
| static PyObject * | ||||
| _encoded_const(PyObject *obj); | ||||
| static void | ||||
|  | @ -266,19 +381,6 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) | |||
|     } | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| join_list_unicode(PyObject *lst) | ||||
| { | ||||
|     /* return u''.join(lst) */ | ||||
|     static PyObject *sep = NULL; | ||||
|     if (sep == NULL) { | ||||
|         sep = PyUnicode_FromStringAndSize("", 0); | ||||
|         if (sep == NULL) | ||||
|             return NULL; | ||||
|     } | ||||
|     return PyUnicode_Join(sep, lst); | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
| _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { | ||||
|     /* return (rval, idx) tuple, stealing reference to rval */ | ||||
|  | @ -1226,22 +1328,22 @@ encoder_call(PyObject *self, PyObject *args, PyObject *kwds) | |||
|     /* Python callable interface to encode_listencode_obj */ | ||||
|     static char *kwlist[] = {"obj", "_current_indent_level", NULL}; | ||||
|     PyObject *obj; | ||||
|     PyObject *rval; | ||||
|     Py_ssize_t indent_level; | ||||
|     PyEncoderObject *s; | ||||
|     accumulator acc; | ||||
| 
 | ||||
|     assert(PyEncoder_Check(self)); | ||||
|     s = (PyEncoderObject *)self; | ||||
|     if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist, | ||||
|         &obj, _convertPyInt_AsSsize_t, &indent_level)) | ||||
|         return NULL; | ||||
|     rval = PyList_New(0); | ||||
|     if (rval == NULL) | ||||
|     if (init_accumulator(&acc)) | ||||
|         return NULL; | ||||
|     if (encoder_listencode_obj(s, rval, obj, indent_level)) { | ||||
|         Py_DECREF(rval); | ||||
|     if (encoder_listencode_obj(s, &acc, obj, indent_level)) { | ||||
|         destroy_accumulator(&acc); | ||||
|         return NULL; | ||||
|     } | ||||
|     return rval; | ||||
|     return finish_accumulator(&acc); | ||||
| } | ||||
| 
 | ||||
| static PyObject * | ||||
|  | @ -1313,18 +1415,19 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj) | |||
| } | ||||
| 
 | ||||
| static int | ||||
| _steal_list_append(PyObject *lst, PyObject *stolen) | ||||
| _steal_accumulate(accumulator *acc, PyObject *stolen) | ||||
| { | ||||
|     /* Append stolen and then decrement its reference count */ | ||||
|     int rval = PyList_Append(lst, stolen); | ||||
|     int rval = accumulate_unicode(acc, stolen); | ||||
|     Py_DECREF(stolen); | ||||
|     return rval; | ||||
| } | ||||
| 
 | ||||
| static int | ||||
| encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) | ||||
| encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, | ||||
|                        PyObject *obj, Py_ssize_t indent_level) | ||||
| { | ||||
|     /* Encode Python object obj to a JSON term, rval is a PyList */ | ||||
|     /* Encode Python object obj to a JSON term */ | ||||
|     PyObject *newobj; | ||||
|     int rv; | ||||
| 
 | ||||
|  | @ -1332,38 +1435,38 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi | |||
|         PyObject *cstr = _encoded_const(obj); | ||||
|         if (cstr == NULL) | ||||
|             return -1; | ||||
|         return _steal_list_append(rval, cstr); | ||||
|         return _steal_accumulate(acc, cstr); | ||||
|     } | ||||
|     else if (PyUnicode_Check(obj)) | ||||
|     { | ||||
|         PyObject *encoded = encoder_encode_string(s, obj); | ||||
|         if (encoded == NULL) | ||||
|             return -1; | ||||
|         return _steal_list_append(rval, encoded); | ||||
|         return _steal_accumulate(acc, encoded); | ||||
|     } | ||||
|     else if (PyLong_Check(obj)) { | ||||
|         PyObject *encoded = PyObject_Str(obj); | ||||
|         if (encoded == NULL) | ||||
|             return -1; | ||||
|         return _steal_list_append(rval, encoded); | ||||
|         return _steal_accumulate(acc, encoded); | ||||
|     } | ||||
|     else if (PyFloat_Check(obj)) { | ||||
|         PyObject *encoded = encoder_encode_float(s, obj); | ||||
|         if (encoded == NULL) | ||||
|             return -1; | ||||
|         return _steal_list_append(rval, encoded); | ||||
|         return _steal_accumulate(acc, encoded); | ||||
|     } | ||||
|     else if (PyList_Check(obj) || PyTuple_Check(obj)) { | ||||
|         if (Py_EnterRecursiveCall(" while encoding a JSON object")) | ||||
|             return -1; | ||||
|         rv = encoder_listencode_list(s, rval, obj, indent_level); | ||||
|         rv = encoder_listencode_list(s, acc, obj, indent_level); | ||||
|         Py_LeaveRecursiveCall(); | ||||
|         return rv; | ||||
|     } | ||||
|     else if (PyDict_Check(obj)) { | ||||
|         if (Py_EnterRecursiveCall(" while encoding a JSON object")) | ||||
|             return -1; | ||||
|         rv = encoder_listencode_dict(s, rval, obj, indent_level); | ||||
|         rv = encoder_listencode_dict(s, acc, obj, indent_level); | ||||
|         Py_LeaveRecursiveCall(); | ||||
|         return rv; | ||||
|     } | ||||
|  | @ -1394,7 +1497,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi | |||
| 
 | ||||
|         if (Py_EnterRecursiveCall(" while encoding a JSON object")) | ||||
|             return -1; | ||||
|         rv = encoder_listencode_obj(s, rval, newobj, indent_level); | ||||
|         rv = encoder_listencode_obj(s, acc, newobj, indent_level); | ||||
|         Py_LeaveRecursiveCall(); | ||||
| 
 | ||||
|         Py_DECREF(newobj); | ||||
|  | @ -1414,9 +1517,10 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi | |||
| } | ||||
| 
 | ||||
| static int | ||||
| encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) | ||||
| encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, | ||||
|                         PyObject *dct, Py_ssize_t indent_level) | ||||
| { | ||||
|     /* Encode Python dict dct a JSON term, rval is a PyList */ | ||||
|     /* Encode Python dict dct a JSON term */ | ||||
|     static PyObject *open_dict = NULL; | ||||
|     static PyObject *close_dict = NULL; | ||||
|     static PyObject *empty_dict = NULL; | ||||
|  | @ -1436,7 +1540,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
|             return -1; | ||||
|     } | ||||
|     if (Py_SIZE(dct) == 0) | ||||
|         return PyList_Append(rval, empty_dict); | ||||
|         return accumulate_unicode(acc, empty_dict); | ||||
| 
 | ||||
|     if (s->markers != Py_None) { | ||||
|         int has_key; | ||||
|  | @ -1454,7 +1558,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (PyList_Append(rval, open_dict)) | ||||
|     if (accumulate_unicode(acc, open_dict)) | ||||
|         goto bail; | ||||
| 
 | ||||
|     if (s->indent != Py_None) { | ||||
|  | @ -1541,7 +1645,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
|         } | ||||
| 
 | ||||
|         if (idx) { | ||||
|             if (PyList_Append(rval, s->item_separator)) | ||||
|             if (accumulate_unicode(acc, s->item_separator)) | ||||
|                 goto bail; | ||||
|         } | ||||
| 
 | ||||
|  | @ -1549,16 +1653,16 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
|         Py_CLEAR(kstr); | ||||
|         if (encoded == NULL) | ||||
|             goto bail; | ||||
|         if (PyList_Append(rval, encoded)) { | ||||
|         if (accumulate_unicode(acc, encoded)) { | ||||
|             Py_DECREF(encoded); | ||||
|             goto bail; | ||||
|         } | ||||
|         Py_DECREF(encoded); | ||||
|         if (PyList_Append(rval, s->key_separator)) | ||||
|         if (accumulate_unicode(acc, s->key_separator)) | ||||
|             goto bail; | ||||
| 
 | ||||
|         value = PyTuple_GET_ITEM(item, 1); | ||||
|         if (encoder_listencode_obj(s, rval, value, indent_level)) | ||||
|         if (encoder_listencode_obj(s, acc, value, indent_level)) | ||||
|             goto bail; | ||||
|         idx += 1; | ||||
|         Py_DECREF(item); | ||||
|  | @ -1578,7 +1682,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
| 
 | ||||
|         yield '\n' + (' ' * (_indent * _current_indent_level)) | ||||
|     }*/ | ||||
|     if (PyList_Append(rval, close_dict)) | ||||
|     if (accumulate_unicode(acc, close_dict)) | ||||
|         goto bail; | ||||
|     return 0; | ||||
| 
 | ||||
|  | @ -1592,9 +1696,10 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss | |||
| 
 | ||||
| 
 | ||||
| static int | ||||
| encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) | ||||
| encoder_listencode_list(PyEncoderObject *s, accumulator *acc, | ||||
|                         PyObject *seq, Py_ssize_t indent_level) | ||||
| { | ||||
|     /* Encode Python list seq to a JSON term, rval is a PyList */ | ||||
|     /* Encode Python list seq to a JSON term */ | ||||
|     static PyObject *open_array = NULL; | ||||
|     static PyObject *close_array = NULL; | ||||
|     static PyObject *empty_array = NULL; | ||||
|  | @ -1618,7 +1723,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss | |||
|     num_items = PySequence_Fast_GET_SIZE(s_fast); | ||||
|     if (num_items == 0) { | ||||
|         Py_DECREF(s_fast); | ||||
|         return PyList_Append(rval, empty_array); | ||||
|         return accumulate_unicode(acc, empty_array); | ||||
|     } | ||||
| 
 | ||||
|     if (s->markers != Py_None) { | ||||
|  | @ -1638,7 +1743,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss | |||
|     } | ||||
| 
 | ||||
|     seq_items = PySequence_Fast_ITEMS(s_fast); | ||||
|     if (PyList_Append(rval, open_array)) | ||||
|     if (accumulate_unicode(acc, open_array)) | ||||
|         goto bail; | ||||
|     if (s->indent != Py_None) { | ||||
|         /* TODO: DOES NOT RUN */ | ||||
|  | @ -1652,10 +1757,10 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss | |||
|     for (i = 0; i < num_items; i++) { | ||||
|         PyObject *obj = seq_items[i]; | ||||
|         if (i) { | ||||
|             if (PyList_Append(rval, s->item_separator)) | ||||
|             if (accumulate_unicode(acc, s->item_separator)) | ||||
|                 goto bail; | ||||
|         } | ||||
|         if (encoder_listencode_obj(s, rval, obj, indent_level)) | ||||
|         if (encoder_listencode_obj(s, acc, obj, indent_level)) | ||||
|             goto bail; | ||||
|     } | ||||
|     if (ident != NULL) { | ||||
|  | @ -1670,7 +1775,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss | |||
| 
 | ||||
|         yield '\n' + (' ' * (_indent * _current_indent_level)) | ||||
|     }*/ | ||||
|     if (PyList_Append(rval, close_array)) | ||||
|     if (accumulate_unicode(acc, close_array)) | ||||
|         goto bail; | ||||
|     Py_DECREF(s_fast); | ||||
|     return 0; | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Antoine Pitrou
						Antoine Pitrou