mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	bpo-46845: Reduce dict size when all keys are Unicode (GH-31564)
This commit is contained in:
		
							parent
							
								
									21099fc064
								
							
						
					
					
						commit
						9833bb91e4
					
				
					 9 changed files with 884 additions and 491 deletions
				
			
		|  | @ -404,6 +404,11 @@ Optimizations | |||
|   larger *k*). | ||||
|   (Contributed by Serhiy Storchaka in :issue:`37295`.) | ||||
| 
 | ||||
| * Dict don't store hash value when all inserted keys are Unicode objects. | ||||
|   This reduces dict size. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))`` | ||||
|   becomes 272 bytes from 352 bytes on 64bit platform. | ||||
|   (Contributed by Inada Naoki in :issue:`46845`.) | ||||
| 
 | ||||
| 
 | ||||
| CPython bytecode changes | ||||
| ======================== | ||||
|  |  | |||
|  | @ -43,6 +43,11 @@ typedef struct { | |||
|     PyObject *me_value; /* This field is only meaningful for combined tables */ | ||||
| } PyDictKeyEntry; | ||||
| 
 | ||||
| typedef struct { | ||||
|     PyObject *me_key;   /* The key must be Unicode and have hash. */ | ||||
|     PyObject *me_value; /* This field is only meaningful for combined tables */ | ||||
| } PyDictUnicodeEntry; | ||||
| 
 | ||||
| extern PyDictKeysObject *_PyDict_NewKeysForClass(void); | ||||
| extern PyObject *_PyDict_FromKeys(PyObject *, PyObject *, PyObject *); | ||||
| 
 | ||||
|  | @ -70,6 +75,7 @@ extern PyObject *_PyDict_Pop_KnownHash(PyObject *, PyObject *, Py_hash_t, PyObje | |||
| #define DKIX_EMPTY (-1) | ||||
| #define DKIX_DUMMY (-2)  /* Used internally */ | ||||
| #define DKIX_ERROR (-3) | ||||
| #define DKIX_KEY_CHANGED (-4) /* Used internally */ | ||||
| 
 | ||||
| typedef enum { | ||||
|     DICT_KEYS_GENERAL = 0, | ||||
|  | @ -114,7 +120,7 @@ struct _dictkeysobject { | |||
|        Dynamically sized, SIZEOF_VOID_P is minimum. */ | ||||
|     char dk_indices[];  /* char is required to avoid strict aliasing. */ | ||||
| 
 | ||||
|     /* "PyDictKeyEntry dk_entries[dk_usable];" array follows:
 | ||||
|     /* "PyDictKeyEntry or PyDictUnicodeEntry dk_entries[USABLE_FRACTION(DK_SIZE(dk))];" array follows:
 | ||||
|        see the DK_ENTRIES() macro */ | ||||
| }; | ||||
| 
 | ||||
|  | @ -148,13 +154,20 @@ struct _dictvalues { | |||
|             2 : sizeof(int32_t)) | ||||
| #endif | ||||
| #define DK_ENTRIES(dk) \ | ||||
|     ((PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes])) | ||||
|     (assert(dk->dk_kind == DICT_KEYS_GENERAL), (PyDictKeyEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes])) | ||||
| #define DK_UNICODE_ENTRIES(dk) \ | ||||
|     (assert(dk->dk_kind != DICT_KEYS_GENERAL), (PyDictUnicodeEntry*)(&((int8_t*)((dk)->dk_indices))[(size_t)1 << (dk)->dk_log2_index_bytes])) | ||||
| #define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL) | ||||
| 
 | ||||
| extern uint64_t _pydict_global_version; | ||||
| 
 | ||||
| #define DICT_NEXT_VERSION() (++_pydict_global_version) | ||||
| 
 | ||||
| extern PyObject *_PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values); | ||||
| extern PyObject *_PyDict_FromItems( | ||||
|         PyObject *const *keys, Py_ssize_t keys_offset, | ||||
|         PyObject *const *values, Py_ssize_t values_offset, | ||||
|         Py_ssize_t length); | ||||
| 
 | ||||
| static inline void | ||||
| _PyDictValues_AddToInsertionOrder(PyDictValues *values, Py_ssize_t ix) | ||||
|  |  | |||
|  | @ -1346,8 +1346,12 @@ def inner(): | |||
|         check({}.__iter__, size('2P')) | ||||
|         # empty dict | ||||
|         check({}, size('nQ2P')) | ||||
|         # dict | ||||
|         check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P')) | ||||
|         # dict (string key) | ||||
|         check({"a": 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('2P')) | ||||
|         longdict = {str(i): i for i in range(8)} | ||||
|         check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('2P')) | ||||
|         # dict (non-string key) | ||||
|         check({1: 1}, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 8 + (8*2//3)*calcsize('n2P')) | ||||
|         longdict = {1:1, 2:2, 3:3, 4:4, 5:5, 6:6, 7:7, 8:8} | ||||
|         check(longdict, size('nQ2P') + calcsize(DICT_KEY_STRUCT_FORMAT) + 16 + (16*2//3)*calcsize('n2P')) | ||||
|         # dictionary-keyview | ||||
|  | @ -1506,14 +1510,14 @@ def delx(self): del self.__x | |||
|                   ) | ||||
|         class newstyleclass(object): pass | ||||
|         # Separate block for PyDictKeysObject with 8 keys and 5 entries | ||||
|         check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P")) | ||||
|         check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P")) | ||||
|         # dict with shared keys | ||||
|         [newstyleclass() for _ in range(100)] | ||||
|         check(newstyleclass().__dict__, size('nQ2P') + self.P) | ||||
|         o = newstyleclass() | ||||
|         o.a = o.b = o.c = o.d = o.e = o.f = o.g = o.h = 1 | ||||
|         # Separate block for PyDictKeysObject with 16 keys and 10 entries | ||||
|         check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("n2P")) | ||||
|         check(newstyleclass, s + calcsize(DICT_KEY_STRUCT_FORMAT) + 64 + 42*calcsize("2P")) | ||||
|         # dict with shared keys | ||||
|         check(newstyleclass().__dict__, size('nQ2P') + self.P) | ||||
|         # unicode | ||||
|  |  | |||
|  | @ -0,0 +1,3 @@ | |||
| Reduces dict size by removing hash value from hash table when all inserted | ||||
| keys are Unicode. For example, ``sys.getsizeof(dict.fromkeys("abcdefg"))`` | ||||
| becomes 272 bytes from 352 bytes on 64bit platform. | ||||
|  | @ -934,26 +934,11 @@ PyObject * | |||
| _PyStack_AsDict(PyObject *const *values, PyObject *kwnames) | ||||
| { | ||||
|     Py_ssize_t nkwargs; | ||||
|     PyObject *kwdict; | ||||
|     Py_ssize_t i; | ||||
| 
 | ||||
|     assert(kwnames != NULL); | ||||
|     nkwargs = PyTuple_GET_SIZE(kwnames); | ||||
|     kwdict = _PyDict_NewPresized(nkwargs); | ||||
|     if (kwdict == NULL) { | ||||
|         return NULL; | ||||
|     } | ||||
| 
 | ||||
|     for (i = 0; i < nkwargs; i++) { | ||||
|         PyObject *key = PyTuple_GET_ITEM(kwnames, i); | ||||
|         PyObject *value = *values++; | ||||
|         /* If key already exists, replace it with the new value */ | ||||
|         if (PyDict_SetItem(kwdict, key, value)) { | ||||
|             Py_DECREF(kwdict); | ||||
|             return NULL; | ||||
|         } | ||||
|     } | ||||
|     return kwdict; | ||||
|     return _PyDict_FromItems(&PyTuple_GET_ITEM(kwnames, 0), 1, | ||||
|                              values, 1, nkwargs); | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -70,8 +70,8 @@ A values array | |||
| Tunable Dictionary Parameters | ||||
| ----------------------------- | ||||
| 
 | ||||
| See comments for PyDict_MINSIZE_SPLIT, PyDict_MINSIZE_COMBINED, | ||||
| USABLE_FRACTION and GROWTH_RATE in dictobject.c | ||||
| See comments for PyDict_MINSIZE, USABLE_FRACTION and GROWTH_RATE in | ||||
| dictobject.c | ||||
| 
 | ||||
| Tune-ups should be measured across a broad range of applications and | ||||
| use cases.  A change to any parameter will help in some situations and | ||||
|  |  | |||
							
								
								
									
										1211
									
								
								Objects/dictobject.c
									
										
									
									
									
								
							
							
						
						
									
										1211
									
								
								Objects/dictobject.c
									
										
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -1457,7 +1457,7 @@ eval_frame_handle_pending(PyThreadState *tstate) | |||
|         LOAD_##attr_or_method); \ | ||||
|     assert(dict->ma_keys->dk_kind == DICT_KEYS_UNICODE); \ | ||||
|     assert(cache0->index < dict->ma_keys->dk_nentries); \ | ||||
|     PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + cache0->index; \ | ||||
|     PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + cache0->index; \ | ||||
|     res = ep->me_value; \ | ||||
|     DEOPT_IF(res == NULL, LOAD_##attr_or_method); \ | ||||
|     STAT_INC(LOAD_##attr_or_method, hit); \ | ||||
|  | @ -1595,6 +1595,19 @@ is_method(PyObject **stack_pointer, int args) { | |||
|     return PEEK(args+2) != NULL; | ||||
| } | ||||
| 
 | ||||
| static PyObject* | ||||
| dictkeys_get_value_by_index(PyDictKeysObject *dk, int index) | ||||
| { | ||||
|     if (DK_IS_UNICODE(dk)) { | ||||
|         PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dk) + index; | ||||
|         return ep->me_value; | ||||
|     } | ||||
|     else { | ||||
|         PyDictKeyEntry *ep = DK_ENTRIES(dk) + index; | ||||
|         return ep->me_value; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #define KWNAMES_LEN() \ | ||||
|     (call_shape.kwnames == NULL ? 0 : ((int)PyTuple_GET_SIZE(call_shape.kwnames))) | ||||
| 
 | ||||
|  | @ -3030,8 +3043,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|             _PyLoadGlobalCache *cache = (_PyLoadGlobalCache *)next_instr; | ||||
|             uint32_t version = read32(&cache->module_keys_version); | ||||
|             DEOPT_IF(dict->ma_keys->dk_version != version, LOAD_GLOBAL); | ||||
|             PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + cache->index; | ||||
|             PyObject *res = ep->me_value; | ||||
|             PyObject *res = dictkeys_get_value_by_index(dict->ma_keys, cache->index); | ||||
|             DEOPT_IF(res == NULL, LOAD_GLOBAL); | ||||
|             JUMPBY(INLINE_CACHE_ENTRIES_LOAD_GLOBAL); | ||||
|             STAT_INC(LOAD_GLOBAL, hit); | ||||
|  | @ -3051,8 +3063,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|             uint16_t bltn_version = cache->builtin_keys_version; | ||||
|             DEOPT_IF(mdict->ma_keys->dk_version != mod_version, LOAD_GLOBAL); | ||||
|             DEOPT_IF(bdict->ma_keys->dk_version != bltn_version, LOAD_GLOBAL); | ||||
|             PyDictKeyEntry *ep = DK_ENTRIES(bdict->ma_keys) + cache->index; | ||||
|             PyObject *res = ep->me_value; | ||||
|             PyObject *res = dictkeys_get_value_by_index(bdict->ma_keys, cache->index); | ||||
|             DEOPT_IF(res == NULL, LOAD_GLOBAL); | ||||
|             JUMPBY(INLINE_CACHE_ENTRIES_LOAD_GLOBAL); | ||||
|             STAT_INC(LOAD_GLOBAL, hit); | ||||
|  | @ -3272,20 +3283,12 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|         } | ||||
| 
 | ||||
|         TARGET(BUILD_MAP) { | ||||
|             Py_ssize_t i; | ||||
|             PyObject *map = _PyDict_NewPresized((Py_ssize_t)oparg); | ||||
|             PyObject *map = _PyDict_FromItems( | ||||
|                     &PEEK(2*oparg), 2, | ||||
|                     &PEEK(2*oparg - 1), 2, | ||||
|                     oparg); | ||||
|             if (map == NULL) | ||||
|                 goto error; | ||||
|             for (i = oparg; i > 0; i--) { | ||||
|                 int err; | ||||
|                 PyObject *key = PEEK(2*i); | ||||
|                 PyObject *value = PEEK(2*i - 1); | ||||
|                 err = PyDict_SetItem(map, key, value); | ||||
|                 if (err != 0) { | ||||
|                     Py_DECREF(map); | ||||
|                     goto error; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             while (oparg--) { | ||||
|                 Py_DECREF(POP()); | ||||
|  | @ -3351,7 +3354,6 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|         } | ||||
| 
 | ||||
|         TARGET(BUILD_CONST_KEY_MAP) { | ||||
|             Py_ssize_t i; | ||||
|             PyObject *map; | ||||
|             PyObject *keys = TOP(); | ||||
|             if (!PyTuple_CheckExact(keys) || | ||||
|  | @ -3360,20 +3362,12 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|                                  "bad BUILD_CONST_KEY_MAP keys argument"); | ||||
|                 goto error; | ||||
|             } | ||||
|             map = _PyDict_NewPresized((Py_ssize_t)oparg); | ||||
|             map = _PyDict_FromItems( | ||||
|                     &PyTuple_GET_ITEM(keys, 0), 1, | ||||
|                     &PEEK(oparg + 1), 1, oparg); | ||||
|             if (map == NULL) { | ||||
|                 goto error; | ||||
|             } | ||||
|             for (i = oparg; i > 0; i--) { | ||||
|                 int err; | ||||
|                 PyObject *key = PyTuple_GET_ITEM(keys, oparg - i); | ||||
|                 PyObject *value = PEEK(i + 1); | ||||
|                 err = PyDict_SetItem(map, key, value); | ||||
|                 if (err != 0) { | ||||
|                     Py_DECREF(map); | ||||
|                     goto error; | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             Py_DECREF(POP()); | ||||
|             while (oparg--) { | ||||
|  | @ -3538,9 +3532,16 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|             PyObject *name = GETITEM(names, cache0->original_oparg); | ||||
|             uint16_t hint = cache0->index; | ||||
|             DEOPT_IF(hint >= (size_t)dict->ma_keys->dk_nentries, LOAD_ATTR); | ||||
|             PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint; | ||||
|             DEOPT_IF(ep->me_key != name, LOAD_ATTR); | ||||
|             res = ep->me_value; | ||||
|             if (DK_IS_UNICODE(dict->ma_keys)) { | ||||
|                 PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint; | ||||
|                 DEOPT_IF(ep->me_key != name, LOAD_ATTR); | ||||
|                 res = ep->me_value; | ||||
|             } | ||||
|             else { | ||||
|                 PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint; | ||||
|                 DEOPT_IF(ep->me_key != name, LOAD_ATTR); | ||||
|                 res = ep->me_value; | ||||
|             } | ||||
|             DEOPT_IF(res == NULL, LOAD_ATTR); | ||||
|             STAT_INC(LOAD_ATTR, hit); | ||||
|             Py_INCREF(res); | ||||
|  | @ -3630,15 +3631,27 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | |||
|             PyObject *name = GETITEM(names, cache0->original_oparg); | ||||
|             uint16_t hint = cache0->index; | ||||
|             DEOPT_IF(hint >= (size_t)dict->ma_keys->dk_nentries, STORE_ATTR); | ||||
|             PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint; | ||||
|             DEOPT_IF(ep->me_key != name, STORE_ATTR); | ||||
|             PyObject *old_value = ep->me_value; | ||||
|             DEOPT_IF(old_value == NULL, STORE_ATTR); | ||||
|             STAT_INC(STORE_ATTR, hit); | ||||
|             STACK_SHRINK(1); | ||||
|             PyObject *value = POP(); | ||||
|             ep->me_value = value; | ||||
|             PyObject *value, *old_value; | ||||
|             if (DK_IS_UNICODE(dict->ma_keys)) { | ||||
|                 PyDictUnicodeEntry *ep = DK_UNICODE_ENTRIES(dict->ma_keys) + hint; | ||||
|                 DEOPT_IF(ep->me_key != name, STORE_ATTR); | ||||
|                 old_value = ep->me_value; | ||||
|                 DEOPT_IF(old_value == NULL, STORE_ATTR); | ||||
|                 STACK_SHRINK(1); | ||||
|                 value = POP(); | ||||
|                 ep->me_value = value; | ||||
|             } | ||||
|             else { | ||||
|                 PyDictKeyEntry *ep = DK_ENTRIES(dict->ma_keys) + hint; | ||||
|                 DEOPT_IF(ep->me_key != name, STORE_ATTR); | ||||
|                 old_value = ep->me_value; | ||||
|                 DEOPT_IF(old_value == NULL, STORE_ATTR); | ||||
|                 STACK_SHRINK(1); | ||||
|                 value = POP(); | ||||
|                 ep->me_value = value; | ||||
|             } | ||||
|             Py_DECREF(old_value); | ||||
|             STAT_INC(STORE_ATTR, hit); | ||||
|             /* Ensure dict is GC tracked if it needs to be */ | ||||
|             if (!_PyObject_GC_IS_TRACKED(dict) && _PyObject_GC_MAY_BE_TRACKED(value)) { | ||||
|                 _PyObject_GC_TRACK(dict); | ||||
|  |  | |||
|  | @ -787,12 +787,6 @@ def write_repr(self, out, visited): | |||
|     def _get_entries(keys): | ||||
|         dk_nentries = int(keys['dk_nentries']) | ||||
|         dk_size = 1<<int(keys['dk_log2_size']) | ||||
|         try: | ||||
|             # <= Python 3.5 | ||||
|             return keys['dk_entries'], dk_size | ||||
|         except RuntimeError: | ||||
|             # >= Python 3.6 | ||||
|             pass | ||||
| 
 | ||||
|         if dk_size <= 0xFF: | ||||
|             offset = dk_size | ||||
|  | @ -805,7 +799,10 @@ def _get_entries(keys): | |||
| 
 | ||||
|         ent_addr = keys['dk_indices'].address | ||||
|         ent_addr = ent_addr.cast(_type_unsigned_char_ptr()) + offset | ||||
|         ent_ptr_t = gdb.lookup_type('PyDictKeyEntry').pointer() | ||||
|         if int(keys['dk_kind']) == 0:  # DICT_KEYS_GENERAL | ||||
|             ent_ptr_t = gdb.lookup_type('PyDictKeyEntry').pointer() | ||||
|         else: | ||||
|             ent_ptr_t = gdb.lookup_type('PyDictUnicodeEntry').pointer() | ||||
|         ent_addr = ent_addr.cast(ent_ptr_t) | ||||
| 
 | ||||
|         return ent_addr, dk_nentries | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Inada Naoki
						Inada Naoki