mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 07:31:38 +00:00 
			
		
		
		
	Optimize unicode_subtype_new(): don't encode to wchar_t and decode from wchar_t
Rewrite unicode_subtype_new(): allocate directly the right type.
This commit is contained in:
		
							parent
							
								
									e90fe6a8f4
								
							
						
					
					
						commit
						07ac3ebd7b
					
				
					 2 changed files with 81 additions and 43 deletions
				
			
		| 
						 | 
					@ -1010,10 +1010,13 @@ def test_constructor(self):
 | 
				
			||||||
        class UnicodeSubclass(str):
 | 
					        class UnicodeSubclass(str):
 | 
				
			||||||
            pass
 | 
					            pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(
 | 
					        for text in ('ascii', '\xe9', '\u20ac', '\U0010FFFF'):
 | 
				
			||||||
            str(UnicodeSubclass('unicode subclass becomes unicode')),
 | 
					            subclass = UnicodeSubclass(text)
 | 
				
			||||||
            'unicode subclass becomes unicode'
 | 
					            self.assertEqual(str(subclass), text)
 | 
				
			||||||
        )
 | 
					            self.assertEqual(len(subclass), len(text))
 | 
				
			||||||
 | 
					            if text == 'ascii':
 | 
				
			||||||
 | 
					                self.assertEqual(subclass.encode('ascii'), b'ascii')
 | 
				
			||||||
 | 
					                self.assertEqual(subclass.encode('utf-8'), b'ascii')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.assertEqual(
 | 
					        self.assertEqual(
 | 
				
			||||||
            str('strings are converted to unicode'),
 | 
					            str('strings are converted to unicode'),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12410,56 +12410,91 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 | 
				
			||||||
static PyObject *
 | 
					static PyObject *
 | 
				
			||||||
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 | 
					unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    PyUnicodeObject *tmp, *pnew;
 | 
					    PyUnicodeObject *unicode, *self;
 | 
				
			||||||
    Py_ssize_t n;
 | 
					    Py_ssize_t length, char_size;
 | 
				
			||||||
    PyObject *err = NULL;
 | 
					    int share_wstr, share_utf8;
 | 
				
			||||||
 | 
					    unsigned int kind;
 | 
				
			||||||
 | 
					    void *data;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
 | 
					    assert(PyType_IsSubtype(type, &PyUnicode_Type));
 | 
				
			||||||
    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
 | 
					
 | 
				
			||||||
    if (tmp == NULL)
 | 
					    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
 | 
				
			||||||
 | 
					    if (unicode == NULL)
 | 
				
			||||||
        return NULL;
 | 
					        return NULL;
 | 
				
			||||||
    assert(PyUnicode_Check(tmp));
 | 
					    assert(PyUnicode_Check(unicode));
 | 
				
			||||||
    // TODO: Verify the PyUnicode_GET_SIZE does the right thing.
 | 
					    if (PyUnicode_READY(unicode))
 | 
				
			||||||
    //       it seems kind of strange that tp_alloc gets passed the size
 | 
					        return NULL;
 | 
				
			||||||
    //       of the unicode string because there will follow another
 | 
					
 | 
				
			||||||
    //       malloc.
 | 
					    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
 | 
				
			||||||
    pnew = (PyUnicodeObject *) type->tp_alloc(type,
 | 
					    if (self == NULL) {
 | 
				
			||||||
                                              n = PyUnicode_GET_SIZE(tmp));
 | 
					        Py_DECREF(unicode);
 | 
				
			||||||
    if (pnew == NULL) {
 | 
					 | 
				
			||||||
        Py_DECREF(tmp);
 | 
					 | 
				
			||||||
        return NULL;
 | 
					        return NULL;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    _PyUnicode_WSTR(pnew) = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
 | 
					    kind = PyUnicode_KIND(unicode);
 | 
				
			||||||
    if (_PyUnicode_WSTR(pnew) == NULL) {
 | 
					    length = PyUnicode_GET_LENGTH(unicode);
 | 
				
			||||||
        err = PyErr_NoMemory();
 | 
					
 | 
				
			||||||
 | 
					    _PyUnicode_LENGTH(self) = length;
 | 
				
			||||||
 | 
					    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
 | 
				
			||||||
 | 
					    _PyUnicode_STATE(self).interned = 0;
 | 
				
			||||||
 | 
					    _PyUnicode_STATE(self).kind = kind;
 | 
				
			||||||
 | 
					    _PyUnicode_STATE(self).compact = 0;
 | 
				
			||||||
 | 
					    _PyUnicode_STATE(self).ascii = 0;
 | 
				
			||||||
 | 
					    _PyUnicode_STATE(self).ready = 1;
 | 
				
			||||||
 | 
					    _PyUnicode_WSTR(self) = NULL;
 | 
				
			||||||
 | 
					    _PyUnicode_UTF8_LENGTH(self) = 0;
 | 
				
			||||||
 | 
					    _PyUnicode_UTF8(self) = NULL;
 | 
				
			||||||
 | 
					    _PyUnicode_WSTR_LENGTH(self) = 0;
 | 
				
			||||||
 | 
					    self->data.any = NULL;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    share_utf8 = 0;
 | 
				
			||||||
 | 
					    share_wstr = 0;
 | 
				
			||||||
 | 
					    if (kind == PyUnicode_1BYTE_KIND) {
 | 
				
			||||||
 | 
					        char_size = 1;
 | 
				
			||||||
 | 
					        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
 | 
				
			||||||
 | 
					            share_utf8 = 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    else if (kind == PyUnicode_2BYTE_KIND) {
 | 
				
			||||||
 | 
					        char_size = 2;
 | 
				
			||||||
 | 
					        if (sizeof(wchar_t) == 2)
 | 
				
			||||||
 | 
					            share_wstr = 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    else {
 | 
				
			||||||
 | 
					        assert(kind == PyUnicode_4BYTE_KIND);
 | 
				
			||||||
 | 
					        char_size = 4;
 | 
				
			||||||
 | 
					        if (sizeof(wchar_t) == 4)
 | 
				
			||||||
 | 
					            share_wstr = 1;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /* Ensure we won't overflow the length. */
 | 
				
			||||||
 | 
					    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
 | 
				
			||||||
 | 
					        PyErr_NoMemory();
 | 
				
			||||||
        goto onError;
 | 
					        goto onError;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    Py_UNICODE_COPY(_PyUnicode_WSTR(pnew), PyUnicode_AS_UNICODE(tmp), n+1);
 | 
					    data = PyObject_MALLOC((length + 1) * char_size);
 | 
				
			||||||
    _PyUnicode_WSTR_LENGTH(pnew) = n;
 | 
					    if (data == NULL) {
 | 
				
			||||||
    _PyUnicode_HASH(pnew) = _PyUnicode_HASH(tmp);
 | 
					        PyErr_NoMemory();
 | 
				
			||||||
    _PyUnicode_STATE(pnew).interned = 0;
 | 
					 | 
				
			||||||
    _PyUnicode_STATE(pnew).kind = 0;
 | 
					 | 
				
			||||||
    _PyUnicode_STATE(pnew).compact = 0;
 | 
					 | 
				
			||||||
    _PyUnicode_STATE(pnew).ready = 0;
 | 
					 | 
				
			||||||
    _PyUnicode_STATE(pnew).ascii = 0;
 | 
					 | 
				
			||||||
    pnew->data.any = NULL;
 | 
					 | 
				
			||||||
    _PyUnicode_LENGTH(pnew) = 0;
 | 
					 | 
				
			||||||
    pnew->_base.utf8 = NULL;
 | 
					 | 
				
			||||||
    pnew->_base.utf8_length = 0;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if (PyUnicode_READY(pnew) == -1) {
 | 
					 | 
				
			||||||
        PyObject_FREE(_PyUnicode_WSTR(pnew));
 | 
					 | 
				
			||||||
        goto onError;
 | 
					        goto onError;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Py_DECREF(tmp);
 | 
					    self->data.any = data;
 | 
				
			||||||
    return (PyObject *)pnew;
 | 
					    if (share_utf8) {
 | 
				
			||||||
 | 
					        _PyUnicode_UTF8_LENGTH(self) = length;
 | 
				
			||||||
 | 
					        _PyUnicode_UTF8(self) = data;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    if (share_wstr) {
 | 
				
			||||||
 | 
					        _PyUnicode_WSTR_LENGTH(self) = length;
 | 
				
			||||||
 | 
					        _PyUnicode_WSTR(self) = (wchar_t *)data;
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Py_MEMCPY(data, PyUnicode_DATA(unicode),
 | 
				
			||||||
 | 
					              PyUnicode_KIND_SIZE(kind, length + 1));
 | 
				
			||||||
 | 
					    Py_DECREF(unicode);
 | 
				
			||||||
 | 
					    return (PyObject *)self;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
onError:
 | 
					onError:
 | 
				
			||||||
    _Py_ForgetReference((PyObject *)pnew);
 | 
					    Py_DECREF(unicode);
 | 
				
			||||||
    PyObject_Del(pnew);
 | 
					    Py_DECREF(self);
 | 
				
			||||||
    Py_DECREF(tmp);
 | 
					    return NULL;
 | 
				
			||||||
    return err;
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
PyDoc_STRVAR(unicode_doc,
 | 
					PyDoc_STRVAR(unicode_doc,
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue