mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 13:41:24 +00:00 
			
		
		
		
	needforspeed: partition implementation, part two.
feel free to improve the documentation and the docstrings.
This commit is contained in:
		
							parent
							
								
									19bebf2e2f
								
							
						
					
					
						commit
						06a69dd8ff
					
				
					 5 changed files with 143 additions and 78 deletions
				
			
		|  | @ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped: | |||
| \versionchanged[Support for the \var{chars} argument]{2.2.2} | ||||
| \end{methoddesc} | ||||
| 
 | ||||
| \begin{methoddesc}[string]{partition}{sep} | ||||
| Splits the string at the \var{sep}, and return a 3-tuple containing | ||||
| the part before the separator, the separator itself, and the part | ||||
| after the separator.  If the separator is not found, return a 3-tuple | ||||
| containing the string itself, followed by two empty strings. | ||||
| \versionadded{2.5} | ||||
| \end{methoddesc} | ||||
| 
 | ||||
| \begin{methoddesc}[string]{replace}{old, new\optional{, count}} | ||||
| Return a copy of the string with all occurrences of substring | ||||
| \var{old} replaced by \var{new}.  If the optional argument | ||||
|  |  | |||
|  | @ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_GetMax PyUnicodeUCS2_GetMax | ||||
| # define PyUnicode_GetSize PyUnicodeUCS2_GetSize | ||||
| # define PyUnicode_Join PyUnicodeUCS2_Join | ||||
| # define PyUnicode_Partition PyUnicodeUCS2_Partition | ||||
| # define PyUnicode_Replace PyUnicodeUCS2_Replace | ||||
| # define PyUnicode_Resize PyUnicodeUCS2_Resize | ||||
| # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding | ||||
|  | @ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; | |||
| # define PyUnicode_GetMax PyUnicodeUCS4_GetMax | ||||
| # define PyUnicode_GetSize PyUnicodeUCS4_GetSize | ||||
| # define PyUnicode_Join PyUnicodeUCS4_Join | ||||
| # define PyUnicode_Partition PyUnicodeUCS4_Partition | ||||
| # define PyUnicode_Replace PyUnicodeUCS4_Replace | ||||
| # define PyUnicode_Resize PyUnicodeUCS4_Resize | ||||
| # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding | ||||
|  | @ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( | |||
|     int keepends		/* If true, line end markers are included */ | ||||
|     );		 | ||||
| 
 | ||||
| /* Partition a string using a given separator. */ | ||||
| 
 | ||||
| PyAPI_FUNC(PyObject*) PyUnicode_Partition( | ||||
|     PyObject *s,		/* String to partition */ | ||||
|     PyObject *sep		/* String separator */ | ||||
|     );		 | ||||
| 
 | ||||
| /* Split a string giving a list of Unicode strings.
 | ||||
| 
 | ||||
|    If sep is NULL, splitting will be done at all whitespace | ||||
|  |  | |||
|  | @ -900,6 +900,21 @@ def test_inplace_rewrites(self): | |||
|         self.checkequal('A', 'a', 'title') | ||||
|         self.checkequal(True, 'a', 'islower') | ||||
| 
 | ||||
|     def test_partition(self): | ||||
| 
 | ||||
|         self.checkequal(('this', ' is ', 'the partition method'), | ||||
|             'this is the partition method', 'partition', ' is ') | ||||
| 
 | ||||
|         # from raymond's original specification | ||||
|         S = 'http://www.python.org' | ||||
|         self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://') | ||||
|         self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?') | ||||
|         self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://') | ||||
|         self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org') | ||||
| 
 | ||||
|         self.checkraises(ValueError, S, 'partition', '') | ||||
|         self.checkraises(TypeError, S, 'partition', None) | ||||
| 
 | ||||
| 
 | ||||
| class MixinStrStringUserStringTest: | ||||
|     # Additional tests for 8bit strings, i.e. str, UserString and | ||||
|  |  | |||
|  | @ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args) | |||
| { | ||||
| 	Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos; | ||||
| 	const char *str = PyString_AS_STRING(self), *sep; | ||||
| 	PyObject *sepobj; | ||||
| 	PyObject *sep_obj; | ||||
| 	PyObject * out; | ||||
| 
 | ||||
| 	if (!PyArg_ParseTuple(args, "O:partition", &sepobj)) | ||||
| 	if (!PyArg_ParseTuple(args, "O:partition", &sep_obj)) | ||||
| 		return NULL; | ||||
| 	if (PyString_Check(sepobj)) { | ||||
| 		sep = PyString_AS_STRING(sepobj); | ||||
| 		sep_len = PyString_GET_SIZE(sepobj); | ||||
| 	if (PyString_Check(sep_obj)) { | ||||
| 		sep = PyString_AS_STRING(sep_obj); | ||||
| 		sep_len = PyString_GET_SIZE(sep_obj); | ||||
| 	} | ||||
| #ifdef Py_USING_UNICODE_NOTYET | ||||
| 	else if (PyUnicode_Check(sepobj)) | ||||
| 		return PyUnicode_Partition((PyObject *)self, sepobj); | ||||
| #ifdef Py_USING_UNICODE | ||||
| 	else if (PyUnicode_Check(sep_obj)) | ||||
| 		return PyUnicode_Partition((PyObject *)self, sep_obj); | ||||
| #endif | ||||
| 	else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len)) | ||||
| 	else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len)) | ||||
| 		return NULL; | ||||
| 
 | ||||
| 	if (sep_len == 0) { | ||||
|  | @ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args) | |||
| 		Py_INCREF(nullstring); | ||||
| 		PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring); | ||||
| 	} else { | ||||
| 		Py_INCREF(sepobj); | ||||
| 		PyObject* obj; | ||||
| 		PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos)); | ||||
| 		PyTuple_SET_ITEM(out, 1, sepobj); | ||||
| 		PyTuple_SET_ITEM(out, 2, | ||||
| 			PyString_FromStringAndSize(str + sep_len + pos, | ||||
| 						   len - sep_len - pos) | ||||
| 			); | ||||
| 		Py_INCREF(sep_obj); | ||||
| 		PyTuple_SET_ITEM(out, 1, sep_obj); | ||||
| 		pos += sep_len; | ||||
| 		obj = PyString_FromStringAndSize(str + pos, len - pos); | ||||
| 		PyTuple_SET_ITEM(out, 2, obj); | ||||
| 		if (PyErr_Occurred()) { | ||||
| 			Py_DECREF(out); | ||||
| 			return NULL; | ||||
|  |  | |||
|  | @ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh, | |||
| modified by Marc-Andre Lemburg <mal@lemburg.com> according to the | ||||
| Unicode Integration Proposal (see file Misc/unicode.txt). | ||||
| 
 | ||||
| Major speed upgrades to the method implementations at the Reykjavik | ||||
| NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. | ||||
| 
 | ||||
| Copyright (c) Corporation for National Research Initiatives. | ||||
| 
 | ||||
| -------------------------------------------------------------------- | ||||
|  | @ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode, | |||
|     /* Resizing shared object (unicode_empty or single character
 | ||||
|        objects) in-place is not allowed. Use PyUnicode_Resize() | ||||
|        instead ! */ | ||||
| 
 | ||||
|     if (unicode == unicode_empty ||  | ||||
| 	(unicode->length == 1 &&  | ||||
| 	 unicode->str[0] < 256U && | ||||
|  | @ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode, | |||
|         return -1; | ||||
|     } | ||||
| 
 | ||||
|     /* We allocate one more byte to make sure the string is
 | ||||
|        Ux0000 terminated -- XXX is this needed ? */ | ||||
|     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 | ||||
|        The overallocation is also used by fastsearch, which assumes that it's | ||||
|        safe to look at str[length] (without makeing any assumptions about what | ||||
|        it contains). */ | ||||
| 
 | ||||
|     oldstr = unicode->str; | ||||
|     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); | ||||
|     if (!unicode->str) { | ||||
|  | @ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, | |||
| 
 | ||||
| /* --- Helpers ------------------------------------------------------------ */ | ||||
| 
 | ||||
| #define USE_FAST /* experimental fast search implementation */ | ||||
| 
 | ||||
| /* fast search/count implementation, based on a mix between boyer-
 | ||||
|    moore and horspool, with a few more bells and whistles on the top. | ||||
|    for some more background, see: http://effbot.org/stringlib */
 | ||||
|  | @ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode) | |||
|             /* miss: check if next character is part of pattern */ | ||||
|             if (!(mask & (1 << (s[i+m] & 0x1F)))) | ||||
|                 i = i + m; | ||||
|             else { | ||||
|             else | ||||
|                 i = i + skip; | ||||
|                 continue; | ||||
|             } | ||||
|         } else { | ||||
|             /* skip: check if next character is part of pattern */ | ||||
|             if (!(mask & (1 << (s[i+m] & 0x1F)))) | ||||
|  | @ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self, | |||
|     if (substring->length == 0) | ||||
| 	return (end - start + 1); | ||||
| 
 | ||||
| #ifdef USE_FAST | ||||
|     count = fastsearch( | ||||
|         PyUnicode_AS_UNICODE(self) + start, end - start, | ||||
|         substring->str, substring->length, FAST_COUNT | ||||
|         ); | ||||
| 
 | ||||
|     if (count < 0) | ||||
|         count = 0; /* no match */ | ||||
| #else     | ||||
|     end -= substring->length; | ||||
| 
 | ||||
|     while (start <= end) | ||||
|         if (Py_UNICODE_MATCH(self, start, substring)) { | ||||
|             count++; | ||||
|             start += substring->length; | ||||
|         } else | ||||
|             start++; | ||||
| #endif | ||||
| 
 | ||||
|     return count; | ||||
| } | ||||
|  | @ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self, | |||
|     if (substring->length == 0) | ||||
| 	return (direction > 0) ? start : end; | ||||
| 
 | ||||
| #ifdef USE_FAST | ||||
|     if (direction > 0) { | ||||
|         Py_ssize_t pos = fastsearch( | ||||
|             PyUnicode_AS_UNICODE(self) + start, end - start, | ||||
|             substring->str, substring->length, FAST_SEARCH | ||||
|             ); | ||||
|         if (pos < 0) | ||||
|             return pos; | ||||
|         if (pos >= 0) | ||||
|             return pos + start; | ||||
|     } | ||||
| #endif | ||||
| 
 | ||||
|     } else { | ||||
|         end -= substring->length; | ||||
| 
 | ||||
|     if (direction < 0) { | ||||
|         for (; end >= start; end--) | ||||
|             if (Py_UNICODE_MATCH(self, end, substring)) | ||||
|                 return end; | ||||
|     } else { | ||||
|         for (; start <= end; start++) | ||||
|             if (Py_UNICODE_MATCH(self, start, substring)) | ||||
|                 return start; | ||||
|     } | ||||
| 
 | ||||
|     return -1; | ||||
| } | ||||
| 
 | ||||
|  | @ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container, | |||
| 		       PyObject *element) | ||||
| { | ||||
|     PyUnicodeObject *u, *v; | ||||
|     int result; | ||||
|     Py_ssize_t size; | ||||
| #ifdef USE_FAST | ||||
|     Py_ssize_t pos; | ||||
| #endif | ||||
| 
 | ||||
|     /* Coerce the two arguments */ | ||||
|     v = (PyUnicodeObject *) PyUnicode_FromObject(element); | ||||
|  | @ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container, | |||
| 
 | ||||
|     size = PyUnicode_GET_SIZE(v); | ||||
|     if (!size) { | ||||
|         result = 1; | ||||
|         pos = 0; | ||||
|         goto done; | ||||
|     } | ||||
| 
 | ||||
| #ifdef USE_FAST | ||||
|     pos = fastsearch( | ||||
|         PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u), | ||||
|         PyUnicode_AS_UNICODE(v), size, FAST_SEARCH | ||||
|         ); | ||||
|     result = (pos != -1); | ||||
| #else     | ||||
|     result = 0; | ||||
| 
 | ||||
|     if (size == 1) { | ||||
|         Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0]; | ||||
|         Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u); | ||||
| 	Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u); | ||||
| 	for (; ptr < end; ptr++) { | ||||
| 	    if (*ptr == chr) { | ||||
| 		result = 1; | ||||
| 		break; | ||||
| 	    } | ||||
| 	} | ||||
|     } else { | ||||
|         Py_ssize_t start = 0; | ||||
|         Py_ssize_t end = PyUnicode_GET_SIZE(u) - size; | ||||
|         for (; start <= end; start++) | ||||
|             if (Py_UNICODE_MATCH(u, start, v)) { | ||||
|                 result = 1; | ||||
|                 break; | ||||
|             } | ||||
|     } | ||||
| #endif | ||||
| 
 | ||||
| done: | ||||
|     Py_DECREF(u); | ||||
|     Py_DECREF(v); | ||||
|     return result; | ||||
|     return (pos != -1); | ||||
| } | ||||
| 
 | ||||
| /* Concat to string or Unicode object giving a new Unicode object. */ | ||||
|  | @ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args) | |||
| 	return PyUnicode_Split((PyObject *)self, substring, maxcount); | ||||
| } | ||||
| 
 | ||||
| PyObject * | ||||
| PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) | ||||
| { | ||||
|     PyObject* str_obj; | ||||
|     PyObject* sep_obj; | ||||
|     Py_UNICODE *str, *sep; | ||||
|     Py_ssize_t len, sep_len, pos; | ||||
|     PyObject* out; | ||||
|      | ||||
|     str_obj = PyUnicode_FromObject(str_in); | ||||
|     if (!str_obj) | ||||
| 	return NULL; | ||||
|     sep_obj = PyUnicode_FromObject(sep_in); | ||||
|     if (!sep_obj) | ||||
|         goto error; | ||||
| 
 | ||||
|     str = PyUnicode_AS_UNICODE(str_obj); | ||||
|     len = PyUnicode_GET_SIZE(str_obj); | ||||
| 
 | ||||
|     sep = PyUnicode_AS_UNICODE(sep_obj); | ||||
|     sep_len = PyUnicode_GET_SIZE(sep_obj); | ||||
| 
 | ||||
|     if (sep_len == 0) { | ||||
|         PyErr_SetString(PyExc_ValueError, "empty separator"); | ||||
|         goto error; | ||||
|     } | ||||
| 
 | ||||
|     out = PyTuple_New(3); | ||||
|     if (!out) | ||||
|         goto error; | ||||
| 
 | ||||
|     pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH); | ||||
|     if (pos < 0) { | ||||
|         Py_INCREF(str_obj); | ||||
|         PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); | ||||
|         Py_INCREF(unicode_empty); | ||||
|         PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty); | ||||
|         Py_INCREF(unicode_empty); | ||||
|         PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty); | ||||
|     } else { | ||||
|         PyObject* obj; | ||||
|         PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos)); | ||||
|         Py_INCREF(sep_obj); | ||||
|         PyTuple_SET_ITEM(out, 1, sep_obj); | ||||
|         obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos); | ||||
|         PyTuple_SET_ITEM(out, 2, obj); | ||||
|         if (PyErr_Occurred()) { | ||||
|             Py_DECREF(out); | ||||
|             goto error; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     return out; | ||||
| 
 | ||||
| error: | ||||
|     Py_XDECREF(sep_obj); | ||||
|     Py_DECREF(str_obj); | ||||
|     return NULL; | ||||
| } | ||||
| 
 | ||||
| PyDoc_STRVAR(partition__doc__, | ||||
| "S.partition(sep) -> (head, sep, tail)\n\
 | ||||
| \n\ | ||||
| Searches for the separator sep in S, and returns the part before it,\n\ | ||||
| the separator itself, and the part after it.  If the separator is not\n\ | ||||
| found, returns S and two empty strings."); | ||||
| 
 | ||||
| static PyObject* | ||||
| unicode_partition(PyUnicodeObject *self, PyObject *args) | ||||
| { | ||||
|     PyObject *separator; | ||||
| 
 | ||||
|     if (!PyArg_ParseTuple(args, "O:partition", &separator)) | ||||
|         return NULL; | ||||
| 
 | ||||
|     return PyUnicode_Partition((PyObject *)self, separator); | ||||
| } | ||||
| 
 | ||||
| PyObject *PyUnicode_RSplit(PyObject *s, | ||||
| 			   PyObject *sep, | ||||
| 			   Py_ssize_t maxsplit) | ||||
|  | @ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = { | |||
|     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, | ||||
|     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, | ||||
|     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, | ||||
|     {"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__}, | ||||
|     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, | ||||
|     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, | ||||
|     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 Fredrik Lundh
						Fredrik Lundh