Backport of several functions from Python 3.0 to 2.6 including PyUnicode_FromString, PyUnicode_Format and PyLong_From/AsSsize_t. The functions are partly required for the backport of the bytearray type and _fileio module. They should also make it easier to port C to 3.0.

First chapter of the Python 3.0 io framework back port: _fileio The next step depends on a working bytearray type which itself depends on a backport of the nwe buffer API.
2025-10-31 05:31:20 +00:00 · 2008-01-25 12:18:43 +00:00 · 2008-01-25 12:18:43 +00:00 · 7f39c9fcbb
commit 7f39c9fcbb
parent 5f95a79b2b
8 changed files with 1599 additions and 173 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -397,6 +397,57 @@ PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
    return (PyObject *)unicode;
 }

+PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
+{
+    PyUnicodeObject *unicode;
+    /* If the Unicode data is known at construction time, we can apply
+       some optimizations which share commonly used objects.
+       Also, this means the input must be UTF-8, so fall back to the
+       UTF-8 decoder at the end. */
+    if (u != NULL) {
+
+	/* Optimization for empty strings */
+	if (size == 0 && unicode_empty != NULL) {
+	    Py_INCREF(unicode_empty);
+	    return (PyObject *)unicode_empty;
+	}
+
+	/* Single characters are shared when using this constructor.
+           Restrict to ASCII, since the input must be UTF-8. */
+	if (size == 1 && Py_CHARMASK(*u) < 128) {
+	    unicode = unicode_latin1[Py_CHARMASK(*u)];
+	    if (!unicode) {
+		unicode = _PyUnicode_New(1);
+		if (!unicode)
+		    return NULL;
+		unicode->str[0] = Py_CHARMASK(*u);
+		unicode_latin1[Py_CHARMASK(*u)] = unicode;
+	    }
+	    Py_INCREF(unicode);
+	    return (PyObject *)unicode;
+	}
+
+        return PyUnicode_DecodeUTF8(u, size, NULL);
+    }
+
+    unicode = _PyUnicode_New(size);
+    if (!unicode)
+        return NULL;
+
+    return (PyObject *)unicode;
+}
+
+PyObject *PyUnicode_FromString(const char *u)
+{
+    size_t size = strlen(u);
+    if (size > PY_SSIZE_T_MAX) {
+        PyErr_SetString(PyExc_OverflowError, "input too long");
+        return NULL;
+    }
+
+    return PyUnicode_FromStringAndSize(u, size);
+}
+
 #ifdef HAVE_WCHAR_H

 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
@ -429,6 +480,420 @@ PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
    return (PyObject *)unicode;
 }

+static void
+makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
+{
+	*fmt++ = '%';
+	if (width) {
+		if (zeropad)
+			*fmt++ = '0';
+		fmt += sprintf(fmt, "%d", width);
+	}
+	if (precision)
+		fmt += sprintf(fmt, ".%d", precision);
+	if (longflag)
+		*fmt++ = 'l';
+	else if (size_tflag) {
+		char *f = PY_FORMAT_SIZE_T;
+		while (*f)
+			*fmt++ = *f++;
+	}
+	*fmt++ = c;
+	*fmt = '\0';
+}
+
+#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
+
+PyObject *
+PyUnicode_FromFormatV(const char *format, va_list vargs)
+{
+	va_list count;
+	Py_ssize_t callcount = 0;
+	PyObject **callresults = NULL;
+	PyObject **callresult = NULL;
+	Py_ssize_t n = 0;
+	int width = 0;
+	int precision = 0;
+	int zeropad;
+	const char* f;
+	Py_UNICODE *s;
+	PyObject *string;
+	/* used by sprintf */
+	char buffer[21];
+	/* use abuffer instead of buffer, if we need more space
+	 * (which can happen if there's a format specifier with width). */
+	char *abuffer = NULL;
+	char *realbuffer;
+	Py_ssize_t abuffersize = 0;
+	char fmt[60]; /* should be enough for %0width.precisionld */
+	const char *copy;
+
+#ifdef VA_LIST_IS_ARRAY
+	Py_MEMCPY(count, vargs, sizeof(va_list));
+#else
+#ifdef  __va_copy
+	__va_copy(count, vargs);
+#else
+	count = vargs;
+#endif
+#endif
+	/* step 1: count the number of %S/%R format specifications
+	 * (we call PyObject_Str()/PyObject_Repr() for these objects
+	 * once during step 3 and put the result in an array) */
+	for (f = format; *f; f++) {
+		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
+			++callcount;
+	}
+	/* step 2: allocate memory for the results of
+	 * PyObject_Str()/PyObject_Repr() calls */
+	if (callcount) {
+		callresults = PyMem_Malloc(sizeof(PyObject *)*callcount);
+		if (!callresults) {
+			PyErr_NoMemory();
+			return NULL;
+		}
+		callresult = callresults;
+	}
+	/* step 3: figure out how large a buffer we need */
+	for (f = format; *f; f++) {
+		if (*f == '%') {
+			const char* p = f;
+			width = 0;
+			while (isdigit(*f))
+				width = (width*10) + *f++ - '0';
+			while (*++f && *f != '%' && !isalpha(*f))
+				;
+
+			/* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
+			 * they don't affect the amount of space we reserve.
+			 */
+			if ((*f == 'l' || *f == 'z') &&
+					(f[1] == 'd' || f[1] == 'u'))
+                                ++f;
+
+			switch (*f) {
+			case 'c':
+				(void)va_arg(count, int);
+				/* fall through... */
+			case '%':
+				n++;
+				break;
+			case 'd': case 'u': case 'i': case 'x':
+				(void) va_arg(count, int);
+				/* 20 bytes is enough to hold a 64-bit
+				   integer.  Decimal takes the most space.
+				   This isn't enough for octal.
+				   If a width is specified we need more
+				   (which we allocate later). */
+				if (width < 20)
+					width = 20;
+				n += width;
+				if (abuffersize < width)
+					abuffersize = width;
+				break;
+			case 's':
+			{
+				/* UTF-8 */
+				unsigned char*s;
+				s = va_arg(count, unsigned char*);
+				while (*s) {
+					if (*s < 128) {
+						n++; s++;
+					} else if (*s < 0xc0) {
+						/* invalid UTF-8 */
+						n++; s++;
+					} else if (*s < 0xc0) {
+						n++;
+						s++; if(!*s)break;
+						s++;
+					} else if (*s < 0xe0) {
+						n++;
+						s++; if(!*s)break;
+						s++; if(!*s)break;
+						s++;
+					} else {
+						#ifdef Py_UNICODE_WIDE
+						n++;
+						#else
+						n+=2;
+						#endif
+						s++; if(!*s)break;
+						s++; if(!*s)break;
+						s++; if(!*s)break;
+						s++;
+					}
+				}
+				break;
+			}
+			case 'U':
+			{
+				PyObject *obj = va_arg(count, PyObject *);
+				assert(obj && PyUnicode_Check(obj));
+				n += PyUnicode_GET_SIZE(obj);
+				break;
+			}
+			case 'V':
+			{
+				PyObject *obj = va_arg(count, PyObject *);
+				const char *str = va_arg(count, const char *);
+				assert(obj || str);
+				assert(!obj || PyUnicode_Check(obj));
+				if (obj)
+					n += PyUnicode_GET_SIZE(obj);
+				else
+					n += strlen(str);
+				break;
+			}
+			case 'S':
+			{
+				PyObject *obj = va_arg(count, PyObject *);
+				PyObject *str;
+				assert(obj);
+				str = PyObject_Str(obj);
+				if (!str)
+					goto fail;
+				n += PyUnicode_GET_SIZE(str);
+				/* Remember the str and switch to the next slot */
+				*callresult++ = str;
+				break;
+			}
+			case 'R':
+			{
+				PyObject *obj = va_arg(count, PyObject *);
+				PyObject *repr;
+				assert(obj);
+				repr = PyObject_Repr(obj);
+				if (!repr)
+					goto fail;
+				n += PyUnicode_GET_SIZE(repr);
+				/* Remember the repr and switch to the next slot */
+				*callresult++ = repr;
+				break;
+			}
+			case 'p':
+				(void) va_arg(count, int);
+				/* maximum 64-bit pointer representation:
+				 * 0xffffffffffffffff
+				 * so 19 characters is enough.
+				 * XXX I count 18 -- what's the extra for?
+				 */
+				n += 19;
+				break;
+			default:
+				/* if we stumble upon an unknown
+				   formatting code, copy the rest of
+				   the format string to the output
+				   string. (we cannot just skip the
+				   code, since there's no way to know
+				   what's in the argument list) */
+				n += strlen(p);
+				goto expand;
+			}
+		} else
+			n++;
+	}
+ expand:
+	if (abuffersize > 20) {
+		abuffer = PyMem_Malloc(abuffersize);
+		if (!abuffer) {
+			PyErr_NoMemory();
+			goto fail;
+		}
+		realbuffer = abuffer;
+	}
+	else
+		realbuffer = buffer;
+	/* step 4: fill the buffer */
+	/* Since we've analyzed how much space we need for the worst case,
+	   we don't have to resize the string.
+	   There can be no errors beyond this point. */
+	string = PyUnicode_FromUnicode(NULL, n);
+	if (!string)
+		goto fail;
+
+	s = PyUnicode_AS_UNICODE(string);
+	callresult = callresults;
+
+	for (f = format; *f; f++) {
+		if (*f == '%') {
+			const char* p = f++;
+			int longflag = 0;
+			int size_tflag = 0;
+			zeropad = (*f == '0');
+			/* parse the width.precision part */
+			width = 0;
+			while (isdigit(*f))
+				width = (width*10) + *f++ - '0';
+			precision = 0;
+			if (*f == '.') {
+				f++;
+				while (isdigit(*f))
+					precision = (precision*10) + *f++ - '0';
+			}
+			/* handle the long flag, but only for %ld and %lu.
+			   others can be added when necessary. */
+			if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
+				longflag = 1;
+				++f;
+			}
+			/* handle the size_t flag. */
+			if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
+				size_tflag = 1;
+				++f;
+			}
+
+			switch (*f) {
+			case 'c':
+				*s++ = va_arg(vargs, int);
+				break;
+			case 'd':
+				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
+				if (longflag)
+					sprintf(realbuffer, fmt, va_arg(vargs, long));
+				else if (size_tflag)
+					sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
+				else
+					sprintf(realbuffer, fmt, va_arg(vargs, int));
+				appendstring(realbuffer);
+				break;
+			case 'u':
+				makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
+				if (longflag)
+					sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
+				else if (size_tflag)
+					sprintf(realbuffer, fmt, va_arg(vargs, size_t));
+				else
+					sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
+				appendstring(realbuffer);
+				break;
+			case 'i':
+				makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
+				sprintf(realbuffer, fmt, va_arg(vargs, int));
+				appendstring(realbuffer);
+				break;
+			case 'x':
+				makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
+				sprintf(realbuffer, fmt, va_arg(vargs, int));
+				appendstring(realbuffer);
+				break;
+			case 's':
+			{
+				/* Parameter must be UTF-8 encoded.
+				   In case of encoding errors, use
+				   the replacement character. */
+				PyObject *u;
+				p = va_arg(vargs, char*);
+				u = PyUnicode_DecodeUTF8(p, strlen(p), 
+							 "replace");
+				if (!u)
+					goto fail;
+				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(u),
+						PyUnicode_GET_SIZE(u));
+				s += PyUnicode_GET_SIZE(u);
+				Py_DECREF(u);
+				break;
+			}
+			case 'U':
+			{
+				PyObject *obj = va_arg(vargs, PyObject *);
+				Py_ssize_t size = PyUnicode_GET_SIZE(obj);
+				Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
+				s += size;
+				break;
+			}
+			case 'V':
+			{
+				PyObject *obj = va_arg(vargs, PyObject *);
+				const char *str = va_arg(vargs, const char *);
+				if (obj) {
+					Py_ssize_t size = PyUnicode_GET_SIZE(obj);
+					Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
+					s += size;
+				} else {
+					appendstring(str);
+				}
+				break;
+			}
+			case 'S':
+			case 'R':
+			{
+				Py_UNICODE *ucopy;
+				Py_ssize_t usize;
+				Py_ssize_t upos;
+				/* unused, since we already have the result */
+				(void) va_arg(vargs, PyObject *);
+				ucopy = PyUnicode_AS_UNICODE(*callresult);
+				usize = PyUnicode_GET_SIZE(*callresult);
+				for (upos = 0; upos<usize;)
+					*s++ = ucopy[upos++];
+				/* We're done with the unicode()/repr() => forget it */
+				Py_DECREF(*callresult);
+				/* switch to next unicode()/repr() result */
+				++callresult;
+				break;
+			}
+			case 'p':
+				sprintf(buffer, "%p", va_arg(vargs, void*));
+				/* %p is ill-defined:  ensure leading 0x. */
+				if (buffer[1] == 'X')
+					buffer[1] = 'x';
+				else if (buffer[1] != 'x') {
+					memmove(buffer+2, buffer, strlen(buffer)+1);
+					buffer[0] = '0';
+					buffer[1] = 'x';
+				}
+				appendstring(buffer);
+				break;
+			case '%':
+				*s++ = '%';
+				break;
+			default:
+				appendstring(p);
+				goto end;
+			}
+		} else
+			*s++ = *f;
+	}
+
+ end:
+	if (callresults)
+		PyMem_Free(callresults);
+	if (abuffer)
+		PyMem_Free(abuffer);
+	_PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
+	return string;
+ fail:
+	if (callresults) {
+		PyObject **callresult2 = callresults;
+		while (callresult2 < callresult) {
+			Py_DECREF(*callresult2);
+			++callresult2;
+		}
+		PyMem_Free(callresults);
+	}
+	if (abuffer)
+		PyMem_Free(abuffer);
+	return NULL;
+}
+
+#undef appendstring
+
+PyObject *
+PyUnicode_FromFormat(const char *format, ...)
+{
+	PyObject* ret;
+	va_list vargs;
+
+#ifdef HAVE_STDARG_PROTOTYPES
+	va_start(vargs, format);
+#else
+	va_start(vargs);
+#endif
+	ret = PyUnicode_FromFormatV(format, vargs);
+	va_end(vargs);
+	return ret;
+}
+
 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 				wchar_t *w,
 				Py_ssize_t size)