/* * _codecs_unicode.c: Codecs collection for Unicode encodings * * Written by Hye-Shik Chang * $CJKCodecs: _codecs_unicode.c,v 1.5 2004/06/27 21:41:15 perky Exp $ */ #include "cjkcodecs.h" /* * UTF-7 codec */ #define SET_DIRECT 1 #define SET_OPTIONAL 2 #define SET_WHITESPACE 3 #define _D SET_DIRECT #define _O SET_OPTIONAL #define _W SET_WHITESPACE static const char utf7_sets[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, _W, _W, 0, 0, _W, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, _W, _O, _O, _O, _O, _O, _O, _D, _D, _D, _O, 0, _D, _D, _D, 0, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, _O, _D, _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, 0, _O, _O, _O, _O, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _D, _O, _O, _O, 0, 0, }; #undef _W #undef _O #undef _D #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" \ "0123456789+/"[(n) & 0x3f]) #define B64CHAR(c) (((c) >= 'A' && (c) <= 'Z') || \ ((c) >= 'a' && (c) <= 'z') || \ ((c) >= '0' && (c) <= '9') || \ (c) == '+' || (c) == '/') #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \ (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4) #define UTF7_DENCODABLE_COMPATIBLE(c) (utf7_sets[c] != 0) #define UTF7_DENCODABLE_STRICT(c) (utf7_sets[c] == SET_DIRECT || \ utf7_sets[c] == SET_WHITESPACE) #define ESTATE_INITIALIZE(state) \ ESTATE_SETSTAGE(state, 0) \ ESTATE_CLEARSHIFTED(state) #define ESTATE_SETPENDING(state, v) (state)->c[0] = (v); #define ESTATE_GETPENDING(state) (state)->c[0] #define ESTATE_SETSHIFTED(state) (state)->c[2] = 1; #define ESTATE_ISSHIFTED(state) ((state)->c[2]) #define ESTATE_CLEARSHIFTED(state) (state)->c[2] = 0; #define ESTATE_SETSTAGE(state, v) (state)->c[3] = (v); #define ESTATE_GETSTAGE(state) ((state)->c[3]) ENCODER_INIT(utf_7) { ESTATE_INITIALIZE(state) return 0; } ENCODER_RESET(utf_7) { if (ESTATE_ISSHIFTED(state)) { if (ESTATE_GETSTAGE(state) != 0) { unsigned char oc; oc = B64(ESTATE_GETPENDING(state)); WRITE2(oc, '-') NEXT_OUT(2) } else { WRITE1('-') NEXT_OUT(1) } ESTATE_CLEARSHIFTED(state) } return 0; } ENCODER(utf_7) { while (inleft > 0) { Py_UNICODE c1 = IN1, c2 = 0; size_t insize = 1; #if Py_UNICODE_SIZE == 2 if (c1 >> 10 == 0xd800 >> 10) { /* high surrogate */ REQUIRE_INBUF(2) if (IN2 >> 10 != 0xdc00 >> 10) /* low surrogate */ return 2; /* invalid surrogate pair */ c2 = IN2; insize = 2; } #else if (c1 > 0x10ffff) /* UTF-16 unencodable */ return 1; else if (c1 > 0xffff) { c2 = 0xdc00 | ((c1 - 0x10000) & 0x3ff); c1 = 0xd800 | ((c1 - 0x10000) >> 10); } #endif for (;;) { unsigned char oc1, oc2, oc3; if (ESTATE_ISSHIFTED(state)) { if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) { if (ESTATE_GETSTAGE(state) != 0) { oc1 = B64(ESTATE_GETPENDING( state)); WRITE3(oc1, '-', (unsigned char)c1) NEXT_OUT(3) } else { WRITE2('-', (unsigned char)c1) NEXT_OUT(2) } ESTATE_CLEARSHIFTED(state) } else { switch (ESTATE_GETSTAGE(state)) { case 0: oc1 = c1 >> 10; oc2 = (c1 >> 4) & 0x3f; WRITE2(B64(oc1), B64(oc2)) ESTATE_SETPENDING(state, (c1 & 0x0f) << 2) ESTATE_SETSTAGE(state, 2) NEXT_OUT(2) break; case 1: oc1 = ESTATE_GETPENDING(state) | (c1 >> 12); oc2 = (c1 >> 6) & 0x3f; oc3 = c1 & 0x3f; WRITE3(B64(oc1), B64(oc2), B64(oc3)) ESTATE_SETSTAGE(state, 0) NEXT_OUT(3) break; case 2: oc1 = ESTATE_GETPENDING(state) | (c1 >> 14); oc2 = (c1 >> 8) & 0x3f; oc3 = (c1 >> 2) & 0x3f; WRITE3(B64(oc1), B64(oc2), B64(oc3)) ESTATE_SETPENDING(state, (c1 & 0x03) << 4) ESTATE_SETSTAGE(state, 1) NEXT_OUT(3) break; default: return MBERR_INTERNAL; } } } else { if (c1 < 128 && UTF7_DENCODABLE_STRICT(c1)) { WRITE1((unsigned char)c1) NEXT_OUT(1) } else if (c1 == '+') { WRITE2('+', '-') NEXT_OUT(2) } else { oc1 = c1 >> 10; oc2 = (c1 >> 4) & 0x3f; WRITE3('+', B64(oc1), B64(oc2)) ESTATE_SETPENDING(state, (c1 & 0x0f) << 2) ESTATE_SETSTAGE(state, 2) ESTATE_SETSHIFTED(state) NEXT_OUT(3) } } if (c2 != 0) { c1 = c2; c2 = 0; } else break; } NEXT_IN(insize) } return 0; } #define DSTATE_INITIALIZE(state) \ DSTATE_SETBSTAGE(state, 0) \ DSTATE_CLEARSHIFTED(state) \ DSTATE_SETULENGTH(state, 0) \ DSTATE_SETUPENDING1(state, 0) \ DSTATE_SETUPENDING2(state, 0) /* XXX: Type-mixed usage of a state union may be not so portable. * If you see any problem with this on your platfom. Please let * me know. */ #define DSTATE_SETSHIFTED(state) (state)->c[0] = 1; #define DSTATE_ISSHIFTED(state) ((state)->c[0]) #define DSTATE_CLEARSHIFTED(state) (state)->c[0] = 0; #define DSTATE_SETBSTAGE(state, v) (state)->c[1] = (v); #define DSTATE_GETBSTAGE(state) ((state)->c[1]) #define DSTATE_SETBPENDING(state, v) (state)->c[2] = (v); #define DSTATE_GETBPENDING(state) ((state)->c[2]) #define DSTATE_SETULENGTH(state, v) (state)->c[3] = (v); #define DSTATE_GETULENGTH(state) ((state)->c[3]) #define DSTATE_SETUPENDING1(state, v) (state)->u2[2] = (v); #define DSTATE_GETUPENDING1(state) (state)->u2[2] #define DSTATE_SETUPENDING2(state, v) (state)->u2[3] = (v); #define DSTATE_GETUPENDING2(state) (state)->u2[3] #define DSTATE_UAPPEND(state, v) \ (state)->u2[(state)->c[3] > 1 ? 3 : 2] |= \ ((state)->c[3] & 1) ? (v) : ((ucs2_t)(v)) << 8; \ (state)->c[3]++; DECODER_INIT(utf_7) { DSTATE_INITIALIZE(state) return 0; } static int utf_7_flush(MultibyteCodec_State *state, Py_UNICODE **outbuf, size_t *outleft) { switch (DSTATE_GETULENGTH(state)) { case 2: { ucs2_t uc; uc = DSTATE_GETUPENDING1(state); #if Py_UNICODE_SIZE == 4 if (uc >> 10 == 0xd800 >> 10) return MBERR_TOOFEW; #endif OUT1(uc) (*outbuf)++; (*outleft)--; DSTATE_SETULENGTH(state, 0) DSTATE_SETUPENDING1(state, 0) break; } #if Py_UNICODE_SIZE == 4 case 4: if (DSTATE_GETUPENDING2(state) >> 10 != 0xdc00 >> 10) return 1; OUT1(0x10000 + (((ucs4_t)DSTATE_GETUPENDING1(state) - 0xd800) << 10) + (DSTATE_GETUPENDING2(state) - 0xdc00)) (*outbuf)++; (*outleft)--; DSTATE_SETULENGTH(state, 0) DSTATE_SETUPENDING1(state, 0) DSTATE_SETUPENDING2(state, 0) break; #endif case 0: /* FALLTHROUGH */ case 1: /* FALLTHROUGH */ case 3: return MBERR_TOOFEW; default: return MBERR_INTERNAL; } return 0; } DECODER_RESET(utf_7) { DSTATE_INITIALIZE(state) return 0; } DECODER(utf_7) { while (inleft > 0) { unsigned char c = IN1; int r; if (!DSTATE_ISSHIFTED(state)) { if (c == '+') { REQUIRE_INBUF(2) if (inleft >= 2 && IN2 == '-') { WRITE1('+') NEXT(2, 1) } else { DSTATE_SETSHIFTED(state) NEXT_IN(1) } } else if (c < 128 && UTF7_DENCODABLE_COMPATIBLE(c)) { WRITE1(c) NEXT(1, 1) } else return 1; } else if (B64CHAR(c)) { unsigned char tb; REQUIRE_OUTBUF(1) c = UB64(c); assert(DSTATE_GETULENGTH(state) < 4); switch (DSTATE_GETBSTAGE(state)) { case 0: DSTATE_SETBPENDING(state, c << 2) DSTATE_SETBSTAGE(state, 1) break; case 1: tb = DSTATE_GETBPENDING(state) | (c >> 4); DSTATE_SETBPENDING(state, c << 4) DSTATE_SETBSTAGE(state, 2) DSTATE_UAPPEND(state, tb) break; case 2: tb = DSTATE_GETBPENDING(state) | (c >> 2); DSTATE_SETBPENDING(state, c << 6) DSTATE_SETBSTAGE(state, 3) DSTATE_UAPPEND(state, tb) break; case 3: tb = DSTATE_GETBPENDING(state) | c; DSTATE_SETBSTAGE(state, 0) DSTATE_UAPPEND(state, tb) break; } r = utf_7_flush(state, outbuf, &outleft); if (r != 0 && r != MBERR_TOOFEW) return r; NEXT_IN(1) } else if (c == '-' || UTF7_DENCODABLE_COMPATIBLE(c)) { if (DSTATE_GETBSTAGE(state) != 0) { DSTATE_UAPPEND(state, DSTATE_GETBSTAGE(state)) DSTATE_SETBSTAGE(state, 0) } r = utf_7_flush(state, outbuf, &outleft); if (r != 0 && r != MBERR_TOOFEW) return r; DSTATE_CLEARSHIFTED(state) if (c != '-') { WRITE1(c) NEXT_OUT(1) } NEXT_IN(1) } else return 1; } return 0; } /* * UTF-8 codec */ ENCODER(utf_8) { while (inleft > 0) { ucs4_t c = **inbuf; size_t outsize, insize = 1; if (c < 0x80) outsize = 1; else if (c < 0x800) outsize = 2; else { #if Py_UNICODE_SIZE == 2 if (c >> 10 == 0xd800 >> 10) { /* high surrogate */ if (inleft < 2) { if (!(flags & MBENC_FLUSH)) return MBERR_TOOFEW; } else if ((*inbuf)[1] >> 10 == 0xdc00 >> 10) { /* low surrogate */ c = 0x10000 + ((c - 0xd800) << 10) + ((ucs4_t)((*inbuf)[1]) - 0xdc00); insize = 2; } } #endif if (c < 0x10000) outsize = 3; else if (c < 0x200000) outsize = 4; else if (c < 0x4000000) outsize = 5; else outsize = 6; } REQUIRE_OUTBUF(outsize) switch (outsize) { case 6: (*outbuf)[5] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x4000000; /* FALLTHROUGH */ case 5: (*outbuf)[4] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x200000; /* FALLTHROUGH */ case 4: (*outbuf)[3] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x10000; /* FALLTHROUGH */ case 3: (*outbuf)[2] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0x800; /* FALLTHROUGH */ case 2: (*outbuf)[1] = 0x80 | (c & 0x3f); c = c >> 6; c |= 0xc0; /* FALLTHROUGH */ case 1: (*outbuf)[0] = c; } NEXT(insize, outsize) } return 0; } DECODER(utf_8) { while (inleft > 0) { unsigned char c = **inbuf; REQUIRE_OUTBUF(1) if (c < 0x80) { (*outbuf)[0] = (unsigned char)c; NEXT(1, 1) } else if (c < 0xc2) { return 1; } else if (c < 0xe0) { unsigned char c2; REQUIRE_INBUF(2) c2 = (*inbuf)[1]; if (!((c2 ^ 0x80) < 0x40)) return 2; **outbuf = ((Py_UNICODE)(c & 0x1f) << 6) | (Py_UNICODE)(c2 ^ 0x80); NEXT(2, 1) } else if (c < 0xf0) { unsigned char c2, c3; REQUIRE_INBUF(3) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c >= 0xe1 || c2 >= 0xa0))) return 3; **outbuf = ((Py_UNICODE)(c & 0x0f) << 12) | ((Py_UNICODE)(c2 ^ 0x80) << 6) | (Py_UNICODE)(c3 ^ 0x80); NEXT(3, 1) } else if (c < 0xf8) { unsigned char c2, c3, c4; ucs4_t code; REQUIRE_INBUF(4) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c >= 0xf1 || c2 >= 0x90))) return 4; code = ((ucs4_t)(c & 0x07) << 18) | ((ucs4_t)(c2 ^ 0x80) << 12) | ((ucs4_t)(c3 ^ 0x80) << 6) | (ucs4_t)(c4 ^ 0x80); WRITEUCS4(code) NEXT_IN(4) } else if (c < 0xfc) { unsigned char c2, c3, c4, c5; ucs4_t code; REQUIRE_INBUF(5) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c >= 0xf9 || c2 >= 0x88))) return 5; code = ((ucs4_t)(c & 0x03) << 24) | ((ucs4_t)(c2 ^ 0x80) << 18) | ((ucs4_t)(c3 ^ 0x80) << 12) | ((ucs4_t)(c4 ^ 0x80) << 6) | (ucs4_t)(c5 ^ 0x80); WRITEUCS4(code) NEXT_IN(5) } else if (c < 0xff) { unsigned char c2, c3, c4, c5, c6; ucs4_t code; REQUIRE_INBUF(6) c2 = (*inbuf)[1]; c3 = (*inbuf)[2]; c4 = (*inbuf)[3]; c5 = (*inbuf)[4]; c6 = (*inbuf)[5]; if (!((c2 ^ 0x80) < 0x40 && (c3 ^ 0x80) < 0x40 && (c4 ^ 0x80) < 0x40 && (c5 ^ 0x80) < 0x40 && (c6 ^ 0x80) < 0x40 && (c >= 0xfd || c2 >= 0x84))) return 6; code = ((ucs4_t)(c & 0x01) << 30) | ((ucs4_t)(c2 ^ 0x80) << 24) | ((ucs4_t)(c3 ^ 0x80) << 18) | ((ucs4_t)(c4 ^ 0x80) << 12) | ((ucs4_t)(c5 ^ 0x80) << 6) | (ucs4_t)(c6 ^ 0x80); WRITEUCS4(code) NEXT_IN(6) } else return 1; } return 0; } BEGIN_MAPPINGS_LIST END_MAPPINGS_LIST BEGIN_CODECS_LIST CODEC_STATEFUL(utf_7) CODEC_STATELESS(utf_8) END_CODECS_LIST I_AM_A_MODULE_FOR(unicode)