Annotation of XML/encoding.c, revision 1.23
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.1 daniel 37: #include "encoding.h"
1.12 daniel 38: #ifdef HAVE_UNICODE_H
39: #include <unicode.h>
40: #endif
1.16 daniel 41: #include "xmlmemory.h"
1.3 daniel 42:
43: /*
44: * From rfc2044: encoding of the Unicode values on UTF-8:
45: *
46: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
47: * 0000 0000-0000 007F 0xxxxxxx
48: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
49: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
50: *
51: * I hope we won't use values > 0xFFFF anytime soon !
52: */
1.1 daniel 53:
54: /**
1.22 daniel 55: * xmlCheckUTF8: Check utf-8 string for legality.
56: * @utf: Pointer to putative utf-8 encoded string.
57: *
58: * Checks @utf for being valid utf-8. @utf is assumed to be
59: * null-terminated. This function is not super-strict, as it will
60: * allow longer utf-8 sequences than necessary. Note that Java is
61: * capable of producing these sequences if provoked. Also note, this
62: * routine checks for the 4-byte maxiumum size, but does not check for
63: * 0x10ffff maximum value.
64: *
65: * Return value: true if @utf is valid.
66: **/
67: int
68: xmlCheckUTF8(const unsigned char *utf)
69: {
70: int ix;
71: unsigned char c;
72:
73: for (ix = 0; (c = utf[ix]);) {
74: if (c & 0x80) {
75: if ((utf[ix + 1] & 0xc0) != 0x80)
76: return(0);
77: if ((c & 0xe0) == 0xe0) {
78: if ((utf[ix + 2] & 0xc0) != 0x80)
79: return(0);
80: if ((c & 0xf0) == 0xf0) {
81: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
82: return(0);
83: ix += 4;
84: /* 4-byte code */
85: } else
86: /* 3-byte code */
87: ix += 3;
88: } else
89: /* 2-byte code */
90: ix += 2;
91: } else
92: /* 1-byte code */
93: ix++;
94: }
95: return(1);
96: }
97:
98: /**
1.1 daniel 99: * isolat1ToUTF8:
1.18 daniel 100: * @out: a pointer to an array of bytes to store the result
101: * @outlen: the length of @out
102: * @in: a pointer to an array of ISO Latin 1 chars
103: * @inlen: the length of @in
1.1 daniel 104: *
105: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
106: * block of chars out.
1.6 daniel 107: * Returns the number of byte written, or -1 by lack of space.
1.1 daniel 108: */
109: int
1.23 ! daniel 110: isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int *inlen)
1.1 daniel 111: {
112: unsigned char* outstart= out;
113: unsigned char* outend= out+outlen;
1.23 ! daniel 114: unsigned char* inend= in+*inlen;
1.1 daniel 115: unsigned char c;
116:
117: while (in < inend) {
118: c= *in++;
119: if (c < 0x80) {
120: if (out >= outend) return -1;
121: *out++ = c;
122: }
123: else {
124: if (out >= outend) return -1;
125: *out++ = 0xC0 | (c >> 6);
126: if (out >= outend) return -1;
127: *out++ = 0x80 | (0x3F & c);
128: }
129: }
130: return out-outstart;
131: }
132:
133: /**
134: * UTF8Toisolat1:
1.18 daniel 135: * @out: a pointer to an array of bytes to store the result
136: * @outlen: the length of @out
137: * @in: a pointer to an array of UTF-8 chars
138: * @inlen: the length of @in
1.1 daniel 139: *
140: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
141: * block of chars out.
1.15 daniel 142: * TODO: UTF8Toisolat1 need a fallback mechanism ...
143: *
1.6 daniel 144: * Returns the number of byte written, or -1 by lack of space, or -2
1.23 ! daniel 145: * if the transcoding faile (for *in is not valid utf8 string or
! 146: * the result of transformation can't fit into the encoding we want)
1.1 daniel 147: */
148: int
1.23 ! daniel 149: UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int *inlen)
1.1 daniel 150: {
151: unsigned char* outstart= out;
152: unsigned char* outend= out+outlen;
1.23 ! daniel 153: unsigned char* inend= in+*inlen;
1.1 daniel 154: unsigned char c;
155:
156: while (in < inend) {
157: c= *in++;
158: if (c < 0x80) {
159: if (out >= outend) return -1;
160: *out++= c;
161: }
1.23 ! daniel 162: else if (in == inend) {
! 163: *inlen -= 1;
! 164: break;
! 165: }
! 166: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
! 167: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 168: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 ! daniel 169: }
! 170: else return -2;
! 171: /* TODO : some should be represent as "&#x____;" */
1.1 daniel 172: }
173: return out-outstart;
174: }
175:
176: /**
177: * UTF16ToUTF8:
1.18 daniel 178: * @out: a pointer to an array of bytes to store the result
179: * @outlen: the length of @out
180: * @in: a pointer to an array of UTF-16 chars (array of unsigned shorts)
181: * @inlen: the length of @in
1.1 daniel 182: *
183: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
184: * block of chars out.
1.6 daniel 185: * Returns the number of byte written, or -1 by lack of space.
1.1 daniel 186: */
187: int
1.23 ! daniel 188: UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int *inlen)
1.1 daniel 189: {
190: unsigned char* outstart= out;
191: unsigned char* outend= out+outlen;
1.23 ! daniel 192: unsigned short* inend= in+*inlen;
1.1 daniel 193: unsigned int c, d;
194: int bits;
195:
196: while (in < inend) {
197: c= *in++;
198: if ((c & 0xFC00) == 0xD800) { /* surrogates */
199: if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
200: c &= 0x03FF;
201: c <<= 10;
202: c |= d & 0x03FF;
203: c += 0x10000;
204: }
205: else return -1;
206: }
207:
208: /* assertion: c is a single UTF-4 value */
209:
210: if (out >= outend) return -1;
211: if (c < 0x80) { *out++= c; bits= -6; }
212: else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
213: else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
214: else { *out++= (c >> 18) | 0xF0; bits= 12; }
215:
1.18 daniel 216: for ( ; bits > 0; bits-= 6) {
1.1 daniel 217: if (out >= outend) return -1;
218: *out++= (c >> bits) & 0x3F;
219: }
220: }
221: return out-outstart;
222: }
223:
224: /**
225: * UTF8ToUTF16:
1.18 daniel 226: * @out: a pointer to an array of shorts to store the result
227: * @outlen: the length of @out (number of shorts)
228: * @in: a pointer to an array of UTF-8 chars
229: * @inlen: the length of @in
1.1 daniel 230: *
231: * Take a block of UTF-8 chars in and try to convert it to an UTF-16
232: * block of chars out.
1.15 daniel 233: * TODO: UTF8ToUTF16 need a fallback mechanism ...
234: *
1.6 daniel 235: * Returns the number of byte written, or -1 by lack of space, or -2
1.1 daniel 236: * if the transcoding failed.
237: */
238: int
1.23 ! daniel 239: UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int *inlen)
1.1 daniel 240: {
241: unsigned short* outstart= out;
242: unsigned short* outend= out+outlen;
1.23 ! daniel 243: unsigned char* inend= in+*inlen;
1.1 daniel 244: unsigned int c, d, trailing;
245:
246: while (in < inend) {
247: d= *in++;
248: if (d < 0x80) { c= d; trailing= 0; }
249: else if (d < 0xC0) return -2; /* trailing byte in leading position */
250: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
251: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
252: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
253: else return -2; /* no chance for this in UTF-16 */
254:
255: for ( ; trailing; trailing--) {
256: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
257: c <<= 6;
258: c |= d & 0x3F;
259: }
260:
261: /* assertion: c is a single UTF-4 value */
262: if (c < 0x10000) {
263: if (out >= outend) return -1;
264: *out++ = c;
265: }
266: else if (c < 0x110000) {
267: if (out+1 >= outend) return -1;
268: c -= 0x10000;
269: *out++ = 0xD800 | (c >> 10);
270: *out++ = 0xDC00 | (c & 0x03FF);
271: }
272: else return -1;
273: }
274: return out-outstart;
275: }
276:
1.7 daniel 277: /**
278: * xmlDetectCharEncoding:
279: * @in: a pointer to the first bytes of the XML entity, must be at least
280: * 4 bytes long.
281: *
282: * Guess the encoding of the entity using the first bytes of the entity content
283: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
284: *
285: * Returns one of the XML_CHAR_ENCODING_... values.
286: */
287: xmlCharEncoding
1.8 daniel 288: xmlDetectCharEncoding(const unsigned char* in)
1.7 daniel 289: {
290: if ((in[0] == 0x00) && (in[1] == 0x00) &&
291: (in[2] == 0x00) && (in[3] == 0x3C))
292: return(XML_CHAR_ENCODING_UCS4BE);
293: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
294: (in[2] == 0x00) && (in[3] == 0x00))
295: return(XML_CHAR_ENCODING_UCS4LE);
296: if ((in[0] == 0x00) && (in[1] == 0x00) &&
297: (in[2] == 0x3C) && (in[3] == 0x00))
298: return(XML_CHAR_ENCODING_UCS4_2143);
299: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
300: (in[2] == 0x00) && (in[3] == 0x00))
301: return(XML_CHAR_ENCODING_UCS4_3412);
302: if ((in[0] == 0xFE) && (in[1] == 0xFF))
303: return(XML_CHAR_ENCODING_UTF16BE);
304: if ((in[0] == 0xFF) && (in[1] == 0xFE))
305: return(XML_CHAR_ENCODING_UTF16LE);
306: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
307: (in[2] == 0xA7) && (in[3] == 0x94))
308: return(XML_CHAR_ENCODING_EBCDIC);
309: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
310: (in[2] == 0x78) && (in[3] == 0x6D))
311: return(XML_CHAR_ENCODING_UTF8);
312: return(XML_CHAR_ENCODING_NONE);
313: }
314:
315: /**
316: * xmlParseCharEncoding:
1.18 daniel 317: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 318: *
319: * Conpare the string to the known encoding schemes already known. Note
320: * that the comparison is case insensitive accordingly to the section
321: * [XML] 4.3.3 Character Encoding in Entities.
322: *
323: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
324: * if not recognized.
325: */
326: xmlCharEncoding
1.8 daniel 327: xmlParseCharEncoding(const char* name)
1.7 daniel 328: {
329: char upper[500];
330: int i;
331:
332: for (i = 0;i < 499;i++) {
333: upper[i] = toupper(name[i]);
334: if (upper[i] == 0) break;
335: }
336: upper[i] = 0;
337:
338: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
339: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
340: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
341:
342: /*
343: * NOTE: if we were able to parse this, the endianness of UTF16 is
344: * already found and in use
345: */
346: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
347: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
348:
349: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
350: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
351: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
352:
353: /*
354: * NOTE: if we were able to parse this, the endianness of UCS4 is
355: * already found and in use
356: */
357: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
358: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
359: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
360:
361:
362: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
363: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
364: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
365:
366: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
367: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
368: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
369:
370: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
371: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
372: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
373: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
374: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
375: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
376: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
377:
378: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
379: if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
380: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
381: return(XML_CHAR_ENCODING_ERROR);
382: }
1.9 daniel 383:
384: /****************************************************************
385: * *
386: * Char encoding handlers *
387: * *
388: ****************************************************************/
389:
390: /* the size should be growable, but it's not a big deal ... */
391: #define MAX_ENCODING_HANDLERS 50
392: static xmlCharEncodingHandlerPtr *handlers = NULL;
393: static int nbCharEncodingHandler = 0;
394:
395: /*
396: * The default is UTF-8 for XML, that's also the default used for the
397: * parser internals, so the default encoding handler is NULL
398: */
399:
400: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
401:
402: /**
403: * xmlNewCharEncodingHandler:
1.18 daniel 404: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 405: * @input: the xmlCharEncodingInputFunc to read that encoding
406: * @output: the xmlCharEncodingOutputFunc to write that encoding
407: *
408: * Create and registers an xmlCharEncodingHandler.
409: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
410: */
411: xmlCharEncodingHandlerPtr
412: xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
413: xmlCharEncodingOutputFunc output) {
414: xmlCharEncodingHandlerPtr handler;
415: char upper[500];
416: int i;
417: char *up = 0;
418:
419: /*
420: * Keep only the uppercase version of the encoding.
421: */
422: if (name == NULL) {
423: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
424: return(NULL);
425: }
426: for (i = 0;i < 499;i++) {
427: upper[i] = toupper(name[i]);
428: if (upper[i] == 0) break;
429: }
430: upper[i] = 0;
1.16 daniel 431: up = xmlMemStrdup(upper);
1.9 daniel 432: if (up == NULL) {
433: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
434: return(NULL);
435: }
436:
437: /*
438: * allocate and fill-up an handler block.
439: */
440: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 441: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 442: if (handler == NULL) {
443: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
444: return(NULL);
445: }
446: handler->input = input;
447: handler->output = output;
448: handler->name = up;
449:
450: /*
451: * registers and returns the handler.
452: */
453: xmlRegisterCharEncodingHandler(handler);
454: return(handler);
455: }
456:
457: /**
458: * xmlInitCharEncodingHandlers:
459: *
460: * Initialize the char encoding support, it registers the default
461: * encoding supported.
1.18 daniel 462: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 463: * in normal processing.
464: */
465: void
466: xmlInitCharEncodingHandlers(void) {
467: if (handlers != NULL) return;
468:
469: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 470: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.9 daniel 471:
472: if (handlers == NULL) {
473: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
474: return;
475: }
1.10 daniel 476: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.12 daniel 477: #ifdef HAVE_UNICODE_H
478: #else
1.13 daniel 479: /* xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16); */
1.10 daniel 480: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.12 daniel 481: #endif
1.9 daniel 482: }
483:
484: /**
1.19 daniel 485: * xmlCleanupCharEncodingHandlers:
486: *
487: * Cleanup the memory allocated for the char encoding support, it
488: * unregisters all the encoding handlers.
489: */
490: void
491: xmlCleanupCharEncodingHandlers(void) {
492: if (handlers == NULL) return;
493:
494: for (;nbCharEncodingHandler > 0;) {
495: nbCharEncodingHandler--;
496: if (handlers[nbCharEncodingHandler] != NULL) {
497: xmlFree(handlers[nbCharEncodingHandler]->name);
498: xmlFree(handlers[nbCharEncodingHandler]);
499: }
500: }
501: xmlFree(handlers);
502: handlers = NULL;
503: nbCharEncodingHandler = 0;
504: xmlDefaultCharEncodingHandler = NULL;
505: }
506:
507: /**
1.9 daniel 508: * xmlRegisterCharEncodingHandler:
509: * @handler: the xmlCharEncodingHandlerPtr handler block
510: *
511: * Register the char encoding handler, surprizing, isn't it ?
512: */
513: void
514: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
515: if (handlers == NULL) xmlInitCharEncodingHandlers();
516: if (handler == NULL) {
517: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
518: return;
519: }
520:
521: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
522: fprintf(stderr,
523: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
524: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
525: return;
526: }
527: handlers[nbCharEncodingHandler++] = handler;
528: }
529:
530: /**
531: * xmlGetCharEncodingHandler:
532: * @enc: an xmlCharEncoding value.
533: *
534: * Search in the registrered set the handler able to read/write that encoding.
535: *
536: * Returns the handler or NULL if not found
537: */
538: xmlCharEncodingHandlerPtr
539: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
540: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.15 daniel 541: /* TODO xmlGetCharEncodingHandler !!!!!!! */
1.9 daniel 542: return(NULL);
543: }
544:
545: /**
546: * xmlGetCharEncodingHandler:
547: * @enc: a string describing the char encoding.
548: *
549: * Search in the registrered set the handler able to read/write that encoding.
550: *
551: * Returns the handler or NULL if not found
552: */
553: xmlCharEncodingHandlerPtr
554: xmlFindCharEncodingHandler(const char *name) {
555: char upper[500];
556: int i;
557:
558: if (handlers == NULL) xmlInitCharEncodingHandlers();
559: if (name == NULL) return(xmlDefaultCharEncodingHandler);
560: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
561:
562: for (i = 0;i < 499;i++) {
563: upper[i] = toupper(name[i]);
564: if (upper[i] == 0) break;
565: }
566: upper[i] = 0;
567:
568: for (i = 0;i < nbCharEncodingHandler; i++)
569: if (!strcmp(name, handlers[i]->name))
570: return(handlers[i]);
571:
572: return(NULL);
573: }
574:
Webmaster