Annotation of XML/encoding.c, revision 1.10
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.7 daniel 22: #include <ctype.h>
1.9 daniel 23: #include <string.h>
24: #include <stdio.h>
1.1 daniel 25: #include "encoding.h"
1.3 daniel 26:
27: /*
28: * From rfc2044: encoding of the Unicode values on UTF-8:
29: *
30: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
31: * 0000 0000-0000 007F 0xxxxxxx
32: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
33: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
34: *
35: * I hope we won't use values > 0xFFFF anytime soon !
36: */
1.1 daniel 37:
38: /**
39: * isolat1ToUTF8:
40: * @out: a pointer ot an array of bytes to store the result
41: * @outlen: the lenght of @out
42: * @in: a pointer ot an array of ISO Latin 1 chars
43: * @inlen: the lenght of @in
44: *
45: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
46: * block of chars out.
1.6 daniel 47: * Returns the number of byte written, or -1 by lack of space.
1.1 daniel 48: */
49: int
50: isolat1ToUTF8(unsigned char* out, int outlen, unsigned char* in, int inlen)
51: {
52: unsigned char* outstart= out;
53: unsigned char* outend= out+outlen;
54: unsigned char* inend= in+inlen;
55: unsigned char c;
56:
57: while (in < inend) {
58: c= *in++;
59: if (c < 0x80) {
60: if (out >= outend) return -1;
61: *out++ = c;
62: }
63: else {
64: if (out >= outend) return -1;
65: *out++ = 0xC0 | (c >> 6);
66: if (out >= outend) return -1;
67: *out++ = 0x80 | (0x3F & c);
68: }
69: }
70: return out-outstart;
71: }
72:
73: /**
74: * UTF8Toisolat1:
75: * @out: a pointer ot an array of bytes to store the result
76: * @outlen: the lenght of @out
77: * @in: a pointer ot an array of UTF-8 chars
78: * @inlen: the lenght of @in
79: *
80: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
81: * block of chars out.
82: * TODO: need a fallback mechanism ...
1.6 daniel 83: * Returns the number of byte written, or -1 by lack of space, or -2
1.1 daniel 84: * if the transcoding failed.
85: */
86: int
87: UTF8Toisolat1(unsigned char* out, int outlen, unsigned char* in, int inlen)
88: {
89: unsigned char* outstart= out;
90: unsigned char* outend= out+outlen;
91: unsigned char* inend= in+inlen;
92: unsigned char c;
93:
94: while (in < inend) {
95: c= *in++;
96: if (c < 0x80) {
97: if (out >= outend) return -1;
98: *out++= c;
99: }
100: else if (((c & 0xFE) == 0xC2) && in<inend) {
101: if (out >= outend) return -1;
102: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
103: }
104: else return -2;
105: }
106: return out-outstart;
107: }
108:
109: /**
110: * UTF16ToUTF8:
111: * @out: a pointer ot an array of bytes to store the result
112: * @outlen: the lenght of @out
113: * @in: a pointer ot an array of UTF-16 chars (array of unsigned shorts)
114: * @inlen: the lenght of @in
115: *
116: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
117: * block of chars out.
1.6 daniel 118: * Returns the number of byte written, or -1 by lack of space.
1.1 daniel 119: */
120: int
121: UTF16ToUTF8(unsigned char* out, int outlen, unsigned short* in, int inlen)
122: {
123: unsigned char* outstart= out;
124: unsigned char* outend= out+outlen;
125: unsigned short* inend= in+inlen;
126: unsigned int c, d;
127: int bits;
128:
129: while (in < inend) {
130: c= *in++;
131: if ((c & 0xFC00) == 0xD800) { /* surrogates */
132: if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
133: c &= 0x03FF;
134: c <<= 10;
135: c |= d & 0x03FF;
136: c += 0x10000;
137: }
138: else return -1;
139: }
140:
141: /* assertion: c is a single UTF-4 value */
142:
143: if (out >= outend) return -1;
144: if (c < 0x80) { *out++= c; bits= -6; }
145: else if (c < 0x800) { *out++= (c >> 6) | 0xC0; bits= 0; }
146: else if (c < 0x10000) { *out++= (c >> 12) | 0xE0; bits= 6; }
147: else { *out++= (c >> 18) | 0xF0; bits= 12; }
148:
149: for ( ; bits < 0; bits-= 6) {
150: if (out >= outend) return -1;
151: *out++= (c >> bits) & 0x3F;
152: }
153: }
154: return out-outstart;
155: }
156:
157: /**
158: * UTF8ToUTF16:
159: * @out: a pointer ot an array of shorts to store the result
160: * @outlen: the lenght of @out (number of shorts)
161: * @in: a pointer ot an array of UTF-8 chars
162: * @inlen: the lenght of @in
163: *
164: * Take a block of UTF-8 chars in and try to convert it to an UTF-16
165: * block of chars out.
166: * TODO: need a fallback mechanism ...
1.6 daniel 167: * Returns the number of byte written, or -1 by lack of space, or -2
1.1 daniel 168: * if the transcoding failed.
169: */
170: int
171: UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
172: {
173: unsigned short* outstart= out;
174: unsigned short* outend= out+outlen;
175: unsigned char* inend= in+inlen;
176: unsigned int c, d, trailing;
177:
178: while (in < inend) {
179: d= *in++;
180: if (d < 0x80) { c= d; trailing= 0; }
181: else if (d < 0xC0) return -2; /* trailing byte in leading position */
182: else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
183: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
184: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
185: else return -2; /* no chance for this in UTF-16 */
186:
187: for ( ; trailing; trailing--) {
188: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return -1;
189: c <<= 6;
190: c |= d & 0x3F;
191: }
192:
193: /* assertion: c is a single UTF-4 value */
194: if (c < 0x10000) {
195: if (out >= outend) return -1;
196: *out++ = c;
197: }
198: else if (c < 0x110000) {
199: if (out+1 >= outend) return -1;
200: c -= 0x10000;
201: *out++ = 0xD800 | (c >> 10);
202: *out++ = 0xDC00 | (c & 0x03FF);
203: }
204: else return -1;
205: }
206: return out-outstart;
207: }
208:
209:
1.7 daniel 210: /**
211: * xmlDetectCharEncoding:
212: * @in: a pointer to the first bytes of the XML entity, must be at least
213: * 4 bytes long.
214: *
215: * Guess the encoding of the entity using the first bytes of the entity content
216: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
217: *
218: * Returns one of the XML_CHAR_ENCODING_... values.
219: */
220: xmlCharEncoding
1.8 daniel 221: xmlDetectCharEncoding(const unsigned char* in)
1.7 daniel 222: {
223: if ((in[0] == 0x00) && (in[1] == 0x00) &&
224: (in[2] == 0x00) && (in[3] == 0x3C))
225: return(XML_CHAR_ENCODING_UCS4BE);
226: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
227: (in[2] == 0x00) && (in[3] == 0x00))
228: return(XML_CHAR_ENCODING_UCS4LE);
229: if ((in[0] == 0x00) && (in[1] == 0x00) &&
230: (in[2] == 0x3C) && (in[3] == 0x00))
231: return(XML_CHAR_ENCODING_UCS4_2143);
232: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
233: (in[2] == 0x00) && (in[3] == 0x00))
234: return(XML_CHAR_ENCODING_UCS4_3412);
235: if ((in[0] == 0xFE) && (in[1] == 0xFF))
236: return(XML_CHAR_ENCODING_UTF16BE);
237: if ((in[0] == 0xFF) && (in[1] == 0xFE))
238: return(XML_CHAR_ENCODING_UTF16LE);
239: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
240: (in[2] == 0xA7) && (in[3] == 0x94))
241: return(XML_CHAR_ENCODING_EBCDIC);
242: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
243: (in[2] == 0x78) && (in[3] == 0x6D))
244: return(XML_CHAR_ENCODING_UTF8);
245: return(XML_CHAR_ENCODING_NONE);
246: }
247:
248: /**
249: * xmlParseCharEncoding:
250: * @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
251: *
252: * Conpare the string to the known encoding schemes already known. Note
253: * that the comparison is case insensitive accordingly to the section
254: * [XML] 4.3.3 Character Encoding in Entities.
255: *
256: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
257: * if not recognized.
258: */
259: xmlCharEncoding
1.8 daniel 260: xmlParseCharEncoding(const char* name)
1.7 daniel 261: {
262: char upper[500];
263: int i;
264:
265: for (i = 0;i < 499;i++) {
266: upper[i] = toupper(name[i]);
267: if (upper[i] == 0) break;
268: }
269: upper[i] = 0;
270:
271: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
272: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
273: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
274:
275: /*
276: * NOTE: if we were able to parse this, the endianness of UTF16 is
277: * already found and in use
278: */
279: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
280: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
281:
282: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
283: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
284: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
285:
286: /*
287: * NOTE: if we were able to parse this, the endianness of UCS4 is
288: * already found and in use
289: */
290: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
291: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
292: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
293:
294:
295: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
296: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
297: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
298:
299: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
300: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
301: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
302:
303: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
304: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
305: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
306: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
307: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
308: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
309: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
310:
311: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
312: if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
313: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
314: return(XML_CHAR_ENCODING_ERROR);
315: }
1.9 daniel 316:
317: /****************************************************************
318: * *
319: * Char encoding handlers *
320: * *
321: ****************************************************************/
322:
323: /* the size should be growable, but it's not a big deal ... */
324: #define MAX_ENCODING_HANDLERS 50
325: static xmlCharEncodingHandlerPtr *handlers = NULL;
326: static int nbCharEncodingHandler = 0;
327:
328: /*
329: * The default is UTF-8 for XML, that's also the default used for the
330: * parser internals, so the default encoding handler is NULL
331: */
332:
333: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
334:
335: /**
336: * xmlNewCharEncodingHandler:
337: * @name: the encoding name, in UTF-8 format (ASCCI actually)
338: * @input: the xmlCharEncodingInputFunc to read that encoding
339: * @output: the xmlCharEncodingOutputFunc to write that encoding
340: *
341: * Create and registers an xmlCharEncodingHandler.
342: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
343: */
344: xmlCharEncodingHandlerPtr
345: xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input,
346: xmlCharEncodingOutputFunc output) {
347: xmlCharEncodingHandlerPtr handler;
348: char upper[500];
349: int i;
350: char *up = 0;
351:
352: /*
353: * Keep only the uppercase version of the encoding.
354: */
355: if (name == NULL) {
356: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
357: return(NULL);
358: }
359: for (i = 0;i < 499;i++) {
360: upper[i] = toupper(name[i]);
361: if (upper[i] == 0) break;
362: }
363: upper[i] = 0;
364: up = strdup(upper);
365: if (up == NULL) {
366: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
367: return(NULL);
368: }
369:
370: /*
371: * allocate and fill-up an handler block.
372: */
373: handler = (xmlCharEncodingHandlerPtr)
374: malloc(sizeof(xmlCharEncodingHandler));
375: if (handler == NULL) {
376: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
377: return(NULL);
378: }
379: handler->input = input;
380: handler->output = output;
381: handler->name = up;
382:
383: /*
384: * registers and returns the handler.
385: */
386: xmlRegisterCharEncodingHandler(handler);
387: return(handler);
388: }
389:
390: /**
391: * xmlInitCharEncodingHandlers:
392: *
393: * Initialize the char encoding support, it registers the default
394: * encoding supported.
395: * NOTE: while public theis function usually don't need to be called
396: * in normal processing.
397: */
398: void
399: xmlInitCharEncodingHandlers(void) {
400: if (handlers != NULL) return;
401:
402: handlers = (xmlCharEncodingHandlerPtr *)
403: malloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
404:
405: if (handlers == NULL) {
406: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
407: return;
408: }
1.10 ! daniel 409: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
! 410: xmlNewCharEncodingHandler("UTF-16", UTF16ToUTF8, UTF8ToUTF16);
! 411: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 412: }
413:
414: /**
415: * xmlRegisterCharEncodingHandler:
416: * @handler: the xmlCharEncodingHandlerPtr handler block
417: *
418: * Register the char encoding handler, surprizing, isn't it ?
419: */
420: void
421: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
422: if (handlers == NULL) xmlInitCharEncodingHandlers();
423: if (handler == NULL) {
424: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
425: return;
426: }
427:
428: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
429: fprintf(stderr,
430: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
431: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
432: return;
433: }
434: handlers[nbCharEncodingHandler++] = handler;
435: }
436:
437: /**
438: * xmlGetCharEncodingHandler:
439: * @enc: an xmlCharEncoding value.
440: *
441: * Search in the registrered set the handler able to read/write that encoding.
442: *
443: * Returns the handler or NULL if not found
444: */
445: xmlCharEncodingHandlerPtr
446: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
447: if (handlers == NULL) xmlInitCharEncodingHandlers();
448: return(NULL);
449: }
450:
451: /**
452: * xmlGetCharEncodingHandler:
453: * @enc: a string describing the char encoding.
454: *
455: * Search in the registrered set the handler able to read/write that encoding.
456: *
457: * Returns the handler or NULL if not found
458: */
459: xmlCharEncodingHandlerPtr
460: xmlFindCharEncodingHandler(const char *name) {
461: char upper[500];
462: int i;
463:
464: if (handlers == NULL) xmlInitCharEncodingHandlers();
465: if (name == NULL) return(xmlDefaultCharEncodingHandler);
466: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
467:
468: for (i = 0;i < 499;i++) {
469: upper[i] = toupper(name[i]);
470: if (upper[i] == 0) break;
471: }
472: upper[i] = 0;
473:
474: for (i = 0;i < nbCharEncodingHandler; i++)
475: if (!strcmp(name, handlers[i]->name))
476: return(handlers[i]);
477:
478: return(NULL);
479: }
480:
Webmaster