Annotation of XML/encoding.c, revision 1.54
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
1.39 daniel 6: * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
1.1 daniel 7: * [ISO-10646] UTF-8 and UTF-16 in Annexes
8: * [ISO-8859-1] ISO Latin-1 characters codes.
9: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10: * Worldwide Character Encoding -- Version 1.0", Addison-
11: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12: * described in Unicode Technical Report #4.
13: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14: * Information Interchange, ANSI X3.4-1986.
15: *
1.9 daniel 16: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 17: *
18: * See Copyright for the status of this software.
19: *
20: * Daniel.Veillard@w3.org
21: */
22:
1.21 daniel 23: #ifdef WIN32
24: #include "win32config.h"
25: #else
1.14 daniel 26: #include "config.h"
1.17 daniel 27: #endif
28:
29: #include <stdio.h>
30: #include <string.h>
31:
32: #ifdef HAVE_CTYPE_H
1.7 daniel 33: #include <ctype.h>
1.17 daniel 34: #endif
1.20 daniel 35: #ifdef HAVE_STDLIB_H
36: #include <stdlib.h>
37: #endif
1.30 daniel 38: #include <libxml/xmlversion.h>
39: #ifdef LIBXML_ICONV_ENABLED
40: #ifdef HAVE_ERRNO_H
41: #include <errno.h>
42: #endif
43: #endif
1.29 daniel 44: #include <libxml/encoding.h>
45: #include <libxml/xmlmemory.h>
1.48 veillard 46: #ifdef LIBXML_HTML_ENABLED
47: #include <libxml/HTMLparser.h>
48: #endif
1.52 veillard 49: #include <libxml/xmlerror.h>
1.3 daniel 50:
1.25 daniel 51: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
52: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
53:
1.51 veillard 54: typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
55: typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
56: struct _xmlCharEncodingAlias {
57: const char *name;
58: const char *alias;
59: };
60:
61: static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
62: static int xmlCharEncodingAliasesNb = 0;
63: static int xmlCharEncodingAliasesMax = 0;
64:
1.30 daniel 65: #ifdef LIBXML_ICONV_ENABLED
1.46 veillard 66: #if 0
1.30 daniel 67: #define DEBUG_ENCODING /* Define this to get encoding traces */
68: #endif
1.33 daniel 69: #endif
1.30 daniel 70:
1.34 daniel 71: static int xmlLittleEndian = 1;
72:
1.3 daniel 73: /*
74: * From rfc2044: encoding of the Unicode values on UTF-8:
75: *
76: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
77: * 0000 0000-0000 007F 0xxxxxxx
78: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
79: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
80: *
81: * I hope we won't use values > 0xFFFF anytime soon !
82: */
1.1 daniel 83:
84: /**
1.39 daniel 85: * xmlGetUTF8Char:
86: * @utf: a sequence of UTF-8 encoded bytes
87: * @len: a pointer to @bytes len
88: *
89: * Read one UTF8 Char from @utf
90: *
91: * Returns the char value or -1 in case of error and update @len with the
92: * number of bytes used
93: */
94: int
95: xmlGetUTF8Char(const unsigned char *utf, int *len) {
96: unsigned int c;
97:
98: if (utf == NULL)
99: goto error;
100: if (len == NULL)
101: goto error;
102: if (*len < 1)
103: goto error;
104:
105: c = utf[0];
106: if (c & 0x80) {
107: if (*len < 2)
108: goto error;
109: if ((utf[1] & 0xc0) != 0x80)
110: goto error;
111: if ((c & 0xe0) == 0xe0) {
112: if (*len < 3)
113: goto error;
114: if ((utf[2] & 0xc0) != 0x80)
115: goto error;
116: if ((c & 0xf0) == 0xf0) {
117: if (*len < 4)
118: goto error;
119: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
120: goto error;
121: *len = 4;
122: /* 4-byte code */
123: c = (utf[0] & 0x7) << 18;
124: c |= (utf[1] & 0x3f) << 12;
125: c |= (utf[2] & 0x3f) << 6;
126: c |= utf[3] & 0x3f;
127: } else {
128: /* 3-byte code */
129: *len = 3;
130: c = (utf[0] & 0xf) << 12;
131: c |= (utf[1] & 0x3f) << 6;
132: c |= utf[2] & 0x3f;
133: }
134: } else {
135: /* 2-byte code */
136: *len = 2;
137: c = (utf[0] & 0x1f) << 6;
138: c |= utf[1] & 0x3f;
139: }
140: } else {
141: /* 1-byte code */
142: *len = 1;
143: }
144: return(c);
145:
146: error:
147: *len = 0;
148: return(-1);
149: }
150:
151: /**
1.22 daniel 152: * xmlCheckUTF8: Check utf-8 string for legality.
153: * @utf: Pointer to putative utf-8 encoded string.
154: *
155: * Checks @utf for being valid utf-8. @utf is assumed to be
156: * null-terminated. This function is not super-strict, as it will
157: * allow longer utf-8 sequences than necessary. Note that Java is
158: * capable of producing these sequences if provoked. Also note, this
159: * routine checks for the 4-byte maxiumum size, but does not check for
160: * 0x10ffff maximum value.
161: *
162: * Return value: true if @utf is valid.
163: **/
164: int
165: xmlCheckUTF8(const unsigned char *utf)
166: {
167: int ix;
168: unsigned char c;
169:
170: for (ix = 0; (c = utf[ix]);) {
171: if (c & 0x80) {
172: if ((utf[ix + 1] & 0xc0) != 0x80)
173: return(0);
174: if ((c & 0xe0) == 0xe0) {
175: if ((utf[ix + 2] & 0xc0) != 0x80)
176: return(0);
177: if ((c & 0xf0) == 0xf0) {
178: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
179: return(0);
180: ix += 4;
181: /* 4-byte code */
182: } else
183: /* 3-byte code */
184: ix += 3;
185: } else
186: /* 2-byte code */
187: ix += 2;
188: } else
189: /* 1-byte code */
190: ix++;
191: }
192: return(1);
193: }
194:
195: /**
1.47 veillard 196: * asciiToUTF8:
197: * @out: a pointer to an array of bytes to store the result
198: * @outlen: the length of @out
199: * @in: a pointer to an array of ASCII chars
200: * @inlen: the length of @in
201: *
202: * Take a block of ASCII chars in and try to convert it to an UTF-8
203: * block of chars out.
204: * Returns 0 if success, or -1 otherwise
205: * The value of @inlen after return is the number of octets consumed
206: * as the return value is positive, else unpredictiable.
207: * The value of @outlen after return is the number of ocetes consumed.
208: */
209: int
210: asciiToUTF8(unsigned char* out, int *outlen,
211: const unsigned char* in, int *inlen) {
212: unsigned char* outstart = out;
213: const unsigned char* base = in;
214: const unsigned char* processed = in;
215: unsigned char* outend = out + *outlen;
216: const unsigned char* inend;
217: unsigned int c;
218: int bits;
219:
220: inend = in + (*inlen);
221: while ((in < inend) && (out - outstart + 5 < *outlen)) {
222: c= *in++;
223:
224: /* assertion: c is a single UTF-4 value */
225: if (out >= outend)
226: break;
227: if (c < 0x80) { *out++= c; bits= -6; }
228: else {
229: *outlen = out - outstart;
230: *inlen = processed - base;
231: return(-1);
232: }
233:
234: for ( ; bits >= 0; bits-= 6) {
235: if (out >= outend)
236: break;
237: *out++= ((c >> bits) & 0x3F) | 0x80;
238: }
239: processed = (const unsigned char*) in;
240: }
241: *outlen = out - outstart;
242: *inlen = processed - base;
243: return(0);
244: }
245:
246: /**
247: * UTF8Toascii:
248: * @out: a pointer to an array of bytes to store the result
249: * @outlen: the length of @out
250: * @in: a pointer to an array of UTF-8 chars
251: * @inlen: the length of @in
252: *
253: * Take a block of UTF-8 chars in and try to convert it to an ASCII
254: * block of chars out.
255: *
256: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
257: * The value of @inlen after return is the number of octets consumed
258: * as the return value is positive, else unpredictiable.
259: * The value of @outlen after return is the number of ocetes consumed.
260: */
261: int
262: UTF8Toascii(unsigned char* out, int *outlen,
263: const unsigned char* in, int *inlen) {
264: const unsigned char* processed = in;
265: const unsigned char* outend;
266: const unsigned char* outstart = out;
267: const unsigned char* instart = in;
268: const unsigned char* inend;
269: unsigned int c, d;
270: int trailing;
271:
272: if (in == NULL) {
273: /*
274: * initialization nothing to do
275: */
276: *outlen = 0;
277: *inlen = 0;
278: return(0);
279: }
280: inend = in + (*inlen);
281: outend = out + (*outlen);
282: while (in < inend) {
283: d = *in++;
284: if (d < 0x80) { c= d; trailing= 0; }
285: else if (d < 0xC0) {
286: /* trailing byte in leading position */
287: *outlen = out - outstart;
288: *inlen = processed - instart;
289: return(-2);
290: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
291: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
292: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
293: else {
294: /* no chance for this in Ascii */
295: *outlen = out - outstart;
296: *inlen = processed - instart;
297: return(-2);
298: }
299:
300: if (inend - in < trailing) {
301: break;
302: }
303:
304: for ( ; trailing; trailing--) {
305: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
306: break;
307: c <<= 6;
308: c |= d & 0x3F;
309: }
310:
311: /* assertion: c is a single UTF-4 value */
312: if (c < 0x80) {
313: if (out >= outend)
314: break;
315: *out++ = c;
316: } else {
317: /* no chance for this in Ascii */
318: *outlen = out - outstart;
319: *inlen = processed - instart;
320: return(-2);
321: }
322: processed = in;
323: }
324: *outlen = out - outstart;
325: *inlen = processed - instart;
326: return(0);
327: }
328:
329: /**
1.1 daniel 330: * isolat1ToUTF8:
1.18 daniel 331: * @out: a pointer to an array of bytes to store the result
332: * @outlen: the length of @out
333: * @in: a pointer to an array of ISO Latin 1 chars
334: * @inlen: the length of @in
1.1 daniel 335: *
336: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
337: * block of chars out.
1.33 daniel 338: * Returns 0 if success, or -1 otherwise
339: * The value of @inlen after return is the number of octets consumed
340: * as the return value is positive, else unpredictiable.
341: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 342: */
343: int
1.33 daniel 344: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 345: const unsigned char* in, int *inlen) {
1.33 daniel 346: unsigned char* outstart = out;
1.45 veillard 347: const unsigned char* base = in;
1.33 daniel 348: const unsigned char* processed = in;
349: unsigned char* outend = out + *outlen;
1.45 veillard 350: const unsigned char* inend;
351: unsigned int c;
352: int bits;
353:
354: inend = in + (*inlen);
355: while ((in < inend) && (out - outstart + 5 < *outlen)) {
356: c= *in++;
1.1 daniel 357:
1.45 veillard 358: /* assertion: c is a single UTF-4 value */
359: if (out >= outend)
360: break;
361: if (c < 0x80) { *out++= c; bits= -6; }
362: else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
363:
364: for ( ; bits >= 0; bits-= 6) {
1.33 daniel 365: if (out >= outend)
1.45 veillard 366: break;
367: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 368: }
1.45 veillard 369: processed = (const unsigned char*) in;
1.1 daniel 370: }
1.33 daniel 371: *outlen = out - outstart;
1.45 veillard 372: *inlen = processed - base;
1.33 daniel 373: return(0);
1.1 daniel 374: }
375:
376: /**
377: * UTF8Toisolat1:
1.18 daniel 378: * @out: a pointer to an array of bytes to store the result
379: * @outlen: the length of @out
380: * @in: a pointer to an array of UTF-8 chars
381: * @inlen: the length of @in
1.1 daniel 382: *
383: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
384: * block of chars out.
1.15 daniel 385: *
1.33 daniel 386: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 387: * The value of @inlen after return is the number of octets consumed
388: * as the return value is positive, else unpredictiable.
1.33 daniel 389: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 390: */
391: int
1.33 daniel 392: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 393: const unsigned char* in, int *inlen) {
1.33 daniel 394: const unsigned char* processed = in;
1.45 veillard 395: const unsigned char* outend;
396: const unsigned char* outstart = out;
397: const unsigned char* instart = in;
398: const unsigned char* inend;
399: unsigned int c, d;
400: int trailing;
1.1 daniel 401:
1.45 veillard 402: if (in == NULL) {
403: /*
404: * initialization nothing to do
405: */
406: *outlen = 0;
407: *inlen = 0;
408: return(0);
409: }
410: inend = in + (*inlen);
411: outend = out + (*outlen);
1.1 daniel 412: while (in < inend) {
1.45 veillard 413: d = *in++;
414: if (d < 0x80) { c= d; trailing= 0; }
415: else if (d < 0xC0) {
416: /* trailing byte in leading position */
417: *outlen = out - outstart;
418: *inlen = processed - instart;
419: return(-2);
420: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
421: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
422: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
423: else {
424: /* no chance for this in IsoLat1 */
425: *outlen = out - outstart;
426: *inlen = processed - instart;
427: return(-2);
1.23 daniel 428: }
1.45 veillard 429:
430: if (inend - in < trailing) {
431: break;
432: }
433:
434: for ( ; trailing; trailing--) {
1.50 veillard 435: if (in >= inend)
1.45 veillard 436: break;
1.50 veillard 437: if (((d= *in++) & 0xC0) != 0x80) {
438: *outlen = out - outstart;
439: *inlen = processed - instart;
440: return(-2);
441: }
1.45 veillard 442: c <<= 6;
443: c |= d & 0x3F;
1.23 daniel 444: }
1.45 veillard 445:
446: /* assertion: c is a single UTF-4 value */
447: if (c <= 0xFF) {
448: if (out >= outend)
449: break;
450: *out++ = c;
451: } else {
452: /* no chance for this in IsoLat1 */
1.33 daniel 453: *outlen = out - outstart;
1.45 veillard 454: *inlen = processed - instart;
1.28 daniel 455: return(-2);
1.33 daniel 456: }
457: processed = in;
1.1 daniel 458: }
1.33 daniel 459: *outlen = out - outstart;
1.45 veillard 460: *inlen = processed - instart;
1.33 daniel 461: return(0);
1.1 daniel 462: }
463:
464: /**
1.28 daniel 465: * UTF16LEToUTF8:
466: * @out: a pointer to an array of bytes to store the result
467: * @outlen: the length of @out
468: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
469: * @inlenb: the length of @in in UTF-16LE chars
470: *
471: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
472: * block of chars out. This function assume the endian properity
473: * is the same between the native type of this machine and the
474: * inputed one.
475: *
476: * Returns the number of byte written, or -1 by lack of space, or -2
477: * if the transcoding fails (for *in is not valid utf16 string)
478: * The value of *inlen after return is the number of octets consumed
479: * as the return value is positive, else unpredictiable.
480: */
481: int
1.33 daniel 482: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 483: const unsigned char* inb, int *inlenb)
484: {
1.33 daniel 485: unsigned char* outstart = out;
486: const unsigned char* processed = inb;
487: unsigned char* outend = out + *outlen;
1.28 daniel 488: unsigned short* in = (unsigned short*) inb;
489: unsigned short* inend;
490: unsigned int c, d, inlen;
491: unsigned char *tmp;
492: int bits;
493:
494: if ((*inlenb % 2) == 1)
495: (*inlenb)--;
496: inlen = *inlenb / 2;
1.33 daniel 497: inend = in + inlen;
1.39 daniel 498: while ((in < inend) && (out - outstart + 5 < *outlen)) {
1.34 daniel 499: if (xmlLittleEndian) {
500: c= *in++;
501: } else {
502: tmp = (unsigned char *) in;
503: c = *tmp++;
504: c = c | (((unsigned int)*tmp) << 8);
505: in++;
506: }
1.28 daniel 507: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.39 daniel 508: if (in >= inend) { /* (in > inend) shouldn't happens */
509: break;
510: }
1.34 daniel 511: if (xmlLittleEndian) {
512: d = *in++;
513: } else {
514: tmp = (unsigned char *) in;
515: d = *tmp++;
516: d = d | (((unsigned int)*tmp) << 8);
517: in++;
518: }
1.28 daniel 519: if ((d & 0xFC00) == 0xDC00) {
520: c &= 0x03FF;
521: c <<= 10;
522: c |= d & 0x03FF;
523: c += 0x10000;
524: }
1.33 daniel 525: else {
526: *outlen = out - outstart;
527: *inlenb = processed - inb;
1.28 daniel 528: return(-2);
1.33 daniel 529: }
1.28 daniel 530: }
531:
532: /* assertion: c is a single UTF-4 value */
533: if (out >= outend)
1.33 daniel 534: break;
1.28 daniel 535: if (c < 0x80) { *out++= c; bits= -6; }
536: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
537: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
538: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
539:
540: for ( ; bits >= 0; bits-= 6) {
541: if (out >= outend)
1.33 daniel 542: break;
1.28 daniel 543: *out++= ((c >> bits) & 0x3F) | 0x80;
544: }
1.33 daniel 545: processed = (const unsigned char*) in;
1.28 daniel 546: }
1.33 daniel 547: *outlen = out - outstart;
548: *inlenb = processed - inb;
549: return(0);
1.28 daniel 550: }
551:
552: /**
553: * UTF8ToUTF16LE:
554: * @outb: a pointer to an array of bytes to store the result
555: * @outlen: the length of @outb
556: * @in: a pointer to an array of UTF-8 chars
557: * @inlen: the length of @in
558: *
559: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
560: * block of chars out.
561: *
562: * Returns the number of byte written, or -1 by lack of space, or -2
563: * if the transcoding failed.
564: */
565: int
1.33 daniel 566: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 567: const unsigned char* in, int *inlen)
568: {
569: unsigned short* out = (unsigned short*) outb;
1.33 daniel 570: const unsigned char* processed = in;
1.28 daniel 571: unsigned short* outstart= out;
572: unsigned short* outend;
573: const unsigned char* inend= in+*inlen;
1.40 daniel 574: unsigned int c, d;
575: int trailing;
1.28 daniel 576: unsigned char *tmp;
577: unsigned short tmp1, tmp2;
578:
1.37 daniel 579: if (in == NULL) {
580: /*
581: * initialization, add the Byte Order Mark
582: */
583: if (*outlen >= 2) {
584: outb[0] = 0xFF;
585: outb[1] = 0xFE;
586: *outlen = 2;
587: *inlen = 0;
588: #ifdef DEBUG_ENCODING
1.52 veillard 589: xmlGenericError(xmlGenericErrorContext,
590: "Added FFFE Byte Order Mark\n");
1.37 daniel 591: #endif
592: return(2);
593: }
594: *outlen = 0;
595: *inlen = 0;
596: return(0);
597: }
1.33 daniel 598: outend = out + (*outlen / 2);
1.28 daniel 599: while (in < inend) {
600: d= *in++;
601: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 602: else if (d < 0xC0) {
603: /* trailing byte in leading position */
1.45 veillard 604: *outlen = (out - outstart) * 2;
1.33 daniel 605: *inlen = processed - in;
606: return(-2);
607: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 608: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
609: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 610: else {
611: /* no chance for this in UTF-16 */
1.45 veillard 612: *outlen = (out - outstart) * 2;
1.33 daniel 613: *inlen = processed - in;
614: return(-2);
615: }
1.28 daniel 616:
617: if (inend - in < trailing) {
618: break;
619: }
620:
621: for ( ; trailing; trailing--) {
622: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 623: break;
1.28 daniel 624: c <<= 6;
625: c |= d & 0x3F;
626: }
627:
628: /* assertion: c is a single UTF-4 value */
629: if (c < 0x10000) {
630: if (out >= outend)
1.33 daniel 631: break;
1.34 daniel 632: if (xmlLittleEndian) {
633: *out++ = c;
634: } else {
635: tmp = (unsigned char *) out;
636: *tmp = c ;
637: *(tmp + 1) = c >> 8 ;
638: out++;
639: }
1.28 daniel 640: }
641: else if (c < 0x110000) {
642: if (out+1 >= outend)
1.33 daniel 643: break;
1.28 daniel 644: c -= 0x10000;
1.34 daniel 645: if (xmlLittleEndian) {
646: *out++ = 0xD800 | (c >> 10);
647: *out++ = 0xDC00 | (c & 0x03FF);
648: } else {
649: tmp1 = 0xD800 | (c >> 10);
650: tmp = (unsigned char *) out;
1.40 daniel 651: *tmp = (unsigned char) tmp1;
1.34 daniel 652: *(tmp + 1) = tmp1 >> 8;
653: out++;
654:
655: tmp2 = 0xDC00 | (c & 0x03FF);
656: tmp = (unsigned char *) out;
1.40 daniel 657: *tmp = (unsigned char) tmp2;
1.34 daniel 658: *(tmp + 1) = tmp2 >> 8;
659: out++;
660: }
1.28 daniel 661: }
662: else
1.33 daniel 663: break;
664: processed = in;
1.28 daniel 665: }
1.36 daniel 666: *outlen = (out - outstart) * 2;
1.33 daniel 667: *inlen = processed - in;
668: return(0);
1.28 daniel 669: }
670:
671: /**
672: * UTF16BEToUTF8:
1.18 daniel 673: * @out: a pointer to an array of bytes to store the result
674: * @outlen: the length of @out
1.25 daniel 675: * @inb: a pointer to an array of UTF-16 passwd as a byte array
676: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 677: *
678: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 679: * block of chars out. This function assume the endian properity
680: * is the same between the native type of this machine and the
681: * inputed one.
1.25 daniel 682: *
1.28 daniel 683: * Returns the number of byte written, or -1 by lack of space, or -2
684: * if the transcoding fails (for *in is not valid utf16 string)
685: * The value of *inlen after return is the number of octets consumed
686: * as the return value is positive, else unpredictiable.
1.1 daniel 687: */
688: int
1.33 daniel 689: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 690: const unsigned char* inb, int *inlenb)
1.1 daniel 691: {
1.33 daniel 692: unsigned char* outstart = out;
693: const unsigned char* processed = inb;
694: unsigned char* outend = out + *outlen;
1.25 daniel 695: unsigned short* in = (unsigned short*) inb;
696: unsigned short* inend;
697: unsigned int c, d, inlen;
1.28 daniel 698: unsigned char *tmp;
1.1 daniel 699: int bits;
700:
1.28 daniel 701: if ((*inlenb % 2) == 1)
702: (*inlenb)--;
1.25 daniel 703: inlen = *inlenb / 2;
704: inend= in + inlen;
1.1 daniel 705: while (in < inend) {
1.34 daniel 706: if (xmlLittleEndian) {
707: tmp = (unsigned char *) in;
708: c = *tmp++;
709: c = c << 8;
710: c = c | (unsigned int) *tmp;
711: in++;
712: } else {
713: c= *in++;
714: }
1.1 daniel 715: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 716: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 717: *outlen = out - outstart;
718: *inlenb = processed - inb;
719: return(-2);
1.28 daniel 720: }
1.34 daniel 721: if (xmlLittleEndian) {
722: tmp = (unsigned char *) in;
723: d = *tmp++;
724: d = d << 8;
725: d = d | (unsigned int) *tmp;
726: in++;
727: } else {
728: d= *in++;
729: }
1.28 daniel 730: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 731: c &= 0x03FF;
732: c <<= 10;
733: c |= d & 0x03FF;
734: c += 0x10000;
735: }
1.33 daniel 736: else {
737: *outlen = out - outstart;
738: *inlenb = processed - inb;
1.28 daniel 739: return(-2);
1.33 daniel 740: }
1.1 daniel 741: }
742:
1.25 daniel 743: /* assertion: c is a single UTF-4 value */
1.27 daniel 744: if (out >= outend)
1.33 daniel 745: break;
1.1 daniel 746: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 747: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
748: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
749: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 750:
1.26 daniel 751: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 752: if (out >= outend)
1.33 daniel 753: break;
1.26 daniel 754: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 755: }
1.33 daniel 756: processed = (const unsigned char*) in;
1.1 daniel 757: }
1.33 daniel 758: *outlen = out - outstart;
759: *inlenb = processed - inb;
760: return(0);
1.1 daniel 761: }
762:
763: /**
1.28 daniel 764: * UTF8ToUTF16BE:
1.25 daniel 765: * @outb: a pointer to an array of bytes to store the result
766: * @outlen: the length of @outb
1.18 daniel 767: * @in: a pointer to an array of UTF-8 chars
768: * @inlen: the length of @in
1.1 daniel 769: *
1.28 daniel 770: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 771: * block of chars out.
1.15 daniel 772: *
1.6 daniel 773: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 774: * if the transcoding failed.
1.1 daniel 775: */
776: int
1.33 daniel 777: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 778: const unsigned char* in, int *inlen)
1.1 daniel 779: {
1.25 daniel 780: unsigned short* out = (unsigned short*) outb;
1.33 daniel 781: const unsigned char* processed = in;
1.1 daniel 782: unsigned short* outstart= out;
1.28 daniel 783: unsigned short* outend;
1.25 daniel 784: const unsigned char* inend= in+*inlen;
1.40 daniel 785: unsigned int c, d;
786: int trailing;
1.28 daniel 787: unsigned char *tmp;
788: unsigned short tmp1, tmp2;
1.1 daniel 789:
1.37 daniel 790: if (in == NULL) {
791: /*
792: * initialization, add the Byte Order Mark
793: */
794: if (*outlen >= 2) {
795: outb[0] = 0xFE;
796: outb[1] = 0xFF;
797: *outlen = 2;
798: *inlen = 0;
799: #ifdef DEBUG_ENCODING
1.52 veillard 800: xmlGenericError(xmlGenericErrorContext,
801: "Added FEFF Byte Order Mark\n");
1.37 daniel 802: #endif
803: return(2);
804: }
805: *outlen = 0;
806: *inlen = 0;
807: return(0);
808: }
1.33 daniel 809: outend = out + (*outlen / 2);
1.1 daniel 810: while (in < inend) {
811: d= *in++;
812: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 813: else if (d < 0xC0) {
814: /* trailing byte in leading position */
815: *outlen = out - outstart;
816: *inlen = processed - in;
817: return(-2);
818: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 819: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
820: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 821: else {
822: /* no chance for this in UTF-16 */
823: *outlen = out - outstart;
824: *inlen = processed - in;
825: return(-2);
826: }
1.28 daniel 827:
828: if (inend - in < trailing) {
829: break;
830: }
1.1 daniel 831:
832: for ( ; trailing; trailing--) {
1.33 daniel 833: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 834: c <<= 6;
835: c |= d & 0x3F;
836: }
837:
838: /* assertion: c is a single UTF-4 value */
839: if (c < 0x10000) {
1.33 daniel 840: if (out >= outend) break;
1.34 daniel 841: if (xmlLittleEndian) {
842: tmp = (unsigned char *) out;
843: *tmp = c >> 8;
844: *(tmp + 1) = c;
845: out++;
846: } else {
847: *out++ = c;
848: }
1.1 daniel 849: }
850: else if (c < 0x110000) {
1.33 daniel 851: if (out+1 >= outend) break;
1.1 daniel 852: c -= 0x10000;
1.34 daniel 853: if (xmlLittleEndian) {
854: tmp1 = 0xD800 | (c >> 10);
855: tmp = (unsigned char *) out;
856: *tmp = tmp1 >> 8;
1.40 daniel 857: *(tmp + 1) = (unsigned char) tmp1;
1.34 daniel 858: out++;
859:
860: tmp2 = 0xDC00 | (c & 0x03FF);
861: tmp = (unsigned char *) out;
862: *tmp = tmp2 >> 8;
1.40 daniel 863: *(tmp + 1) = (unsigned char) tmp2;
1.34 daniel 864: out++;
865: } else {
866: *out++ = 0xD800 | (c >> 10);
867: *out++ = 0xDC00 | (c & 0x03FF);
868: }
1.1 daniel 869: }
1.33 daniel 870: else
871: break;
872: processed = in;
1.1 daniel 873: }
1.36 daniel 874: *outlen = (out - outstart) * 2;
1.33 daniel 875: *inlen = processed - in;
876: return(0);
1.1 daniel 877: }
878:
1.7 daniel 879: /**
880: * xmlDetectCharEncoding:
881: * @in: a pointer to the first bytes of the XML entity, must be at least
882: * 4 bytes long.
1.25 daniel 883: * @len: pointer to the length of the buffer
1.7 daniel 884: *
885: * Guess the encoding of the entity using the first bytes of the entity content
886: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
887: *
888: * Returns one of the XML_CHAR_ENCODING_... values.
889: */
890: xmlCharEncoding
1.25 daniel 891: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 892: {
1.25 daniel 893: if (len >= 4) {
894: if ((in[0] == 0x00) && (in[1] == 0x00) &&
895: (in[2] == 0x00) && (in[3] == 0x3C))
896: return(XML_CHAR_ENCODING_UCS4BE);
897: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
898: (in[2] == 0x00) && (in[3] == 0x00))
899: return(XML_CHAR_ENCODING_UCS4LE);
900: if ((in[0] == 0x00) && (in[1] == 0x00) &&
901: (in[2] == 0x3C) && (in[3] == 0x00))
902: return(XML_CHAR_ENCODING_UCS4_2143);
903: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
904: (in[2] == 0x00) && (in[3] == 0x00))
905: return(XML_CHAR_ENCODING_UCS4_3412);
906: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
907: (in[2] == 0xA7) && (in[3] == 0x94))
908: return(XML_CHAR_ENCODING_EBCDIC);
909: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
910: (in[2] == 0x78) && (in[3] == 0x6D))
911: return(XML_CHAR_ENCODING_UTF8);
912: }
913: if (len >= 2) {
914: if ((in[0] == 0xFE) && (in[1] == 0xFF))
915: return(XML_CHAR_ENCODING_UTF16BE);
916: if ((in[0] == 0xFF) && (in[1] == 0xFE))
917: return(XML_CHAR_ENCODING_UTF16LE);
918: }
1.7 daniel 919: return(XML_CHAR_ENCODING_NONE);
920: }
921:
922: /**
1.51 veillard 923: * xmlCleanupEncodingAliases:
924: *
925: * Unregisters all aliases
926: */
927: void
928: xmlCleanupEncodingAliases(void) {
929: int i;
930:
931: if (xmlCharEncodingAliases == NULL)
932: return;
933:
934: for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
935: if (xmlCharEncodingAliases[i].name != NULL)
936: xmlFree((char *) xmlCharEncodingAliases[i].name);
937: if (xmlCharEncodingAliases[i].alias != NULL)
938: xmlFree((char *) xmlCharEncodingAliases[i].alias);
939: }
940: xmlCharEncodingAliasesNb = 0;
941: xmlCharEncodingAliasesMax = 0;
942: xmlFree(xmlCharEncodingAliases);
943: }
944:
945: /**
946: * xmlGetEncodingAlias:
947: * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
948: *
949: * Lookup an encoding name for the given alias.
950: *
951: * Returns NULL if not found the original name otherwise
952: */
953: const char *
954: xmlGetEncodingAlias(const char *alias) {
955: int i;
956: char upper[100];
957:
958: if (alias == NULL)
959: return(NULL);
960:
961: if (xmlCharEncodingAliases == NULL)
962: return(NULL);
963:
964: for (i = 0;i < 99;i++) {
965: upper[i] = toupper(alias[i]);
966: if (upper[i] == 0) break;
967: }
968: upper[i] = 0;
969:
970: /*
971: * Walk down the list looking for a definition of the alias
972: */
973: for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
974: if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
975: return(xmlCharEncodingAliases[i].name);
976: }
977: }
978: return(NULL);
979: }
980:
981: /**
982: * xmlAddEncodingAlias:
983: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
984: * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
985: *
986: * Registers and alias @alias for an encoding named @name. Existing alias
987: * will be overwritten.
988: *
989: * Returns 0 in case of success, -1 in case of error
990: */
991: int
992: xmlAddEncodingAlias(const char *name, const char *alias) {
993: int i;
994: char upper[100];
995:
996: if ((name == NULL) || (alias == NULL))
997: return(-1);
998:
999: for (i = 0;i < 99;i++) {
1000: upper[i] = toupper(alias[i]);
1001: if (upper[i] == 0) break;
1002: }
1003: upper[i] = 0;
1004:
1005: if (xmlCharEncodingAliases == NULL) {
1006: xmlCharEncodingAliasesNb = 0;
1007: xmlCharEncodingAliasesMax = 20;
1008: xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1009: xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1010: if (xmlCharEncodingAliases == NULL)
1011: return(-1);
1012: } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1013: xmlCharEncodingAliasesMax *= 2;
1014: xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1015: xmlRealloc(xmlCharEncodingAliases,
1016: xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1017: }
1018: /*
1019: * Walk down the list looking for a definition of the alias
1020: */
1021: for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1022: if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1023: /*
1024: * Replace the definition.
1025: */
1026: xmlFree((char *) xmlCharEncodingAliases[i].name);
1027: xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1028: return(0);
1029: }
1030: }
1031: /*
1032: * Add the definition
1033: */
1034: xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1035: xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1036: xmlCharEncodingAliasesNb++;
1037: return(0);
1038: }
1039:
1040: /**
1041: * xmlDelEncodingAlias:
1042: * @alias: the alias name as parsed, in UTF-8 format (ASCII actually)
1043: *
1044: * Unregisters an encoding alias @alias
1045: *
1046: * Returns 0 in case of success, -1 in case of error
1047: */
1048: int
1049: xmlDelEncodingAlias(const char *alias) {
1050: int i;
1051:
1052: if (alias == NULL)
1053: return(-1);
1054:
1055: if (xmlCharEncodingAliases == NULL)
1056: return(-1);
1057: /*
1058: * Walk down the list looking for a definition of the alias
1059: */
1060: for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1061: if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1062: xmlFree((char *) xmlCharEncodingAliases[i].name);
1063: xmlFree((char *) xmlCharEncodingAliases[i].alias);
1064: xmlCharEncodingAliasesNb--;
1065: memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1066: sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1067: return(0);
1068: }
1069: }
1070: return(-1);
1071: }
1072:
1073: /**
1.7 daniel 1074: * xmlParseCharEncoding:
1.18 daniel 1075: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 1076: *
1077: * Conpare the string to the known encoding schemes already known. Note
1078: * that the comparison is case insensitive accordingly to the section
1079: * [XML] 4.3.3 Character Encoding in Entities.
1080: *
1081: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1082: * if not recognized.
1083: */
1084: xmlCharEncoding
1.8 daniel 1085: xmlParseCharEncoding(const char* name)
1.7 daniel 1086: {
1.51 veillard 1087: const char *alias;
1.7 daniel 1088: char upper[500];
1089: int i;
1090:
1.51 veillard 1091: if (name == NULL)
1092: return(XML_CHAR_ENCODING_NONE);
1093:
1094: /*
1095: * Do the alias resolution
1096: */
1097: alias = xmlGetEncodingAlias(name);
1098: if (alias != NULL)
1099: name = alias;
1100:
1.7 daniel 1101: for (i = 0;i < 499;i++) {
1102: upper[i] = toupper(name[i]);
1103: if (upper[i] == 0) break;
1104: }
1105: upper[i] = 0;
1106:
1107: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1108: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1109: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1110:
1111: /*
1112: * NOTE: if we were able to parse this, the endianness of UTF16 is
1113: * already found and in use
1114: */
1115: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1116: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1117:
1118: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1119: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1120: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1121:
1122: /*
1123: * NOTE: if we were able to parse this, the endianness of UCS4 is
1124: * already found and in use
1125: */
1126: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1127: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1128: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1129:
1130:
1131: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1132: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1133: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1134:
1135: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1136: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1137: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1138:
1139: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1140: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1141: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1142: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1143: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1144: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1145: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1146:
1147: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 1148: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 1149: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 1150:
1151: #ifdef DEBUG_ENCODING
1.52 veillard 1152: xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1.30 daniel 1153: #endif
1.7 daniel 1154: return(XML_CHAR_ENCODING_ERROR);
1155: }
1.9 daniel 1156:
1.38 daniel 1157: /**
1158: * xmlGetCharEncodingName:
1159: * @enc: the encoding
1160: *
1161: * The "canonical" name for XML encoding.
1162: * C.f. http://www.w3.org/TR/REC-xml#charencoding
1163: * Section 4.3.3 Character Encoding in Entities
1164: *
1165: * Returns the canonical name for the given encoding
1166: */
1167:
1168: const char*
1169: xmlGetCharEncodingName(xmlCharEncoding enc) {
1170: switch (enc) {
1171: case XML_CHAR_ENCODING_ERROR:
1172: return(NULL);
1173: case XML_CHAR_ENCODING_NONE:
1174: return(NULL);
1175: case XML_CHAR_ENCODING_UTF8:
1176: return("UTF-8");
1177: case XML_CHAR_ENCODING_UTF16LE:
1178: return("UTF-16");
1179: case XML_CHAR_ENCODING_UTF16BE:
1180: return("UTF-16");
1181: case XML_CHAR_ENCODING_EBCDIC:
1182: return("EBCDIC");
1183: case XML_CHAR_ENCODING_UCS4LE:
1184: return("ISO-10646-UCS-4");
1185: case XML_CHAR_ENCODING_UCS4BE:
1186: return("ISO-10646-UCS-4");
1187: case XML_CHAR_ENCODING_UCS4_2143:
1188: return("ISO-10646-UCS-4");
1189: case XML_CHAR_ENCODING_UCS4_3412:
1190: return("ISO-10646-UCS-4");
1191: case XML_CHAR_ENCODING_UCS2:
1192: return("ISO-10646-UCS-2");
1193: case XML_CHAR_ENCODING_8859_1:
1194: return("ISO-8859-1");
1195: case XML_CHAR_ENCODING_8859_2:
1196: return("ISO-8859-2");
1197: case XML_CHAR_ENCODING_8859_3:
1198: return("ISO-8859-3");
1199: case XML_CHAR_ENCODING_8859_4:
1200: return("ISO-8859-4");
1201: case XML_CHAR_ENCODING_8859_5:
1202: return("ISO-8859-5");
1203: case XML_CHAR_ENCODING_8859_6:
1204: return("ISO-8859-6");
1205: case XML_CHAR_ENCODING_8859_7:
1206: return("ISO-8859-7");
1207: case XML_CHAR_ENCODING_8859_8:
1208: return("ISO-8859-8");
1209: case XML_CHAR_ENCODING_8859_9:
1210: return("ISO-8859-9");
1211: case XML_CHAR_ENCODING_2022_JP:
1212: return("ISO-2022-JP");
1213: case XML_CHAR_ENCODING_SHIFT_JIS:
1214: return("Shift-JIS");
1215: case XML_CHAR_ENCODING_EUC_JP:
1216: return("EUC-JP");
1.50 veillard 1217: case XML_CHAR_ENCODING_ASCII:
1218: return(NULL);
1.38 daniel 1219: }
1220: return(NULL);
1221: }
1222:
1.9 daniel 1223: /****************************************************************
1224: * *
1225: * Char encoding handlers *
1226: * *
1227: ****************************************************************/
1228:
1229: /* the size should be growable, but it's not a big deal ... */
1230: #define MAX_ENCODING_HANDLERS 50
1231: static xmlCharEncodingHandlerPtr *handlers = NULL;
1232: static int nbCharEncodingHandler = 0;
1233:
1234: /*
1235: * The default is UTF-8 for XML, that's also the default used for the
1236: * parser internals, so the default encoding handler is NULL
1237: */
1238:
1239: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1240:
1241: /**
1242: * xmlNewCharEncodingHandler:
1.18 daniel 1243: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 1244: * @input: the xmlCharEncodingInputFunc to read that encoding
1245: * @output: the xmlCharEncodingOutputFunc to write that encoding
1246: *
1247: * Create and registers an xmlCharEncodingHandler.
1248: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1249: */
1250: xmlCharEncodingHandlerPtr
1.25 daniel 1251: xmlNewCharEncodingHandler(const char *name,
1252: xmlCharEncodingInputFunc input,
1.9 daniel 1253: xmlCharEncodingOutputFunc output) {
1254: xmlCharEncodingHandlerPtr handler;
1.51 veillard 1255: const char *alias;
1.9 daniel 1256: char upper[500];
1257: int i;
1258: char *up = 0;
1259:
1260: /*
1.51 veillard 1261: * Do the alias resolution
1262: */
1263: alias = xmlGetEncodingAlias(name);
1264: if (alias != NULL)
1265: name = alias;
1266:
1267: /*
1.9 daniel 1268: * Keep only the uppercase version of the encoding.
1269: */
1270: if (name == NULL) {
1.52 veillard 1271: xmlGenericError(xmlGenericErrorContext,
1272: "xmlNewCharEncodingHandler : no name !\n");
1.9 daniel 1273: return(NULL);
1274: }
1275: for (i = 0;i < 499;i++) {
1276: upper[i] = toupper(name[i]);
1277: if (upper[i] == 0) break;
1278: }
1279: upper[i] = 0;
1.16 daniel 1280: up = xmlMemStrdup(upper);
1.9 daniel 1281: if (up == NULL) {
1.52 veillard 1282: xmlGenericError(xmlGenericErrorContext,
1283: "xmlNewCharEncodingHandler : out of memory !\n");
1.9 daniel 1284: return(NULL);
1285: }
1286:
1287: /*
1288: * allocate and fill-up an handler block.
1289: */
1290: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 1291: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 1292: if (handler == NULL) {
1.52 veillard 1293: xmlGenericError(xmlGenericErrorContext,
1294: "xmlNewCharEncodingHandler : out of memory !\n");
1.9 daniel 1295: return(NULL);
1296: }
1297: handler->input = input;
1298: handler->output = output;
1299: handler->name = up;
1300:
1.50 veillard 1301: #ifdef LIBXML_ICONV_ENABLED
1.49 veillard 1302: handler->iconv_in = NULL;
1303: handler->iconv_out = NULL;
1.50 veillard 1304: #endif /* LIBXML_ICONV_ENABLED */
1.49 veillard 1305:
1.9 daniel 1306: /*
1307: * registers and returns the handler.
1308: */
1309: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 1310: #ifdef DEBUG_ENCODING
1.52 veillard 1311: xmlGenericError(xmlGenericErrorContext,
1312: "Registered encoding handler for %s\n", name);
1.30 daniel 1313: #endif
1.9 daniel 1314: return(handler);
1315: }
1316:
1317: /**
1318: * xmlInitCharEncodingHandlers:
1319: *
1320: * Initialize the char encoding support, it registers the default
1321: * encoding supported.
1.18 daniel 1322: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 1323: * in normal processing.
1324: */
1325: void
1326: xmlInitCharEncodingHandlers(void) {
1.34 daniel 1327: unsigned short int tst = 0x1234;
1328: unsigned char *ptr = (unsigned char *) &tst;
1329:
1.9 daniel 1330: if (handlers != NULL) return;
1331:
1332: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 1333: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 1334:
1335: if (*ptr == 0x12) xmlLittleEndian = 0;
1336: else if (*ptr == 0x34) xmlLittleEndian = 1;
1.52 veillard 1337: else xmlGenericError(xmlGenericErrorContext,
1338: "Odd problem at endianness detection\n");
1.9 daniel 1339:
1340: if (handlers == NULL) {
1.52 veillard 1341: xmlGenericError(xmlGenericErrorContext,
1342: "xmlInitCharEncodingHandlers : out of memory !\n");
1.9 daniel 1343: return;
1344: }
1.10 daniel 1345: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 1346: xmlUTF16LEHandler =
1.28 daniel 1347: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1348: xmlUTF16BEHandler =
1349: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 1350: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.47 veillard 1351: xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1.48 veillard 1352: #ifdef LIBXML_HTML_ENABLED
1353: xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1354: #endif
1.9 daniel 1355: }
1356:
1357: /**
1.19 daniel 1358: * xmlCleanupCharEncodingHandlers:
1359: *
1360: * Cleanup the memory allocated for the char encoding support, it
1.51 veillard 1361: * unregisters all the encoding handlers and the aliases.
1.19 daniel 1362: */
1363: void
1364: xmlCleanupCharEncodingHandlers(void) {
1.51 veillard 1365: xmlCleanupEncodingAliases();
1366:
1.19 daniel 1367: if (handlers == NULL) return;
1368:
1369: for (;nbCharEncodingHandler > 0;) {
1370: nbCharEncodingHandler--;
1371: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 1372: if (handlers[nbCharEncodingHandler]->name != NULL)
1373: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 1374: xmlFree(handlers[nbCharEncodingHandler]);
1375: }
1376: }
1377: xmlFree(handlers);
1378: handlers = NULL;
1379: nbCharEncodingHandler = 0;
1380: xmlDefaultCharEncodingHandler = NULL;
1381: }
1382:
1383: /**
1.9 daniel 1384: * xmlRegisterCharEncodingHandler:
1385: * @handler: the xmlCharEncodingHandlerPtr handler block
1386: *
1387: * Register the char encoding handler, surprizing, isn't it ?
1388: */
1389: void
1390: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1391: if (handlers == NULL) xmlInitCharEncodingHandlers();
1392: if (handler == NULL) {
1.52 veillard 1393: xmlGenericError(xmlGenericErrorContext,
1394: "xmlRegisterCharEncodingHandler: NULL handler !\n");
1.9 daniel 1395: return;
1396: }
1397:
1398: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1.52 veillard 1399: xmlGenericError(xmlGenericErrorContext,
1.9 daniel 1400: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1.52 veillard 1401: xmlGenericError(xmlGenericErrorContext,
1402: "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1.9 daniel 1403: return;
1404: }
1405: handlers[nbCharEncodingHandler++] = handler;
1406: }
1407:
1408: /**
1409: * xmlGetCharEncodingHandler:
1410: * @enc: an xmlCharEncoding value.
1411: *
1412: * Search in the registrered set the handler able to read/write that encoding.
1413: *
1414: * Returns the handler or NULL if not found
1415: */
1416: xmlCharEncodingHandlerPtr
1417: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 1418: xmlCharEncodingHandlerPtr handler;
1419:
1.9 daniel 1420: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 1421: switch (enc) {
1422: case XML_CHAR_ENCODING_ERROR:
1423: return(NULL);
1424: case XML_CHAR_ENCODING_NONE:
1425: return(NULL);
1426: case XML_CHAR_ENCODING_UTF8:
1427: return(NULL);
1428: case XML_CHAR_ENCODING_UTF16LE:
1429: return(xmlUTF16LEHandler);
1430: case XML_CHAR_ENCODING_UTF16BE:
1431: return(xmlUTF16BEHandler);
1432: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 1433: handler = xmlFindCharEncodingHandler("EBCDIC");
1434: if (handler != NULL) return(handler);
1435: handler = xmlFindCharEncodingHandler("ebcdic");
1436: if (handler != NULL) return(handler);
1437: break;
1.38 daniel 1438: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 1439: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1440: if (handler != NULL) return(handler);
1441: handler = xmlFindCharEncodingHandler("UCS-4");
1442: if (handler != NULL) return(handler);
1443: handler = xmlFindCharEncodingHandler("UCS4");
1444: if (handler != NULL) return(handler);
1445: break;
1.38 daniel 1446: case XML_CHAR_ENCODING_UCS4LE:
1447: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1448: if (handler != NULL) return(handler);
1449: handler = xmlFindCharEncodingHandler("UCS-4");
1450: if (handler != NULL) return(handler);
1451: handler = xmlFindCharEncodingHandler("UCS4");
1.30 daniel 1452: if (handler != NULL) return(handler);
1453: break;
1.25 daniel 1454: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 1455: break;
1.25 daniel 1456: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 1457: break;
1.25 daniel 1458: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 1459: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1460: if (handler != NULL) return(handler);
1461: handler = xmlFindCharEncodingHandler("UCS-2");
1462: if (handler != NULL) return(handler);
1463: handler = xmlFindCharEncodingHandler("UCS2");
1464: if (handler != NULL) return(handler);
1465: break;
1.42 veillard 1466:
1467: /*
1468: * We used to keep ISO Latin encodings native in the
1469: * generated data. This led to so many problems that
1470: * this has been removed. One can still change this
1471: * back by registering no-ops encoders for those
1472: */
1.25 daniel 1473: case XML_CHAR_ENCODING_8859_1:
1.42 veillard 1474: handler = xmlFindCharEncodingHandler("ISO-8859-1");
1475: if (handler != NULL) return(handler);
1476: break;
1.25 daniel 1477: case XML_CHAR_ENCODING_8859_2:
1.42 veillard 1478: handler = xmlFindCharEncodingHandler("ISO-8859-2");
1479: if (handler != NULL) return(handler);
1480: break;
1.25 daniel 1481: case XML_CHAR_ENCODING_8859_3:
1.42 veillard 1482: handler = xmlFindCharEncodingHandler("ISO-8859-3");
1483: if (handler != NULL) return(handler);
1484: break;
1.25 daniel 1485: case XML_CHAR_ENCODING_8859_4:
1.42 veillard 1486: handler = xmlFindCharEncodingHandler("ISO-8859-4");
1487: if (handler != NULL) return(handler);
1488: break;
1.25 daniel 1489: case XML_CHAR_ENCODING_8859_5:
1.42 veillard 1490: handler = xmlFindCharEncodingHandler("ISO-8859-5");
1491: if (handler != NULL) return(handler);
1492: break;
1.25 daniel 1493: case XML_CHAR_ENCODING_8859_6:
1.42 veillard 1494: handler = xmlFindCharEncodingHandler("ISO-8859-6");
1495: if (handler != NULL) return(handler);
1496: break;
1.25 daniel 1497: case XML_CHAR_ENCODING_8859_7:
1.42 veillard 1498: handler = xmlFindCharEncodingHandler("ISO-8859-7");
1499: if (handler != NULL) return(handler);
1500: break;
1.25 daniel 1501: case XML_CHAR_ENCODING_8859_8:
1.42 veillard 1502: handler = xmlFindCharEncodingHandler("ISO-8859-8");
1503: if (handler != NULL) return(handler);
1504: break;
1.25 daniel 1505: case XML_CHAR_ENCODING_8859_9:
1.42 veillard 1506: handler = xmlFindCharEncodingHandler("ISO-8859-9");
1507: if (handler != NULL) return(handler);
1508: break;
1509:
1510:
1.25 daniel 1511: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 1512: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1513: if (handler != NULL) return(handler);
1514: break;
1.25 daniel 1515: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 1516: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1517: if (handler != NULL) return(handler);
1518: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1519: if (handler != NULL) return(handler);
1520: handler = xmlFindCharEncodingHandler("Shift_JIS");
1521: if (handler != NULL) return(handler);
1522: break;
1.25 daniel 1523: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 1524: handler = xmlFindCharEncodingHandler("EUC-JP");
1525: if (handler != NULL) return(handler);
1526: break;
1527: default:
1528: break;
1.25 daniel 1529: }
1.30 daniel 1530:
1531: #ifdef DEBUG_ENCODING
1.52 veillard 1532: xmlGenericError(xmlGenericErrorContext,
1533: "No handler found for encoding %d\n", enc);
1.30 daniel 1534: #endif
1.9 daniel 1535: return(NULL);
1536: }
1537:
1538: /**
1539: * xmlGetCharEncodingHandler:
1540: * @enc: a string describing the char encoding.
1541: *
1542: * Search in the registrered set the handler able to read/write that encoding.
1543: *
1544: * Returns the handler or NULL if not found
1545: */
1546: xmlCharEncodingHandlerPtr
1547: xmlFindCharEncodingHandler(const char *name) {
1.51 veillard 1548: const char *nalias;
1549: const char *norig;
1.36 daniel 1550: xmlCharEncoding alias;
1.30 daniel 1551: #ifdef LIBXML_ICONV_ENABLED
1.40 daniel 1552: xmlCharEncodingHandlerPtr enc;
1.30 daniel 1553: iconv_t icv_in, icv_out;
1554: #endif /* LIBXML_ICONV_ENABLED */
1555: char upper[100];
1.9 daniel 1556: int i;
1557:
1558: if (handlers == NULL) xmlInitCharEncodingHandlers();
1559: if (name == NULL) return(xmlDefaultCharEncodingHandler);
1560: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1561:
1.36 daniel 1562: /*
1.51 veillard 1563: * Do the alias resolution
1564: */
1565: norig = name;
1566: nalias = xmlGetEncodingAlias(name);
1567: if (nalias != NULL)
1568: name = nalias;
1569:
1570: /*
1.36 daniel 1571: * Check first for directly registered encoding names
1572: */
1.30 daniel 1573: for (i = 0;i < 99;i++) {
1.9 daniel 1574: upper[i] = toupper(name[i]);
1575: if (upper[i] == 0) break;
1576: }
1577: upper[i] = 0;
1578:
1579: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 1580: if (!strcmp(upper, handlers[i]->name)) {
1581: #ifdef DEBUG_ENCODING
1.52 veillard 1582: xmlGenericError(xmlGenericErrorContext,
1583: "Found registered handler for encoding %s\n", name);
1.30 daniel 1584: #endif
1.9 daniel 1585: return(handlers[i]);
1.30 daniel 1586: }
1.9 daniel 1587:
1.30 daniel 1588: #ifdef LIBXML_ICONV_ENABLED
1589: /* check whether iconv can handle this */
1.31 daniel 1590: icv_in = iconv_open("UTF-8", name);
1591: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1592: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.43 veillard 1593: enc = (xmlCharEncodingHandlerPtr)
1594: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1595: if (enc == NULL) {
1596: iconv_close(icv_in);
1597: iconv_close(icv_out);
1598: return(NULL);
1599: }
1.41 daniel 1600: enc->name = xmlMemStrdup(name);
1.30 daniel 1601: enc->input = NULL;
1602: enc->output = NULL;
1603: enc->iconv_in = icv_in;
1604: enc->iconv_out = icv_out;
1605: #ifdef DEBUG_ENCODING
1.52 veillard 1606: xmlGenericError(xmlGenericErrorContext,
1607: "Found iconv handler for encoding %s\n", name);
1.30 daniel 1608: #endif
1609: return enc;
1610: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1.52 veillard 1611: xmlGenericError(xmlGenericErrorContext,
1612: "iconv : problems with filters for '%s'\n", name);
1.30 daniel 1613: }
1614: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1615:
1.30 daniel 1616: #ifdef DEBUG_ENCODING
1.52 veillard 1617: xmlGenericError(xmlGenericErrorContext,
1618: "No handler found for encoding %s\n", name);
1.30 daniel 1619: #endif
1.38 daniel 1620:
1621: /*
1622: * Fallback using the canonical names
1623: */
1.51 veillard 1624: alias = xmlParseCharEncoding(norig);
1.38 daniel 1625: if (alias != XML_CHAR_ENCODING_ERROR) {
1626: const char* canon;
1627: canon = xmlGetCharEncodingName(alias);
1628: if ((canon != NULL) && (strcmp(name, canon))) {
1629: return(xmlFindCharEncodingHandler(canon));
1630: }
1631: }
1632:
1.9 daniel 1633: return(NULL);
1.30 daniel 1634: }
1635:
1636: #ifdef LIBXML_ICONV_ENABLED
1637: /**
1638: * xmlIconvWrapper:
1639: * @cd: iconv converter data structure
1640: * @out: a pointer to an array of bytes to store the result
1641: * @outlen: the length of @out
1642: * @in: a pointer to an array of ISO Latin 1 chars
1643: * @inlen: the length of @in
1644: *
1645: * Returns 0 if success, or
1646: * -1 by lack of space, or
1647: * -2 if the transcoding fails (for *in is not valid utf8 string or
1648: * the result of transformation can't fit into the encoding we want), or
1649: * -3 if there the last byte can't form a single output char.
1650: *
1651: * The value of @inlen after return is the number of octets consumed
1652: * as the return value is positive, else unpredictiable.
1653: * The value of @outlen after return is the number of ocetes consumed.
1654: */
1655: static int
1656: xmlIconvWrapper(iconv_t cd,
1657: unsigned char *out, int *outlen,
1658: const unsigned char *in, int *inlen) {
1659:
1660: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1661: const char *icv_in = (const char *) in;
1662: char *icv_out = (char *) out;
1663: int ret;
1664:
1665: ret = iconv(cd,
1666: &icv_in, &icv_inlen,
1667: &icv_out, &icv_outlen);
1.35 daniel 1668: if (in != NULL) {
1669: *inlen -= icv_inlen;
1670: *outlen -= icv_outlen;
1671: } else {
1672: *inlen = 0;
1673: *outlen = 0;
1674: }
1.30 daniel 1675: if (icv_inlen != 0 || ret == (size_t) -1) {
1676: #ifdef EILSEQ
1677: if (errno == EILSEQ) {
1.31 daniel 1678: return -2;
1.30 daniel 1679: } else
1680: #endif
1681: #ifdef E2BIG
1682: if (errno == E2BIG) {
1683: return -1;
1684: } else
1685: #endif
1686: #ifdef EINVAL
1687: if (errno == EINVAL) {
1.31 daniel 1688: return -3;
1.53 veillard 1689: } else
1.30 daniel 1690: #endif
1.53 veillard 1691: {
1.30 daniel 1692: return -3;
1693: }
1694: }
1695: return 0;
1696: }
1697: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1698:
1699: /**
1700: * xmlCharEncFirstLine:
1701: * @handler: char enconding transformation data structure
1702: * @out: an xmlBuffer for the output.
1703: * @in: an xmlBuffer for the input
1704: *
1705: * Front-end for the encoding handler input function, but handle only
1706: * the very first line, i.e. limit itself to 45 chars.
1707: *
1708: * Returns the number of byte written if success, or
1709: * -1 general error
1710: * -2 if the transcoding fails (for *in is not valid utf8 string or
1711: * the result of transformation can't fit into the encoding we want), or
1712: */
1713: int
1714: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1715: xmlBufferPtr in) {
1716: int ret = -2;
1717: int written;
1718: int toconv;
1719:
1720: if (handler == NULL) return(-1);
1721: if (out == NULL) return(-1);
1722: if (in == NULL) return(-1);
1723:
1724: written = out->size - out->use;
1725: toconv = in->use;
1726: if (toconv * 2 >= written) {
1.39 daniel 1727: xmlBufferGrow(out, toconv);
1.38 daniel 1728: written = out->size - out->use - 1;
1729: }
1.39 daniel 1730:
1.38 daniel 1731: /*
1732: * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1733: * 45 chars should be sufficient to reach the end of the encoding
1734: * decalration without going too far inside the document content.
1735: */
1736: written = 45;
1737:
1738: if (handler->input != NULL) {
1739: ret = handler->input(&out->content[out->use], &written,
1740: in->content, &toconv);
1741: xmlBufferShrink(in, toconv);
1742: out->use += written;
1743: out->content[out->use] = 0;
1744: }
1745: #ifdef LIBXML_ICONV_ENABLED
1746: else if (handler->iconv_in != NULL) {
1747: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1748: &written, in->content, &toconv);
1749: xmlBufferShrink(in, toconv);
1750: out->use += written;
1751: out->content[out->use] = 0;
1752: if (ret == -1) ret = -3;
1753: }
1754: #endif /* LIBXML_ICONV_ENABLED */
1755: #ifdef DEBUG_ENCODING
1756: switch (ret) {
1757: case 0:
1.52 veillard 1758: xmlGenericError(xmlGenericErrorContext,
1759: "converted %d bytes to %d bytes of input\n",
1.38 daniel 1760: toconv, written);
1761: break;
1762: case -1:
1.52 veillard 1763: xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1.38 daniel 1764: toconv, written, in->use);
1765: break;
1766: case -2:
1.52 veillard 1767: xmlGenericError(xmlGenericErrorContext,
1768: "input conversion failed due to input error\n");
1.38 daniel 1769: break;
1770: case -3:
1.52 veillard 1771: xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1.38 daniel 1772: toconv, written, in->use);
1773: break;
1774: default:
1.52 veillard 1775: xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
1.38 daniel 1776: }
1777: #endif
1778: /*
1779: * Ignore when input buffer is not on a boundary
1780: */
1781: if (ret == -3) ret = 0;
1782: if (ret == -1) ret = 0;
1783: return(ret);
1784: }
1.30 daniel 1785:
1786: /**
1787: * xmlCharEncInFunc:
1788: * @handler: char enconding transformation data structure
1.31 daniel 1789: * @out: an xmlBuffer for the output.
1790: * @in: an xmlBuffer for the input
1.30 daniel 1791: *
1792: * Generic front-end for the encoding handler input function
1793: *
1.31 daniel 1794: * Returns the number of byte written if success, or
1795: * -1 general error
1.30 daniel 1796: * -2 if the transcoding fails (for *in is not valid utf8 string or
1797: * the result of transformation can't fit into the encoding we want), or
1798: */
1799: int
1.31 daniel 1800: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1801: xmlBufferPtr in) {
1.30 daniel 1802: int ret = -2;
1.31 daniel 1803: int written;
1804: int toconv;
1.30 daniel 1805:
1.31 daniel 1806: if (handler == NULL) return(-1);
1807: if (out == NULL) return(-1);
1808: if (in == NULL) return(-1);
1809:
1.50 veillard 1810: toconv = in->use;
1811: if (toconv == 0)
1812: return(0);
1.31 daniel 1813: written = out->size - out->use;
1814: if (toconv * 2 >= written) {
1.54 ! veillard 1815: xmlBufferGrow(out, out->size + toconv * 2);
1.33 daniel 1816: written = out->size - out->use - 1;
1.31 daniel 1817: }
1.30 daniel 1818: if (handler->input != NULL) {
1.32 daniel 1819: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1820: in->content, &toconv);
1821: xmlBufferShrink(in, toconv);
1822: out->use += written;
1.33 daniel 1823: out->content[out->use] = 0;
1.30 daniel 1824: }
1825: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1826: else if (handler->iconv_in != NULL) {
1827: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1828: &written, in->content, &toconv);
1829: xmlBufferShrink(in, toconv);
1830: out->use += written;
1.33 daniel 1831: out->content[out->use] = 0;
1832: if (ret == -1) ret = -3;
1.30 daniel 1833: }
1834: #endif /* LIBXML_ICONV_ENABLED */
1.39 daniel 1835: switch (ret) {
1.30 daniel 1836: #ifdef DEBUG_ENCODING
1837: case 0:
1.52 veillard 1838: xmlGenericError(xmlGenericErrorContext,
1839: "converted %d bytes to %d bytes of input\n",
1.31 daniel 1840: toconv, written);
1.30 daniel 1841: break;
1842: case -1:
1.52 veillard 1843: xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1.31 daniel 1844: toconv, written, in->use);
1.30 daniel 1845: break;
1846: case -3:
1.52 veillard 1847: xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
1.31 daniel 1848: toconv, written, in->use);
1.30 daniel 1849: break;
1.39 daniel 1850: #endif
1851: case -2:
1.52 veillard 1852: xmlGenericError(xmlGenericErrorContext,
1853: "input conversion failed due to input error\n");
1854: xmlGenericError(xmlGenericErrorContext,
1855: "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1.39 daniel 1856: in->content[0], in->content[1],
1857: in->content[2], in->content[3]);
1.30 daniel 1858: }
1.33 daniel 1859: /*
1860: * Ignore when input buffer is not on a boundary
1861: */
1862: if (ret == -3) ret = 0;
1.30 daniel 1863: return(ret);
1864: }
1865:
1866: /**
1867: * xmlCharEncOutFunc:
1868: * @handler: char enconding transformation data structure
1.31 daniel 1869: * @out: an xmlBuffer for the output.
1870: * @in: an xmlBuffer for the input
1871: *
1872: * Generic front-end for the encoding handler output function
1.35 daniel 1873: * a first call with @in == NULL has to be made firs to initiate the
1874: * output in case of non-stateless encoding needing to initiate their
1875: * state or the output (like the BOM in UTF16).
1.39 daniel 1876: * In case of UTF8 sequence conversion errors for the given encoder,
1877: * the content will be automatically remapped to a CharRef sequence.
1.30 daniel 1878: *
1.31 daniel 1879: * Returns the number of byte written if success, or
1880: * -1 general error
1.30 daniel 1881: * -2 if the transcoding fails (for *in is not valid utf8 string or
1882: * the result of transformation can't fit into the encoding we want), or
1883: */
1884: int
1.31 daniel 1885: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1886: xmlBufferPtr in) {
1.30 daniel 1887: int ret = -2;
1.31 daniel 1888: int written;
1.54 ! veillard 1889: int writtentot = 0;
1.31 daniel 1890: int toconv;
1.39 daniel 1891: int output = 0;
1.31 daniel 1892:
1893: if (handler == NULL) return(-1);
1894: if (out == NULL) return(-1);
1.39 daniel 1895:
1896: retry:
1897:
1.35 daniel 1898: written = out->size - out->use;
1899:
1.39 daniel 1900: /*
1901: * First specific handling of in = NULL, i.e. the initialization call
1902: */
1.35 daniel 1903: if (in == NULL) {
1904: toconv = 0;
1905: if (handler->output != NULL) {
1906: ret = handler->output(&out->content[out->use], &written,
1907: NULL, &toconv);
1908: out->use += written;
1909: out->content[out->use] = 0;
1910: }
1911: #ifdef LIBXML_ICONV_ENABLED
1912: else if (handler->iconv_out != NULL) {
1913: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1914: &written, NULL, &toconv);
1915: out->use += written;
1916: out->content[out->use] = 0;
1917: }
1918: #endif /* LIBXML_ICONV_ENABLED */
1919: #ifdef DEBUG_ENCODING
1.52 veillard 1920: xmlGenericError(xmlGenericErrorContext,
1921: "initialized encoder\n");
1.35 daniel 1922: #endif
1923: return(0);
1924: }
1.30 daniel 1925:
1.39 daniel 1926: /*
1927: * Convertion itself.
1928: */
1.33 daniel 1929: toconv = in->use;
1.50 veillard 1930: if (toconv == 0)
1931: return(0);
1.33 daniel 1932: if (toconv * 2 >= written) {
1933: xmlBufferGrow(out, toconv * 2);
1934: written = out->size - out->use - 1;
1935: }
1.30 daniel 1936: if (handler->output != NULL) {
1.33 daniel 1937: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1938: in->content, &toconv);
1.31 daniel 1939: xmlBufferShrink(in, toconv);
1940: out->use += written;
1.54 ! veillard 1941: writtentot += written;
1.33 daniel 1942: out->content[out->use] = 0;
1.30 daniel 1943: }
1944: #ifdef LIBXML_ICONV_ENABLED
1945: else if (handler->iconv_out != NULL) {
1.31 daniel 1946: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1947: &written, in->content, &toconv);
1948: xmlBufferShrink(in, toconv);
1949: out->use += written;
1.54 ! veillard 1950: writtentot += written;
1.33 daniel 1951: out->content[out->use] = 0;
1.54 ! veillard 1952: if (ret == -1) {
! 1953: if (written > 0) {
! 1954: /*
! 1955: * Can be a limitation of iconv
! 1956: */
! 1957: goto retry;
! 1958: }
! 1959: ret = -3;
! 1960: }
1.30 daniel 1961: }
1962: #endif /* LIBXML_ICONV_ENABLED */
1.46 veillard 1963: else {
1.52 veillard 1964: xmlGenericError(xmlGenericErrorContext,
1965: "xmlCharEncOutFunc: no output function !\n");
1.46 veillard 1966: return(-1);
1967: }
1.39 daniel 1968:
1969: if (ret >= 0) output += ret;
1970:
1971: /*
1972: * Attempt to handle error cases
1973: */
1974: switch (ret) {
1.30 daniel 1975: #ifdef DEBUG_ENCODING
1976: case 0:
1.52 veillard 1977: xmlGenericError(xmlGenericErrorContext,
1978: "converted %d bytes to %d bytes of output\n",
1.31 daniel 1979: toconv, written);
1.30 daniel 1980: break;
1981: case -1:
1.52 veillard 1982: xmlGenericError(xmlGenericErrorContext,
1983: "output conversion failed by lack of space\n");
1.30 daniel 1984: break;
1.54 ! veillard 1985: #endif
1.30 daniel 1986: case -3:
1.52 veillard 1987: xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
1.31 daniel 1988: toconv, written, in->use);
1.30 daniel 1989: break;
1.39 daniel 1990: case -2: {
1991: int len = in->use;
1.43 veillard 1992: const xmlChar *utf = (const xmlChar *) in->content;
1.39 daniel 1993: int cur;
1994:
1995: cur = xmlGetUTF8Char(utf, &len);
1996: if (cur > 0) {
1997: xmlChar charref[20];
1998:
1999: #ifdef DEBUG_ENCODING
1.52 veillard 2000: xmlGenericError(xmlGenericErrorContext,
2001: "handling output conversion error\n");
2002: xmlGenericError(xmlGenericErrorContext,
2003: "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1.39 daniel 2004: in->content[0], in->content[1],
2005: in->content[2], in->content[3]);
2006: #endif
2007: /*
2008: * Removes the UTF8 sequence, and replace it by a charref
2009: * and continue the transcoding phase, hoping the error
2010: * did not mangle the encoder state.
2011: */
1.43 veillard 2012: sprintf((char *) charref, "&#x%X;", cur);
1.39 daniel 2013: xmlBufferShrink(in, len);
2014: xmlBufferAddHead(in, charref, -1);
2015:
2016: goto retry;
2017: } else {
1.52 veillard 2018: xmlGenericError(xmlGenericErrorContext,
2019: "output conversion failed due to conv error\n");
2020: xmlGenericError(xmlGenericErrorContext,
2021: "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1.39 daniel 2022: in->content[0], in->content[1],
2023: in->content[2], in->content[3]);
1.50 veillard 2024: in->content[0] = ' ';
1.39 daniel 2025: }
2026: break;
2027: }
1.30 daniel 2028: }
2029: return(ret);
2030: }
2031:
2032: /**
2033: * xmlCharEncCloseFunc:
2034: * @handler: char enconding transformation data structure
2035: *
2036: * Generic front-end for hencoding handler close function
2037: *
2038: * Returns 0 if success, or -1 in case of error
2039: */
2040: int
2041: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2042: int ret = 0;
1.31 daniel 2043: if (handler == NULL) return(-1);
2044: if (handler->name == NULL) return(-1);
1.30 daniel 2045: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 2046: /*
2047: * Iconv handlers can be oused only once, free the whole block.
2048: * and the associated icon resources.
2049: */
1.32 daniel 2050: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2051: if (handler->name != NULL)
2052: xmlFree(handler->name);
2053: handler->name = NULL;
2054: if (handler->iconv_out != NULL) {
2055: if (iconv_close(handler->iconv_out))
2056: ret = -1;
2057: handler->iconv_out = NULL;
2058: }
2059: if (handler->iconv_in != NULL) {
2060: if (iconv_close(handler->iconv_in))
2061: ret = -1;
2062: handler->iconv_in = NULL;
2063: }
2064: xmlFree(handler);
1.30 daniel 2065: }
2066: #endif /* LIBXML_ICONV_ENABLED */
2067: #ifdef DEBUG_ENCODING
2068: if (ret)
1.52 veillard 2069: xmlGenericError(xmlGenericErrorContext,
2070: "failed to close the encoding handler\n");
1.30 daniel 2071: else
1.52 veillard 2072: xmlGenericError(xmlGenericErrorContext,
2073: "closed the encoding handler\n");
1.30 daniel 2074:
2075: #endif
2076: return(ret);
1.9 daniel 2077: }
2078:
Webmaster