Annotation of XML/encoding.c, revision 1.49
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
1.39 daniel 6: * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
1.1 daniel 7: * [ISO-10646] UTF-8 and UTF-16 in Annexes
8: * [ISO-8859-1] ISO Latin-1 characters codes.
9: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
10: * Worldwide Character Encoding -- Version 1.0", Addison-
11: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
12: * described in Unicode Technical Report #4.
13: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
14: * Information Interchange, ANSI X3.4-1986.
15: *
1.9 daniel 16: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 17: *
18: * See Copyright for the status of this software.
19: *
20: * Daniel.Veillard@w3.org
21: */
22:
1.21 daniel 23: #ifdef WIN32
24: #include "win32config.h"
25: #else
1.14 daniel 26: #include "config.h"
1.17 daniel 27: #endif
28:
29: #include <stdio.h>
30: #include <string.h>
31:
32: #ifdef HAVE_CTYPE_H
1.7 daniel 33: #include <ctype.h>
1.17 daniel 34: #endif
1.20 daniel 35: #ifdef HAVE_STDLIB_H
36: #include <stdlib.h>
37: #endif
1.30 daniel 38: #include <libxml/xmlversion.h>
39: #ifdef LIBXML_ICONV_ENABLED
40: #ifdef HAVE_ERRNO_H
41: #include <errno.h>
42: #endif
43: #endif
1.29 daniel 44: #include <libxml/encoding.h>
45: #include <libxml/xmlmemory.h>
1.48 veillard 46: #ifdef LIBXML_HTML_ENABLED
47: #include <libxml/HTMLparser.h>
48: #endif
1.3 daniel 49:
1.25 daniel 50: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
51: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
52:
1.30 daniel 53: #ifdef LIBXML_ICONV_ENABLED
1.46 veillard 54: #if 0
1.30 daniel 55: #define DEBUG_ENCODING /* Define this to get encoding traces */
56: #endif
1.33 daniel 57: #endif
1.30 daniel 58:
1.34 daniel 59: static int xmlLittleEndian = 1;
60:
1.3 daniel 61: /*
62: * From rfc2044: encoding of the Unicode values on UTF-8:
63: *
64: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
65: * 0000 0000-0000 007F 0xxxxxxx
66: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
67: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
68: *
69: * I hope we won't use values > 0xFFFF anytime soon !
70: */
1.1 daniel 71:
72: /**
1.39 daniel 73: * xmlGetUTF8Char:
74: * @utf: a sequence of UTF-8 encoded bytes
75: * @len: a pointer to @bytes len
76: *
77: * Read one UTF8 Char from @utf
78: *
79: * Returns the char value or -1 in case of error and update @len with the
80: * number of bytes used
81: */
82: int
83: xmlGetUTF8Char(const unsigned char *utf, int *len) {
84: unsigned int c;
85:
86: if (utf == NULL)
87: goto error;
88: if (len == NULL)
89: goto error;
90: if (*len < 1)
91: goto error;
92:
93: c = utf[0];
94: if (c & 0x80) {
95: if (*len < 2)
96: goto error;
97: if ((utf[1] & 0xc0) != 0x80)
98: goto error;
99: if ((c & 0xe0) == 0xe0) {
100: if (*len < 3)
101: goto error;
102: if ((utf[2] & 0xc0) != 0x80)
103: goto error;
104: if ((c & 0xf0) == 0xf0) {
105: if (*len < 4)
106: goto error;
107: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
108: goto error;
109: *len = 4;
110: /* 4-byte code */
111: c = (utf[0] & 0x7) << 18;
112: c |= (utf[1] & 0x3f) << 12;
113: c |= (utf[2] & 0x3f) << 6;
114: c |= utf[3] & 0x3f;
115: } else {
116: /* 3-byte code */
117: *len = 3;
118: c = (utf[0] & 0xf) << 12;
119: c |= (utf[1] & 0x3f) << 6;
120: c |= utf[2] & 0x3f;
121: }
122: } else {
123: /* 2-byte code */
124: *len = 2;
125: c = (utf[0] & 0x1f) << 6;
126: c |= utf[1] & 0x3f;
127: }
128: } else {
129: /* 1-byte code */
130: *len = 1;
131: }
132: return(c);
133:
134: error:
135: *len = 0;
136: return(-1);
137: }
138:
139: /**
1.22 daniel 140: * xmlCheckUTF8: Check utf-8 string for legality.
141: * @utf: Pointer to putative utf-8 encoded string.
142: *
143: * Checks @utf for being valid utf-8. @utf is assumed to be
144: * null-terminated. This function is not super-strict, as it will
145: * allow longer utf-8 sequences than necessary. Note that Java is
146: * capable of producing these sequences if provoked. Also note, this
147: * routine checks for the 4-byte maxiumum size, but does not check for
148: * 0x10ffff maximum value.
149: *
150: * Return value: true if @utf is valid.
151: **/
152: int
153: xmlCheckUTF8(const unsigned char *utf)
154: {
155: int ix;
156: unsigned char c;
157:
158: for (ix = 0; (c = utf[ix]);) {
159: if (c & 0x80) {
160: if ((utf[ix + 1] & 0xc0) != 0x80)
161: return(0);
162: if ((c & 0xe0) == 0xe0) {
163: if ((utf[ix + 2] & 0xc0) != 0x80)
164: return(0);
165: if ((c & 0xf0) == 0xf0) {
166: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
167: return(0);
168: ix += 4;
169: /* 4-byte code */
170: } else
171: /* 3-byte code */
172: ix += 3;
173: } else
174: /* 2-byte code */
175: ix += 2;
176: } else
177: /* 1-byte code */
178: ix++;
179: }
180: return(1);
181: }
182:
183: /**
1.47 veillard 184: * asciiToUTF8:
185: * @out: a pointer to an array of bytes to store the result
186: * @outlen: the length of @out
187: * @in: a pointer to an array of ASCII chars
188: * @inlen: the length of @in
189: *
190: * Take a block of ASCII chars in and try to convert it to an UTF-8
191: * block of chars out.
192: * Returns 0 if success, or -1 otherwise
193: * The value of @inlen after return is the number of octets consumed
194: * as the return value is positive, else unpredictiable.
195: * The value of @outlen after return is the number of ocetes consumed.
196: */
197: int
198: asciiToUTF8(unsigned char* out, int *outlen,
199: const unsigned char* in, int *inlen) {
200: unsigned char* outstart = out;
201: const unsigned char* base = in;
202: const unsigned char* processed = in;
203: unsigned char* outend = out + *outlen;
204: const unsigned char* inend;
205: unsigned int c;
206: int bits;
207:
208: inend = in + (*inlen);
209: while ((in < inend) && (out - outstart + 5 < *outlen)) {
210: c= *in++;
211:
212: /* assertion: c is a single UTF-4 value */
213: if (out >= outend)
214: break;
215: if (c < 0x80) { *out++= c; bits= -6; }
216: else {
217: *outlen = out - outstart;
218: *inlen = processed - base;
219: return(-1);
220: }
221:
222: for ( ; bits >= 0; bits-= 6) {
223: if (out >= outend)
224: break;
225: *out++= ((c >> bits) & 0x3F) | 0x80;
226: }
227: processed = (const unsigned char*) in;
228: }
229: *outlen = out - outstart;
230: *inlen = processed - base;
231: return(0);
232: }
233:
234: /**
235: * UTF8Toascii:
236: * @out: a pointer to an array of bytes to store the result
237: * @outlen: the length of @out
238: * @in: a pointer to an array of UTF-8 chars
239: * @inlen: the length of @in
240: *
241: * Take a block of UTF-8 chars in and try to convert it to an ASCII
242: * block of chars out.
243: *
244: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
245: * The value of @inlen after return is the number of octets consumed
246: * as the return value is positive, else unpredictiable.
247: * The value of @outlen after return is the number of ocetes consumed.
248: */
249: int
250: UTF8Toascii(unsigned char* out, int *outlen,
251: const unsigned char* in, int *inlen) {
252: const unsigned char* processed = in;
253: const unsigned char* outend;
254: const unsigned char* outstart = out;
255: const unsigned char* instart = in;
256: const unsigned char* inend;
257: unsigned int c, d;
258: int trailing;
259:
260: if (in == NULL) {
261: /*
262: * initialization nothing to do
263: */
264: *outlen = 0;
265: *inlen = 0;
266: return(0);
267: }
268: inend = in + (*inlen);
269: outend = out + (*outlen);
270: while (in < inend) {
271: d = *in++;
272: if (d < 0x80) { c= d; trailing= 0; }
273: else if (d < 0xC0) {
274: /* trailing byte in leading position */
275: *outlen = out - outstart;
276: *inlen = processed - instart;
277: return(-2);
278: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
279: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
280: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
281: else {
282: /* no chance for this in Ascii */
283: *outlen = out - outstart;
284: *inlen = processed - instart;
285: return(-2);
286: }
287:
288: if (inend - in < trailing) {
289: break;
290: }
291:
292: for ( ; trailing; trailing--) {
293: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
294: break;
295: c <<= 6;
296: c |= d & 0x3F;
297: }
298:
299: /* assertion: c is a single UTF-4 value */
300: if (c < 0x80) {
301: if (out >= outend)
302: break;
303: *out++ = c;
304: } else {
305: /* no chance for this in Ascii */
306: *outlen = out - outstart;
307: *inlen = processed - instart;
308: return(-2);
309: }
310: processed = in;
311: }
312: *outlen = out - outstart;
313: *inlen = processed - instart;
314: return(0);
315: }
316:
317: /**
1.1 daniel 318: * isolat1ToUTF8:
1.18 daniel 319: * @out: a pointer to an array of bytes to store the result
320: * @outlen: the length of @out
321: * @in: a pointer to an array of ISO Latin 1 chars
322: * @inlen: the length of @in
1.1 daniel 323: *
324: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
325: * block of chars out.
1.33 daniel 326: * Returns 0 if success, or -1 otherwise
327: * The value of @inlen after return is the number of octets consumed
328: * as the return value is positive, else unpredictiable.
329: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 330: */
331: int
1.33 daniel 332: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 333: const unsigned char* in, int *inlen) {
1.33 daniel 334: unsigned char* outstart = out;
1.45 veillard 335: const unsigned char* base = in;
1.33 daniel 336: const unsigned char* processed = in;
337: unsigned char* outend = out + *outlen;
1.45 veillard 338: const unsigned char* inend;
339: unsigned int c;
340: int bits;
341:
342: inend = in + (*inlen);
343: while ((in < inend) && (out - outstart + 5 < *outlen)) {
344: c= *in++;
1.1 daniel 345:
1.45 veillard 346: /* assertion: c is a single UTF-4 value */
347: if (out >= outend)
348: break;
349: if (c < 0x80) { *out++= c; bits= -6; }
350: else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
351:
352: for ( ; bits >= 0; bits-= 6) {
1.33 daniel 353: if (out >= outend)
1.45 veillard 354: break;
355: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 356: }
1.45 veillard 357: processed = (const unsigned char*) in;
1.1 daniel 358: }
1.33 daniel 359: *outlen = out - outstart;
1.45 veillard 360: *inlen = processed - base;
1.33 daniel 361: return(0);
1.1 daniel 362: }
363:
364: /**
365: * UTF8Toisolat1:
1.18 daniel 366: * @out: a pointer to an array of bytes to store the result
367: * @outlen: the length of @out
368: * @in: a pointer to an array of UTF-8 chars
369: * @inlen: the length of @in
1.1 daniel 370: *
371: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
372: * block of chars out.
1.15 daniel 373: *
1.33 daniel 374: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 375: * The value of @inlen after return is the number of octets consumed
376: * as the return value is positive, else unpredictiable.
1.33 daniel 377: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 378: */
379: int
1.33 daniel 380: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 381: const unsigned char* in, int *inlen) {
1.33 daniel 382: const unsigned char* processed = in;
1.45 veillard 383: const unsigned char* outend;
384: const unsigned char* outstart = out;
385: const unsigned char* instart = in;
386: const unsigned char* inend;
387: unsigned int c, d;
388: int trailing;
1.1 daniel 389:
1.45 veillard 390: if (in == NULL) {
391: /*
392: * initialization nothing to do
393: */
394: *outlen = 0;
395: *inlen = 0;
396: return(0);
397: }
398: inend = in + (*inlen);
399: outend = out + (*outlen);
1.1 daniel 400: while (in < inend) {
1.45 veillard 401: d = *in++;
402: if (d < 0x80) { c= d; trailing= 0; }
403: else if (d < 0xC0) {
404: /* trailing byte in leading position */
405: *outlen = out - outstart;
406: *inlen = processed - instart;
407: return(-2);
408: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
409: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
410: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
411: else {
412: /* no chance for this in IsoLat1 */
413: *outlen = out - outstart;
414: *inlen = processed - instart;
415: return(-2);
1.23 daniel 416: }
1.45 veillard 417:
418: if (inend - in < trailing) {
419: break;
420: }
421:
422: for ( ; trailing; trailing--) {
423: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
424: break;
425: c <<= 6;
426: c |= d & 0x3F;
1.23 daniel 427: }
1.45 veillard 428:
429: /* assertion: c is a single UTF-4 value */
430: if (c <= 0xFF) {
431: if (out >= outend)
432: break;
433: *out++ = c;
434: } else {
435: /* no chance for this in IsoLat1 */
1.33 daniel 436: *outlen = out - outstart;
1.45 veillard 437: *inlen = processed - instart;
1.28 daniel 438: return(-2);
1.33 daniel 439: }
440: processed = in;
1.1 daniel 441: }
1.33 daniel 442: *outlen = out - outstart;
1.45 veillard 443: *inlen = processed - instart;
1.33 daniel 444: return(0);
1.1 daniel 445: }
446:
447: /**
1.28 daniel 448: * UTF16LEToUTF8:
449: * @out: a pointer to an array of bytes to store the result
450: * @outlen: the length of @out
451: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
452: * @inlenb: the length of @in in UTF-16LE chars
453: *
454: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
455: * block of chars out. This function assume the endian properity
456: * is the same between the native type of this machine and the
457: * inputed one.
458: *
459: * Returns the number of byte written, or -1 by lack of space, or -2
460: * if the transcoding fails (for *in is not valid utf16 string)
461: * The value of *inlen after return is the number of octets consumed
462: * as the return value is positive, else unpredictiable.
463: */
464: int
1.33 daniel 465: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 466: const unsigned char* inb, int *inlenb)
467: {
1.33 daniel 468: unsigned char* outstart = out;
469: const unsigned char* processed = inb;
470: unsigned char* outend = out + *outlen;
1.28 daniel 471: unsigned short* in = (unsigned short*) inb;
472: unsigned short* inend;
473: unsigned int c, d, inlen;
474: unsigned char *tmp;
475: int bits;
476:
477: if ((*inlenb % 2) == 1)
478: (*inlenb)--;
479: inlen = *inlenb / 2;
1.33 daniel 480: inend = in + inlen;
1.39 daniel 481: while ((in < inend) && (out - outstart + 5 < *outlen)) {
1.34 daniel 482: if (xmlLittleEndian) {
483: c= *in++;
484: } else {
485: tmp = (unsigned char *) in;
486: c = *tmp++;
487: c = c | (((unsigned int)*tmp) << 8);
488: in++;
489: }
1.28 daniel 490: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.39 daniel 491: if (in >= inend) { /* (in > inend) shouldn't happens */
492: break;
493: }
1.34 daniel 494: if (xmlLittleEndian) {
495: d = *in++;
496: } else {
497: tmp = (unsigned char *) in;
498: d = *tmp++;
499: d = d | (((unsigned int)*tmp) << 8);
500: in++;
501: }
1.28 daniel 502: if ((d & 0xFC00) == 0xDC00) {
503: c &= 0x03FF;
504: c <<= 10;
505: c |= d & 0x03FF;
506: c += 0x10000;
507: }
1.33 daniel 508: else {
509: *outlen = out - outstart;
510: *inlenb = processed - inb;
1.28 daniel 511: return(-2);
1.33 daniel 512: }
1.28 daniel 513: }
514:
515: /* assertion: c is a single UTF-4 value */
516: if (out >= outend)
1.33 daniel 517: break;
1.28 daniel 518: if (c < 0x80) { *out++= c; bits= -6; }
519: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
520: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
521: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
522:
523: for ( ; bits >= 0; bits-= 6) {
524: if (out >= outend)
1.33 daniel 525: break;
1.28 daniel 526: *out++= ((c >> bits) & 0x3F) | 0x80;
527: }
1.33 daniel 528: processed = (const unsigned char*) in;
1.28 daniel 529: }
1.33 daniel 530: *outlen = out - outstart;
531: *inlenb = processed - inb;
532: return(0);
1.28 daniel 533: }
534:
535: /**
536: * UTF8ToUTF16LE:
537: * @outb: a pointer to an array of bytes to store the result
538: * @outlen: the length of @outb
539: * @in: a pointer to an array of UTF-8 chars
540: * @inlen: the length of @in
541: *
542: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
543: * block of chars out.
544: *
545: * Returns the number of byte written, or -1 by lack of space, or -2
546: * if the transcoding failed.
547: */
548: int
1.33 daniel 549: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 550: const unsigned char* in, int *inlen)
551: {
552: unsigned short* out = (unsigned short*) outb;
1.33 daniel 553: const unsigned char* processed = in;
1.28 daniel 554: unsigned short* outstart= out;
555: unsigned short* outend;
556: const unsigned char* inend= in+*inlen;
1.40 daniel 557: unsigned int c, d;
558: int trailing;
1.28 daniel 559: unsigned char *tmp;
560: unsigned short tmp1, tmp2;
561:
1.37 daniel 562: if (in == NULL) {
563: /*
564: * initialization, add the Byte Order Mark
565: */
566: if (*outlen >= 2) {
567: outb[0] = 0xFF;
568: outb[1] = 0xFE;
569: *outlen = 2;
570: *inlen = 0;
571: #ifdef DEBUG_ENCODING
572: fprintf(stderr, "Added FFFE Byte Order Mark\n");
573: #endif
574: return(2);
575: }
576: *outlen = 0;
577: *inlen = 0;
578: return(0);
579: }
1.33 daniel 580: outend = out + (*outlen / 2);
1.28 daniel 581: while (in < inend) {
582: d= *in++;
583: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 584: else if (d < 0xC0) {
585: /* trailing byte in leading position */
1.45 veillard 586: *outlen = (out - outstart) * 2;
1.33 daniel 587: *inlen = processed - in;
588: return(-2);
589: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 590: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
591: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 592: else {
593: /* no chance for this in UTF-16 */
1.45 veillard 594: *outlen = (out - outstart) * 2;
1.33 daniel 595: *inlen = processed - in;
596: return(-2);
597: }
1.28 daniel 598:
599: if (inend - in < trailing) {
600: break;
601: }
602:
603: for ( ; trailing; trailing--) {
604: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 605: break;
1.28 daniel 606: c <<= 6;
607: c |= d & 0x3F;
608: }
609:
610: /* assertion: c is a single UTF-4 value */
611: if (c < 0x10000) {
612: if (out >= outend)
1.33 daniel 613: break;
1.34 daniel 614: if (xmlLittleEndian) {
615: *out++ = c;
616: } else {
617: tmp = (unsigned char *) out;
618: *tmp = c ;
619: *(tmp + 1) = c >> 8 ;
620: out++;
621: }
1.28 daniel 622: }
623: else if (c < 0x110000) {
624: if (out+1 >= outend)
1.33 daniel 625: break;
1.28 daniel 626: c -= 0x10000;
1.34 daniel 627: if (xmlLittleEndian) {
628: *out++ = 0xD800 | (c >> 10);
629: *out++ = 0xDC00 | (c & 0x03FF);
630: } else {
631: tmp1 = 0xD800 | (c >> 10);
632: tmp = (unsigned char *) out;
1.40 daniel 633: *tmp = (unsigned char) tmp1;
1.34 daniel 634: *(tmp + 1) = tmp1 >> 8;
635: out++;
636:
637: tmp2 = 0xDC00 | (c & 0x03FF);
638: tmp = (unsigned char *) out;
1.40 daniel 639: *tmp = (unsigned char) tmp2;
1.34 daniel 640: *(tmp + 1) = tmp2 >> 8;
641: out++;
642: }
1.28 daniel 643: }
644: else
1.33 daniel 645: break;
646: processed = in;
1.28 daniel 647: }
1.36 daniel 648: *outlen = (out - outstart) * 2;
1.33 daniel 649: *inlen = processed - in;
650: return(0);
1.28 daniel 651: }
652:
653: /**
654: * UTF16BEToUTF8:
1.18 daniel 655: * @out: a pointer to an array of bytes to store the result
656: * @outlen: the length of @out
1.25 daniel 657: * @inb: a pointer to an array of UTF-16 passwd as a byte array
658: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 659: *
660: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 661: * block of chars out. This function assume the endian properity
662: * is the same between the native type of this machine and the
663: * inputed one.
1.25 daniel 664: *
1.28 daniel 665: * Returns the number of byte written, or -1 by lack of space, or -2
666: * if the transcoding fails (for *in is not valid utf16 string)
667: * The value of *inlen after return is the number of octets consumed
668: * as the return value is positive, else unpredictiable.
1.1 daniel 669: */
670: int
1.33 daniel 671: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 672: const unsigned char* inb, int *inlenb)
1.1 daniel 673: {
1.33 daniel 674: unsigned char* outstart = out;
675: const unsigned char* processed = inb;
676: unsigned char* outend = out + *outlen;
1.25 daniel 677: unsigned short* in = (unsigned short*) inb;
678: unsigned short* inend;
679: unsigned int c, d, inlen;
1.28 daniel 680: unsigned char *tmp;
1.1 daniel 681: int bits;
682:
1.28 daniel 683: if ((*inlenb % 2) == 1)
684: (*inlenb)--;
1.25 daniel 685: inlen = *inlenb / 2;
686: inend= in + inlen;
1.1 daniel 687: while (in < inend) {
1.34 daniel 688: if (xmlLittleEndian) {
689: tmp = (unsigned char *) in;
690: c = *tmp++;
691: c = c << 8;
692: c = c | (unsigned int) *tmp;
693: in++;
694: } else {
695: c= *in++;
696: }
1.1 daniel 697: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 698: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 699: *outlen = out - outstart;
700: *inlenb = processed - inb;
701: return(-2);
1.28 daniel 702: }
1.34 daniel 703: if (xmlLittleEndian) {
704: tmp = (unsigned char *) in;
705: d = *tmp++;
706: d = d << 8;
707: d = d | (unsigned int) *tmp;
708: in++;
709: } else {
710: d= *in++;
711: }
1.28 daniel 712: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 713: c &= 0x03FF;
714: c <<= 10;
715: c |= d & 0x03FF;
716: c += 0x10000;
717: }
1.33 daniel 718: else {
719: *outlen = out - outstart;
720: *inlenb = processed - inb;
1.28 daniel 721: return(-2);
1.33 daniel 722: }
1.1 daniel 723: }
724:
1.25 daniel 725: /* assertion: c is a single UTF-4 value */
1.27 daniel 726: if (out >= outend)
1.33 daniel 727: break;
1.1 daniel 728: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 729: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
730: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
731: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 732:
1.26 daniel 733: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 734: if (out >= outend)
1.33 daniel 735: break;
1.26 daniel 736: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 737: }
1.33 daniel 738: processed = (const unsigned char*) in;
1.1 daniel 739: }
1.33 daniel 740: *outlen = out - outstart;
741: *inlenb = processed - inb;
742: return(0);
1.1 daniel 743: }
744:
745: /**
1.28 daniel 746: * UTF8ToUTF16BE:
1.25 daniel 747: * @outb: a pointer to an array of bytes to store the result
748: * @outlen: the length of @outb
1.18 daniel 749: * @in: a pointer to an array of UTF-8 chars
750: * @inlen: the length of @in
1.1 daniel 751: *
1.28 daniel 752: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 753: * block of chars out.
1.15 daniel 754: *
1.6 daniel 755: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 756: * if the transcoding failed.
1.1 daniel 757: */
758: int
1.33 daniel 759: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 760: const unsigned char* in, int *inlen)
1.1 daniel 761: {
1.25 daniel 762: unsigned short* out = (unsigned short*) outb;
1.33 daniel 763: const unsigned char* processed = in;
1.1 daniel 764: unsigned short* outstart= out;
1.28 daniel 765: unsigned short* outend;
1.25 daniel 766: const unsigned char* inend= in+*inlen;
1.40 daniel 767: unsigned int c, d;
768: int trailing;
1.28 daniel 769: unsigned char *tmp;
770: unsigned short tmp1, tmp2;
1.1 daniel 771:
1.37 daniel 772: if (in == NULL) {
773: /*
774: * initialization, add the Byte Order Mark
775: */
776: if (*outlen >= 2) {
777: outb[0] = 0xFE;
778: outb[1] = 0xFF;
779: *outlen = 2;
780: *inlen = 0;
781: #ifdef DEBUG_ENCODING
782: fprintf(stderr, "Added FEFF Byte Order Mark\n");
783: #endif
784: return(2);
785: }
786: *outlen = 0;
787: *inlen = 0;
788: return(0);
789: }
1.33 daniel 790: outend = out + (*outlen / 2);
1.1 daniel 791: while (in < inend) {
792: d= *in++;
793: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 794: else if (d < 0xC0) {
795: /* trailing byte in leading position */
796: *outlen = out - outstart;
797: *inlen = processed - in;
798: return(-2);
799: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 800: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
801: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 802: else {
803: /* no chance for this in UTF-16 */
804: *outlen = out - outstart;
805: *inlen = processed - in;
806: return(-2);
807: }
1.28 daniel 808:
809: if (inend - in < trailing) {
810: break;
811: }
1.1 daniel 812:
813: for ( ; trailing; trailing--) {
1.33 daniel 814: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 815: c <<= 6;
816: c |= d & 0x3F;
817: }
818:
819: /* assertion: c is a single UTF-4 value */
820: if (c < 0x10000) {
1.33 daniel 821: if (out >= outend) break;
1.34 daniel 822: if (xmlLittleEndian) {
823: tmp = (unsigned char *) out;
824: *tmp = c >> 8;
825: *(tmp + 1) = c;
826: out++;
827: } else {
828: *out++ = c;
829: }
1.1 daniel 830: }
831: else if (c < 0x110000) {
1.33 daniel 832: if (out+1 >= outend) break;
1.1 daniel 833: c -= 0x10000;
1.34 daniel 834: if (xmlLittleEndian) {
835: tmp1 = 0xD800 | (c >> 10);
836: tmp = (unsigned char *) out;
837: *tmp = tmp1 >> 8;
1.40 daniel 838: *(tmp + 1) = (unsigned char) tmp1;
1.34 daniel 839: out++;
840:
841: tmp2 = 0xDC00 | (c & 0x03FF);
842: tmp = (unsigned char *) out;
843: *tmp = tmp2 >> 8;
1.40 daniel 844: *(tmp + 1) = (unsigned char) tmp2;
1.34 daniel 845: out++;
846: } else {
847: *out++ = 0xD800 | (c >> 10);
848: *out++ = 0xDC00 | (c & 0x03FF);
849: }
1.1 daniel 850: }
1.33 daniel 851: else
852: break;
853: processed = in;
1.1 daniel 854: }
1.36 daniel 855: *outlen = (out - outstart) * 2;
1.33 daniel 856: *inlen = processed - in;
857: return(0);
1.1 daniel 858: }
859:
1.7 daniel 860: /**
861: * xmlDetectCharEncoding:
862: * @in: a pointer to the first bytes of the XML entity, must be at least
863: * 4 bytes long.
1.25 daniel 864: * @len: pointer to the length of the buffer
1.7 daniel 865: *
866: * Guess the encoding of the entity using the first bytes of the entity content
867: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
868: *
869: * Returns one of the XML_CHAR_ENCODING_... values.
870: */
871: xmlCharEncoding
1.25 daniel 872: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 873: {
1.25 daniel 874: if (len >= 4) {
875: if ((in[0] == 0x00) && (in[1] == 0x00) &&
876: (in[2] == 0x00) && (in[3] == 0x3C))
877: return(XML_CHAR_ENCODING_UCS4BE);
878: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
879: (in[2] == 0x00) && (in[3] == 0x00))
880: return(XML_CHAR_ENCODING_UCS4LE);
881: if ((in[0] == 0x00) && (in[1] == 0x00) &&
882: (in[2] == 0x3C) && (in[3] == 0x00))
883: return(XML_CHAR_ENCODING_UCS4_2143);
884: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
885: (in[2] == 0x00) && (in[3] == 0x00))
886: return(XML_CHAR_ENCODING_UCS4_3412);
887: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
888: (in[2] == 0xA7) && (in[3] == 0x94))
889: return(XML_CHAR_ENCODING_EBCDIC);
890: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
891: (in[2] == 0x78) && (in[3] == 0x6D))
892: return(XML_CHAR_ENCODING_UTF8);
893: }
894: if (len >= 2) {
895: if ((in[0] == 0xFE) && (in[1] == 0xFF))
896: return(XML_CHAR_ENCODING_UTF16BE);
897: if ((in[0] == 0xFF) && (in[1] == 0xFE))
898: return(XML_CHAR_ENCODING_UTF16LE);
899: }
1.7 daniel 900: return(XML_CHAR_ENCODING_NONE);
901: }
902:
903: /**
904: * xmlParseCharEncoding:
1.18 daniel 905: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 906: *
907: * Conpare the string to the known encoding schemes already known. Note
908: * that the comparison is case insensitive accordingly to the section
909: * [XML] 4.3.3 Character Encoding in Entities.
910: *
911: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
912: * if not recognized.
913: */
914: xmlCharEncoding
1.8 daniel 915: xmlParseCharEncoding(const char* name)
1.7 daniel 916: {
917: char upper[500];
918: int i;
919:
920: for (i = 0;i < 499;i++) {
921: upper[i] = toupper(name[i]);
922: if (upper[i] == 0) break;
923: }
924: upper[i] = 0;
925:
926: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
927: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
928: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
929:
930: /*
931: * NOTE: if we were able to parse this, the endianness of UTF16 is
932: * already found and in use
933: */
934: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
935: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
936:
937: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
938: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
939: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
940:
941: /*
942: * NOTE: if we were able to parse this, the endianness of UCS4 is
943: * already found and in use
944: */
945: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
946: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
947: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
948:
949:
950: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
951: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
952: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
953:
954: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
955: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
956: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
957:
958: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
959: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
960: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
961: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
962: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
963: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
964: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
965:
966: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 967: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 968: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 969:
970: #ifdef DEBUG_ENCODING
971: fprintf(stderr, "Unknown encoding %s\n", name);
972: #endif
1.7 daniel 973: return(XML_CHAR_ENCODING_ERROR);
974: }
1.9 daniel 975:
1.38 daniel 976: /**
977: * xmlGetCharEncodingName:
978: * @enc: the encoding
979: *
980: * The "canonical" name for XML encoding.
981: * C.f. http://www.w3.org/TR/REC-xml#charencoding
982: * Section 4.3.3 Character Encoding in Entities
983: *
984: * Returns the canonical name for the given encoding
985: */
986:
987: const char*
988: xmlGetCharEncodingName(xmlCharEncoding enc) {
989: switch (enc) {
990: case XML_CHAR_ENCODING_ERROR:
991: return(NULL);
992: case XML_CHAR_ENCODING_NONE:
993: return(NULL);
994: case XML_CHAR_ENCODING_UTF8:
995: return("UTF-8");
996: case XML_CHAR_ENCODING_UTF16LE:
997: return("UTF-16");
998: case XML_CHAR_ENCODING_UTF16BE:
999: return("UTF-16");
1000: case XML_CHAR_ENCODING_EBCDIC:
1001: return("EBCDIC");
1002: case XML_CHAR_ENCODING_UCS4LE:
1003: return("ISO-10646-UCS-4");
1004: case XML_CHAR_ENCODING_UCS4BE:
1005: return("ISO-10646-UCS-4");
1006: case XML_CHAR_ENCODING_UCS4_2143:
1007: return("ISO-10646-UCS-4");
1008: case XML_CHAR_ENCODING_UCS4_3412:
1009: return("ISO-10646-UCS-4");
1010: case XML_CHAR_ENCODING_UCS2:
1011: return("ISO-10646-UCS-2");
1012: case XML_CHAR_ENCODING_8859_1:
1013: return("ISO-8859-1");
1014: case XML_CHAR_ENCODING_8859_2:
1015: return("ISO-8859-2");
1016: case XML_CHAR_ENCODING_8859_3:
1017: return("ISO-8859-3");
1018: case XML_CHAR_ENCODING_8859_4:
1019: return("ISO-8859-4");
1020: case XML_CHAR_ENCODING_8859_5:
1021: return("ISO-8859-5");
1022: case XML_CHAR_ENCODING_8859_6:
1023: return("ISO-8859-6");
1024: case XML_CHAR_ENCODING_8859_7:
1025: return("ISO-8859-7");
1026: case XML_CHAR_ENCODING_8859_8:
1027: return("ISO-8859-8");
1028: case XML_CHAR_ENCODING_8859_9:
1029: return("ISO-8859-9");
1030: case XML_CHAR_ENCODING_2022_JP:
1031: return("ISO-2022-JP");
1032: case XML_CHAR_ENCODING_SHIFT_JIS:
1033: return("Shift-JIS");
1034: case XML_CHAR_ENCODING_EUC_JP:
1035: return("EUC-JP");
1036: }
1037: return(NULL);
1038: }
1039:
1.9 daniel 1040: /****************************************************************
1041: * *
1042: * Char encoding handlers *
1043: * *
1044: ****************************************************************/
1045:
1046: /* the size should be growable, but it's not a big deal ... */
1047: #define MAX_ENCODING_HANDLERS 50
1048: static xmlCharEncodingHandlerPtr *handlers = NULL;
1049: static int nbCharEncodingHandler = 0;
1050:
1051: /*
1052: * The default is UTF-8 for XML, that's also the default used for the
1053: * parser internals, so the default encoding handler is NULL
1054: */
1055:
1056: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1057:
1058: /**
1059: * xmlNewCharEncodingHandler:
1.18 daniel 1060: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 1061: * @input: the xmlCharEncodingInputFunc to read that encoding
1062: * @output: the xmlCharEncodingOutputFunc to write that encoding
1063: *
1064: * Create and registers an xmlCharEncodingHandler.
1065: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1066: */
1067: xmlCharEncodingHandlerPtr
1.25 daniel 1068: xmlNewCharEncodingHandler(const char *name,
1069: xmlCharEncodingInputFunc input,
1.9 daniel 1070: xmlCharEncodingOutputFunc output) {
1071: xmlCharEncodingHandlerPtr handler;
1072: char upper[500];
1073: int i;
1074: char *up = 0;
1075:
1076: /*
1077: * Keep only the uppercase version of the encoding.
1078: */
1079: if (name == NULL) {
1080: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
1081: return(NULL);
1082: }
1083: for (i = 0;i < 499;i++) {
1084: upper[i] = toupper(name[i]);
1085: if (upper[i] == 0) break;
1086: }
1087: upper[i] = 0;
1.16 daniel 1088: up = xmlMemStrdup(upper);
1.9 daniel 1089: if (up == NULL) {
1090: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1091: return(NULL);
1092: }
1093:
1094: /*
1095: * allocate and fill-up an handler block.
1096: */
1097: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 1098: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 1099: if (handler == NULL) {
1100: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
1101: return(NULL);
1102: }
1103: handler->input = input;
1104: handler->output = output;
1105: handler->name = up;
1106:
1.49 ! veillard 1107: handler->iconv_in = NULL;
! 1108: handler->iconv_out = NULL;
! 1109:
1.9 daniel 1110: /*
1111: * registers and returns the handler.
1112: */
1113: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 1114: #ifdef DEBUG_ENCODING
1115: fprintf(stderr, "Registered encoding handler for %s\n", name);
1116: #endif
1.9 daniel 1117: return(handler);
1118: }
1119:
1120: /**
1121: * xmlInitCharEncodingHandlers:
1122: *
1123: * Initialize the char encoding support, it registers the default
1124: * encoding supported.
1.18 daniel 1125: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 1126: * in normal processing.
1127: */
1128: void
1129: xmlInitCharEncodingHandlers(void) {
1.34 daniel 1130: unsigned short int tst = 0x1234;
1131: unsigned char *ptr = (unsigned char *) &tst;
1132:
1.9 daniel 1133: if (handlers != NULL) return;
1134:
1135: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 1136: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 1137:
1138: if (*ptr == 0x12) xmlLittleEndian = 0;
1139: else if (*ptr == 0x34) xmlLittleEndian = 1;
1140: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 1141:
1142: if (handlers == NULL) {
1143: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
1144: return;
1145: }
1.10 daniel 1146: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 1147: xmlUTF16LEHandler =
1.28 daniel 1148: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1149: xmlUTF16BEHandler =
1150: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 1151: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.47 veillard 1152: xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1.48 veillard 1153: #ifdef LIBXML_HTML_ENABLED
1154: xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1155: #endif
1.9 daniel 1156: }
1157:
1158: /**
1.19 daniel 1159: * xmlCleanupCharEncodingHandlers:
1160: *
1161: * Cleanup the memory allocated for the char encoding support, it
1162: * unregisters all the encoding handlers.
1163: */
1164: void
1165: xmlCleanupCharEncodingHandlers(void) {
1166: if (handlers == NULL) return;
1167:
1168: for (;nbCharEncodingHandler > 0;) {
1169: nbCharEncodingHandler--;
1170: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 1171: if (handlers[nbCharEncodingHandler]->name != NULL)
1172: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 1173: xmlFree(handlers[nbCharEncodingHandler]);
1174: }
1175: }
1176: xmlFree(handlers);
1177: handlers = NULL;
1178: nbCharEncodingHandler = 0;
1179: xmlDefaultCharEncodingHandler = NULL;
1180: }
1181:
1182: /**
1.9 daniel 1183: * xmlRegisterCharEncodingHandler:
1184: * @handler: the xmlCharEncodingHandlerPtr handler block
1185: *
1186: * Register the char encoding handler, surprizing, isn't it ?
1187: */
1188: void
1189: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1190: if (handlers == NULL) xmlInitCharEncodingHandlers();
1191: if (handler == NULL) {
1192: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
1193: return;
1194: }
1195:
1196: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1197: fprintf(stderr,
1198: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1199: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1200: return;
1201: }
1202: handlers[nbCharEncodingHandler++] = handler;
1203: }
1204:
1205: /**
1206: * xmlGetCharEncodingHandler:
1207: * @enc: an xmlCharEncoding value.
1208: *
1209: * Search in the registrered set the handler able to read/write that encoding.
1210: *
1211: * Returns the handler or NULL if not found
1212: */
1213: xmlCharEncodingHandlerPtr
1214: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 1215: xmlCharEncodingHandlerPtr handler;
1216:
1.9 daniel 1217: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 1218: switch (enc) {
1219: case XML_CHAR_ENCODING_ERROR:
1220: return(NULL);
1221: case XML_CHAR_ENCODING_NONE:
1222: return(NULL);
1223: case XML_CHAR_ENCODING_UTF8:
1224: return(NULL);
1225: case XML_CHAR_ENCODING_UTF16LE:
1226: return(xmlUTF16LEHandler);
1227: case XML_CHAR_ENCODING_UTF16BE:
1228: return(xmlUTF16BEHandler);
1229: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 1230: handler = xmlFindCharEncodingHandler("EBCDIC");
1231: if (handler != NULL) return(handler);
1232: handler = xmlFindCharEncodingHandler("ebcdic");
1233: if (handler != NULL) return(handler);
1234: break;
1.38 daniel 1235: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 1236: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1237: if (handler != NULL) return(handler);
1238: handler = xmlFindCharEncodingHandler("UCS-4");
1239: if (handler != NULL) return(handler);
1240: handler = xmlFindCharEncodingHandler("UCS4");
1241: if (handler != NULL) return(handler);
1242: break;
1.38 daniel 1243: case XML_CHAR_ENCODING_UCS4LE:
1244: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1245: if (handler != NULL) return(handler);
1246: handler = xmlFindCharEncodingHandler("UCS-4");
1247: if (handler != NULL) return(handler);
1248: handler = xmlFindCharEncodingHandler("UCS4");
1.30 daniel 1249: if (handler != NULL) return(handler);
1250: break;
1.25 daniel 1251: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 1252: break;
1.25 daniel 1253: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 1254: break;
1.25 daniel 1255: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 1256: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1257: if (handler != NULL) return(handler);
1258: handler = xmlFindCharEncodingHandler("UCS-2");
1259: if (handler != NULL) return(handler);
1260: handler = xmlFindCharEncodingHandler("UCS2");
1261: if (handler != NULL) return(handler);
1262: break;
1.42 veillard 1263:
1264: /*
1265: * We used to keep ISO Latin encodings native in the
1266: * generated data. This led to so many problems that
1267: * this has been removed. One can still change this
1268: * back by registering no-ops encoders for those
1269: */
1.25 daniel 1270: case XML_CHAR_ENCODING_8859_1:
1.42 veillard 1271: handler = xmlFindCharEncodingHandler("ISO-8859-1");
1272: if (handler != NULL) return(handler);
1273: break;
1.25 daniel 1274: case XML_CHAR_ENCODING_8859_2:
1.42 veillard 1275: handler = xmlFindCharEncodingHandler("ISO-8859-2");
1276: if (handler != NULL) return(handler);
1277: break;
1.25 daniel 1278: case XML_CHAR_ENCODING_8859_3:
1.42 veillard 1279: handler = xmlFindCharEncodingHandler("ISO-8859-3");
1280: if (handler != NULL) return(handler);
1281: break;
1.25 daniel 1282: case XML_CHAR_ENCODING_8859_4:
1.42 veillard 1283: handler = xmlFindCharEncodingHandler("ISO-8859-4");
1284: if (handler != NULL) return(handler);
1285: break;
1.25 daniel 1286: case XML_CHAR_ENCODING_8859_5:
1.42 veillard 1287: handler = xmlFindCharEncodingHandler("ISO-8859-5");
1288: if (handler != NULL) return(handler);
1289: break;
1.25 daniel 1290: case XML_CHAR_ENCODING_8859_6:
1.42 veillard 1291: handler = xmlFindCharEncodingHandler("ISO-8859-6");
1292: if (handler != NULL) return(handler);
1293: break;
1.25 daniel 1294: case XML_CHAR_ENCODING_8859_7:
1.42 veillard 1295: handler = xmlFindCharEncodingHandler("ISO-8859-7");
1296: if (handler != NULL) return(handler);
1297: break;
1.25 daniel 1298: case XML_CHAR_ENCODING_8859_8:
1.42 veillard 1299: handler = xmlFindCharEncodingHandler("ISO-8859-8");
1300: if (handler != NULL) return(handler);
1301: break;
1.25 daniel 1302: case XML_CHAR_ENCODING_8859_9:
1.42 veillard 1303: handler = xmlFindCharEncodingHandler("ISO-8859-9");
1304: if (handler != NULL) return(handler);
1305: break;
1306:
1307:
1.25 daniel 1308: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 1309: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1310: if (handler != NULL) return(handler);
1311: break;
1.25 daniel 1312: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 1313: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1314: if (handler != NULL) return(handler);
1315: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1316: if (handler != NULL) return(handler);
1317: handler = xmlFindCharEncodingHandler("Shift_JIS");
1318: if (handler != NULL) return(handler);
1319: break;
1.25 daniel 1320: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 1321: handler = xmlFindCharEncodingHandler("EUC-JP");
1322: if (handler != NULL) return(handler);
1323: break;
1324: default:
1325: break;
1.25 daniel 1326: }
1.30 daniel 1327:
1328: #ifdef DEBUG_ENCODING
1329: fprintf(stderr, "No handler found for encoding %d\n", enc);
1330: #endif
1.9 daniel 1331: return(NULL);
1332: }
1333:
1334: /**
1335: * xmlGetCharEncodingHandler:
1336: * @enc: a string describing the char encoding.
1337: *
1338: * Search in the registrered set the handler able to read/write that encoding.
1339: *
1340: * Returns the handler or NULL if not found
1341: */
1342: xmlCharEncodingHandlerPtr
1343: xmlFindCharEncodingHandler(const char *name) {
1.36 daniel 1344: xmlCharEncoding alias;
1.30 daniel 1345: #ifdef LIBXML_ICONV_ENABLED
1.40 daniel 1346: xmlCharEncodingHandlerPtr enc;
1.30 daniel 1347: iconv_t icv_in, icv_out;
1348: #endif /* LIBXML_ICONV_ENABLED */
1349: char upper[100];
1.9 daniel 1350: int i;
1351:
1352: if (handlers == NULL) xmlInitCharEncodingHandlers();
1353: if (name == NULL) return(xmlDefaultCharEncodingHandler);
1354: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1355:
1.36 daniel 1356: /*
1357: * Check first for directly registered encoding names
1358: */
1.30 daniel 1359: for (i = 0;i < 99;i++) {
1.9 daniel 1360: upper[i] = toupper(name[i]);
1361: if (upper[i] == 0) break;
1362: }
1363: upper[i] = 0;
1364:
1365: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 1366: if (!strcmp(upper, handlers[i]->name)) {
1367: #ifdef DEBUG_ENCODING
1368: fprintf(stderr, "Found registered handler for encoding %s\n", name);
1369: #endif
1.9 daniel 1370: return(handlers[i]);
1.30 daniel 1371: }
1.9 daniel 1372:
1.30 daniel 1373: #ifdef LIBXML_ICONV_ENABLED
1374: /* check whether iconv can handle this */
1.31 daniel 1375: icv_in = iconv_open("UTF-8", name);
1376: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1377: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.43 veillard 1378: enc = (xmlCharEncodingHandlerPtr)
1379: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1380: if (enc == NULL) {
1381: iconv_close(icv_in);
1382: iconv_close(icv_out);
1383: return(NULL);
1384: }
1.41 daniel 1385: enc->name = xmlMemStrdup(name);
1.30 daniel 1386: enc->input = NULL;
1387: enc->output = NULL;
1388: enc->iconv_in = icv_in;
1389: enc->iconv_out = icv_out;
1390: #ifdef DEBUG_ENCODING
1391: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1392: #endif
1393: return enc;
1394: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1395: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1396: }
1397: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1398:
1.30 daniel 1399: #ifdef DEBUG_ENCODING
1400: fprintf(stderr, "No handler found for encoding %s\n", name);
1401: #endif
1.38 daniel 1402:
1403: /*
1404: * Fallback using the canonical names
1405: */
1406: alias = xmlParseCharEncoding(name);
1407: if (alias != XML_CHAR_ENCODING_ERROR) {
1408: const char* canon;
1409: canon = xmlGetCharEncodingName(alias);
1410: if ((canon != NULL) && (strcmp(name, canon))) {
1411: return(xmlFindCharEncodingHandler(canon));
1412: }
1413: }
1414:
1.9 daniel 1415: return(NULL);
1.30 daniel 1416: }
1417:
1418: #ifdef LIBXML_ICONV_ENABLED
1419: /**
1420: * xmlIconvWrapper:
1421: * @cd: iconv converter data structure
1422: * @out: a pointer to an array of bytes to store the result
1423: * @outlen: the length of @out
1424: * @in: a pointer to an array of ISO Latin 1 chars
1425: * @inlen: the length of @in
1426: *
1427: * Returns 0 if success, or
1428: * -1 by lack of space, or
1429: * -2 if the transcoding fails (for *in is not valid utf8 string or
1430: * the result of transformation can't fit into the encoding we want), or
1431: * -3 if there the last byte can't form a single output char.
1432: *
1433: * The value of @inlen after return is the number of octets consumed
1434: * as the return value is positive, else unpredictiable.
1435: * The value of @outlen after return is the number of ocetes consumed.
1436: */
1437: static int
1438: xmlIconvWrapper(iconv_t cd,
1439: unsigned char *out, int *outlen,
1440: const unsigned char *in, int *inlen) {
1441:
1442: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1443: const char *icv_in = (const char *) in;
1444: char *icv_out = (char *) out;
1445: int ret;
1446:
1447: ret = iconv(cd,
1448: &icv_in, &icv_inlen,
1449: &icv_out, &icv_outlen);
1.35 daniel 1450: if (in != NULL) {
1451: *inlen -= icv_inlen;
1452: *outlen -= icv_outlen;
1453: } else {
1454: *inlen = 0;
1455: *outlen = 0;
1456: }
1.30 daniel 1457: if (icv_inlen != 0 || ret == (size_t) -1) {
1458: #ifdef EILSEQ
1459: if (errno == EILSEQ) {
1.31 daniel 1460: return -2;
1.30 daniel 1461: } else
1462: #endif
1463: #ifdef E2BIG
1464: if (errno == E2BIG) {
1465: return -1;
1466: } else
1467: #endif
1468: #ifdef EINVAL
1469: if (errno == EINVAL) {
1.31 daniel 1470: return -3;
1.30 daniel 1471: }
1472: #endif
1473: else {
1474: return -3;
1475: }
1476: }
1477: return 0;
1478: }
1479: #endif /* LIBXML_ICONV_ENABLED */
1.38 daniel 1480:
1481: /**
1482: * xmlCharEncFirstLine:
1483: * @handler: char enconding transformation data structure
1484: * @out: an xmlBuffer for the output.
1485: * @in: an xmlBuffer for the input
1486: *
1487: * Front-end for the encoding handler input function, but handle only
1488: * the very first line, i.e. limit itself to 45 chars.
1489: *
1490: * Returns the number of byte written if success, or
1491: * -1 general error
1492: * -2 if the transcoding fails (for *in is not valid utf8 string or
1493: * the result of transformation can't fit into the encoding we want), or
1494: */
1495: int
1496: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1497: xmlBufferPtr in) {
1498: int ret = -2;
1499: int written;
1500: int toconv;
1501:
1502: if (handler == NULL) return(-1);
1503: if (out == NULL) return(-1);
1504: if (in == NULL) return(-1);
1505:
1506: written = out->size - out->use;
1507: toconv = in->use;
1508: if (toconv * 2 >= written) {
1.39 daniel 1509: xmlBufferGrow(out, toconv);
1.38 daniel 1510: written = out->size - out->use - 1;
1511: }
1.39 daniel 1512:
1.38 daniel 1513: /*
1514: * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1515: * 45 chars should be sufficient to reach the end of the encoding
1516: * decalration without going too far inside the document content.
1517: */
1518: written = 45;
1519:
1520: if (handler->input != NULL) {
1521: ret = handler->input(&out->content[out->use], &written,
1522: in->content, &toconv);
1523: xmlBufferShrink(in, toconv);
1524: out->use += written;
1525: out->content[out->use] = 0;
1526: }
1527: #ifdef LIBXML_ICONV_ENABLED
1528: else if (handler->iconv_in != NULL) {
1529: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1530: &written, in->content, &toconv);
1531: xmlBufferShrink(in, toconv);
1532: out->use += written;
1533: out->content[out->use] = 0;
1534: if (ret == -1) ret = -3;
1535: }
1536: #endif /* LIBXML_ICONV_ENABLED */
1537: #ifdef DEBUG_ENCODING
1538: switch (ret) {
1539: case 0:
1540: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1541: toconv, written);
1542: break;
1543: case -1:
1544: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1545: toconv, written, in->use);
1546: break;
1547: case -2:
1548: fprintf(stderr, "input conversion failed due to input error\n");
1549: break;
1550: case -3:
1551: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1552: toconv, written, in->use);
1553: break;
1554: default:
1555: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1556: }
1557: #endif
1558: /*
1559: * Ignore when input buffer is not on a boundary
1560: */
1561: if (ret == -3) ret = 0;
1562: if (ret == -1) ret = 0;
1563: return(ret);
1564: }
1.30 daniel 1565:
1566: /**
1567: * xmlCharEncInFunc:
1568: * @handler: char enconding transformation data structure
1.31 daniel 1569: * @out: an xmlBuffer for the output.
1570: * @in: an xmlBuffer for the input
1.30 daniel 1571: *
1572: * Generic front-end for the encoding handler input function
1573: *
1.31 daniel 1574: * Returns the number of byte written if success, or
1575: * -1 general error
1.30 daniel 1576: * -2 if the transcoding fails (for *in is not valid utf8 string or
1577: * the result of transformation can't fit into the encoding we want), or
1578: */
1579: int
1.31 daniel 1580: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1581: xmlBufferPtr in) {
1.30 daniel 1582: int ret = -2;
1.31 daniel 1583: int written;
1584: int toconv;
1.30 daniel 1585:
1.31 daniel 1586: if (handler == NULL) return(-1);
1587: if (out == NULL) return(-1);
1588: if (in == NULL) return(-1);
1589:
1590: written = out->size - out->use;
1591: toconv = in->use;
1592: if (toconv * 2 >= written) {
1593: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1594: written = out->size - out->use - 1;
1.31 daniel 1595: }
1.30 daniel 1596: if (handler->input != NULL) {
1.32 daniel 1597: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1598: in->content, &toconv);
1599: xmlBufferShrink(in, toconv);
1600: out->use += written;
1.33 daniel 1601: out->content[out->use] = 0;
1.30 daniel 1602: }
1603: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1604: else if (handler->iconv_in != NULL) {
1605: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1606: &written, in->content, &toconv);
1607: xmlBufferShrink(in, toconv);
1608: out->use += written;
1.33 daniel 1609: out->content[out->use] = 0;
1610: if (ret == -1) ret = -3;
1.30 daniel 1611: }
1612: #endif /* LIBXML_ICONV_ENABLED */
1.39 daniel 1613: switch (ret) {
1.30 daniel 1614: #ifdef DEBUG_ENCODING
1615: case 0:
1616: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1617: toconv, written);
1.30 daniel 1618: break;
1619: case -1:
1.31 daniel 1620: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1621: toconv, written, in->use);
1.30 daniel 1622: break;
1623: case -3:
1.31 daniel 1624: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1625: toconv, written, in->use);
1.30 daniel 1626: break;
1.39 daniel 1627: #endif
1628: case -2:
1629: fprintf(stderr, "input conversion failed due to input error\n");
1630: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1631: in->content[0], in->content[1],
1632: in->content[2], in->content[3]);
1.30 daniel 1633: }
1.33 daniel 1634: /*
1635: * Ignore when input buffer is not on a boundary
1636: */
1637: if (ret == -3) ret = 0;
1.30 daniel 1638: return(ret);
1639: }
1640:
1641: /**
1642: * xmlCharEncOutFunc:
1643: * @handler: char enconding transformation data structure
1.31 daniel 1644: * @out: an xmlBuffer for the output.
1645: * @in: an xmlBuffer for the input
1646: *
1647: * Generic front-end for the encoding handler output function
1.35 daniel 1648: * a first call with @in == NULL has to be made firs to initiate the
1649: * output in case of non-stateless encoding needing to initiate their
1650: * state or the output (like the BOM in UTF16).
1.39 daniel 1651: * In case of UTF8 sequence conversion errors for the given encoder,
1652: * the content will be automatically remapped to a CharRef sequence.
1.30 daniel 1653: *
1.31 daniel 1654: * Returns the number of byte written if success, or
1655: * -1 general error
1.30 daniel 1656: * -2 if the transcoding fails (for *in is not valid utf8 string or
1657: * the result of transformation can't fit into the encoding we want), or
1658: */
1659: int
1.31 daniel 1660: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1661: xmlBufferPtr in) {
1.30 daniel 1662: int ret = -2;
1.31 daniel 1663: int written;
1664: int toconv;
1.39 daniel 1665: int output = 0;
1.31 daniel 1666:
1667: if (handler == NULL) return(-1);
1668: if (out == NULL) return(-1);
1.39 daniel 1669:
1670: retry:
1671:
1.35 daniel 1672: written = out->size - out->use;
1673:
1.39 daniel 1674: /*
1675: * First specific handling of in = NULL, i.e. the initialization call
1676: */
1.35 daniel 1677: if (in == NULL) {
1678: toconv = 0;
1679: if (handler->output != NULL) {
1680: ret = handler->output(&out->content[out->use], &written,
1681: NULL, &toconv);
1682: out->use += written;
1683: out->content[out->use] = 0;
1684: }
1685: #ifdef LIBXML_ICONV_ENABLED
1686: else if (handler->iconv_out != NULL) {
1687: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1688: &written, NULL, &toconv);
1689: out->use += written;
1690: out->content[out->use] = 0;
1691: }
1692: #endif /* LIBXML_ICONV_ENABLED */
1693: #ifdef DEBUG_ENCODING
1694: fprintf(stderr, "initialized encoder\n");
1695: #endif
1696: return(0);
1697: }
1.30 daniel 1698:
1.39 daniel 1699: /*
1700: * Convertion itself.
1701: */
1.33 daniel 1702: toconv = in->use;
1703: if (toconv * 2 >= written) {
1704: xmlBufferGrow(out, toconv * 2);
1705: written = out->size - out->use - 1;
1706: }
1.30 daniel 1707: if (handler->output != NULL) {
1.33 daniel 1708: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1709: in->content, &toconv);
1.31 daniel 1710: xmlBufferShrink(in, toconv);
1711: out->use += written;
1.33 daniel 1712: out->content[out->use] = 0;
1.30 daniel 1713: }
1714: #ifdef LIBXML_ICONV_ENABLED
1715: else if (handler->iconv_out != NULL) {
1.31 daniel 1716: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1717: &written, in->content, &toconv);
1718: xmlBufferShrink(in, toconv);
1719: out->use += written;
1.33 daniel 1720: out->content[out->use] = 0;
1721: if (ret == -1) ret = -3;
1.30 daniel 1722: }
1723: #endif /* LIBXML_ICONV_ENABLED */
1.46 veillard 1724: else {
1725: fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
1726: return(-1);
1727: }
1.39 daniel 1728:
1729: if (ret >= 0) output += ret;
1730:
1731: /*
1732: * Attempt to handle error cases
1733: */
1734: switch (ret) {
1.30 daniel 1735: #ifdef DEBUG_ENCODING
1736: case 0:
1737: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1738: toconv, written);
1.30 daniel 1739: break;
1740: case -1:
1741: fprintf(stderr, "output conversion failed by lack of space\n");
1742: break;
1743: case -3:
1.31 daniel 1744: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1745: toconv, written, in->use);
1.30 daniel 1746: break;
1.39 daniel 1747: #endif
1748: case -2: {
1749: int len = in->use;
1.43 veillard 1750: const xmlChar *utf = (const xmlChar *) in->content;
1.39 daniel 1751: int cur;
1752:
1753: cur = xmlGetUTF8Char(utf, &len);
1754: if (cur > 0) {
1755: xmlChar charref[20];
1756:
1757: #ifdef DEBUG_ENCODING
1758: fprintf(stderr, "handling output conversion error\n");
1759: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1760: in->content[0], in->content[1],
1761: in->content[2], in->content[3]);
1762: #endif
1763: /*
1764: * Removes the UTF8 sequence, and replace it by a charref
1765: * and continue the transcoding phase, hoping the error
1766: * did not mangle the encoder state.
1767: */
1.43 veillard 1768: sprintf((char *) charref, "&#x%X;", cur);
1.39 daniel 1769: xmlBufferShrink(in, len);
1770: xmlBufferAddHead(in, charref, -1);
1771:
1772: goto retry;
1773: } else {
1774: fprintf(stderr, "output conversion failed due to conv error\n");
1775: fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
1776: in->content[0], in->content[1],
1777: in->content[2], in->content[3]);
1778: }
1779: break;
1780: }
1.30 daniel 1781: }
1782: return(ret);
1783: }
1784:
1785: /**
1786: * xmlCharEncCloseFunc:
1787: * @handler: char enconding transformation data structure
1788: *
1789: * Generic front-end for hencoding handler close function
1790: *
1791: * Returns 0 if success, or -1 in case of error
1792: */
1793: int
1794: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1795: int ret = 0;
1.31 daniel 1796: if (handler == NULL) return(-1);
1797: if (handler->name == NULL) return(-1);
1.30 daniel 1798: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1799: /*
1800: * Iconv handlers can be oused only once, free the whole block.
1801: * and the associated icon resources.
1802: */
1.32 daniel 1803: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1804: if (handler->name != NULL)
1805: xmlFree(handler->name);
1806: handler->name = NULL;
1807: if (handler->iconv_out != NULL) {
1808: if (iconv_close(handler->iconv_out))
1809: ret = -1;
1810: handler->iconv_out = NULL;
1811: }
1812: if (handler->iconv_in != NULL) {
1813: if (iconv_close(handler->iconv_in))
1814: ret = -1;
1815: handler->iconv_in = NULL;
1816: }
1817: xmlFree(handler);
1.30 daniel 1818: }
1819: #endif /* LIBXML_ICONV_ENABLED */
1820: #ifdef DEBUG_ENCODING
1821: if (ret)
1822: fprintf(stderr, "failed to close the encoding handler\n");
1823: else
1824: fprintf(stderr, "closed the encoding handler\n");
1825:
1826: #endif
1827: return(ret);
1.9 daniel 1828: }
1829:
Webmaster