Annotation of XML/encoding.c, revision 1.37
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.30 daniel 37: #include <libxml/xmlversion.h>
38: #ifdef LIBXML_ICONV_ENABLED
39: #ifdef HAVE_ERRNO_H
40: #include <errno.h>
41: #endif
42: #endif
1.29 daniel 43: #include <libxml/encoding.h>
44: #include <libxml/xmlmemory.h>
1.3 daniel 45:
1.25 daniel 46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
48:
1.30 daniel 49: #ifdef LIBXML_ICONV_ENABLED
1.37 ! daniel 50: #if 0
1.30 daniel 51: #define DEBUG_ENCODING /* Define this to get encoding traces */
52: #endif
1.33 daniel 53: #endif
1.30 daniel 54:
1.34 daniel 55: static int xmlLittleEndian = 1;
56:
1.3 daniel 57: /*
58: * From rfc2044: encoding of the Unicode values on UTF-8:
59: *
60: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
61: * 0000 0000-0000 007F 0xxxxxxx
62: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
63: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
64: *
65: * I hope we won't use values > 0xFFFF anytime soon !
66: */
1.1 daniel 67:
68: /**
1.22 daniel 69: * xmlCheckUTF8: Check utf-8 string for legality.
70: * @utf: Pointer to putative utf-8 encoded string.
71: *
72: * Checks @utf for being valid utf-8. @utf is assumed to be
73: * null-terminated. This function is not super-strict, as it will
74: * allow longer utf-8 sequences than necessary. Note that Java is
75: * capable of producing these sequences if provoked. Also note, this
76: * routine checks for the 4-byte maxiumum size, but does not check for
77: * 0x10ffff maximum value.
78: *
79: * Return value: true if @utf is valid.
80: **/
81: int
82: xmlCheckUTF8(const unsigned char *utf)
83: {
84: int ix;
85: unsigned char c;
86:
87: for (ix = 0; (c = utf[ix]);) {
88: if (c & 0x80) {
89: if ((utf[ix + 1] & 0xc0) != 0x80)
90: return(0);
91: if ((c & 0xe0) == 0xe0) {
92: if ((utf[ix + 2] & 0xc0) != 0x80)
93: return(0);
94: if ((c & 0xf0) == 0xf0) {
95: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
96: return(0);
97: ix += 4;
98: /* 4-byte code */
99: } else
100: /* 3-byte code */
101: ix += 3;
102: } else
103: /* 2-byte code */
104: ix += 2;
105: } else
106: /* 1-byte code */
107: ix++;
108: }
109: return(1);
110: }
111:
112: /**
1.1 daniel 113: * isolat1ToUTF8:
1.18 daniel 114: * @out: a pointer to an array of bytes to store the result
115: * @outlen: the length of @out
116: * @in: a pointer to an array of ISO Latin 1 chars
117: * @inlen: the length of @in
1.1 daniel 118: *
119: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
120: * block of chars out.
1.33 daniel 121: * Returns 0 if success, or -1 otherwise
122: * The value of @inlen after return is the number of octets consumed
123: * as the return value is positive, else unpredictiable.
124: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 125: */
126: int
1.33 daniel 127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 128: const unsigned char* in, int *inlen) {
1.33 daniel 129: unsigned char* outstart = out;
130: const unsigned char* processed = in;
131: unsigned char* outend = out + *outlen;
132: const unsigned char* inend = in + *inlen;
1.1 daniel 133: unsigned char c;
134:
135: while (in < inend) {
136: c= *in++;
137: if (c < 0x80) {
1.33 daniel 138: if (out >= outend)
139: break;
1.1 daniel 140: *out++ = c;
141: }
142: else {
1.33 daniel 143: if (out + 1 >= outend) break;
1.1 daniel 144: *out++ = 0xC0 | (c >> 6);
145: *out++ = 0x80 | (0x3F & c);
146: }
1.33 daniel 147: processed = in;
1.1 daniel 148: }
1.33 daniel 149: *outlen = out - outstart;
150: *inlen = processed - in;
151:
152: return(0);
1.1 daniel 153: }
154:
155: /**
156: * UTF8Toisolat1:
1.18 daniel 157: * @out: a pointer to an array of bytes to store the result
158: * @outlen: the length of @out
159: * @in: a pointer to an array of UTF-8 chars
160: * @inlen: the length of @in
1.1 daniel 161: *
162: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
163: * block of chars out.
1.15 daniel 164: * TODO: UTF8Toisolat1 need a fallback mechanism ...
165: *
1.33 daniel 166: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 167: * The value of @inlen after return is the number of octets consumed
168: * as the return value is positive, else unpredictiable.
1.33 daniel 169: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 170: */
171: int
1.33 daniel 172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 173: const unsigned char* in, int *inlen) {
1.33 daniel 174: unsigned char* outstart = out;
175: const unsigned char* processed = in;
176: unsigned char* outend = out + *outlen;
177: const unsigned char* inend = in + *inlen;
1.1 daniel 178: unsigned char c;
179:
180: while (in < inend) {
181: c= *in++;
182: if (c < 0x80) {
1.28 daniel 183: if (out >= outend) return(-1);
1.1 daniel 184: *out++= c;
185: }
1.23 daniel 186: else if (in == inend) {
187: break;
188: }
189: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
190: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 191: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 192: }
1.33 daniel 193: else {
194: *outlen = out - outstart;
195: *inlen = processed - in;
1.28 daniel 196: return(-2);
1.33 daniel 197: }
198: processed = in;
1.1 daniel 199: }
1.33 daniel 200: *outlen = out - outstart;
201: *inlen = processed - in;
202: return(0);
1.1 daniel 203: }
204:
205: /**
1.28 daniel 206: * UTF16LEToUTF8:
207: * @out: a pointer to an array of bytes to store the result
208: * @outlen: the length of @out
209: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
210: * @inlenb: the length of @in in UTF-16LE chars
211: *
212: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
213: * block of chars out. This function assume the endian properity
214: * is the same between the native type of this machine and the
215: * inputed one.
216: *
217: * Returns the number of byte written, or -1 by lack of space, or -2
218: * if the transcoding fails (for *in is not valid utf16 string)
219: * The value of *inlen after return is the number of octets consumed
220: * as the return value is positive, else unpredictiable.
221: */
222: int
1.33 daniel 223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 224: const unsigned char* inb, int *inlenb)
225: {
1.33 daniel 226: unsigned char* outstart = out;
227: const unsigned char* processed = inb;
228: unsigned char* outend = out + *outlen;
1.28 daniel 229: unsigned short* in = (unsigned short*) inb;
230: unsigned short* inend;
231: unsigned int c, d, inlen;
232: unsigned char *tmp;
233: int bits;
234:
235: if ((*inlenb % 2) == 1)
236: (*inlenb)--;
237: inlen = *inlenb / 2;
1.33 daniel 238: inend = in + inlen;
1.28 daniel 239: while (in < inend) {
1.34 daniel 240: if (xmlLittleEndian) {
241: c= *in++;
242: } else {
243: tmp = (unsigned char *) in;
244: c = *tmp++;
245: c = c | (((unsigned int)*tmp) << 8);
246: in++;
247: }
1.28 daniel 248: if ((c & 0xFC00) == 0xD800) { /* surrogates */
249: if (in >= inend) { /* (in > inend) shouldn't happens */
250: break;
251: }
1.34 daniel 252: if (xmlLittleEndian) {
253: d = *in++;
254: } else {
255: tmp = (unsigned char *) in;
256: d = *tmp++;
257: d = d | (((unsigned int)*tmp) << 8);
258: in++;
259: }
1.28 daniel 260: if ((d & 0xFC00) == 0xDC00) {
261: c &= 0x03FF;
262: c <<= 10;
263: c |= d & 0x03FF;
264: c += 0x10000;
265: }
1.33 daniel 266: else {
267: *outlen = out - outstart;
268: *inlenb = processed - inb;
1.28 daniel 269: return(-2);
1.33 daniel 270: }
1.28 daniel 271: }
272:
273: /* assertion: c is a single UTF-4 value */
274: if (out >= outend)
1.33 daniel 275: break;
1.28 daniel 276: if (c < 0x80) { *out++= c; bits= -6; }
277: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
278: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
279: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
280:
281: for ( ; bits >= 0; bits-= 6) {
282: if (out >= outend)
1.33 daniel 283: break;
1.28 daniel 284: *out++= ((c >> bits) & 0x3F) | 0x80;
285: }
1.33 daniel 286: processed = (const unsigned char*) in;
1.28 daniel 287: }
1.33 daniel 288: *outlen = out - outstart;
289: *inlenb = processed - inb;
290: return(0);
1.28 daniel 291: }
292:
293: /**
294: * UTF8ToUTF16LE:
295: * @outb: a pointer to an array of bytes to store the result
296: * @outlen: the length of @outb
297: * @in: a pointer to an array of UTF-8 chars
298: * @inlen: the length of @in
299: *
300: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
301: * block of chars out.
302: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
303: *
304: * Returns the number of byte written, or -1 by lack of space, or -2
305: * if the transcoding failed.
306: */
307: int
1.33 daniel 308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 309: const unsigned char* in, int *inlen)
310: {
311: unsigned short* out = (unsigned short*) outb;
1.33 daniel 312: const unsigned char* processed = in;
1.28 daniel 313: unsigned short* outstart= out;
314: unsigned short* outend;
315: const unsigned char* inend= in+*inlen;
316: unsigned int c, d, trailing;
317: unsigned char *tmp;
318: unsigned short tmp1, tmp2;
319:
1.37 ! daniel 320: if (in == NULL) {
! 321: /*
! 322: * initialization, add the Byte Order Mark
! 323: */
! 324: if (*outlen >= 2) {
! 325: outb[0] = 0xFF;
! 326: outb[1] = 0xFE;
! 327: *outlen = 2;
! 328: *inlen = 0;
! 329: #ifdef DEBUG_ENCODING
! 330: fprintf(stderr, "Added FFFE Byte Order Mark\n");
! 331: #endif
! 332: return(2);
! 333: }
! 334: *outlen = 0;
! 335: *inlen = 0;
! 336: return(0);
! 337: }
1.33 daniel 338: outend = out + (*outlen / 2);
1.28 daniel 339: while (in < inend) {
340: d= *in++;
341: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 342: else if (d < 0xC0) {
343: /* trailing byte in leading position */
344: *outlen = out - outstart;
345: *inlen = processed - in;
346: return(-2);
347: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 348: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
349: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 350: else {
351: /* no chance for this in UTF-16 */
352: *outlen = out - outstart;
353: *inlen = processed - in;
354: return(-2);
355: }
1.28 daniel 356:
357: if (inend - in < trailing) {
358: break;
359: }
360:
361: for ( ; trailing; trailing--) {
362: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 363: break;
1.28 daniel 364: c <<= 6;
365: c |= d & 0x3F;
366: }
367:
368: /* assertion: c is a single UTF-4 value */
369: if (c < 0x10000) {
370: if (out >= outend)
1.33 daniel 371: break;
1.34 daniel 372: if (xmlLittleEndian) {
373: *out++ = c;
374: } else {
375: tmp = (unsigned char *) out;
376: *tmp = c ;
377: *(tmp + 1) = c >> 8 ;
378: out++;
379: }
1.28 daniel 380: }
381: else if (c < 0x110000) {
382: if (out+1 >= outend)
1.33 daniel 383: break;
1.28 daniel 384: c -= 0x10000;
1.34 daniel 385: if (xmlLittleEndian) {
386: *out++ = 0xD800 | (c >> 10);
387: *out++ = 0xDC00 | (c & 0x03FF);
388: } else {
389: tmp1 = 0xD800 | (c >> 10);
390: tmp = (unsigned char *) out;
391: *tmp = tmp1;
392: *(tmp + 1) = tmp1 >> 8;
393: out++;
394:
395: tmp2 = 0xDC00 | (c & 0x03FF);
396: tmp = (unsigned char *) out;
397: *tmp = tmp2;
398: *(tmp + 1) = tmp2 >> 8;
399: out++;
400: }
1.28 daniel 401: }
402: else
1.33 daniel 403: break;
404: processed = in;
1.28 daniel 405: }
1.36 daniel 406: *outlen = (out - outstart) * 2;
1.33 daniel 407: *inlen = processed - in;
408: return(0);
1.28 daniel 409: }
410:
411: /**
412: * UTF16BEToUTF8:
1.18 daniel 413: * @out: a pointer to an array of bytes to store the result
414: * @outlen: the length of @out
1.25 daniel 415: * @inb: a pointer to an array of UTF-16 passwd as a byte array
416: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 417: *
418: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 419: * block of chars out. This function assume the endian properity
420: * is the same between the native type of this machine and the
421: * inputed one.
1.25 daniel 422: *
1.28 daniel 423: * Returns the number of byte written, or -1 by lack of space, or -2
424: * if the transcoding fails (for *in is not valid utf16 string)
425: * The value of *inlen after return is the number of octets consumed
426: * as the return value is positive, else unpredictiable.
1.1 daniel 427: */
428: int
1.33 daniel 429: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 430: const unsigned char* inb, int *inlenb)
1.1 daniel 431: {
1.33 daniel 432: unsigned char* outstart = out;
433: const unsigned char* processed = inb;
434: unsigned char* outend = out + *outlen;
1.25 daniel 435: unsigned short* in = (unsigned short*) inb;
436: unsigned short* inend;
437: unsigned int c, d, inlen;
1.28 daniel 438: unsigned char *tmp;
1.1 daniel 439: int bits;
440:
1.28 daniel 441: if ((*inlenb % 2) == 1)
442: (*inlenb)--;
1.25 daniel 443: inlen = *inlenb / 2;
444: inend= in + inlen;
1.1 daniel 445: while (in < inend) {
1.34 daniel 446: if (xmlLittleEndian) {
447: tmp = (unsigned char *) in;
448: c = *tmp++;
449: c = c << 8;
450: c = c | (unsigned int) *tmp;
451: in++;
452: } else {
453: c= *in++;
454: }
1.1 daniel 455: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 456: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 457: *outlen = out - outstart;
458: *inlenb = processed - inb;
459: return(-2);
1.28 daniel 460: }
1.34 daniel 461: if (xmlLittleEndian) {
462: tmp = (unsigned char *) in;
463: d = *tmp++;
464: d = d << 8;
465: d = d | (unsigned int) *tmp;
466: in++;
467: } else {
468: d= *in++;
469: }
1.28 daniel 470: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 471: c &= 0x03FF;
472: c <<= 10;
473: c |= d & 0x03FF;
474: c += 0x10000;
475: }
1.33 daniel 476: else {
477: *outlen = out - outstart;
478: *inlenb = processed - inb;
1.28 daniel 479: return(-2);
1.33 daniel 480: }
1.1 daniel 481: }
482:
1.25 daniel 483: /* assertion: c is a single UTF-4 value */
1.27 daniel 484: if (out >= outend)
1.33 daniel 485: break;
1.1 daniel 486: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 487: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
488: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
489: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 490:
1.26 daniel 491: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 492: if (out >= outend)
1.33 daniel 493: break;
1.26 daniel 494: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 495: }
1.33 daniel 496: processed = (const unsigned char*) in;
1.1 daniel 497: }
1.33 daniel 498: *outlen = out - outstart;
499: *inlenb = processed - inb;
500: return(0);
1.1 daniel 501: }
502:
503: /**
1.28 daniel 504: * UTF8ToUTF16BE:
1.25 daniel 505: * @outb: a pointer to an array of bytes to store the result
506: * @outlen: the length of @outb
1.18 daniel 507: * @in: a pointer to an array of UTF-8 chars
508: * @inlen: the length of @in
1.1 daniel 509: *
1.28 daniel 510: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 511: * block of chars out.
1.28 daniel 512: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 513: *
1.6 daniel 514: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 515: * if the transcoding failed.
1.1 daniel 516: */
517: int
1.33 daniel 518: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 519: const unsigned char* in, int *inlen)
1.1 daniel 520: {
1.25 daniel 521: unsigned short* out = (unsigned short*) outb;
1.33 daniel 522: const unsigned char* processed = in;
1.1 daniel 523: unsigned short* outstart= out;
1.28 daniel 524: unsigned short* outend;
1.25 daniel 525: const unsigned char* inend= in+*inlen;
1.1 daniel 526: unsigned int c, d, trailing;
1.28 daniel 527: unsigned char *tmp;
528: unsigned short tmp1, tmp2;
1.1 daniel 529:
1.37 ! daniel 530: if (in == NULL) {
! 531: /*
! 532: * initialization, add the Byte Order Mark
! 533: */
! 534: if (*outlen >= 2) {
! 535: outb[0] = 0xFE;
! 536: outb[1] = 0xFF;
! 537: *outlen = 2;
! 538: *inlen = 0;
! 539: #ifdef DEBUG_ENCODING
! 540: fprintf(stderr, "Added FEFF Byte Order Mark\n");
! 541: #endif
! 542: return(2);
! 543: }
! 544: *outlen = 0;
! 545: *inlen = 0;
! 546: return(0);
! 547: }
1.33 daniel 548: outend = out + (*outlen / 2);
1.1 daniel 549: while (in < inend) {
550: d= *in++;
551: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 552: else if (d < 0xC0) {
553: /* trailing byte in leading position */
554: *outlen = out - outstart;
555: *inlen = processed - in;
556: return(-2);
557: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 558: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
559: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 560: else {
561: /* no chance for this in UTF-16 */
562: *outlen = out - outstart;
563: *inlen = processed - in;
564: return(-2);
565: }
1.28 daniel 566:
567: if (inend - in < trailing) {
568: break;
569: }
1.1 daniel 570:
571: for ( ; trailing; trailing--) {
1.33 daniel 572: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 573: c <<= 6;
574: c |= d & 0x3F;
575: }
576:
577: /* assertion: c is a single UTF-4 value */
578: if (c < 0x10000) {
1.33 daniel 579: if (out >= outend) break;
1.34 daniel 580: if (xmlLittleEndian) {
581: tmp = (unsigned char *) out;
582: *tmp = c >> 8;
583: *(tmp + 1) = c;
584: out++;
585: } else {
586: *out++ = c;
587: }
1.1 daniel 588: }
589: else if (c < 0x110000) {
1.33 daniel 590: if (out+1 >= outend) break;
1.1 daniel 591: c -= 0x10000;
1.34 daniel 592: if (xmlLittleEndian) {
593: tmp1 = 0xD800 | (c >> 10);
594: tmp = (unsigned char *) out;
595: *tmp = tmp1 >> 8;
596: *(tmp + 1) = tmp1;
597: out++;
598:
599: tmp2 = 0xDC00 | (c & 0x03FF);
600: tmp = (unsigned char *) out;
601: *tmp = tmp2 >> 8;
602: *(tmp + 1) = tmp2;
603: out++;
604: } else {
605: *out++ = 0xD800 | (c >> 10);
606: *out++ = 0xDC00 | (c & 0x03FF);
607: }
1.1 daniel 608: }
1.33 daniel 609: else
610: break;
611: processed = in;
1.1 daniel 612: }
1.36 daniel 613: *outlen = (out - outstart) * 2;
1.33 daniel 614: *inlen = processed - in;
615: return(0);
1.1 daniel 616: }
617:
1.7 daniel 618: /**
619: * xmlDetectCharEncoding:
620: * @in: a pointer to the first bytes of the XML entity, must be at least
621: * 4 bytes long.
1.25 daniel 622: * @len: pointer to the length of the buffer
1.7 daniel 623: *
624: * Guess the encoding of the entity using the first bytes of the entity content
625: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
626: *
627: * Returns one of the XML_CHAR_ENCODING_... values.
628: */
629: xmlCharEncoding
1.25 daniel 630: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 631: {
1.25 daniel 632: if (len >= 4) {
633: if ((in[0] == 0x00) && (in[1] == 0x00) &&
634: (in[2] == 0x00) && (in[3] == 0x3C))
635: return(XML_CHAR_ENCODING_UCS4BE);
636: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
637: (in[2] == 0x00) && (in[3] == 0x00))
638: return(XML_CHAR_ENCODING_UCS4LE);
639: if ((in[0] == 0x00) && (in[1] == 0x00) &&
640: (in[2] == 0x3C) && (in[3] == 0x00))
641: return(XML_CHAR_ENCODING_UCS4_2143);
642: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
643: (in[2] == 0x00) && (in[3] == 0x00))
644: return(XML_CHAR_ENCODING_UCS4_3412);
645: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
646: (in[2] == 0xA7) && (in[3] == 0x94))
647: return(XML_CHAR_ENCODING_EBCDIC);
648: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
649: (in[2] == 0x78) && (in[3] == 0x6D))
650: return(XML_CHAR_ENCODING_UTF8);
651: }
652: if (len >= 2) {
653: if ((in[0] == 0xFE) && (in[1] == 0xFF))
654: return(XML_CHAR_ENCODING_UTF16BE);
655: if ((in[0] == 0xFF) && (in[1] == 0xFE))
656: return(XML_CHAR_ENCODING_UTF16LE);
657: }
1.7 daniel 658: return(XML_CHAR_ENCODING_NONE);
659: }
660:
661: /**
662: * xmlParseCharEncoding:
1.18 daniel 663: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 664: *
665: * Conpare the string to the known encoding schemes already known. Note
666: * that the comparison is case insensitive accordingly to the section
667: * [XML] 4.3.3 Character Encoding in Entities.
668: *
669: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
670: * if not recognized.
671: */
672: xmlCharEncoding
1.8 daniel 673: xmlParseCharEncoding(const char* name)
1.7 daniel 674: {
675: char upper[500];
676: int i;
677:
678: for (i = 0;i < 499;i++) {
679: upper[i] = toupper(name[i]);
680: if (upper[i] == 0) break;
681: }
682: upper[i] = 0;
683:
684: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
685: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
686: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
687:
688: /*
689: * NOTE: if we were able to parse this, the endianness of UTF16 is
690: * already found and in use
691: */
692: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
693: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
694:
695: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
696: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
697: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
698:
699: /*
700: * NOTE: if we were able to parse this, the endianness of UCS4 is
701: * already found and in use
702: */
703: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
704: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
705: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
706:
707:
708: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
709: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
710: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
711:
712: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
713: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
714: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
715:
716: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
717: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
718: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
719: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
720: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
721: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
722: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
723:
724: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 725: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 726: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 727:
728: #ifdef DEBUG_ENCODING
729: fprintf(stderr, "Unknown encoding %s\n", name);
730: #endif
1.7 daniel 731: return(XML_CHAR_ENCODING_ERROR);
732: }
1.9 daniel 733:
734: /****************************************************************
735: * *
736: * Char encoding handlers *
737: * *
738: ****************************************************************/
739:
740: /* the size should be growable, but it's not a big deal ... */
741: #define MAX_ENCODING_HANDLERS 50
742: static xmlCharEncodingHandlerPtr *handlers = NULL;
743: static int nbCharEncodingHandler = 0;
744:
745: /*
746: * The default is UTF-8 for XML, that's also the default used for the
747: * parser internals, so the default encoding handler is NULL
748: */
749:
750: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
751:
752: /**
753: * xmlNewCharEncodingHandler:
1.18 daniel 754: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 755: * @input: the xmlCharEncodingInputFunc to read that encoding
756: * @output: the xmlCharEncodingOutputFunc to write that encoding
757: *
758: * Create and registers an xmlCharEncodingHandler.
759: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
760: */
761: xmlCharEncodingHandlerPtr
1.25 daniel 762: xmlNewCharEncodingHandler(const char *name,
763: xmlCharEncodingInputFunc input,
1.9 daniel 764: xmlCharEncodingOutputFunc output) {
765: xmlCharEncodingHandlerPtr handler;
766: char upper[500];
767: int i;
768: char *up = 0;
769:
770: /*
771: * Keep only the uppercase version of the encoding.
772: */
773: if (name == NULL) {
774: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
775: return(NULL);
776: }
777: for (i = 0;i < 499;i++) {
778: upper[i] = toupper(name[i]);
779: if (upper[i] == 0) break;
780: }
781: upper[i] = 0;
1.16 daniel 782: up = xmlMemStrdup(upper);
1.9 daniel 783: if (up == NULL) {
784: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
785: return(NULL);
786: }
787:
788: /*
789: * allocate and fill-up an handler block.
790: */
791: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 792: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 793: if (handler == NULL) {
794: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
795: return(NULL);
796: }
797: handler->input = input;
798: handler->output = output;
799: handler->name = up;
800:
801: /*
802: * registers and returns the handler.
803: */
804: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 805: #ifdef DEBUG_ENCODING
806: fprintf(stderr, "Registered encoding handler for %s\n", name);
807: #endif
1.9 daniel 808: return(handler);
809: }
810:
811: /**
812: * xmlInitCharEncodingHandlers:
813: *
814: * Initialize the char encoding support, it registers the default
815: * encoding supported.
1.18 daniel 816: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 817: * in normal processing.
818: */
819: void
820: xmlInitCharEncodingHandlers(void) {
1.34 daniel 821: unsigned short int tst = 0x1234;
822: unsigned char *ptr = (unsigned char *) &tst;
823:
1.9 daniel 824: if (handlers != NULL) return;
825:
826: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 827: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 828:
829: if (*ptr == 0x12) xmlLittleEndian = 0;
830: else if (*ptr == 0x34) xmlLittleEndian = 1;
831: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 832:
833: if (handlers == NULL) {
834: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
835: return;
836: }
1.10 daniel 837: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 838: xmlUTF16LEHandler =
1.28 daniel 839: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
840: xmlUTF16BEHandler =
841: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 842: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 843: }
844:
845: /**
1.19 daniel 846: * xmlCleanupCharEncodingHandlers:
847: *
848: * Cleanup the memory allocated for the char encoding support, it
849: * unregisters all the encoding handlers.
850: */
851: void
852: xmlCleanupCharEncodingHandlers(void) {
853: if (handlers == NULL) return;
854:
855: for (;nbCharEncodingHandler > 0;) {
856: nbCharEncodingHandler--;
857: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 858: if (handlers[nbCharEncodingHandler]->name != NULL)
859: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 860: xmlFree(handlers[nbCharEncodingHandler]);
861: }
862: }
863: xmlFree(handlers);
864: handlers = NULL;
865: nbCharEncodingHandler = 0;
866: xmlDefaultCharEncodingHandler = NULL;
867: }
868:
869: /**
1.9 daniel 870: * xmlRegisterCharEncodingHandler:
871: * @handler: the xmlCharEncodingHandlerPtr handler block
872: *
873: * Register the char encoding handler, surprizing, isn't it ?
874: */
875: void
876: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
877: if (handlers == NULL) xmlInitCharEncodingHandlers();
878: if (handler == NULL) {
879: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
880: return;
881: }
882:
883: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
884: fprintf(stderr,
885: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
886: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
887: return;
888: }
889: handlers[nbCharEncodingHandler++] = handler;
890: }
891:
892: /**
893: * xmlGetCharEncodingHandler:
894: * @enc: an xmlCharEncoding value.
895: *
896: * Search in the registrered set the handler able to read/write that encoding.
897: *
898: * Returns the handler or NULL if not found
899: */
900: xmlCharEncodingHandlerPtr
901: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 902: xmlCharEncodingHandlerPtr handler;
903:
1.9 daniel 904: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 905: switch (enc) {
906: case XML_CHAR_ENCODING_ERROR:
907: return(NULL);
908: case XML_CHAR_ENCODING_NONE:
909: return(NULL);
910: case XML_CHAR_ENCODING_UTF8:
911: return(NULL);
912: case XML_CHAR_ENCODING_UTF16LE:
913: return(xmlUTF16LEHandler);
914: case XML_CHAR_ENCODING_UTF16BE:
915: return(xmlUTF16BEHandler);
916: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 917: handler = xmlFindCharEncodingHandler("EBCDIC");
918: if (handler != NULL) return(handler);
919: handler = xmlFindCharEncodingHandler("ebcdic");
920: if (handler != NULL) return(handler);
921: break;
1.25 daniel 922: case XML_CHAR_ENCODING_UCS4LE:
1.30 daniel 923: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
924: if (handler != NULL) return(handler);
925: handler = xmlFindCharEncodingHandler("UCS-4");
926: if (handler != NULL) return(handler);
927: handler = xmlFindCharEncodingHandler("UCS4");
928: if (handler != NULL) return(handler);
929: break;
1.25 daniel 930: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 931: handler = xmlFindCharEncodingHandler("UCS4BE");
932: if (handler != NULL) return(handler);
933: break;
1.25 daniel 934: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 935: break;
1.25 daniel 936: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 937: break;
1.25 daniel 938: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 939: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
940: if (handler != NULL) return(handler);
941: handler = xmlFindCharEncodingHandler("UCS-2");
942: if (handler != NULL) return(handler);
943: handler = xmlFindCharEncodingHandler("UCS2");
944: if (handler != NULL) return(handler);
945: break;
1.25 daniel 946: case XML_CHAR_ENCODING_8859_1:
947: case XML_CHAR_ENCODING_8859_2:
948: case XML_CHAR_ENCODING_8859_3:
949: case XML_CHAR_ENCODING_8859_4:
950: case XML_CHAR_ENCODING_8859_5:
951: case XML_CHAR_ENCODING_8859_6:
952: case XML_CHAR_ENCODING_8859_7:
953: case XML_CHAR_ENCODING_8859_8:
954: case XML_CHAR_ENCODING_8859_9:
955: return(NULL);
956: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 957: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
958: if (handler != NULL) return(handler);
959: break;
1.25 daniel 960: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 961: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
962: if (handler != NULL) return(handler);
963: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
964: if (handler != NULL) return(handler);
965: handler = xmlFindCharEncodingHandler("Shift_JIS");
966: if (handler != NULL) return(handler);
967: break;
1.25 daniel 968: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 969: handler = xmlFindCharEncodingHandler("EUC-JP");
970: if (handler != NULL) return(handler);
971: break;
972: default:
973: break;
1.25 daniel 974: }
1.30 daniel 975:
976: #ifdef DEBUG_ENCODING
977: fprintf(stderr, "No handler found for encoding %d\n", enc);
978: #endif
1.9 daniel 979: return(NULL);
980: }
981:
982: /**
983: * xmlGetCharEncodingHandler:
984: * @enc: a string describing the char encoding.
985: *
986: * Search in the registrered set the handler able to read/write that encoding.
987: *
988: * Returns the handler or NULL if not found
989: */
990: xmlCharEncodingHandlerPtr
991: xmlFindCharEncodingHandler(const char *name) {
1.36 daniel 992: xmlCharEncodingHandlerPtr enc;
993: xmlCharEncoding alias;
1.30 daniel 994: #ifdef LIBXML_ICONV_ENABLED
995: iconv_t icv_in, icv_out;
996: #endif /* LIBXML_ICONV_ENABLED */
997: char upper[100];
1.9 daniel 998: int i;
999:
1000: if (handlers == NULL) xmlInitCharEncodingHandlers();
1001: if (name == NULL) return(xmlDefaultCharEncodingHandler);
1002: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1003:
1.36 daniel 1004: /*
1005: * Check first for directly registered encoding names
1006: */
1.30 daniel 1007: for (i = 0;i < 99;i++) {
1.9 daniel 1008: upper[i] = toupper(name[i]);
1009: if (upper[i] == 0) break;
1010: }
1011: upper[i] = 0;
1012:
1013: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 1014: if (!strcmp(upper, handlers[i]->name)) {
1015: #ifdef DEBUG_ENCODING
1016: fprintf(stderr, "Found registered handler for encoding %s\n", name);
1017: #endif
1.9 daniel 1018: return(handlers[i]);
1.30 daniel 1019: }
1.9 daniel 1020:
1.36 daniel 1021: /*
1022: * check using aliases names
1023: */
1024: alias = xmlParseCharEncoding(name);
1025: if (alias != XML_CHAR_ENCODING_ERROR) {
1026: enc = xmlGetCharEncodingHandler(alias);
1027: if (enc != NULL) {
1028: #ifdef DEBUG_ENCODING
1029: fprintf(stderr, "Found registered handler %s for encoding %s\n",
1030: enc->name, name);
1031: #endif
1032: return(enc);
1033: }
1034: }
1.30 daniel 1035: #ifdef LIBXML_ICONV_ENABLED
1036: /* check whether iconv can handle this */
1.31 daniel 1037: icv_in = iconv_open("UTF-8", name);
1038: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1039: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31 daniel 1040: enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1041: if (enc == NULL) {
1042: iconv_close(icv_in);
1043: iconv_close(icv_out);
1044: return(NULL);
1045: }
1046: enc->name = NULL;
1.30 daniel 1047: enc->input = NULL;
1048: enc->output = NULL;
1049: enc->iconv_in = icv_in;
1050: enc->iconv_out = icv_out;
1051: #ifdef DEBUG_ENCODING
1052: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1053: #endif
1054: return enc;
1055: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1056: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1057: }
1058: #endif /* LIBXML_ICONV_ENABLED */
1059: #ifdef DEBUG_ENCODING
1060: fprintf(stderr, "No handler found for encoding %s\n", name);
1061: #endif
1.9 daniel 1062: return(NULL);
1.30 daniel 1063: }
1064:
1065: #ifdef LIBXML_ICONV_ENABLED
1066: /**
1067: * xmlIconvWrapper:
1068: * @cd: iconv converter data structure
1069: * @out: a pointer to an array of bytes to store the result
1070: * @outlen: the length of @out
1071: * @in: a pointer to an array of ISO Latin 1 chars
1072: * @inlen: the length of @in
1073: *
1074: * Returns 0 if success, or
1075: * -1 by lack of space, or
1076: * -2 if the transcoding fails (for *in is not valid utf8 string or
1077: * the result of transformation can't fit into the encoding we want), or
1078: * -3 if there the last byte can't form a single output char.
1079: *
1080: * The value of @inlen after return is the number of octets consumed
1081: * as the return value is positive, else unpredictiable.
1082: * The value of @outlen after return is the number of ocetes consumed.
1083: */
1084: static int
1085: xmlIconvWrapper(iconv_t cd,
1086: unsigned char *out, int *outlen,
1087: const unsigned char *in, int *inlen) {
1088:
1089: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1090: const char *icv_in = (const char *) in;
1091: char *icv_out = (char *) out;
1092: int ret;
1093:
1094: ret = iconv(cd,
1095: &icv_in, &icv_inlen,
1096: &icv_out, &icv_outlen);
1.35 daniel 1097: if (in != NULL) {
1098: *inlen -= icv_inlen;
1099: *outlen -= icv_outlen;
1100: } else {
1101: *inlen = 0;
1102: *outlen = 0;
1103: }
1.30 daniel 1104: if (icv_inlen != 0 || ret == (size_t) -1) {
1105: #ifdef EILSEQ
1106: if (errno == EILSEQ) {
1.31 daniel 1107: return -2;
1.30 daniel 1108: } else
1109: #endif
1110: #ifdef E2BIG
1111: if (errno == E2BIG) {
1112: return -1;
1113: } else
1114: #endif
1115: #ifdef EINVAL
1116: if (errno == EINVAL) {
1.31 daniel 1117: return -3;
1.30 daniel 1118: }
1119: #endif
1120: else {
1121: return -3;
1122: }
1123: }
1124: return 0;
1125: }
1126: #endif /* LIBXML_ICONV_ENABLED */
1127:
1128: /**
1129: * xmlCharEncInFunc:
1130: * @handler: char enconding transformation data structure
1.31 daniel 1131: * @out: an xmlBuffer for the output.
1132: * @in: an xmlBuffer for the input
1.30 daniel 1133: *
1134: * Generic front-end for the encoding handler input function
1135: *
1.31 daniel 1136: * Returns the number of byte written if success, or
1137: * -1 general error
1.30 daniel 1138: * -2 if the transcoding fails (for *in is not valid utf8 string or
1139: * the result of transformation can't fit into the encoding we want), or
1140: */
1141: int
1.31 daniel 1142: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1143: xmlBufferPtr in) {
1.30 daniel 1144: int ret = -2;
1.31 daniel 1145: int written;
1146: int toconv;
1.30 daniel 1147:
1.31 daniel 1148: if (handler == NULL) return(-1);
1149: if (out == NULL) return(-1);
1150: if (in == NULL) return(-1);
1151:
1152: written = out->size - out->use;
1153: toconv = in->use;
1154: if (toconv * 2 >= written) {
1155: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1156: written = out->size - out->use - 1;
1.31 daniel 1157: }
1.30 daniel 1158: if (handler->input != NULL) {
1.32 daniel 1159: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1160: in->content, &toconv);
1161: xmlBufferShrink(in, toconv);
1162: out->use += written;
1.33 daniel 1163: out->content[out->use] = 0;
1.30 daniel 1164: }
1165: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1166: else if (handler->iconv_in != NULL) {
1167: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1168: &written, in->content, &toconv);
1169: xmlBufferShrink(in, toconv);
1170: out->use += written;
1.33 daniel 1171: out->content[out->use] = 0;
1172: if (ret == -1) ret = -3;
1.30 daniel 1173: }
1174: #endif /* LIBXML_ICONV_ENABLED */
1175: #ifdef DEBUG_ENCODING
1176: switch (ret) {
1177: case 0:
1178: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1179: toconv, written);
1.30 daniel 1180: break;
1181: case -1:
1.31 daniel 1182: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1183: toconv, written, in->use);
1.30 daniel 1184: break;
1185: case -2:
1186: fprintf(stderr, "input conversion failed due to input error\n");
1187: break;
1188: case -3:
1.31 daniel 1189: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1190: toconv, written, in->use);
1.30 daniel 1191: break;
1192: default:
1193: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1194: }
1195: #endif
1.33 daniel 1196: /*
1197: * Ignore when input buffer is not on a boundary
1198: */
1199: if (ret == -3) ret = 0;
1.30 daniel 1200: return(ret);
1201: }
1202:
1203: /**
1204: * xmlCharEncOutFunc:
1205: * @handler: char enconding transformation data structure
1.31 daniel 1206: * @out: an xmlBuffer for the output.
1207: * @in: an xmlBuffer for the input
1208: *
1209: * Generic front-end for the encoding handler output function
1.35 daniel 1210: * a first call with @in == NULL has to be made firs to initiate the
1211: * output in case of non-stateless encoding needing to initiate their
1212: * state or the output (like the BOM in UTF16).
1.30 daniel 1213: *
1.31 daniel 1214: * Returns the number of byte written if success, or
1215: * -1 general error
1.30 daniel 1216: * -2 if the transcoding fails (for *in is not valid utf8 string or
1217: * the result of transformation can't fit into the encoding we want), or
1218: */
1219: int
1.31 daniel 1220: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1221: xmlBufferPtr in) {
1.30 daniel 1222: int ret = -2;
1.31 daniel 1223: int written;
1224: int toconv;
1225:
1226: if (handler == NULL) return(-1);
1227: if (out == NULL) return(-1);
1.35 daniel 1228: written = out->size - out->use;
1229:
1230: if (in == NULL) {
1231: toconv = 0;
1232: if (handler->output != NULL) {
1233: ret = handler->output(&out->content[out->use], &written,
1234: NULL, &toconv);
1235: out->use += written;
1236: out->content[out->use] = 0;
1237: }
1238: #ifdef LIBXML_ICONV_ENABLED
1239: else if (handler->iconv_out != NULL) {
1240: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1241: &written, NULL, &toconv);
1242: out->use += written;
1243: out->content[out->use] = 0;
1244: }
1245: #endif /* LIBXML_ICONV_ENABLED */
1246: #ifdef DEBUG_ENCODING
1247: fprintf(stderr, "initialized encoder\n");
1248: #endif
1249: return(0);
1250: }
1.30 daniel 1251:
1.33 daniel 1252: toconv = in->use;
1253: if (toconv * 2 >= written) {
1254: xmlBufferGrow(out, toconv * 2);
1255: written = out->size - out->use - 1;
1256: }
1.30 daniel 1257: if (handler->output != NULL) {
1.33 daniel 1258: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1259: in->content, &toconv);
1.31 daniel 1260: xmlBufferShrink(in, toconv);
1261: out->use += written;
1.33 daniel 1262: out->content[out->use] = 0;
1.30 daniel 1263: }
1264: #ifdef LIBXML_ICONV_ENABLED
1265: else if (handler->iconv_out != NULL) {
1.31 daniel 1266: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1267: &written, in->content, &toconv);
1268: xmlBufferShrink(in, toconv);
1269: out->use += written;
1.33 daniel 1270: out->content[out->use] = 0;
1271: if (ret == -1) ret = -3;
1.30 daniel 1272: }
1273: #endif /* LIBXML_ICONV_ENABLED */
1274: #ifdef DEBUG_ENCODING
1275: switch (ret) {
1276: case 0:
1277: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1278: toconv, written);
1.30 daniel 1279: break;
1280: case -1:
1281: fprintf(stderr, "output conversion failed by lack of space\n");
1282: break;
1283: case -2:
1284: fprintf(stderr, "output conversion failed due to output error\n");
1285: break;
1286: case -3:
1.31 daniel 1287: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1288: toconv, written, in->use);
1.30 daniel 1289: break;
1290: default:
1291: fprintf(stderr,"Unknown output conversion failed %d\n", ret);
1292: }
1293: #endif
1294: return(ret);
1295: }
1296:
1297: /**
1298: * xmlCharEncCloseFunc:
1299: * @handler: char enconding transformation data structure
1300: *
1301: * Generic front-end for hencoding handler close function
1302: *
1303: * Returns 0 if success, or -1 in case of error
1304: */
1305: int
1306: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1307: int ret = 0;
1.31 daniel 1308: if (handler == NULL) return(-1);
1309: if (handler->name == NULL) return(-1);
1.30 daniel 1310: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1311: /*
1312: * Iconv handlers can be oused only once, free the whole block.
1313: * and the associated icon resources.
1314: */
1.32 daniel 1315: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1316: if (handler->name != NULL)
1317: xmlFree(handler->name);
1318: handler->name = NULL;
1319: if (handler->iconv_out != NULL) {
1320: if (iconv_close(handler->iconv_out))
1321: ret = -1;
1322: handler->iconv_out = NULL;
1323: }
1324: if (handler->iconv_in != NULL) {
1325: if (iconv_close(handler->iconv_in))
1326: ret = -1;
1327: handler->iconv_in = NULL;
1328: }
1329: xmlFree(handler);
1.30 daniel 1330: }
1331: #endif /* LIBXML_ICONV_ENABLED */
1332: #ifdef DEBUG_ENCODING
1333: if (ret)
1334: fprintf(stderr, "failed to close the encoding handler\n");
1335: else
1336: fprintf(stderr, "closed the encoding handler\n");
1337:
1338: #endif
1339: return(ret);
1.9 daniel 1340: }
1341:
Webmaster