Annotation of XML/encoding.c, revision 1.36
1.1 daniel 1: /*
2: * encoding.c : implements the encoding conversion functions needed for XML
3: *
4: * Related specs:
5: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6: * [ISO-10646] UTF-8 and UTF-16 in Annexes
7: * [ISO-8859-1] ISO Latin-1 characters codes.
8: * [UNICODE] The Unicode Consortium, "The Unicode Standard --
9: * Worldwide Character Encoding -- Version 1.0", Addison-
10: * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
11: * described in Unicode Technical Report #4.
12: * [US-ASCII] Coded Character Set--7-bit American Standard Code for
13: * Information Interchange, ANSI X3.4-1986.
14: *
1.9 daniel 15: * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1 daniel 16: *
17: * See Copyright for the status of this software.
18: *
19: * Daniel.Veillard@w3.org
20: */
21:
1.21 daniel 22: #ifdef WIN32
23: #include "win32config.h"
24: #else
1.14 daniel 25: #include "config.h"
1.17 daniel 26: #endif
27:
28: #include <stdio.h>
29: #include <string.h>
30:
31: #ifdef HAVE_CTYPE_H
1.7 daniel 32: #include <ctype.h>
1.17 daniel 33: #endif
1.20 daniel 34: #ifdef HAVE_STDLIB_H
35: #include <stdlib.h>
36: #endif
1.30 daniel 37: #include <libxml/xmlversion.h>
38: #ifdef LIBXML_ICONV_ENABLED
39: #ifdef HAVE_ERRNO_H
40: #include <errno.h>
41: #endif
42: #endif
1.29 daniel 43: #include <libxml/encoding.h>
44: #include <libxml/xmlmemory.h>
1.3 daniel 45:
1.25 daniel 46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
48:
1.30 daniel 49: #ifdef LIBXML_ICONV_ENABLED
1.36 ! daniel 50: #if 1
1.30 daniel 51: #define DEBUG_ENCODING /* Define this to get encoding traces */
52: #endif
1.33 daniel 53: #endif
1.30 daniel 54:
1.34 daniel 55: static int xmlLittleEndian = 1;
56:
1.3 daniel 57: /*
58: * From rfc2044: encoding of the Unicode values on UTF-8:
59: *
60: * UCS-4 range (hex.) UTF-8 octet sequence (binary)
61: * 0000 0000-0000 007F 0xxxxxxx
62: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
63: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
64: *
65: * I hope we won't use values > 0xFFFF anytime soon !
66: */
1.1 daniel 67:
68: /**
1.22 daniel 69: * xmlCheckUTF8: Check utf-8 string for legality.
70: * @utf: Pointer to putative utf-8 encoded string.
71: *
72: * Checks @utf for being valid utf-8. @utf is assumed to be
73: * null-terminated. This function is not super-strict, as it will
74: * allow longer utf-8 sequences than necessary. Note that Java is
75: * capable of producing these sequences if provoked. Also note, this
76: * routine checks for the 4-byte maxiumum size, but does not check for
77: * 0x10ffff maximum value.
78: *
79: * Return value: true if @utf is valid.
80: **/
81: int
82: xmlCheckUTF8(const unsigned char *utf)
83: {
84: int ix;
85: unsigned char c;
86:
87: for (ix = 0; (c = utf[ix]);) {
88: if (c & 0x80) {
89: if ((utf[ix + 1] & 0xc0) != 0x80)
90: return(0);
91: if ((c & 0xe0) == 0xe0) {
92: if ((utf[ix + 2] & 0xc0) != 0x80)
93: return(0);
94: if ((c & 0xf0) == 0xf0) {
95: if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
96: return(0);
97: ix += 4;
98: /* 4-byte code */
99: } else
100: /* 3-byte code */
101: ix += 3;
102: } else
103: /* 2-byte code */
104: ix += 2;
105: } else
106: /* 1-byte code */
107: ix++;
108: }
109: return(1);
110: }
111:
112: /**
1.1 daniel 113: * isolat1ToUTF8:
1.18 daniel 114: * @out: a pointer to an array of bytes to store the result
115: * @outlen: the length of @out
116: * @in: a pointer to an array of ISO Latin 1 chars
117: * @inlen: the length of @in
1.1 daniel 118: *
119: * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
120: * block of chars out.
1.33 daniel 121: * Returns 0 if success, or -1 otherwise
122: * The value of @inlen after return is the number of octets consumed
123: * as the return value is positive, else unpredictiable.
124: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 125: */
126: int
1.33 daniel 127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25 daniel 128: const unsigned char* in, int *inlen) {
1.33 daniel 129: unsigned char* outstart = out;
130: const unsigned char* processed = in;
131: unsigned char* outend = out + *outlen;
132: const unsigned char* inend = in + *inlen;
1.1 daniel 133: unsigned char c;
134:
135: while (in < inend) {
136: c= *in++;
137: if (c < 0x80) {
1.33 daniel 138: if (out >= outend)
139: break;
1.1 daniel 140: *out++ = c;
141: }
142: else {
1.33 daniel 143: if (out + 1 >= outend) break;
1.1 daniel 144: *out++ = 0xC0 | (c >> 6);
145: *out++ = 0x80 | (0x3F & c);
146: }
1.33 daniel 147: processed = in;
1.1 daniel 148: }
1.33 daniel 149: *outlen = out - outstart;
150: *inlen = processed - in;
151:
152: return(0);
1.1 daniel 153: }
154:
155: /**
156: * UTF8Toisolat1:
1.18 daniel 157: * @out: a pointer to an array of bytes to store the result
158: * @outlen: the length of @out
159: * @in: a pointer to an array of UTF-8 chars
160: * @inlen: the length of @in
1.1 daniel 161: *
162: * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
163: * block of chars out.
1.15 daniel 164: * TODO: UTF8Toisolat1 need a fallback mechanism ...
165: *
1.33 daniel 166: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28 daniel 167: * The value of @inlen after return is the number of octets consumed
168: * as the return value is positive, else unpredictiable.
1.33 daniel 169: * The value of @outlen after return is the number of ocetes consumed.
1.1 daniel 170: */
171: int
1.33 daniel 172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25 daniel 173: const unsigned char* in, int *inlen) {
1.33 daniel 174: unsigned char* outstart = out;
175: const unsigned char* processed = in;
176: unsigned char* outend = out + *outlen;
177: const unsigned char* inend = in + *inlen;
1.1 daniel 178: unsigned char c;
179:
180: while (in < inend) {
181: c= *in++;
182: if (c < 0x80) {
1.28 daniel 183: if (out >= outend) return(-1);
1.1 daniel 184: *out++= c;
185: }
1.23 daniel 186: else if (in == inend) {
187: break;
188: }
189: else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
190: /* a two byte utf-8 and can be encoding as isolate1 */
1.1 daniel 191: *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23 daniel 192: }
1.33 daniel 193: else {
194: *outlen = out - outstart;
195: *inlen = processed - in;
1.28 daniel 196: return(-2);
1.33 daniel 197: }
198: processed = in;
1.1 daniel 199: }
1.33 daniel 200: *outlen = out - outstart;
201: *inlen = processed - in;
202: return(0);
1.1 daniel 203: }
204:
205: /**
1.28 daniel 206: * UTF16LEToUTF8:
207: * @out: a pointer to an array of bytes to store the result
208: * @outlen: the length of @out
209: * @inb: a pointer to an array of UTF-16LE passwd as a byte array
210: * @inlenb: the length of @in in UTF-16LE chars
211: *
212: * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
213: * block of chars out. This function assume the endian properity
214: * is the same between the native type of this machine and the
215: * inputed one.
216: *
217: * Returns the number of byte written, or -1 by lack of space, or -2
218: * if the transcoding fails (for *in is not valid utf16 string)
219: * The value of *inlen after return is the number of octets consumed
220: * as the return value is positive, else unpredictiable.
221: */
222: int
1.33 daniel 223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28 daniel 224: const unsigned char* inb, int *inlenb)
225: {
1.33 daniel 226: unsigned char* outstart = out;
227: const unsigned char* processed = inb;
228: unsigned char* outend = out + *outlen;
1.28 daniel 229: unsigned short* in = (unsigned short*) inb;
230: unsigned short* inend;
231: unsigned int c, d, inlen;
232: unsigned char *tmp;
233: int bits;
234:
235: if ((*inlenb % 2) == 1)
236: (*inlenb)--;
237: inlen = *inlenb / 2;
1.33 daniel 238: inend = in + inlen;
1.28 daniel 239: while (in < inend) {
1.34 daniel 240: if (xmlLittleEndian) {
241: c= *in++;
242: } else {
243: tmp = (unsigned char *) in;
244: c = *tmp++;
245: c = c | (((unsigned int)*tmp) << 8);
246: in++;
247: }
1.28 daniel 248: if ((c & 0xFC00) == 0xD800) { /* surrogates */
249: if (in >= inend) { /* (in > inend) shouldn't happens */
250: break;
251: }
1.34 daniel 252: if (xmlLittleEndian) {
253: d = *in++;
254: } else {
255: tmp = (unsigned char *) in;
256: d = *tmp++;
257: d = d | (((unsigned int)*tmp) << 8);
258: in++;
259: }
1.28 daniel 260: if ((d & 0xFC00) == 0xDC00) {
261: c &= 0x03FF;
262: c <<= 10;
263: c |= d & 0x03FF;
264: c += 0x10000;
265: }
1.33 daniel 266: else {
267: *outlen = out - outstart;
268: *inlenb = processed - inb;
1.28 daniel 269: return(-2);
1.33 daniel 270: }
1.28 daniel 271: }
272:
273: /* assertion: c is a single UTF-4 value */
274: if (out >= outend)
1.33 daniel 275: break;
1.28 daniel 276: if (c < 0x80) { *out++= c; bits= -6; }
277: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
278: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
279: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
280:
281: for ( ; bits >= 0; bits-= 6) {
282: if (out >= outend)
1.33 daniel 283: break;
1.28 daniel 284: *out++= ((c >> bits) & 0x3F) | 0x80;
285: }
1.33 daniel 286: processed = (const unsigned char*) in;
1.28 daniel 287: }
1.33 daniel 288: *outlen = out - outstart;
289: *inlenb = processed - inb;
290: return(0);
1.28 daniel 291: }
292:
293: /**
294: * UTF8ToUTF16LE:
295: * @outb: a pointer to an array of bytes to store the result
296: * @outlen: the length of @outb
297: * @in: a pointer to an array of UTF-8 chars
298: * @inlen: the length of @in
299: *
300: * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
301: * block of chars out.
302: * TODO: UTF8ToUTF16LE need a fallback mechanism ...
303: *
304: * Returns the number of byte written, or -1 by lack of space, or -2
305: * if the transcoding failed.
306: */
307: int
1.33 daniel 308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28 daniel 309: const unsigned char* in, int *inlen)
310: {
311: unsigned short* out = (unsigned short*) outb;
1.33 daniel 312: const unsigned char* processed = in;
1.28 daniel 313: unsigned short* outstart= out;
314: unsigned short* outend;
315: const unsigned char* inend= in+*inlen;
316: unsigned int c, d, trailing;
317: unsigned char *tmp;
318: unsigned short tmp1, tmp2;
319:
1.33 daniel 320: outend = out + (*outlen / 2);
1.28 daniel 321: while (in < inend) {
322: d= *in++;
323: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 324: else if (d < 0xC0) {
325: /* trailing byte in leading position */
326: *outlen = out - outstart;
327: *inlen = processed - in;
328: return(-2);
329: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.28 daniel 330: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
331: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 332: else {
333: /* no chance for this in UTF-16 */
334: *outlen = out - outstart;
335: *inlen = processed - in;
336: return(-2);
337: }
1.28 daniel 338:
339: if (inend - in < trailing) {
340: break;
341: }
342:
343: for ( ; trailing; trailing--) {
344: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33 daniel 345: break;
1.28 daniel 346: c <<= 6;
347: c |= d & 0x3F;
348: }
349:
350: /* assertion: c is a single UTF-4 value */
351: if (c < 0x10000) {
352: if (out >= outend)
1.33 daniel 353: break;
1.34 daniel 354: if (xmlLittleEndian) {
355: *out++ = c;
356: } else {
357: tmp = (unsigned char *) out;
358: *tmp = c ;
359: *(tmp + 1) = c >> 8 ;
360: out++;
361: }
1.28 daniel 362: }
363: else if (c < 0x110000) {
364: if (out+1 >= outend)
1.33 daniel 365: break;
1.28 daniel 366: c -= 0x10000;
1.34 daniel 367: if (xmlLittleEndian) {
368: *out++ = 0xD800 | (c >> 10);
369: *out++ = 0xDC00 | (c & 0x03FF);
370: } else {
371: tmp1 = 0xD800 | (c >> 10);
372: tmp = (unsigned char *) out;
373: *tmp = tmp1;
374: *(tmp + 1) = tmp1 >> 8;
375: out++;
376:
377: tmp2 = 0xDC00 | (c & 0x03FF);
378: tmp = (unsigned char *) out;
379: *tmp = tmp2;
380: *(tmp + 1) = tmp2 >> 8;
381: out++;
382: }
1.28 daniel 383: }
384: else
1.33 daniel 385: break;
386: processed = in;
1.28 daniel 387: }
1.36 ! daniel 388: *outlen = (out - outstart) * 2;
1.33 daniel 389: *inlen = processed - in;
390: return(0);
1.28 daniel 391: }
392:
393: /**
394: * UTF16BEToUTF8:
1.18 daniel 395: * @out: a pointer to an array of bytes to store the result
396: * @outlen: the length of @out
1.25 daniel 397: * @inb: a pointer to an array of UTF-16 passwd as a byte array
398: * @inlenb: the length of @in in UTF-16 chars
1.1 daniel 399: *
400: * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28 daniel 401: * block of chars out. This function assume the endian properity
402: * is the same between the native type of this machine and the
403: * inputed one.
1.25 daniel 404: *
1.28 daniel 405: * Returns the number of byte written, or -1 by lack of space, or -2
406: * if the transcoding fails (for *in is not valid utf16 string)
407: * The value of *inlen after return is the number of octets consumed
408: * as the return value is positive, else unpredictiable.
1.1 daniel 409: */
410: int
1.33 daniel 411: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25 daniel 412: const unsigned char* inb, int *inlenb)
1.1 daniel 413: {
1.33 daniel 414: unsigned char* outstart = out;
415: const unsigned char* processed = inb;
416: unsigned char* outend = out + *outlen;
1.25 daniel 417: unsigned short* in = (unsigned short*) inb;
418: unsigned short* inend;
419: unsigned int c, d, inlen;
1.28 daniel 420: unsigned char *tmp;
1.1 daniel 421: int bits;
422:
1.28 daniel 423: if ((*inlenb % 2) == 1)
424: (*inlenb)--;
1.25 daniel 425: inlen = *inlenb / 2;
426: inend= in + inlen;
1.1 daniel 427: while (in < inend) {
1.34 daniel 428: if (xmlLittleEndian) {
429: tmp = (unsigned char *) in;
430: c = *tmp++;
431: c = c << 8;
432: c = c | (unsigned int) *tmp;
433: in++;
434: } else {
435: c= *in++;
436: }
1.1 daniel 437: if ((c & 0xFC00) == 0xD800) { /* surrogates */
1.28 daniel 438: if (in >= inend) { /* (in > inend) shouldn't happens */
1.33 daniel 439: *outlen = out - outstart;
440: *inlenb = processed - inb;
441: return(-2);
1.28 daniel 442: }
1.34 daniel 443: if (xmlLittleEndian) {
444: tmp = (unsigned char *) in;
445: d = *tmp++;
446: d = d << 8;
447: d = d | (unsigned int) *tmp;
448: in++;
449: } else {
450: d= *in++;
451: }
1.28 daniel 452: if ((d & 0xFC00) == 0xDC00) {
1.1 daniel 453: c &= 0x03FF;
454: c <<= 10;
455: c |= d & 0x03FF;
456: c += 0x10000;
457: }
1.33 daniel 458: else {
459: *outlen = out - outstart;
460: *inlenb = processed - inb;
1.28 daniel 461: return(-2);
1.33 daniel 462: }
1.1 daniel 463: }
464:
1.25 daniel 465: /* assertion: c is a single UTF-4 value */
1.27 daniel 466: if (out >= outend)
1.33 daniel 467: break;
1.1 daniel 468: if (c < 0x80) { *out++= c; bits= -6; }
1.26 daniel 469: else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
470: else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
471: else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
1.1 daniel 472:
1.26 daniel 473: for ( ; bits >= 0; bits-= 6) {
1.27 daniel 474: if (out >= outend)
1.33 daniel 475: break;
1.26 daniel 476: *out++= ((c >> bits) & 0x3F) | 0x80;
1.1 daniel 477: }
1.33 daniel 478: processed = (const unsigned char*) in;
1.1 daniel 479: }
1.33 daniel 480: *outlen = out - outstart;
481: *inlenb = processed - inb;
482: return(0);
1.1 daniel 483: }
484:
485: /**
1.28 daniel 486: * UTF8ToUTF16BE:
1.25 daniel 487: * @outb: a pointer to an array of bytes to store the result
488: * @outlen: the length of @outb
1.18 daniel 489: * @in: a pointer to an array of UTF-8 chars
490: * @inlen: the length of @in
1.1 daniel 491: *
1.28 daniel 492: * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1 daniel 493: * block of chars out.
1.28 daniel 494: * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15 daniel 495: *
1.6 daniel 496: * Returns the number of byte written, or -1 by lack of space, or -2
1.25 daniel 497: * if the transcoding failed.
1.1 daniel 498: */
499: int
1.33 daniel 500: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25 daniel 501: const unsigned char* in, int *inlen)
1.1 daniel 502: {
1.25 daniel 503: unsigned short* out = (unsigned short*) outb;
1.33 daniel 504: const unsigned char* processed = in;
1.1 daniel 505: unsigned short* outstart= out;
1.28 daniel 506: unsigned short* outend;
1.25 daniel 507: const unsigned char* inend= in+*inlen;
1.1 daniel 508: unsigned int c, d, trailing;
1.28 daniel 509: unsigned char *tmp;
510: unsigned short tmp1, tmp2;
1.1 daniel 511:
1.33 daniel 512: outend = out + (*outlen / 2);
1.1 daniel 513: while (in < inend) {
514: d= *in++;
515: if (d < 0x80) { c= d; trailing= 0; }
1.33 daniel 516: else if (d < 0xC0) {
517: /* trailing byte in leading position */
518: *outlen = out - outstart;
519: *inlen = processed - in;
520: return(-2);
521: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1.1 daniel 522: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
523: else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1.33 daniel 524: else {
525: /* no chance for this in UTF-16 */
526: *outlen = out - outstart;
527: *inlen = processed - in;
528: return(-2);
529: }
1.28 daniel 530:
531: if (inend - in < trailing) {
532: break;
533: }
1.1 daniel 534:
535: for ( ; trailing; trailing--) {
1.33 daniel 536: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
1.1 daniel 537: c <<= 6;
538: c |= d & 0x3F;
539: }
540:
541: /* assertion: c is a single UTF-4 value */
542: if (c < 0x10000) {
1.33 daniel 543: if (out >= outend) break;
1.34 daniel 544: if (xmlLittleEndian) {
545: tmp = (unsigned char *) out;
546: *tmp = c >> 8;
547: *(tmp + 1) = c;
548: out++;
549: } else {
550: *out++ = c;
551: }
1.1 daniel 552: }
553: else if (c < 0x110000) {
1.33 daniel 554: if (out+1 >= outend) break;
1.1 daniel 555: c -= 0x10000;
1.34 daniel 556: if (xmlLittleEndian) {
557: tmp1 = 0xD800 | (c >> 10);
558: tmp = (unsigned char *) out;
559: *tmp = tmp1 >> 8;
560: *(tmp + 1) = tmp1;
561: out++;
562:
563: tmp2 = 0xDC00 | (c & 0x03FF);
564: tmp = (unsigned char *) out;
565: *tmp = tmp2 >> 8;
566: *(tmp + 1) = tmp2;
567: out++;
568: } else {
569: *out++ = 0xD800 | (c >> 10);
570: *out++ = 0xDC00 | (c & 0x03FF);
571: }
1.1 daniel 572: }
1.33 daniel 573: else
574: break;
575: processed = in;
1.1 daniel 576: }
1.36 ! daniel 577: *outlen = (out - outstart) * 2;
1.33 daniel 578: *inlen = processed - in;
579: return(0);
1.1 daniel 580: }
581:
1.7 daniel 582: /**
583: * xmlDetectCharEncoding:
584: * @in: a pointer to the first bytes of the XML entity, must be at least
585: * 4 bytes long.
1.25 daniel 586: * @len: pointer to the length of the buffer
1.7 daniel 587: *
588: * Guess the encoding of the entity using the first bytes of the entity content
589: * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
590: *
591: * Returns one of the XML_CHAR_ENCODING_... values.
592: */
593: xmlCharEncoding
1.25 daniel 594: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7 daniel 595: {
1.25 daniel 596: if (len >= 4) {
597: if ((in[0] == 0x00) && (in[1] == 0x00) &&
598: (in[2] == 0x00) && (in[3] == 0x3C))
599: return(XML_CHAR_ENCODING_UCS4BE);
600: if ((in[0] == 0x3C) && (in[1] == 0x00) &&
601: (in[2] == 0x00) && (in[3] == 0x00))
602: return(XML_CHAR_ENCODING_UCS4LE);
603: if ((in[0] == 0x00) && (in[1] == 0x00) &&
604: (in[2] == 0x3C) && (in[3] == 0x00))
605: return(XML_CHAR_ENCODING_UCS4_2143);
606: if ((in[0] == 0x00) && (in[1] == 0x3C) &&
607: (in[2] == 0x00) && (in[3] == 0x00))
608: return(XML_CHAR_ENCODING_UCS4_3412);
609: if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
610: (in[2] == 0xA7) && (in[3] == 0x94))
611: return(XML_CHAR_ENCODING_EBCDIC);
612: if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
613: (in[2] == 0x78) && (in[3] == 0x6D))
614: return(XML_CHAR_ENCODING_UTF8);
615: }
616: if (len >= 2) {
617: if ((in[0] == 0xFE) && (in[1] == 0xFF))
618: return(XML_CHAR_ENCODING_UTF16BE);
619: if ((in[0] == 0xFF) && (in[1] == 0xFE))
620: return(XML_CHAR_ENCODING_UTF16LE);
621: }
1.7 daniel 622: return(XML_CHAR_ENCODING_NONE);
623: }
624:
625: /**
626: * xmlParseCharEncoding:
1.18 daniel 627: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7 daniel 628: *
629: * Conpare the string to the known encoding schemes already known. Note
630: * that the comparison is case insensitive accordingly to the section
631: * [XML] 4.3.3 Character Encoding in Entities.
632: *
633: * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
634: * if not recognized.
635: */
636: xmlCharEncoding
1.8 daniel 637: xmlParseCharEncoding(const char* name)
1.7 daniel 638: {
639: char upper[500];
640: int i;
641:
642: for (i = 0;i < 499;i++) {
643: upper[i] = toupper(name[i]);
644: if (upper[i] == 0) break;
645: }
646: upper[i] = 0;
647:
648: if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
649: if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
650: if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
651:
652: /*
653: * NOTE: if we were able to parse this, the endianness of UTF16 is
654: * already found and in use
655: */
656: if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
657: if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
658:
659: if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
660: if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
661: if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
662:
663: /*
664: * NOTE: if we were able to parse this, the endianness of UCS4 is
665: * already found and in use
666: */
667: if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
668: if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
669: if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
670:
671:
672: if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
673: if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
674: if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
675:
676: if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
677: if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
678: if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
679:
680: if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
681: if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
682: if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
683: if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
684: if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
685: if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
686: if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
687:
688: if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30 daniel 689: if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7 daniel 690: if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30 daniel 691:
692: #ifdef DEBUG_ENCODING
693: fprintf(stderr, "Unknown encoding %s\n", name);
694: #endif
1.7 daniel 695: return(XML_CHAR_ENCODING_ERROR);
696: }
1.9 daniel 697:
698: /****************************************************************
699: * *
700: * Char encoding handlers *
701: * *
702: ****************************************************************/
703:
704: /* the size should be growable, but it's not a big deal ... */
705: #define MAX_ENCODING_HANDLERS 50
706: static xmlCharEncodingHandlerPtr *handlers = NULL;
707: static int nbCharEncodingHandler = 0;
708:
709: /*
710: * The default is UTF-8 for XML, that's also the default used for the
711: * parser internals, so the default encoding handler is NULL
712: */
713:
714: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
715:
716: /**
717: * xmlNewCharEncodingHandler:
1.18 daniel 718: * @name: the encoding name, in UTF-8 format (ASCII actually)
1.9 daniel 719: * @input: the xmlCharEncodingInputFunc to read that encoding
720: * @output: the xmlCharEncodingOutputFunc to write that encoding
721: *
722: * Create and registers an xmlCharEncodingHandler.
723: * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
724: */
725: xmlCharEncodingHandlerPtr
1.25 daniel 726: xmlNewCharEncodingHandler(const char *name,
727: xmlCharEncodingInputFunc input,
1.9 daniel 728: xmlCharEncodingOutputFunc output) {
729: xmlCharEncodingHandlerPtr handler;
730: char upper[500];
731: int i;
732: char *up = 0;
733:
734: /*
735: * Keep only the uppercase version of the encoding.
736: */
737: if (name == NULL) {
738: fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
739: return(NULL);
740: }
741: for (i = 0;i < 499;i++) {
742: upper[i] = toupper(name[i]);
743: if (upper[i] == 0) break;
744: }
745: upper[i] = 0;
1.16 daniel 746: up = xmlMemStrdup(upper);
1.9 daniel 747: if (up == NULL) {
748: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
749: return(NULL);
750: }
751:
752: /*
753: * allocate and fill-up an handler block.
754: */
755: handler = (xmlCharEncodingHandlerPtr)
1.16 daniel 756: xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9 daniel 757: if (handler == NULL) {
758: fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
759: return(NULL);
760: }
761: handler->input = input;
762: handler->output = output;
763: handler->name = up;
764:
765: /*
766: * registers and returns the handler.
767: */
768: xmlRegisterCharEncodingHandler(handler);
1.30 daniel 769: #ifdef DEBUG_ENCODING
770: fprintf(stderr, "Registered encoding handler for %s\n", name);
771: #endif
1.9 daniel 772: return(handler);
773: }
774:
775: /**
776: * xmlInitCharEncodingHandlers:
777: *
778: * Initialize the char encoding support, it registers the default
779: * encoding supported.
1.18 daniel 780: * NOTE: while public, this function usually doesn't need to be called
1.9 daniel 781: * in normal processing.
782: */
783: void
784: xmlInitCharEncodingHandlers(void) {
1.34 daniel 785: unsigned short int tst = 0x1234;
786: unsigned char *ptr = (unsigned char *) &tst;
787:
1.9 daniel 788: if (handlers != NULL) return;
789:
790: handlers = (xmlCharEncodingHandlerPtr *)
1.16 daniel 791: xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34 daniel 792:
793: if (*ptr == 0x12) xmlLittleEndian = 0;
794: else if (*ptr == 0x34) xmlLittleEndian = 1;
795: else fprintf(stderr, "Odd problem at endianness detection\n");
1.9 daniel 796:
797: if (handlers == NULL) {
798: fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
799: return;
800: }
1.10 daniel 801: xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25 daniel 802: xmlUTF16LEHandler =
1.28 daniel 803: xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
804: xmlUTF16BEHandler =
805: xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10 daniel 806: xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9 daniel 807: }
808:
809: /**
1.19 daniel 810: * xmlCleanupCharEncodingHandlers:
811: *
812: * Cleanup the memory allocated for the char encoding support, it
813: * unregisters all the encoding handlers.
814: */
815: void
816: xmlCleanupCharEncodingHandlers(void) {
817: if (handlers == NULL) return;
818:
819: for (;nbCharEncodingHandler > 0;) {
820: nbCharEncodingHandler--;
821: if (handlers[nbCharEncodingHandler] != NULL) {
1.31 daniel 822: if (handlers[nbCharEncodingHandler]->name != NULL)
823: xmlFree(handlers[nbCharEncodingHandler]->name);
1.19 daniel 824: xmlFree(handlers[nbCharEncodingHandler]);
825: }
826: }
827: xmlFree(handlers);
828: handlers = NULL;
829: nbCharEncodingHandler = 0;
830: xmlDefaultCharEncodingHandler = NULL;
831: }
832:
833: /**
1.9 daniel 834: * xmlRegisterCharEncodingHandler:
835: * @handler: the xmlCharEncodingHandlerPtr handler block
836: *
837: * Register the char encoding handler, surprizing, isn't it ?
838: */
839: void
840: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
841: if (handlers == NULL) xmlInitCharEncodingHandlers();
842: if (handler == NULL) {
843: fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
844: return;
845: }
846:
847: if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
848: fprintf(stderr,
849: "xmlRegisterCharEncodingHandler: Too many handler registered\n");
850: fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
851: return;
852: }
853: handlers[nbCharEncodingHandler++] = handler;
854: }
855:
856: /**
857: * xmlGetCharEncodingHandler:
858: * @enc: an xmlCharEncoding value.
859: *
860: * Search in the registrered set the handler able to read/write that encoding.
861: *
862: * Returns the handler or NULL if not found
863: */
864: xmlCharEncodingHandlerPtr
865: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30 daniel 866: xmlCharEncodingHandlerPtr handler;
867:
1.9 daniel 868: if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25 daniel 869: switch (enc) {
870: case XML_CHAR_ENCODING_ERROR:
871: return(NULL);
872: case XML_CHAR_ENCODING_NONE:
873: return(NULL);
874: case XML_CHAR_ENCODING_UTF8:
875: return(NULL);
876: case XML_CHAR_ENCODING_UTF16LE:
877: return(xmlUTF16LEHandler);
878: case XML_CHAR_ENCODING_UTF16BE:
879: return(xmlUTF16BEHandler);
880: case XML_CHAR_ENCODING_EBCDIC:
1.30 daniel 881: handler = xmlFindCharEncodingHandler("EBCDIC");
882: if (handler != NULL) return(handler);
883: handler = xmlFindCharEncodingHandler("ebcdic");
884: if (handler != NULL) return(handler);
885: break;
1.25 daniel 886: case XML_CHAR_ENCODING_UCS4LE:
1.30 daniel 887: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
888: if (handler != NULL) return(handler);
889: handler = xmlFindCharEncodingHandler("UCS-4");
890: if (handler != NULL) return(handler);
891: handler = xmlFindCharEncodingHandler("UCS4");
892: if (handler != NULL) return(handler);
893: break;
1.25 daniel 894: case XML_CHAR_ENCODING_UCS4BE:
1.30 daniel 895: handler = xmlFindCharEncodingHandler("UCS4BE");
896: if (handler != NULL) return(handler);
897: break;
1.25 daniel 898: case XML_CHAR_ENCODING_UCS4_2143:
1.30 daniel 899: break;
1.25 daniel 900: case XML_CHAR_ENCODING_UCS4_3412:
1.30 daniel 901: break;
1.25 daniel 902: case XML_CHAR_ENCODING_UCS2:
1.30 daniel 903: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
904: if (handler != NULL) return(handler);
905: handler = xmlFindCharEncodingHandler("UCS-2");
906: if (handler != NULL) return(handler);
907: handler = xmlFindCharEncodingHandler("UCS2");
908: if (handler != NULL) return(handler);
909: break;
1.25 daniel 910: case XML_CHAR_ENCODING_8859_1:
911: case XML_CHAR_ENCODING_8859_2:
912: case XML_CHAR_ENCODING_8859_3:
913: case XML_CHAR_ENCODING_8859_4:
914: case XML_CHAR_ENCODING_8859_5:
915: case XML_CHAR_ENCODING_8859_6:
916: case XML_CHAR_ENCODING_8859_7:
917: case XML_CHAR_ENCODING_8859_8:
918: case XML_CHAR_ENCODING_8859_9:
919: return(NULL);
920: case XML_CHAR_ENCODING_2022_JP:
1.30 daniel 921: handler = xmlFindCharEncodingHandler("ISO-2022-JP");
922: if (handler != NULL) return(handler);
923: break;
1.25 daniel 924: case XML_CHAR_ENCODING_SHIFT_JIS:
1.30 daniel 925: handler = xmlFindCharEncodingHandler("SHIFT-JIS");
926: if (handler != NULL) return(handler);
927: handler = xmlFindCharEncodingHandler("SHIFT_JIS");
928: if (handler != NULL) return(handler);
929: handler = xmlFindCharEncodingHandler("Shift_JIS");
930: if (handler != NULL) return(handler);
931: break;
1.25 daniel 932: case XML_CHAR_ENCODING_EUC_JP:
1.30 daniel 933: handler = xmlFindCharEncodingHandler("EUC-JP");
934: if (handler != NULL) return(handler);
935: break;
936: default:
937: break;
1.25 daniel 938: }
1.30 daniel 939:
940: #ifdef DEBUG_ENCODING
941: fprintf(stderr, "No handler found for encoding %d\n", enc);
942: #endif
1.9 daniel 943: return(NULL);
944: }
945:
946: /**
947: * xmlGetCharEncodingHandler:
948: * @enc: a string describing the char encoding.
949: *
950: * Search in the registrered set the handler able to read/write that encoding.
951: *
952: * Returns the handler or NULL if not found
953: */
954: xmlCharEncodingHandlerPtr
955: xmlFindCharEncodingHandler(const char *name) {
1.36 ! daniel 956: xmlCharEncodingHandlerPtr enc;
! 957: xmlCharEncoding alias;
1.30 daniel 958: #ifdef LIBXML_ICONV_ENABLED
959: iconv_t icv_in, icv_out;
960: #endif /* LIBXML_ICONV_ENABLED */
961: char upper[100];
1.9 daniel 962: int i;
963:
964: if (handlers == NULL) xmlInitCharEncodingHandlers();
965: if (name == NULL) return(xmlDefaultCharEncodingHandler);
966: if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
967:
1.36 ! daniel 968: /*
! 969: * Check first for directly registered encoding names
! 970: */
1.30 daniel 971: for (i = 0;i < 99;i++) {
1.9 daniel 972: upper[i] = toupper(name[i]);
973: if (upper[i] == 0) break;
974: }
975: upper[i] = 0;
976:
977: for (i = 0;i < nbCharEncodingHandler; i++)
1.30 daniel 978: if (!strcmp(upper, handlers[i]->name)) {
979: #ifdef DEBUG_ENCODING
980: fprintf(stderr, "Found registered handler for encoding %s\n", name);
981: #endif
1.9 daniel 982: return(handlers[i]);
1.30 daniel 983: }
1.9 daniel 984:
1.36 ! daniel 985: /*
! 986: * check using aliases names
! 987: */
! 988: alias = xmlParseCharEncoding(name);
! 989: if (alias != XML_CHAR_ENCODING_ERROR) {
! 990: enc = xmlGetCharEncodingHandler(alias);
! 991: if (enc != NULL) {
! 992: #ifdef DEBUG_ENCODING
! 993: fprintf(stderr, "Found registered handler %s for encoding %s\n",
! 994: enc->name, name);
! 995: #endif
! 996: return(enc);
! 997: }
! 998: }
1.30 daniel 999: #ifdef LIBXML_ICONV_ENABLED
1000: /* check whether iconv can handle this */
1.31 daniel 1001: icv_in = iconv_open("UTF-8", name);
1002: icv_out = iconv_open(name, "UTF-8");
1.30 daniel 1003: if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31 daniel 1004: enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32 daniel 1005: if (enc == NULL) {
1006: iconv_close(icv_in);
1007: iconv_close(icv_out);
1008: return(NULL);
1009: }
1010: enc->name = NULL;
1.30 daniel 1011: enc->input = NULL;
1012: enc->output = NULL;
1013: enc->iconv_in = icv_in;
1014: enc->iconv_out = icv_out;
1015: #ifdef DEBUG_ENCODING
1016: fprintf(stderr, "Found iconv handler for encoding %s\n", name);
1017: #endif
1018: return enc;
1019: } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1020: fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
1021: }
1022: #endif /* LIBXML_ICONV_ENABLED */
1023: #ifdef DEBUG_ENCODING
1024: fprintf(stderr, "No handler found for encoding %s\n", name);
1025: #endif
1.9 daniel 1026: return(NULL);
1.30 daniel 1027: }
1028:
1029: #ifdef LIBXML_ICONV_ENABLED
1030: /**
1031: * xmlIconvWrapper:
1032: * @cd: iconv converter data structure
1033: * @out: a pointer to an array of bytes to store the result
1034: * @outlen: the length of @out
1035: * @in: a pointer to an array of ISO Latin 1 chars
1036: * @inlen: the length of @in
1037: *
1038: * Returns 0 if success, or
1039: * -1 by lack of space, or
1040: * -2 if the transcoding fails (for *in is not valid utf8 string or
1041: * the result of transformation can't fit into the encoding we want), or
1042: * -3 if there the last byte can't form a single output char.
1043: *
1044: * The value of @inlen after return is the number of octets consumed
1045: * as the return value is positive, else unpredictiable.
1046: * The value of @outlen after return is the number of ocetes consumed.
1047: */
1048: static int
1049: xmlIconvWrapper(iconv_t cd,
1050: unsigned char *out, int *outlen,
1051: const unsigned char *in, int *inlen) {
1052:
1053: size_t icv_inlen = *inlen, icv_outlen = *outlen;
1054: const char *icv_in = (const char *) in;
1055: char *icv_out = (char *) out;
1056: int ret;
1057:
1058: ret = iconv(cd,
1059: &icv_in, &icv_inlen,
1060: &icv_out, &icv_outlen);
1.35 daniel 1061: if (in != NULL) {
1062: *inlen -= icv_inlen;
1063: *outlen -= icv_outlen;
1064: } else {
1065: *inlen = 0;
1066: *outlen = 0;
1067: }
1.30 daniel 1068: if (icv_inlen != 0 || ret == (size_t) -1) {
1069: #ifdef EILSEQ
1070: if (errno == EILSEQ) {
1.31 daniel 1071: return -2;
1.30 daniel 1072: } else
1073: #endif
1074: #ifdef E2BIG
1075: if (errno == E2BIG) {
1076: return -1;
1077: } else
1078: #endif
1079: #ifdef EINVAL
1080: if (errno == EINVAL) {
1.31 daniel 1081: return -3;
1.30 daniel 1082: }
1083: #endif
1084: else {
1085: return -3;
1086: }
1087: }
1088: return 0;
1089: }
1090: #endif /* LIBXML_ICONV_ENABLED */
1091:
1092: /**
1093: * xmlCharEncInFunc:
1094: * @handler: char enconding transformation data structure
1.31 daniel 1095: * @out: an xmlBuffer for the output.
1096: * @in: an xmlBuffer for the input
1.30 daniel 1097: *
1098: * Generic front-end for the encoding handler input function
1099: *
1.31 daniel 1100: * Returns the number of byte written if success, or
1101: * -1 general error
1.30 daniel 1102: * -2 if the transcoding fails (for *in is not valid utf8 string or
1103: * the result of transformation can't fit into the encoding we want), or
1104: */
1105: int
1.31 daniel 1106: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1107: xmlBufferPtr in) {
1.30 daniel 1108: int ret = -2;
1.31 daniel 1109: int written;
1110: int toconv;
1.30 daniel 1111:
1.31 daniel 1112: if (handler == NULL) return(-1);
1113: if (out == NULL) return(-1);
1114: if (in == NULL) return(-1);
1115:
1116: written = out->size - out->use;
1117: toconv = in->use;
1118: if (toconv * 2 >= written) {
1119: xmlBufferGrow(out, toconv * 2);
1.33 daniel 1120: written = out->size - out->use - 1;
1.31 daniel 1121: }
1.30 daniel 1122: if (handler->input != NULL) {
1.32 daniel 1123: ret = handler->input(&out->content[out->use], &written,
1.31 daniel 1124: in->content, &toconv);
1125: xmlBufferShrink(in, toconv);
1126: out->use += written;
1.33 daniel 1127: out->content[out->use] = 0;
1.30 daniel 1128: }
1129: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1130: else if (handler->iconv_in != NULL) {
1131: ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1132: &written, in->content, &toconv);
1133: xmlBufferShrink(in, toconv);
1134: out->use += written;
1.33 daniel 1135: out->content[out->use] = 0;
1136: if (ret == -1) ret = -3;
1.30 daniel 1137: }
1138: #endif /* LIBXML_ICONV_ENABLED */
1139: #ifdef DEBUG_ENCODING
1140: switch (ret) {
1141: case 0:
1142: fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31 daniel 1143: toconv, written);
1.30 daniel 1144: break;
1145: case -1:
1.31 daniel 1146: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1147: toconv, written, in->use);
1.30 daniel 1148: break;
1149: case -2:
1150: fprintf(stderr, "input conversion failed due to input error\n");
1151: break;
1152: case -3:
1.31 daniel 1153: fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
1154: toconv, written, in->use);
1.30 daniel 1155: break;
1156: default:
1157: fprintf(stderr,"Unknown input conversion failed %d\n", ret);
1158: }
1159: #endif
1.33 daniel 1160: /*
1161: * Ignore when input buffer is not on a boundary
1162: */
1163: if (ret == -3) ret = 0;
1.30 daniel 1164: return(ret);
1165: }
1166:
1167: /**
1168: * xmlCharEncOutFunc:
1169: * @handler: char enconding transformation data structure
1.31 daniel 1170: * @out: an xmlBuffer for the output.
1171: * @in: an xmlBuffer for the input
1172: *
1173: * Generic front-end for the encoding handler output function
1.35 daniel 1174: * a first call with @in == NULL has to be made firs to initiate the
1175: * output in case of non-stateless encoding needing to initiate their
1176: * state or the output (like the BOM in UTF16).
1.30 daniel 1177: *
1.31 daniel 1178: * Returns the number of byte written if success, or
1179: * -1 general error
1.30 daniel 1180: * -2 if the transcoding fails (for *in is not valid utf8 string or
1181: * the result of transformation can't fit into the encoding we want), or
1182: */
1183: int
1.31 daniel 1184: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1185: xmlBufferPtr in) {
1.30 daniel 1186: int ret = -2;
1.31 daniel 1187: int written;
1188: int toconv;
1189:
1190: if (handler == NULL) return(-1);
1191: if (out == NULL) return(-1);
1.35 daniel 1192: written = out->size - out->use;
1193:
1194: if (in == NULL) {
1195: toconv = 0;
1196: if (handler->output != NULL) {
1197: ret = handler->output(&out->content[out->use], &written,
1198: NULL, &toconv);
1199: out->use += written;
1200: out->content[out->use] = 0;
1201: }
1202: #ifdef LIBXML_ICONV_ENABLED
1203: else if (handler->iconv_out != NULL) {
1204: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1205: &written, NULL, &toconv);
1206: out->use += written;
1207: out->content[out->use] = 0;
1208: }
1209: #endif /* LIBXML_ICONV_ENABLED */
1210: #ifdef DEBUG_ENCODING
1211: fprintf(stderr, "initialized encoder\n");
1212: #endif
1213: return(0);
1214: }
1.30 daniel 1215:
1.33 daniel 1216: toconv = in->use;
1217: if (toconv * 2 >= written) {
1218: xmlBufferGrow(out, toconv * 2);
1219: written = out->size - out->use - 1;
1220: }
1.30 daniel 1221: if (handler->output != NULL) {
1.33 daniel 1222: ret = handler->output(&out->content[out->use], &written,
1.35 daniel 1223: in->content, &toconv);
1.31 daniel 1224: xmlBufferShrink(in, toconv);
1225: out->use += written;
1.33 daniel 1226: out->content[out->use] = 0;
1.30 daniel 1227: }
1228: #ifdef LIBXML_ICONV_ENABLED
1229: else if (handler->iconv_out != NULL) {
1.31 daniel 1230: ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
1231: &written, in->content, &toconv);
1232: xmlBufferShrink(in, toconv);
1233: out->use += written;
1.33 daniel 1234: out->content[out->use] = 0;
1235: if (ret == -1) ret = -3;
1.30 daniel 1236: }
1237: #endif /* LIBXML_ICONV_ENABLED */
1238: #ifdef DEBUG_ENCODING
1239: switch (ret) {
1240: case 0:
1241: fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31 daniel 1242: toconv, written);
1.30 daniel 1243: break;
1244: case -1:
1245: fprintf(stderr, "output conversion failed by lack of space\n");
1246: break;
1247: case -2:
1248: fprintf(stderr, "output conversion failed due to output error\n");
1249: break;
1250: case -3:
1.31 daniel 1251: fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
1252: toconv, written, in->use);
1.30 daniel 1253: break;
1254: default:
1255: fprintf(stderr,"Unknown output conversion failed %d\n", ret);
1256: }
1257: #endif
1258: return(ret);
1259: }
1260:
1261: /**
1262: * xmlCharEncCloseFunc:
1263: * @handler: char enconding transformation data structure
1264: *
1265: * Generic front-end for hencoding handler close function
1266: *
1267: * Returns 0 if success, or -1 in case of error
1268: */
1269: int
1270: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
1271: int ret = 0;
1.31 daniel 1272: if (handler == NULL) return(-1);
1273: if (handler->name == NULL) return(-1);
1.30 daniel 1274: #ifdef LIBXML_ICONV_ENABLED
1.31 daniel 1275: /*
1276: * Iconv handlers can be oused only once, free the whole block.
1277: * and the associated icon resources.
1278: */
1.32 daniel 1279: if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
1280: if (handler->name != NULL)
1281: xmlFree(handler->name);
1282: handler->name = NULL;
1283: if (handler->iconv_out != NULL) {
1284: if (iconv_close(handler->iconv_out))
1285: ret = -1;
1286: handler->iconv_out = NULL;
1287: }
1288: if (handler->iconv_in != NULL) {
1289: if (iconv_close(handler->iconv_in))
1290: ret = -1;
1291: handler->iconv_in = NULL;
1292: }
1293: xmlFree(handler);
1.30 daniel 1294: }
1295: #endif /* LIBXML_ICONV_ENABLED */
1296: #ifdef DEBUG_ENCODING
1297: if (ret)
1298: fprintf(stderr, "failed to close the encoding handler\n");
1299: else
1300: fprintf(stderr, "closed the encoding handler\n");
1301:
1302: #endif
1303: return(ret);
1.9 daniel 1304: }
1305:
Webmaster