Annotation of XML/encoding.c, revision 1.49

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
1.39      daniel      6:  * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
1.1       daniel      7:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      8:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      9:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                     10:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     11:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     12:  *                described in Unicode Technical Report #4.
                     13:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     14:  *                Information Interchange, ANSI X3.4-1986.
                     15:  *
1.9       daniel     16:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     17:  *
                     18:  * See Copyright for the status of this software.
                     19:  *
                     20:  * Daniel.Veillard@w3.org
                     21:  */
                     22: 
1.21      daniel     23: #ifdef WIN32
                     24: #include "win32config.h"
                     25: #else
1.14      daniel     26: #include "config.h"
1.17      daniel     27: #endif
                     28: 
                     29: #include <stdio.h>
                     30: #include <string.h>
                     31: 
                     32: #ifdef HAVE_CTYPE_H
1.7       daniel     33: #include <ctype.h>
1.17      daniel     34: #endif
1.20      daniel     35: #ifdef HAVE_STDLIB_H
                     36: #include <stdlib.h>
                     37: #endif
1.30      daniel     38: #include <libxml/xmlversion.h>
                     39: #ifdef LIBXML_ICONV_ENABLED
                     40: #ifdef HAVE_ERRNO_H
                     41: #include <errno.h>
                     42: #endif
                     43: #endif
1.29      daniel     44: #include <libxml/encoding.h>
                     45: #include <libxml/xmlmemory.h>
1.48      veillard   46: #ifdef LIBXML_HTML_ENABLED
                     47: #include <libxml/HTMLparser.h>
                     48: #endif
1.3       daniel     49: 
1.25      daniel     50: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     51: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     52: 
1.30      daniel     53: #ifdef LIBXML_ICONV_ENABLED
1.46      veillard   54: #if 0
1.30      daniel     55: #define DEBUG_ENCODING  /* Define this to get encoding traces */
                     56: #endif
1.33      daniel     57: #endif
1.30      daniel     58: 
1.34      daniel     59: static int xmlLittleEndian = 1;
                     60: 
1.3       daniel     61: /*
                     62:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     63:  *
                     64:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     65:  * 0000 0000-0000 007F   0xxxxxxx
                     66:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     67:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     68:  *
                     69:  * I hope we won't use values > 0xFFFF anytime soon !
                     70:  */
1.1       daniel     71: 
                     72: /**
1.39      daniel     73:  * xmlGetUTF8Char:
                     74:  * @utf:  a sequence of UTF-8 encoded bytes
                     75:  * @len:  a pointer to @bytes len
                     76:  *
                     77:  * Read one UTF8 Char from @utf
                     78:  *
                     79:  * Returns the char value or -1 in case of error and update @len with the
                     80:  *        number of bytes used
                     81:  */
                     82: int
                     83: xmlGetUTF8Char(const unsigned char *utf, int *len) {
                     84:     unsigned int c;
                     85: 
                     86:     if (utf == NULL)
                     87:        goto error;
                     88:     if (len == NULL)
                     89:        goto error;
                     90:     if (*len < 1)
                     91:        goto error;
                     92: 
                     93:     c = utf[0];
                     94:     if (c & 0x80) {
                     95:        if (*len < 2)
                     96:            goto error;
                     97:        if ((utf[1] & 0xc0) != 0x80)
                     98:            goto error;
                     99:        if ((c & 0xe0) == 0xe0) {
                    100:            if (*len < 3)
                    101:                goto error;
                    102:            if ((utf[2] & 0xc0) != 0x80)
                    103:                goto error;
                    104:            if ((c & 0xf0) == 0xf0) {
                    105:                if (*len < 4)
                    106:                    goto error;
                    107:                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
                    108:                    goto error;
                    109:                *len = 4;
                    110:                /* 4-byte code */
                    111:                c = (utf[0] & 0x7) << 18;
                    112:                c |= (utf[1] & 0x3f) << 12;
                    113:                c |= (utf[2] & 0x3f) << 6;
                    114:                c |= utf[3] & 0x3f;
                    115:            } else {
                    116:              /* 3-byte code */
                    117:                *len = 3;
                    118:                c = (utf[0] & 0xf) << 12;
                    119:                c |= (utf[1] & 0x3f) << 6;
                    120:                c |= utf[2] & 0x3f;
                    121:            }
                    122:        } else {
                    123:          /* 2-byte code */
                    124:            *len = 2;
                    125:            c = (utf[0] & 0x1f) << 6;
                    126:            c |= utf[1] & 0x3f;
                    127:        }
                    128:     } else {
                    129:        /* 1-byte code */
                    130:        *len = 1;
                    131:     }
                    132:     return(c);
                    133: 
                    134: error:
                    135:     *len = 0;
                    136:     return(-1);
                    137: }
                    138: 
                    139: /**
1.22      daniel    140:  * xmlCheckUTF8: Check utf-8 string for legality.
                    141:  * @utf: Pointer to putative utf-8 encoded string.
                    142:  *
                    143:  * Checks @utf for being valid utf-8. @utf is assumed to be
                    144:  * null-terminated. This function is not super-strict, as it will
                    145:  * allow longer utf-8 sequences than necessary. Note that Java is
                    146:  * capable of producing these sequences if provoked. Also note, this
                    147:  * routine checks for the 4-byte maxiumum size, but does not check for
                    148:  * 0x10ffff maximum value.
                    149:  *
                    150:  * Return value: true if @utf is valid.
                    151:  **/
                    152: int
                    153: xmlCheckUTF8(const unsigned char *utf)
                    154: {
                    155:     int ix;
                    156:     unsigned char c;
                    157: 
                    158:     for (ix = 0; (c = utf[ix]);) {
                    159:         if (c & 0x80) {
                    160:            if ((utf[ix + 1] & 0xc0) != 0x80)
                    161:                return(0);
                    162:            if ((c & 0xe0) == 0xe0) {
                    163:                if ((utf[ix + 2] & 0xc0) != 0x80)
                    164:                    return(0);
                    165:                if ((c & 0xf0) == 0xf0) {
                    166:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                    167:                        return(0);
                    168:                    ix += 4;
                    169:                    /* 4-byte code */
                    170:                } else
                    171:                  /* 3-byte code */
                    172:                    ix += 3;
                    173:            } else
                    174:              /* 2-byte code */
                    175:                ix += 2;
                    176:        } else
                    177:            /* 1-byte code */
                    178:            ix++;
                    179:       }
                    180:       return(1);
                    181: }
                    182: 
                    183: /**
1.47      veillard  184:  * asciiToUTF8:
                    185:  * @out:  a pointer to an array of bytes to store the result
                    186:  * @outlen:  the length of @out
                    187:  * @in:  a pointer to an array of ASCII chars
                    188:  * @inlen:  the length of @in
                    189:  *
                    190:  * Take a block of ASCII chars in and try to convert it to an UTF-8
                    191:  * block of chars out.
                    192:  * Returns 0 if success, or -1 otherwise
                    193:  * The value of @inlen after return is the number of octets consumed
                    194:  *     as the return value is positive, else unpredictiable.
                    195:  * The value of @outlen after return is the number of ocetes consumed.
                    196:  */
                    197: int
                    198: asciiToUTF8(unsigned char* out, int *outlen,
                    199:               const unsigned char* in, int *inlen) {
                    200:     unsigned char* outstart = out;
                    201:     const unsigned char* base = in;
                    202:     const unsigned char* processed = in;
                    203:     unsigned char* outend = out + *outlen;
                    204:     const unsigned char* inend;
                    205:     unsigned int c;
                    206:     int bits;
                    207: 
                    208:     inend = in + (*inlen);
                    209:     while ((in < inend) && (out - outstart + 5 < *outlen)) {
                    210:        c= *in++;
                    211: 
                    212:        /* assertion: c is a single UTF-4 value */
                    213:         if (out >= outend)
                    214:            break;
                    215:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    216:         else { 
                    217:            *outlen = out - outstart;
                    218:            *inlen = processed - base;
                    219:            return(-1);
                    220:        }
                    221:  
                    222:         for ( ; bits >= 0; bits-= 6) {
                    223:             if (out >= outend)
                    224:                break;
                    225:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    226:         }
                    227:        processed = (const unsigned char*) in;
                    228:     }
                    229:     *outlen = out - outstart;
                    230:     *inlen = processed - base;
                    231:     return(0);
                    232: }
                    233: 
                    234: /**
                    235:  * UTF8Toascii:
                    236:  * @out:  a pointer to an array of bytes to store the result
                    237:  * @outlen:  the length of @out
                    238:  * @in:  a pointer to an array of UTF-8 chars
                    239:  * @inlen:  the length of @in
                    240:  *
                    241:  * Take a block of UTF-8 chars in and try to convert it to an ASCII
                    242:  * block of chars out.
                    243:  *
                    244:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
                    245:  * The value of @inlen after return is the number of octets consumed
                    246:  *     as the return value is positive, else unpredictiable.
                    247:  * The value of @outlen after return is the number of ocetes consumed.
                    248:  */
                    249: int
                    250: UTF8Toascii(unsigned char* out, int *outlen,
                    251:               const unsigned char* in, int *inlen) {
                    252:     const unsigned char* processed = in;
                    253:     const unsigned char* outend;
                    254:     const unsigned char* outstart = out;
                    255:     const unsigned char* instart = in;
                    256:     const unsigned char* inend;
                    257:     unsigned int c, d;
                    258:     int trailing;
                    259: 
                    260:     if (in == NULL) {
                    261:         /*
                    262:         * initialization nothing to do
                    263:         */
                    264:        *outlen = 0;
                    265:        *inlen = 0;
                    266:        return(0);
                    267:     }
                    268:     inend = in + (*inlen);
                    269:     outend = out + (*outlen);
                    270:     while (in < inend) {
                    271:        d = *in++;
                    272:        if      (d < 0x80)  { c= d; trailing= 0; }
                    273:        else if (d < 0xC0) {
                    274:            /* trailing byte in leading position */
                    275:            *outlen = out - outstart;
                    276:            *inlen = processed - instart;
                    277:            return(-2);
                    278:         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    279:         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    280:         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                    281:        else {
                    282:            /* no chance for this in Ascii */
                    283:            *outlen = out - outstart;
                    284:            *inlen = processed - instart;
                    285:            return(-2);
                    286:        }
                    287: 
                    288:        if (inend - in < trailing) {
                    289:            break;
                    290:        } 
                    291: 
                    292:        for ( ; trailing; trailing--) {
                    293:            if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
                    294:                break;
                    295:            c <<= 6;
                    296:            c |= d & 0x3F;
                    297:        }
                    298: 
                    299:        /* assertion: c is a single UTF-4 value */
                    300:        if (c < 0x80) {
                    301:            if (out >= outend)
                    302:                break;
                    303:            *out++ = c;
                    304:        } else {
                    305:            /* no chance for this in Ascii */
                    306:            *outlen = out - outstart;
                    307:            *inlen = processed - instart;
                    308:            return(-2);
                    309:        }
                    310:        processed = in;
                    311:     }
                    312:     *outlen = out - outstart;
                    313:     *inlen = processed - instart;
                    314:     return(0);
                    315: }
                    316: 
                    317: /**
1.1       daniel    318:  * isolat1ToUTF8:
1.18      daniel    319:  * @out:  a pointer to an array of bytes to store the result
                    320:  * @outlen:  the length of @out
                    321:  * @in:  a pointer to an array of ISO Latin 1 chars
                    322:  * @inlen:  the length of @in
1.1       daniel    323:  *
                    324:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    325:  * block of chars out.
1.33      daniel    326:  * Returns 0 if success, or -1 otherwise
                    327:  * The value of @inlen after return is the number of octets consumed
                    328:  *     as the return value is positive, else unpredictiable.
                    329:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    330:  */
                    331: int
1.33      daniel    332: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25      daniel    333:               const unsigned char* in, int *inlen) {
1.33      daniel    334:     unsigned char* outstart = out;
1.45      veillard  335:     const unsigned char* base = in;
1.33      daniel    336:     const unsigned char* processed = in;
                    337:     unsigned char* outend = out + *outlen;
1.45      veillard  338:     const unsigned char* inend;
                    339:     unsigned int c;
                    340:     int bits;
                    341: 
                    342:     inend = in + (*inlen);
                    343:     while ((in < inend) && (out - outstart + 5 < *outlen)) {
                    344:        c= *in++;
1.1       daniel    345: 
1.45      veillard  346:        /* assertion: c is a single UTF-4 value */
                    347:         if (out >= outend)
                    348:            break;
                    349:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    350:         else                  {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    351:  
                    352:         for ( ; bits >= 0; bits-= 6) {
1.33      daniel    353:             if (out >= outend)
1.45      veillard  354:                break;
                    355:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    356:         }
1.45      veillard  357:        processed = (const unsigned char*) in;
1.1       daniel    358:     }
1.33      daniel    359:     *outlen = out - outstart;
1.45      veillard  360:     *inlen = processed - base;
1.33      daniel    361:     return(0);
1.1       daniel    362: }
                    363: 
                    364: /**
                    365:  * UTF8Toisolat1:
1.18      daniel    366:  * @out:  a pointer to an array of bytes to store the result
                    367:  * @outlen:  the length of @out
                    368:  * @in:  a pointer to an array of UTF-8 chars
                    369:  * @inlen:  the length of @in
1.1       daniel    370:  *
                    371:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    372:  * block of chars out.
1.15      daniel    373:  *
1.33      daniel    374:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28      daniel    375:  * The value of @inlen after return is the number of octets consumed
                    376:  *     as the return value is positive, else unpredictiable.
1.33      daniel    377:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    378:  */
                    379: int
1.33      daniel    380: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25      daniel    381:               const unsigned char* in, int *inlen) {
1.33      daniel    382:     const unsigned char* processed = in;
1.45      veillard  383:     const unsigned char* outend;
                    384:     const unsigned char* outstart = out;
                    385:     const unsigned char* instart = in;
                    386:     const unsigned char* inend;
                    387:     unsigned int c, d;
                    388:     int trailing;
1.1       daniel    389: 
1.45      veillard  390:     if (in == NULL) {
                    391:         /*
                    392:         * initialization nothing to do
                    393:         */
                    394:        *outlen = 0;
                    395:        *inlen = 0;
                    396:        return(0);
                    397:     }
                    398:     inend = in + (*inlen);
                    399:     outend = out + (*outlen);
1.1       daniel    400:     while (in < inend) {
1.45      veillard  401:        d = *in++;
                    402:        if      (d < 0x80)  { c= d; trailing= 0; }
                    403:        else if (d < 0xC0) {
                    404:            /* trailing byte in leading position */
                    405:            *outlen = out - outstart;
                    406:            *inlen = processed - instart;
                    407:            return(-2);
                    408:         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    409:         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    410:         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                    411:        else {
                    412:            /* no chance for this in IsoLat1 */
                    413:            *outlen = out - outstart;
                    414:            *inlen = processed - instart;
                    415:            return(-2);
1.23      daniel    416:        }
1.45      veillard  417: 
                    418:        if (inend - in < trailing) {
                    419:            break;
                    420:        } 
                    421: 
                    422:        for ( ; trailing; trailing--) {
                    423:            if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
                    424:                break;
                    425:            c <<= 6;
                    426:            c |= d & 0x3F;
1.23      daniel    427:        }
1.45      veillard  428: 
                    429:        /* assertion: c is a single UTF-4 value */
                    430:        if (c <= 0xFF) {
                    431:            if (out >= outend)
                    432:                break;
                    433:            *out++ = c;
                    434:        } else {
                    435:            /* no chance for this in IsoLat1 */
1.33      daniel    436:            *outlen = out - outstart;
1.45      veillard  437:            *inlen = processed - instart;
1.28      daniel    438:            return(-2);
1.33      daniel    439:        }
                    440:        processed = in;
1.1       daniel    441:     }
1.33      daniel    442:     *outlen = out - outstart;
1.45      veillard  443:     *inlen = processed - instart;
1.33      daniel    444:     return(0);
1.1       daniel    445: }
                    446: 
                    447: /**
1.28      daniel    448:  * UTF16LEToUTF8:
                    449:  * @out:  a pointer to an array of bytes to store the result
                    450:  * @outlen:  the length of @out
                    451:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    452:  * @inlenb:  the length of @in in UTF-16LE chars
                    453:  *
                    454:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    455:  * block of chars out. This function assume the endian properity
                    456:  * is the same between the native type of this machine and the
                    457:  * inputed one.
                    458:  *
                    459:  * Returns the number of byte written, or -1 by lack of space, or -2
                    460:  *     if the transcoding fails (for *in is not valid utf16 string)
                    461:  *     The value of *inlen after return is the number of octets consumed
                    462:  *     as the return value is positive, else unpredictiable.
                    463:  */
                    464: int
1.33      daniel    465: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28      daniel    466:             const unsigned char* inb, int *inlenb)
                    467: {
1.33      daniel    468:     unsigned char* outstart = out;
                    469:     const unsigned char* processed = inb;
                    470:     unsigned char* outend = out + *outlen;
1.28      daniel    471:     unsigned short* in = (unsigned short*) inb;
                    472:     unsigned short* inend;
                    473:     unsigned int c, d, inlen;
                    474:     unsigned char *tmp;
                    475:     int bits;
                    476: 
                    477:     if ((*inlenb % 2) == 1)
                    478:         (*inlenb)--;
                    479:     inlen = *inlenb / 2;
1.33      daniel    480:     inend = in + inlen;
1.39      daniel    481:     while ((in < inend) && (out - outstart + 5 < *outlen)) {
1.34      daniel    482:         if (xmlLittleEndian) {
                    483:            c= *in++;
                    484:        } else {
                    485:            tmp = (unsigned char *) in;
                    486:            c = *tmp++;
                    487:            c = c | (((unsigned int)*tmp) << 8);
                    488:            in++;
                    489:        }
1.28      daniel    490:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.39      daniel    491:            if (in >= inend) {           /* (in > inend) shouldn't happens */
                    492:                break;
                    493:            }
1.34      daniel    494:            if (xmlLittleEndian) {
                    495:                d = *in++;
                    496:            } else {
                    497:                tmp = (unsigned char *) in;
                    498:                d = *tmp++;
                    499:                d = d | (((unsigned int)*tmp) << 8);
                    500:                in++;
                    501:            }
1.28      daniel    502:             if ((d & 0xFC00) == 0xDC00) {
                    503:                 c &= 0x03FF;
                    504:                 c <<= 10;
                    505:                 c |= d & 0x03FF;
                    506:                 c += 0x10000;
                    507:             }
1.33      daniel    508:             else {
                    509:                *outlen = out - outstart;
                    510:                *inlenb = processed - inb;
1.28      daniel    511:                return(-2);
1.33      daniel    512:            }
1.28      daniel    513:         }
                    514: 
                    515:        /* assertion: c is a single UTF-4 value */
                    516:         if (out >= outend)
1.33      daniel    517:            break;
1.28      daniel    518:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    519:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    520:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    521:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    522:  
                    523:         for ( ; bits >= 0; bits-= 6) {
                    524:             if (out >= outend)
1.33      daniel    525:                break;
1.28      daniel    526:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    527:         }
1.33      daniel    528:        processed = (const unsigned char*) in;
1.28      daniel    529:     }
1.33      daniel    530:     *outlen = out - outstart;
                    531:     *inlenb = processed - inb;
                    532:     return(0);
1.28      daniel    533: }
                    534: 
                    535: /**
                    536:  * UTF8ToUTF16LE:
                    537:  * @outb:  a pointer to an array of bytes to store the result
                    538:  * @outlen:  the length of @outb
                    539:  * @in:  a pointer to an array of UTF-8 chars
                    540:  * @inlen:  the length of @in
                    541:  *
                    542:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    543:  * block of chars out.
                    544:  *
                    545:  * Returns the number of byte written, or -1 by lack of space, or -2
                    546:  *     if the transcoding failed. 
                    547:  */
                    548: int
1.33      daniel    549: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28      daniel    550:             const unsigned char* in, int *inlen)
                    551: {
                    552:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    553:     const unsigned char* processed = in;
1.28      daniel    554:     unsigned short* outstart= out;
                    555:     unsigned short* outend;
                    556:     const unsigned char* inend= in+*inlen;
1.40      daniel    557:     unsigned int c, d;
                    558:     int trailing;
1.28      daniel    559:     unsigned char *tmp;
                    560:     unsigned short tmp1, tmp2;
                    561: 
1.37      daniel    562:     if (in == NULL) {
                    563:         /*
                    564:         * initialization, add the Byte Order Mark
                    565:         */
                    566:         if (*outlen >= 2) {
                    567:            outb[0] = 0xFF;
                    568:            outb[1] = 0xFE;
                    569:            *outlen = 2;
                    570:            *inlen = 0;
                    571: #ifdef DEBUG_ENCODING
                    572:             fprintf(stderr, "Added FFFE Byte Order Mark\n");
                    573: #endif
                    574:            return(2);
                    575:        }
                    576:        *outlen = 0;
                    577:        *inlen = 0;
                    578:        return(0);
                    579:     }
1.33      daniel    580:     outend = out + (*outlen / 2);
1.28      daniel    581:     while (in < inend) {
                    582:       d= *in++;
                    583:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    584:       else if (d < 0xC0) {
                    585:           /* trailing byte in leading position */
1.45      veillard  586:          *outlen = (out - outstart) * 2;
1.33      daniel    587:          *inlen = processed - in;
                    588:          return(-2);
                    589:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.28      daniel    590:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    591:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    592:       else {
                    593:        /* no chance for this in UTF-16 */
1.45      veillard  594:        *outlen = (out - outstart) * 2;
1.33      daniel    595:        *inlen = processed - in;
                    596:        return(-2);
                    597:       }
1.28      daniel    598: 
                    599:       if (inend - in < trailing) {
                    600:           break;
                    601:       } 
                    602: 
                    603:       for ( ; trailing; trailing--) {
                    604:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33      daniel    605:              break;
1.28      daniel    606:           c <<= 6;
                    607:           c |= d & 0x3F;
                    608:       }
                    609: 
                    610:       /* assertion: c is a single UTF-4 value */
                    611:         if (c < 0x10000) {
                    612:             if (out >= outend)
1.33      daniel    613:                break;
1.34      daniel    614:            if (xmlLittleEndian) {
                    615:                *out++ = c;
                    616:            } else {
                    617:                tmp = (unsigned char *) out;
                    618:                *tmp = c ;
                    619:                *(tmp + 1) = c >> 8 ;
                    620:                out++;
                    621:            }
1.28      daniel    622:         }
                    623:         else if (c < 0x110000) {
                    624:             if (out+1 >= outend)
1.33      daniel    625:                break;
1.28      daniel    626:             c -= 0x10000;
1.34      daniel    627:            if (xmlLittleEndian) {
                    628:                *out++ = 0xD800 | (c >> 10);
                    629:                *out++ = 0xDC00 | (c & 0x03FF);
                    630:            } else {
                    631:                tmp1 = 0xD800 | (c >> 10);
                    632:                tmp = (unsigned char *) out;
1.40      daniel    633:                *tmp = (unsigned char) tmp1;
1.34      daniel    634:                *(tmp + 1) = tmp1 >> 8;
                    635:                out++;
                    636: 
                    637:                tmp2 = 0xDC00 | (c & 0x03FF);
                    638:                tmp = (unsigned char *) out;
1.40      daniel    639:                *tmp  = (unsigned char) tmp2;
1.34      daniel    640:                *(tmp + 1) = tmp2 >> 8;
                    641:                out++;
                    642:            }
1.28      daniel    643:         }
                    644:         else
1.33      daniel    645:            break;
                    646:        processed = in;
1.28      daniel    647:     }
1.36      daniel    648:     *outlen = (out - outstart) * 2;
1.33      daniel    649:     *inlen = processed - in;
                    650:     return(0);
1.28      daniel    651: }
                    652: 
                    653: /**
                    654:  * UTF16BEToUTF8:
1.18      daniel    655:  * @out:  a pointer to an array of bytes to store the result
                    656:  * @outlen:  the length of @out
1.25      daniel    657:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    658:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    659:  *
                    660:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    661:  * block of chars out. This function assume the endian properity
                    662:  * is the same between the native type of this machine and the
                    663:  * inputed one.
1.25      daniel    664:  *
1.28      daniel    665:  * Returns the number of byte written, or -1 by lack of space, or -2
                    666:  *     if the transcoding fails (for *in is not valid utf16 string)
                    667:  * The value of *inlen after return is the number of octets consumed
                    668:  *     as the return value is positive, else unpredictiable.
1.1       daniel    669:  */
                    670: int
1.33      daniel    671: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25      daniel    672:             const unsigned char* inb, int *inlenb)
1.1       daniel    673: {
1.33      daniel    674:     unsigned char* outstart = out;
                    675:     const unsigned char* processed = inb;
                    676:     unsigned char* outend = out + *outlen;
1.25      daniel    677:     unsigned short* in = (unsigned short*) inb;
                    678:     unsigned short* inend;
                    679:     unsigned int c, d, inlen;
1.28      daniel    680:     unsigned char *tmp;
1.1       daniel    681:     int bits;
                    682: 
1.28      daniel    683:     if ((*inlenb % 2) == 1)
                    684:         (*inlenb)--;
1.25      daniel    685:     inlen = *inlenb / 2;
                    686:     inend= in + inlen;
1.1       daniel    687:     while (in < inend) {
1.34      daniel    688:        if (xmlLittleEndian) {
                    689:            tmp = (unsigned char *) in;
                    690:            c = *tmp++;
                    691:            c = c << 8;
                    692:            c = c | (unsigned int) *tmp;
                    693:            in++;
                    694:        } else {
                    695:            c= *in++;
                    696:        } 
1.1       daniel    697:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    698:            if (in >= inend) {           /* (in > inend) shouldn't happens */
1.33      daniel    699:                *outlen = out - outstart;
                    700:                *inlenb = processed - inb;
                    701:                return(-2);
1.28      daniel    702:            }
1.34      daniel    703:            if (xmlLittleEndian) {
                    704:                tmp = (unsigned char *) in;
                    705:                d = *tmp++;
                    706:                d = d << 8;
                    707:                d = d | (unsigned int) *tmp;
                    708:                in++;
                    709:            } else {
                    710:                d= *in++;
                    711:            }
1.28      daniel    712:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    713:                 c &= 0x03FF;
                    714:                 c <<= 10;
                    715:                 c |= d & 0x03FF;
                    716:                 c += 0x10000;
                    717:             }
1.33      daniel    718:             else {
                    719:                *outlen = out - outstart;
                    720:                *inlenb = processed - inb;
1.28      daniel    721:                return(-2);
1.33      daniel    722:            }
1.1       daniel    723:         }
                    724: 
1.25      daniel    725:        /* assertion: c is a single UTF-4 value */
1.27      daniel    726:         if (out >= outend) 
1.33      daniel    727:            break;
1.1       daniel    728:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    729:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    730:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    731:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    732:  
1.26      daniel    733:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    734:             if (out >= outend) 
1.33      daniel    735:                break;
1.26      daniel    736:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    737:         }
1.33      daniel    738:        processed = (const unsigned char*) in;
1.1       daniel    739:     }
1.33      daniel    740:     *outlen = out - outstart;
                    741:     *inlenb = processed - inb;
                    742:     return(0);
1.1       daniel    743: }
                    744: 
                    745: /**
1.28      daniel    746:  * UTF8ToUTF16BE:
1.25      daniel    747:  * @outb:  a pointer to an array of bytes to store the result
                    748:  * @outlen:  the length of @outb
1.18      daniel    749:  * @in:  a pointer to an array of UTF-8 chars
                    750:  * @inlen:  the length of @in
1.1       daniel    751:  *
1.28      daniel    752:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    753:  * block of chars out.
1.15      daniel    754:  *
1.6       daniel    755:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    756:  *     if the transcoding failed. 
1.1       daniel    757:  */
                    758: int
1.33      daniel    759: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25      daniel    760:             const unsigned char* in, int *inlen)
1.1       daniel    761: {
1.25      daniel    762:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    763:     const unsigned char* processed = in;
1.1       daniel    764:     unsigned short* outstart= out;
1.28      daniel    765:     unsigned short* outend;
1.25      daniel    766:     const unsigned char* inend= in+*inlen;
1.40      daniel    767:     unsigned int c, d;
                    768:     int trailing;
1.28      daniel    769:     unsigned char *tmp;
                    770:     unsigned short tmp1, tmp2;
1.1       daniel    771: 
1.37      daniel    772:     if (in == NULL) {
                    773:         /*
                    774:         * initialization, add the Byte Order Mark
                    775:         */
                    776:         if (*outlen >= 2) {
                    777:            outb[0] = 0xFE;
                    778:            outb[1] = 0xFF;
                    779:            *outlen = 2;
                    780:            *inlen = 0;
                    781: #ifdef DEBUG_ENCODING
                    782:             fprintf(stderr, "Added FEFF Byte Order Mark\n");
                    783: #endif
                    784:            return(2);
                    785:        }
                    786:        *outlen = 0;
                    787:        *inlen = 0;
                    788:        return(0);
                    789:     }
1.33      daniel    790:     outend = out + (*outlen / 2);
1.1       daniel    791:     while (in < inend) {
                    792:       d= *in++;
                    793:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    794:       else if (d < 0xC0)  {
                    795:           /* trailing byte in leading position */
                    796:          *outlen = out - outstart;
                    797:          *inlen = processed - in;
                    798:          return(-2);
                    799:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.1       daniel    800:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    801:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    802:       else {
                    803:           /* no chance for this in UTF-16 */
                    804:          *outlen = out - outstart;
                    805:          *inlen = processed - in;
                    806:          return(-2);
                    807:       }
1.28      daniel    808: 
                    809:       if (inend - in < trailing) {
                    810:           break;
                    811:       } 
1.1       daniel    812: 
                    813:       for ( ; trailing; trailing--) {
1.33      daniel    814:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1.1       daniel    815:           c <<= 6;
                    816:           c |= d & 0x3F;
                    817:       }
                    818: 
                    819:       /* assertion: c is a single UTF-4 value */
                    820:         if (c < 0x10000) {
1.33      daniel    821:             if (out >= outend)  break;
1.34      daniel    822:            if (xmlLittleEndian) {
                    823:                tmp = (unsigned char *) out;
                    824:                *tmp = c >> 8;
                    825:                *(tmp + 1) = c;
                    826:                out++;
                    827:            } else {
                    828:                *out++ = c;
                    829:            }
1.1       daniel    830:         }
                    831:         else if (c < 0x110000) {
1.33      daniel    832:             if (out+1 >= outend)  break;
1.1       daniel    833:             c -= 0x10000;
1.34      daniel    834:            if (xmlLittleEndian) {
                    835:                tmp1 = 0xD800 | (c >> 10);
                    836:                tmp = (unsigned char *) out;
                    837:                *tmp = tmp1 >> 8;
1.40      daniel    838:                *(tmp + 1) = (unsigned char) tmp1;
1.34      daniel    839:                out++;
                    840: 
                    841:                tmp2 = 0xDC00 | (c & 0x03FF);
                    842:                tmp = (unsigned char *) out;
                    843:                *tmp = tmp2 >> 8;
1.40      daniel    844:                *(tmp + 1) = (unsigned char) tmp2;
1.34      daniel    845:                out++;
                    846:            } else {
                    847:                *out++ = 0xD800 | (c >> 10);
                    848:                *out++ = 0xDC00 | (c & 0x03FF);
                    849:            }
1.1       daniel    850:         }
1.33      daniel    851:         else
                    852:            break;
                    853:        processed = in;
1.1       daniel    854:     }
1.36      daniel    855:     *outlen = (out - outstart) * 2;
1.33      daniel    856:     *inlen = processed - in;
                    857:     return(0);
1.1       daniel    858: }
                    859: 
1.7       daniel    860: /**
                    861:  * xmlDetectCharEncoding:
                    862:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    863:  *       4 bytes long.
1.25      daniel    864:  * @len:  pointer to the length of the buffer
1.7       daniel    865:  *
                    866:  * Guess the encoding of the entity using the first bytes of the entity content
                    867:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    868:  * 
                    869:  * Returns one of the XML_CHAR_ENCODING_... values.
                    870:  */
                    871: xmlCharEncoding
1.25      daniel    872: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    873: {
1.25      daniel    874:     if (len >= 4) {
                    875:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    876:            (in[2] == 0x00) && (in[3] == 0x3C))
                    877:            return(XML_CHAR_ENCODING_UCS4BE);
                    878:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    879:            (in[2] == 0x00) && (in[3] == 0x00))
                    880:            return(XML_CHAR_ENCODING_UCS4LE);
                    881:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    882:            (in[2] == 0x3C) && (in[3] == 0x00))
                    883:            return(XML_CHAR_ENCODING_UCS4_2143);
                    884:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    885:            (in[2] == 0x00) && (in[3] == 0x00))
                    886:            return(XML_CHAR_ENCODING_UCS4_3412);
                    887:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    888:            (in[2] == 0xA7) && (in[3] == 0x94))
                    889:            return(XML_CHAR_ENCODING_EBCDIC);
                    890:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    891:            (in[2] == 0x78) && (in[3] == 0x6D))
                    892:            return(XML_CHAR_ENCODING_UTF8);
                    893:     }
                    894:     if (len >= 2) {
                    895:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    896:            return(XML_CHAR_ENCODING_UTF16BE);
                    897:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    898:            return(XML_CHAR_ENCODING_UTF16LE);
                    899:     }
1.7       daniel    900:     return(XML_CHAR_ENCODING_NONE);
                    901: }
                    902: 
                    903: /**
                    904:  * xmlParseCharEncoding:
1.18      daniel    905:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    906:  *
                    907:  * Conpare the string to the known encoding schemes already known. Note
                    908:  * that the comparison is case insensitive accordingly to the section
                    909:  * [XML] 4.3.3 Character Encoding in Entities.
                    910:  * 
                    911:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    912:  * if not recognized.
                    913:  */
                    914: xmlCharEncoding
1.8       daniel    915: xmlParseCharEncoding(const char* name)
1.7       daniel    916: {
                    917:     char upper[500];
                    918:     int i;
                    919: 
                    920:     for (i = 0;i < 499;i++) {
                    921:         upper[i] = toupper(name[i]);
                    922:        if (upper[i] == 0) break;
                    923:     }
                    924:     upper[i] = 0;
                    925: 
                    926:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    927:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    928:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    929: 
                    930:     /*
                    931:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    932:      *       already found and in use
                    933:      */
                    934:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    935:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    936:     
                    937:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    938:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    939:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    940: 
                    941:     /*
                    942:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    943:      *       already found and in use
                    944:      */
                    945:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    946:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    947:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    948: 
                    949:     
                    950:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    951:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    952:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    953: 
                    954:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    955:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    956:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    957: 
                    958:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    959:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    960:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    961:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    962:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    963:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    964:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    965: 
                    966:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30      daniel    967:     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7       daniel    968:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30      daniel    969: 
                    970: #ifdef DEBUG_ENCODING
                    971:     fprintf(stderr, "Unknown encoding %s\n", name);
                    972: #endif
1.7       daniel    973:     return(XML_CHAR_ENCODING_ERROR);
                    974: }
1.9       daniel    975: 
1.38      daniel    976: /**
                    977:  * xmlGetCharEncodingName:
                    978:  * @enc:  the encoding
                    979:  *
                    980:  * The "canonical" name for XML encoding.
                    981:  * C.f. http://www.w3.org/TR/REC-xml#charencoding
                    982:  * Section 4.3.3  Character Encoding in Entities
                    983:  *
                    984:  * Returns the canonical name for the given encoding
                    985:  */
                    986: 
                    987: const char*
                    988: xmlGetCharEncodingName(xmlCharEncoding enc) {
                    989:     switch (enc) {
                    990:         case XML_CHAR_ENCODING_ERROR:
                    991:            return(NULL);
                    992:         case XML_CHAR_ENCODING_NONE:
                    993:            return(NULL);
                    994:         case XML_CHAR_ENCODING_UTF8:
                    995:            return("UTF-8");
                    996:         case XML_CHAR_ENCODING_UTF16LE:
                    997:            return("UTF-16");
                    998:         case XML_CHAR_ENCODING_UTF16BE:
                    999:            return("UTF-16");
                   1000:         case XML_CHAR_ENCODING_EBCDIC:
                   1001:             return("EBCDIC");
                   1002:         case XML_CHAR_ENCODING_UCS4LE:
                   1003:             return("ISO-10646-UCS-4");
                   1004:         case XML_CHAR_ENCODING_UCS4BE:
                   1005:             return("ISO-10646-UCS-4");
                   1006:         case XML_CHAR_ENCODING_UCS4_2143:
                   1007:             return("ISO-10646-UCS-4");
                   1008:         case XML_CHAR_ENCODING_UCS4_3412:
                   1009:             return("ISO-10646-UCS-4");
                   1010:         case XML_CHAR_ENCODING_UCS2:
                   1011:             return("ISO-10646-UCS-2");
                   1012:         case XML_CHAR_ENCODING_8859_1:
                   1013:            return("ISO-8859-1");
                   1014:         case XML_CHAR_ENCODING_8859_2:
                   1015:            return("ISO-8859-2");
                   1016:         case XML_CHAR_ENCODING_8859_3:
                   1017:            return("ISO-8859-3");
                   1018:         case XML_CHAR_ENCODING_8859_4:
                   1019:            return("ISO-8859-4");
                   1020:         case XML_CHAR_ENCODING_8859_5:
                   1021:            return("ISO-8859-5");
                   1022:         case XML_CHAR_ENCODING_8859_6:
                   1023:            return("ISO-8859-6");
                   1024:         case XML_CHAR_ENCODING_8859_7:
                   1025:            return("ISO-8859-7");
                   1026:         case XML_CHAR_ENCODING_8859_8:
                   1027:            return("ISO-8859-8");
                   1028:         case XML_CHAR_ENCODING_8859_9:
                   1029:            return("ISO-8859-9");
                   1030:         case XML_CHAR_ENCODING_2022_JP:
                   1031:             return("ISO-2022-JP");
                   1032:         case XML_CHAR_ENCODING_SHIFT_JIS:
                   1033:             return("Shift-JIS");
                   1034:         case XML_CHAR_ENCODING_EUC_JP:
                   1035:             return("EUC-JP");
                   1036:     }
                   1037:     return(NULL);
                   1038: }
                   1039: 
1.9       daniel   1040: /****************************************************************
                   1041:  *                                                             *
                   1042:  *             Char encoding handlers                          *
                   1043:  *                                                             *
                   1044:  ****************************************************************/
                   1045: 
                   1046: /* the size should be growable, but it's not a big deal ... */
                   1047: #define MAX_ENCODING_HANDLERS 50
                   1048: static xmlCharEncodingHandlerPtr *handlers = NULL;
                   1049: static int nbCharEncodingHandler = 0;
                   1050: 
                   1051: /*
                   1052:  * The default is UTF-8 for XML, that's also the default used for the
                   1053:  * parser internals, so the default encoding handler is NULL
                   1054:  */
                   1055: 
                   1056: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                   1057: 
                   1058: /**
                   1059:  * xmlNewCharEncodingHandler:
1.18      daniel   1060:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel   1061:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                   1062:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                   1063:  *
                   1064:  * Create and registers an xmlCharEncodingHandler.
                   1065:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                   1066:  */
                   1067: xmlCharEncodingHandlerPtr
1.25      daniel   1068: xmlNewCharEncodingHandler(const char *name, 
                   1069:                           xmlCharEncodingInputFunc input,
1.9       daniel   1070:                           xmlCharEncodingOutputFunc output) {
                   1071:     xmlCharEncodingHandlerPtr handler;
                   1072:     char upper[500];
                   1073:     int i;
                   1074:     char *up = 0;
                   1075: 
                   1076:     /*
                   1077:      * Keep only the uppercase version of the encoding.
                   1078:      */
                   1079:     if (name == NULL) {
                   1080:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                   1081:        return(NULL);
                   1082:     }
                   1083:     for (i = 0;i < 499;i++) {
                   1084:         upper[i] = toupper(name[i]);
                   1085:        if (upper[i] == 0) break;
                   1086:     }
                   1087:     upper[i] = 0;
1.16      daniel   1088:     up = xmlMemStrdup(upper);
1.9       daniel   1089:     if (up == NULL) {
                   1090:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                   1091:        return(NULL);
                   1092:     }
                   1093: 
                   1094:     /*
                   1095:      * allocate and fill-up an handler block.
                   1096:      */
                   1097:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel   1098:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel   1099:     if (handler == NULL) {
                   1100:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                   1101:        return(NULL);
                   1102:     }
                   1103:     handler->input = input;
                   1104:     handler->output = output;
                   1105:     handler->name = up;
                   1106: 
1.49    ! veillard 1107:     handler->iconv_in = NULL;
        !          1108:     handler->iconv_out = NULL;
        !          1109: 
1.9       daniel   1110:     /*
                   1111:      * registers and returns the handler.
                   1112:      */
                   1113:     xmlRegisterCharEncodingHandler(handler);
1.30      daniel   1114: #ifdef DEBUG_ENCODING
                   1115:     fprintf(stderr, "Registered encoding handler for %s\n", name);
                   1116: #endif
1.9       daniel   1117:     return(handler);
                   1118: }
                   1119: 
                   1120: /**
                   1121:  * xmlInitCharEncodingHandlers:
                   1122:  *
                   1123:  * Initialize the char encoding support, it registers the default
                   1124:  * encoding supported.
1.18      daniel   1125:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel   1126:  *       in normal processing.
                   1127:  */
                   1128: void
                   1129: xmlInitCharEncodingHandlers(void) {
1.34      daniel   1130:     unsigned short int tst = 0x1234;
                   1131:     unsigned char *ptr = (unsigned char *) &tst; 
                   1132: 
1.9       daniel   1133:     if (handlers != NULL) return;
                   1134: 
                   1135:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel   1136:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34      daniel   1137: 
                   1138:     if (*ptr == 0x12) xmlLittleEndian = 0;
                   1139:     else if (*ptr == 0x34) xmlLittleEndian = 1;
                   1140:     else fprintf(stderr, "Odd problem at endianness detection\n");
1.9       daniel   1141: 
                   1142:     if (handlers == NULL) {
                   1143:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                   1144:        return;
                   1145:     }
1.10      daniel   1146:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel   1147:     xmlUTF16LEHandler = 
1.28      daniel   1148:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                   1149:     xmlUTF16BEHandler = 
                   1150:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel   1151:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.47      veillard 1152:     xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1.48      veillard 1153: #ifdef LIBXML_HTML_ENABLED
                   1154:     xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
                   1155: #endif
1.9       daniel   1156: }
                   1157: 
                   1158: /**
1.19      daniel   1159:  * xmlCleanupCharEncodingHandlers:
                   1160:  *
                   1161:  * Cleanup the memory allocated for the char encoding support, it
                   1162:  * unregisters all the encoding handlers.
                   1163:  */
                   1164: void
                   1165: xmlCleanupCharEncodingHandlers(void) {
                   1166:     if (handlers == NULL) return;
                   1167: 
                   1168:     for (;nbCharEncodingHandler > 0;) {
                   1169:         nbCharEncodingHandler--;
                   1170:        if (handlers[nbCharEncodingHandler] != NULL) {
1.31      daniel   1171:            if (handlers[nbCharEncodingHandler]->name != NULL)
                   1172:                xmlFree(handlers[nbCharEncodingHandler]->name);
1.19      daniel   1173:            xmlFree(handlers[nbCharEncodingHandler]);
                   1174:        }
                   1175:     }
                   1176:     xmlFree(handlers);
                   1177:     handlers = NULL;
                   1178:     nbCharEncodingHandler = 0;
                   1179:     xmlDefaultCharEncodingHandler = NULL;
                   1180: }
                   1181: 
                   1182: /**
1.9       daniel   1183:  * xmlRegisterCharEncodingHandler:
                   1184:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                   1185:  *
                   1186:  * Register the char encoding handler, surprizing, isn't it ?
                   1187:  */
                   1188: void
                   1189: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                   1190:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1191:     if (handler == NULL) {
                   1192:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                   1193:        return;
                   1194:     }
                   1195: 
                   1196:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                   1197:         fprintf(stderr, 
                   1198:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                   1199:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                   1200:        return;
                   1201:     }
                   1202:     handlers[nbCharEncodingHandler++] = handler;
                   1203: }
                   1204: 
                   1205: /**
                   1206:  * xmlGetCharEncodingHandler:
                   1207:  * @enc:  an xmlCharEncoding value.
                   1208:  *
                   1209:  * Search in the registrered set the handler able to read/write that encoding.
                   1210:  *
                   1211:  * Returns the handler or NULL if not found
                   1212:  */
                   1213: xmlCharEncodingHandlerPtr
                   1214: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30      daniel   1215:     xmlCharEncodingHandlerPtr handler;
                   1216: 
1.9       daniel   1217:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel   1218:     switch (enc) {
                   1219:         case XML_CHAR_ENCODING_ERROR:
                   1220:            return(NULL);
                   1221:         case XML_CHAR_ENCODING_NONE:
                   1222:            return(NULL);
                   1223:         case XML_CHAR_ENCODING_UTF8:
                   1224:            return(NULL);
                   1225:         case XML_CHAR_ENCODING_UTF16LE:
                   1226:            return(xmlUTF16LEHandler);
                   1227:         case XML_CHAR_ENCODING_UTF16BE:
                   1228:            return(xmlUTF16BEHandler);
                   1229:         case XML_CHAR_ENCODING_EBCDIC:
1.30      daniel   1230:             handler = xmlFindCharEncodingHandler("EBCDIC");
                   1231:             if (handler != NULL) return(handler);
                   1232:             handler = xmlFindCharEncodingHandler("ebcdic");
                   1233:             if (handler != NULL) return(handler);
                   1234:            break;
1.38      daniel   1235:         case XML_CHAR_ENCODING_UCS4BE:
1.30      daniel   1236:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                   1237:             if (handler != NULL) return(handler);
                   1238:             handler = xmlFindCharEncodingHandler("UCS-4");
                   1239:             if (handler != NULL) return(handler);
                   1240:             handler = xmlFindCharEncodingHandler("UCS4");
                   1241:             if (handler != NULL) return(handler);
                   1242:            break;
1.38      daniel   1243:         case XML_CHAR_ENCODING_UCS4LE:
                   1244:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                   1245:             if (handler != NULL) return(handler);
                   1246:             handler = xmlFindCharEncodingHandler("UCS-4");
                   1247:             if (handler != NULL) return(handler);
                   1248:             handler = xmlFindCharEncodingHandler("UCS4");
1.30      daniel   1249:             if (handler != NULL) return(handler);
                   1250:            break;
1.25      daniel   1251:         case XML_CHAR_ENCODING_UCS4_2143:
1.30      daniel   1252:            break;
1.25      daniel   1253:         case XML_CHAR_ENCODING_UCS4_3412:
1.30      daniel   1254:            break;
1.25      daniel   1255:         case XML_CHAR_ENCODING_UCS2:
1.30      daniel   1256:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
                   1257:             if (handler != NULL) return(handler);
                   1258:             handler = xmlFindCharEncodingHandler("UCS-2");
                   1259:             if (handler != NULL) return(handler);
                   1260:             handler = xmlFindCharEncodingHandler("UCS2");
                   1261:             if (handler != NULL) return(handler);
                   1262:            break;
1.42      veillard 1263: 
                   1264:            /*
                   1265:             * We used to keep ISO Latin encodings native in the
                   1266:             * generated data. This led to so many problems that
                   1267:             * this has been removed. One can still change this
                   1268:             * back by registering no-ops encoders for those
                   1269:             */
1.25      daniel   1270:         case XML_CHAR_ENCODING_8859_1:
1.42      veillard 1271:            handler = xmlFindCharEncodingHandler("ISO-8859-1");
                   1272:            if (handler != NULL) return(handler);
                   1273:            break;
1.25      daniel   1274:         case XML_CHAR_ENCODING_8859_2:
1.42      veillard 1275:            handler = xmlFindCharEncodingHandler("ISO-8859-2");
                   1276:            if (handler != NULL) return(handler);
                   1277:            break;
1.25      daniel   1278:         case XML_CHAR_ENCODING_8859_3:
1.42      veillard 1279:            handler = xmlFindCharEncodingHandler("ISO-8859-3");
                   1280:            if (handler != NULL) return(handler);
                   1281:            break;
1.25      daniel   1282:         case XML_CHAR_ENCODING_8859_4:
1.42      veillard 1283:            handler = xmlFindCharEncodingHandler("ISO-8859-4");
                   1284:            if (handler != NULL) return(handler);
                   1285:            break;
1.25      daniel   1286:         case XML_CHAR_ENCODING_8859_5:
1.42      veillard 1287:            handler = xmlFindCharEncodingHandler("ISO-8859-5");
                   1288:            if (handler != NULL) return(handler);
                   1289:            break;
1.25      daniel   1290:         case XML_CHAR_ENCODING_8859_6:
1.42      veillard 1291:            handler = xmlFindCharEncodingHandler("ISO-8859-6");
                   1292:            if (handler != NULL) return(handler);
                   1293:            break;
1.25      daniel   1294:         case XML_CHAR_ENCODING_8859_7:
1.42      veillard 1295:            handler = xmlFindCharEncodingHandler("ISO-8859-7");
                   1296:            if (handler != NULL) return(handler);
                   1297:            break;
1.25      daniel   1298:         case XML_CHAR_ENCODING_8859_8:
1.42      veillard 1299:            handler = xmlFindCharEncodingHandler("ISO-8859-8");
                   1300:            if (handler != NULL) return(handler);
                   1301:            break;
1.25      daniel   1302:         case XML_CHAR_ENCODING_8859_9:
1.42      veillard 1303:            handler = xmlFindCharEncodingHandler("ISO-8859-9");
                   1304:            if (handler != NULL) return(handler);
                   1305:            break;
                   1306: 
                   1307: 
1.25      daniel   1308:         case XML_CHAR_ENCODING_2022_JP:
1.30      daniel   1309:             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
                   1310:             if (handler != NULL) return(handler);
                   1311:            break;
1.25      daniel   1312:         case XML_CHAR_ENCODING_SHIFT_JIS:
1.30      daniel   1313:             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
                   1314:             if (handler != NULL) return(handler);
                   1315:             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
                   1316:             if (handler != NULL) return(handler);
                   1317:             handler = xmlFindCharEncodingHandler("Shift_JIS");
                   1318:             if (handler != NULL) return(handler);
                   1319:            break;
1.25      daniel   1320:         case XML_CHAR_ENCODING_EUC_JP:
1.30      daniel   1321:             handler = xmlFindCharEncodingHandler("EUC-JP");
                   1322:             if (handler != NULL) return(handler);
                   1323:            break;
                   1324:        default: 
                   1325:            break;
1.25      daniel   1326:     }
1.30      daniel   1327:     
                   1328: #ifdef DEBUG_ENCODING
                   1329:     fprintf(stderr, "No handler found for encoding %d\n", enc);
                   1330: #endif
1.9       daniel   1331:     return(NULL);
                   1332: }
                   1333: 
                   1334: /**
                   1335:  * xmlGetCharEncodingHandler:
                   1336:  * @enc:  a string describing the char encoding.
                   1337:  *
                   1338:  * Search in the registrered set the handler able to read/write that encoding.
                   1339:  *
                   1340:  * Returns the handler or NULL if not found
                   1341:  */
                   1342: xmlCharEncodingHandlerPtr
                   1343: xmlFindCharEncodingHandler(const char *name) {
1.36      daniel   1344:     xmlCharEncoding alias;
1.30      daniel   1345: #ifdef LIBXML_ICONV_ENABLED
1.40      daniel   1346:     xmlCharEncodingHandlerPtr enc;
1.30      daniel   1347:     iconv_t icv_in, icv_out;
                   1348: #endif /* LIBXML_ICONV_ENABLED */
                   1349:     char upper[100];
1.9       daniel   1350:     int i;
                   1351: 
                   1352:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1353:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                   1354:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                   1355: 
1.36      daniel   1356:     /*
                   1357:      * Check first for directly registered encoding names
                   1358:      */
1.30      daniel   1359:     for (i = 0;i < 99;i++) {
1.9       daniel   1360:         upper[i] = toupper(name[i]);
                   1361:        if (upper[i] == 0) break;
                   1362:     }
                   1363:     upper[i] = 0;
                   1364: 
                   1365:     for (i = 0;i < nbCharEncodingHandler; i++)
1.30      daniel   1366:         if (!strcmp(upper, handlers[i]->name)) {
                   1367: #ifdef DEBUG_ENCODING
                   1368:             fprintf(stderr, "Found registered handler for encoding %s\n", name);
                   1369: #endif
1.9       daniel   1370:            return(handlers[i]);
1.30      daniel   1371:        }
1.9       daniel   1372: 
1.30      daniel   1373: #ifdef LIBXML_ICONV_ENABLED
                   1374:     /* check whether iconv can handle this */
1.31      daniel   1375:     icv_in = iconv_open("UTF-8", name);
                   1376:     icv_out = iconv_open(name, "UTF-8");
1.30      daniel   1377:     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.43      veillard 1378:            enc = (xmlCharEncodingHandlerPtr)
                   1379:                  xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32      daniel   1380:            if (enc == NULL) {
                   1381:                iconv_close(icv_in);
                   1382:                iconv_close(icv_out);
                   1383:                return(NULL);
                   1384:            }
1.41      daniel   1385:            enc->name = xmlMemStrdup(name);
1.30      daniel   1386:            enc->input = NULL;
                   1387:            enc->output = NULL;
                   1388:            enc->iconv_in = icv_in;
                   1389:            enc->iconv_out = icv_out;
                   1390: #ifdef DEBUG_ENCODING
                   1391:             fprintf(stderr, "Found iconv handler for encoding %s\n", name);
                   1392: #endif
                   1393:            return enc;
                   1394:     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
                   1395:            fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
                   1396:     }
                   1397: #endif /* LIBXML_ICONV_ENABLED */
1.38      daniel   1398: 
1.30      daniel   1399: #ifdef DEBUG_ENCODING
                   1400:     fprintf(stderr, "No handler found for encoding %s\n", name);
                   1401: #endif
1.38      daniel   1402: 
                   1403:     /*
                   1404:      * Fallback using the canonical names
                   1405:      */
                   1406:     alias = xmlParseCharEncoding(name);
                   1407:     if (alias != XML_CHAR_ENCODING_ERROR) {
                   1408:         const char* canon;
                   1409:         canon = xmlGetCharEncodingName(alias);
                   1410:         if ((canon != NULL) && (strcmp(name, canon))) {
                   1411:            return(xmlFindCharEncodingHandler(canon));
                   1412:         }
                   1413:     }
                   1414: 
1.9       daniel   1415:     return(NULL);
1.30      daniel   1416: }
                   1417: 
                   1418: #ifdef LIBXML_ICONV_ENABLED
                   1419: /**
                   1420:  * xmlIconvWrapper:
                   1421:  * @cd:                iconv converter data structure
                   1422:  * @out:  a pointer to an array of bytes to store the result
                   1423:  * @outlen:  the length of @out
                   1424:  * @in:  a pointer to an array of ISO Latin 1 chars
                   1425:  * @inlen:  the length of @in
                   1426:  *
                   1427:  * Returns 0 if success, or 
                   1428:  *     -1 by lack of space, or
                   1429:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1430:  *        the result of transformation can't fit into the encoding we want), or
                   1431:  *     -3 if there the last byte can't form a single output char.
                   1432:  *     
                   1433:  * The value of @inlen after return is the number of octets consumed
                   1434:  *     as the return value is positive, else unpredictiable.
                   1435:  * The value of @outlen after return is the number of ocetes consumed.
                   1436:  */
                   1437: static int
                   1438: xmlIconvWrapper(iconv_t cd,
                   1439:        unsigned char *out, int *outlen,
                   1440:        const unsigned char *in, int *inlen) {
                   1441: 
                   1442:        size_t icv_inlen = *inlen, icv_outlen = *outlen;
                   1443:        const char *icv_in = (const char *) in;
                   1444:        char *icv_out = (char *) out;
                   1445:        int ret;
                   1446: 
                   1447:        ret = iconv(cd,
                   1448:                &icv_in, &icv_inlen,
                   1449:                &icv_out, &icv_outlen);
1.35      daniel   1450:        if (in != NULL) {
                   1451:            *inlen -= icv_inlen;
                   1452:            *outlen -= icv_outlen;
                   1453:        } else {
                   1454:            *inlen = 0;
                   1455:            *outlen = 0;
                   1456:        }
1.30      daniel   1457:        if (icv_inlen != 0 || ret == (size_t) -1) {
                   1458: #ifdef EILSEQ
                   1459:                if (errno == EILSEQ) {
1.31      daniel   1460:                        return -2;
1.30      daniel   1461:                } else
                   1462: #endif
                   1463: #ifdef E2BIG
                   1464:                if (errno == E2BIG) {
                   1465:                        return -1;
                   1466:                } else
                   1467: #endif
                   1468: #ifdef EINVAL
                   1469:                if (errno == EINVAL) {
1.31      daniel   1470:                        return -3;
1.30      daniel   1471:                }
                   1472: #endif
                   1473:                else {
                   1474:                        return -3;
                   1475:                }
                   1476:        }
                   1477:        return 0;
                   1478: }
                   1479: #endif /* LIBXML_ICONV_ENABLED */
1.38      daniel   1480: 
                   1481: /**
                   1482:  * xmlCharEncFirstLine:
                   1483:  * @handler:   char enconding transformation data structure
                   1484:  * @out:  an xmlBuffer for the output.
                   1485:  * @in:  an xmlBuffer for the input
                   1486:  *     
                   1487:  * Front-end for the encoding handler input function, but handle only
                   1488:  * the very first line, i.e. limit itself to 45 chars.
                   1489:  *     
                   1490:  * Returns the number of byte written if success, or 
                   1491:  *     -1 general error
                   1492:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1493:  *        the result of transformation can't fit into the encoding we want), or
                   1494:  */
                   1495: int
                   1496: xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1497:                  xmlBufferPtr in) {
                   1498:     int ret = -2;
                   1499:     int written;
                   1500:     int toconv;
                   1501: 
                   1502:     if (handler == NULL) return(-1);
                   1503:     if (out == NULL) return(-1);
                   1504:     if (in == NULL) return(-1);
                   1505: 
                   1506:     written = out->size - out->use;
                   1507:     toconv = in->use;
                   1508:     if (toconv * 2 >= written) {
1.39      daniel   1509:         xmlBufferGrow(out, toconv);
1.38      daniel   1510:        written = out->size - out->use - 1;
                   1511:     }
1.39      daniel   1512: 
1.38      daniel   1513:     /*
                   1514:      * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
                   1515:      * 45 chars should be sufficient to reach the end of the encoding
                   1516:      * decalration without going too far inside the document content.
                   1517:      */
                   1518:     written = 45;
                   1519: 
                   1520:     if (handler->input != NULL) {
                   1521:        ret = handler->input(&out->content[out->use], &written,
                   1522:                             in->content, &toconv);
                   1523:        xmlBufferShrink(in, toconv);
                   1524:        out->use += written;
                   1525:        out->content[out->use] = 0;
                   1526:     }
                   1527: #ifdef LIBXML_ICONV_ENABLED
                   1528:     else if (handler->iconv_in != NULL) {
                   1529:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1530:                              &written, in->content, &toconv);
                   1531:        xmlBufferShrink(in, toconv);
                   1532:        out->use += written;
                   1533:        out->content[out->use] = 0;
                   1534:        if (ret == -1) ret = -3;
                   1535:     }
                   1536: #endif /* LIBXML_ICONV_ENABLED */
                   1537: #ifdef DEBUG_ENCODING
                   1538:     switch (ret) {
                   1539:         case 0:
                   1540:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
                   1541:                    toconv, written);
                   1542:            break;
                   1543:         case -1:
                   1544:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1545:                    toconv, written, in->use);
                   1546:            break;
                   1547:         case -2:
                   1548:            fprintf(stderr, "input conversion failed due to input error\n");
                   1549:            break;
                   1550:         case -3:
                   1551:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1552:                    toconv, written, in->use);
                   1553:            break;
                   1554:        default:
                   1555:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
                   1556:     }
                   1557: #endif
                   1558:     /*
                   1559:      * Ignore when input buffer is not on a boundary
                   1560:      */
                   1561:     if (ret == -3) ret = 0;
                   1562:     if (ret == -1) ret = 0;
                   1563:     return(ret);
                   1564: }
1.30      daniel   1565: 
                   1566: /**
                   1567:  * xmlCharEncInFunc:
                   1568:  * @handler:   char enconding transformation data structure
1.31      daniel   1569:  * @out:  an xmlBuffer for the output.
                   1570:  * @in:  an xmlBuffer for the input
1.30      daniel   1571:  *     
                   1572:  * Generic front-end for the encoding handler input function
                   1573:  *     
1.31      daniel   1574:  * Returns the number of byte written if success, or 
                   1575:  *     -1 general error
1.30      daniel   1576:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1577:  *        the result of transformation can't fit into the encoding we want), or
                   1578:  */
                   1579: int
1.31      daniel   1580: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1581:                  xmlBufferPtr in) {
1.30      daniel   1582:     int ret = -2;
1.31      daniel   1583:     int written;
                   1584:     int toconv;
1.30      daniel   1585: 
1.31      daniel   1586:     if (handler == NULL) return(-1);
                   1587:     if (out == NULL) return(-1);
                   1588:     if (in == NULL) return(-1);
                   1589: 
                   1590:     written = out->size - out->use;
                   1591:     toconv = in->use;
                   1592:     if (toconv * 2 >= written) {
                   1593:         xmlBufferGrow(out, toconv * 2);
1.33      daniel   1594:        written = out->size - out->use - 1;
1.31      daniel   1595:     }
1.30      daniel   1596:     if (handler->input != NULL) {
1.32      daniel   1597:        ret = handler->input(&out->content[out->use], &written,
1.31      daniel   1598:                             in->content, &toconv);
                   1599:        xmlBufferShrink(in, toconv);
                   1600:        out->use += written;
1.33      daniel   1601:        out->content[out->use] = 0;
1.30      daniel   1602:     }
                   1603: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1604:     else if (handler->iconv_in != NULL) {
                   1605:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1606:                              &written, in->content, &toconv);
                   1607:        xmlBufferShrink(in, toconv);
                   1608:        out->use += written;
1.33      daniel   1609:        out->content[out->use] = 0;
                   1610:        if (ret == -1) ret = -3;
1.30      daniel   1611:     }
                   1612: #endif /* LIBXML_ICONV_ENABLED */
1.39      daniel   1613:     switch (ret) {
1.30      daniel   1614: #ifdef DEBUG_ENCODING
                   1615:         case 0:
                   1616:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31      daniel   1617:                    toconv, written);
1.30      daniel   1618:            break;
                   1619:         case -1:
1.31      daniel   1620:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1621:                    toconv, written, in->use);
1.30      daniel   1622:            break;
                   1623:         case -3:
1.31      daniel   1624:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1625:                    toconv, written, in->use);
1.30      daniel   1626:            break;
1.39      daniel   1627: #endif
                   1628:         case -2:
                   1629:            fprintf(stderr, "input conversion failed due to input error\n");
                   1630:            fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
                   1631:                    in->content[0], in->content[1],
                   1632:                    in->content[2], in->content[3]);
1.30      daniel   1633:     }
1.33      daniel   1634:     /*
                   1635:      * Ignore when input buffer is not on a boundary
                   1636:      */
                   1637:     if (ret == -3) ret = 0;
1.30      daniel   1638:     return(ret);
                   1639: }
                   1640: 
                   1641: /**
                   1642:  * xmlCharEncOutFunc:
                   1643:  * @handler:   char enconding transformation data structure
1.31      daniel   1644:  * @out:  an xmlBuffer for the output.
                   1645:  * @in:  an xmlBuffer for the input
                   1646:  *     
                   1647:  * Generic front-end for the encoding handler output function
1.35      daniel   1648:  * a first call with @in == NULL has to be made firs to initiate the 
                   1649:  * output in case of non-stateless encoding needing to initiate their
                   1650:  * state or the output (like the BOM in UTF16).
1.39      daniel   1651:  * In case of UTF8 sequence conversion errors for the given encoder,
                   1652:  * the content will be automatically remapped to a CharRef sequence.
1.30      daniel   1653:  *     
1.31      daniel   1654:  * Returns the number of byte written if success, or 
                   1655:  *     -1 general error
1.30      daniel   1656:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1657:  *        the result of transformation can't fit into the encoding we want), or
                   1658:  */
                   1659: int
1.31      daniel   1660: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1661:                   xmlBufferPtr in) {
1.30      daniel   1662:     int ret = -2;
1.31      daniel   1663:     int written;
                   1664:     int toconv;
1.39      daniel   1665:     int output = 0;
1.31      daniel   1666: 
                   1667:     if (handler == NULL) return(-1);
                   1668:     if (out == NULL) return(-1);
1.39      daniel   1669: 
                   1670: retry:
                   1671:     
1.35      daniel   1672:     written = out->size - out->use;
                   1673: 
1.39      daniel   1674:     /*
                   1675:      * First specific handling of in = NULL, i.e. the initialization call
                   1676:      */
1.35      daniel   1677:     if (in == NULL) {
                   1678:         toconv = 0;
                   1679:        if (handler->output != NULL) {
                   1680:            ret = handler->output(&out->content[out->use], &written,
                   1681:                                  NULL, &toconv);
                   1682:            out->use += written;
                   1683:            out->content[out->use] = 0;
                   1684:        }
                   1685: #ifdef LIBXML_ICONV_ENABLED
                   1686:        else if (handler->iconv_out != NULL) {
                   1687:            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1688:                                  &written, NULL, &toconv);
                   1689:            out->use += written;
                   1690:            out->content[out->use] = 0;
                   1691:        }
                   1692: #endif /* LIBXML_ICONV_ENABLED */
                   1693: #ifdef DEBUG_ENCODING
                   1694:        fprintf(stderr, "initialized encoder\n");
                   1695: #endif
                   1696:         return(0);
                   1697:     }
1.30      daniel   1698: 
1.39      daniel   1699:     /*
                   1700:      * Convertion itself.
                   1701:      */
1.33      daniel   1702:     toconv = in->use;
                   1703:     if (toconv * 2 >= written) {
                   1704:         xmlBufferGrow(out, toconv * 2);
                   1705:        written = out->size - out->use - 1;
                   1706:     }
1.30      daniel   1707:     if (handler->output != NULL) {
1.33      daniel   1708:        ret = handler->output(&out->content[out->use], &written,
1.35      daniel   1709:                              in->content, &toconv);
1.31      daniel   1710:        xmlBufferShrink(in, toconv);
                   1711:        out->use += written;
1.33      daniel   1712:        out->content[out->use] = 0;
1.30      daniel   1713:     }
                   1714: #ifdef LIBXML_ICONV_ENABLED
                   1715:     else if (handler->iconv_out != NULL) {
1.31      daniel   1716:        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1717:                              &written, in->content, &toconv);
                   1718:        xmlBufferShrink(in, toconv);
                   1719:        out->use += written;
1.33      daniel   1720:        out->content[out->use] = 0;
                   1721:        if (ret == -1) ret = -3;
1.30      daniel   1722:     }
                   1723: #endif /* LIBXML_ICONV_ENABLED */
1.46      veillard 1724:     else {
                   1725:        fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
                   1726:        return(-1);
                   1727:     }
1.39      daniel   1728: 
                   1729:     if (ret >= 0) output += ret;
                   1730: 
                   1731:     /*
                   1732:      * Attempt to handle error cases
                   1733:      */
                   1734:     switch (ret) {
1.30      daniel   1735: #ifdef DEBUG_ENCODING
                   1736:         case 0:
                   1737:            fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31      daniel   1738:                    toconv, written);
1.30      daniel   1739:            break;
                   1740:         case -1:
                   1741:            fprintf(stderr, "output conversion failed by lack of space\n");
                   1742:            break;
                   1743:         case -3:
1.31      daniel   1744:            fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
                   1745:                    toconv, written, in->use);
1.30      daniel   1746:            break;
1.39      daniel   1747: #endif
                   1748:         case -2: {
                   1749:            int len = in->use;
1.43      veillard 1750:            const xmlChar *utf = (const xmlChar *) in->content;
1.39      daniel   1751:            int cur;
                   1752: 
                   1753:            cur = xmlGetUTF8Char(utf, &len);
                   1754:            if (cur > 0) {
                   1755:                xmlChar charref[20];
                   1756: 
                   1757: #ifdef DEBUG_ENCODING
                   1758:                fprintf(stderr, "handling output conversion error\n");
                   1759:                fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
                   1760:                        in->content[0], in->content[1],
                   1761:                        in->content[2], in->content[3]);
                   1762: #endif
                   1763:                /*
                   1764:                 * Removes the UTF8 sequence, and replace it by a charref
                   1765:                 * and continue the transcoding phase, hoping the error
                   1766:                 * did not mangle the encoder state.
                   1767:                 */
1.43      veillard 1768:                sprintf((char *) charref, "&#x%X;", cur);
1.39      daniel   1769:                xmlBufferShrink(in, len);
                   1770:                xmlBufferAddHead(in, charref, -1);
                   1771: 
                   1772:                goto retry;
                   1773:            } else {
                   1774:                fprintf(stderr, "output conversion failed due to conv error\n");
                   1775:                fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
                   1776:                        in->content[0], in->content[1],
                   1777:                        in->content[2], in->content[3]);
                   1778:            }
                   1779:            break;
                   1780:        }
1.30      daniel   1781:     }
                   1782:     return(ret);
                   1783: }
                   1784: 
                   1785: /**
                   1786:  * xmlCharEncCloseFunc:
                   1787:  * @handler:   char enconding transformation data structure
                   1788:  *     
                   1789:  * Generic front-end for hencoding handler close function
                   1790:  *
                   1791:  * Returns 0 if success, or -1 in case of error
                   1792:  */
                   1793: int
                   1794: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
                   1795:     int ret = 0;
1.31      daniel   1796:     if (handler == NULL) return(-1);
                   1797:     if (handler->name == NULL) return(-1);
1.30      daniel   1798: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1799:     /*
                   1800:      * Iconv handlers can be oused only once, free the whole block.
                   1801:      * and the associated icon resources.
                   1802:      */
1.32      daniel   1803:     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
                   1804:        if (handler->name != NULL)
                   1805:            xmlFree(handler->name);
                   1806:        handler->name = NULL;
                   1807:        if (handler->iconv_out != NULL) {
                   1808:            if (iconv_close(handler->iconv_out))
                   1809:                ret = -1;
                   1810:            handler->iconv_out = NULL;
                   1811:        }
                   1812:        if (handler->iconv_in != NULL) {
                   1813:            if (iconv_close(handler->iconv_in))
                   1814:                ret = -1;
                   1815:            handler->iconv_in = NULL;
                   1816:        }
                   1817:        xmlFree(handler);
1.30      daniel   1818:     }
                   1819: #endif /* LIBXML_ICONV_ENABLED */
                   1820: #ifdef DEBUG_ENCODING
                   1821:     if (ret)
                   1822:         fprintf(stderr, "failed to close the encoding handler\n");
                   1823:     else
                   1824:         fprintf(stderr, "closed the encoding handler\n");
                   1825: 
                   1826: #endif
                   1827:     return(ret);
1.9       daniel   1828: }
                   1829: 

Webmaster