Annotation of XML/encoding.c, revision 1.37

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.30      daniel     37: #include <libxml/xmlversion.h>
                     38: #ifdef LIBXML_ICONV_ENABLED
                     39: #ifdef HAVE_ERRNO_H
                     40: #include <errno.h>
                     41: #endif
                     42: #endif
1.29      daniel     43: #include <libxml/encoding.h>
                     44: #include <libxml/xmlmemory.h>
1.3       daniel     45: 
1.25      daniel     46: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     47: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     48: 
1.30      daniel     49: #ifdef LIBXML_ICONV_ENABLED
1.37    ! daniel     50: #if 0
1.30      daniel     51: #define DEBUG_ENCODING  /* Define this to get encoding traces */
                     52: #endif
1.33      daniel     53: #endif
1.30      daniel     54: 
1.34      daniel     55: static int xmlLittleEndian = 1;
                     56: 
1.3       daniel     57: /*
                     58:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     59:  *
                     60:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     61:  * 0000 0000-0000 007F   0xxxxxxx
                     62:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     63:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     64:  *
                     65:  * I hope we won't use values > 0xFFFF anytime soon !
                     66:  */
1.1       daniel     67: 
                     68: /**
1.22      daniel     69:  * xmlCheckUTF8: Check utf-8 string for legality.
                     70:  * @utf: Pointer to putative utf-8 encoded string.
                     71:  *
                     72:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     73:  * null-terminated. This function is not super-strict, as it will
                     74:  * allow longer utf-8 sequences than necessary. Note that Java is
                     75:  * capable of producing these sequences if provoked. Also note, this
                     76:  * routine checks for the 4-byte maxiumum size, but does not check for
                     77:  * 0x10ffff maximum value.
                     78:  *
                     79:  * Return value: true if @utf is valid.
                     80:  **/
                     81: int
                     82: xmlCheckUTF8(const unsigned char *utf)
                     83: {
                     84:     int ix;
                     85:     unsigned char c;
                     86: 
                     87:     for (ix = 0; (c = utf[ix]);) {
                     88:         if (c & 0x80) {
                     89:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     90:                return(0);
                     91:            if ((c & 0xe0) == 0xe0) {
                     92:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     93:                    return(0);
                     94:                if ((c & 0xf0) == 0xf0) {
                     95:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     96:                        return(0);
                     97:                    ix += 4;
                     98:                    /* 4-byte code */
                     99:                } else
                    100:                  /* 3-byte code */
                    101:                    ix += 3;
                    102:            } else
                    103:              /* 2-byte code */
                    104:                ix += 2;
                    105:        } else
                    106:            /* 1-byte code */
                    107:            ix++;
                    108:       }
                    109:       return(1);
                    110: }
                    111: 
                    112: /**
1.1       daniel    113:  * isolat1ToUTF8:
1.18      daniel    114:  * @out:  a pointer to an array of bytes to store the result
                    115:  * @outlen:  the length of @out
                    116:  * @in:  a pointer to an array of ISO Latin 1 chars
                    117:  * @inlen:  the length of @in
1.1       daniel    118:  *
                    119:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    120:  * block of chars out.
1.33      daniel    121:  * Returns 0 if success, or -1 otherwise
                    122:  * The value of @inlen after return is the number of octets consumed
                    123:  *     as the return value is positive, else unpredictiable.
                    124:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    125:  */
                    126: int
1.33      daniel    127: isolat1ToUTF8(unsigned char* out, int *outlen,
1.25      daniel    128:               const unsigned char* in, int *inlen) {
1.33      daniel    129:     unsigned char* outstart = out;
                    130:     const unsigned char* processed = in;
                    131:     unsigned char* outend = out + *outlen;
                    132:     const unsigned char* inend = in + *inlen;
1.1       daniel    133:     unsigned char c;
                    134: 
                    135:     while (in < inend) {
                    136:         c= *in++;
                    137:         if (c < 0x80) {
1.33      daniel    138:             if (out >= outend)
                    139:                break;
1.1       daniel    140:             *out++ = c;
                    141:         }
                    142:         else {
1.33      daniel    143:             if (out + 1 >= outend)  break;
1.1       daniel    144:             *out++ = 0xC0 | (c >> 6);
                    145:             *out++ = 0x80 | (0x3F & c);
                    146:         }
1.33      daniel    147:        processed = in;
1.1       daniel    148:     }
1.33      daniel    149:     *outlen = out - outstart;
                    150:     *inlen = processed - in;
                    151: 
                    152:     return(0);
1.1       daniel    153: }
                    154: 
                    155: /**
                    156:  * UTF8Toisolat1:
1.18      daniel    157:  * @out:  a pointer to an array of bytes to store the result
                    158:  * @outlen:  the length of @out
                    159:  * @in:  a pointer to an array of UTF-8 chars
                    160:  * @inlen:  the length of @in
1.1       daniel    161:  *
                    162:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    163:  * block of chars out.
1.15      daniel    164:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    165:  *
1.33      daniel    166:  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1.28      daniel    167:  * The value of @inlen after return is the number of octets consumed
                    168:  *     as the return value is positive, else unpredictiable.
1.33      daniel    169:  * The value of @outlen after return is the number of ocetes consumed.
1.1       daniel    170:  */
                    171: int
1.33      daniel    172: UTF8Toisolat1(unsigned char* out, int *outlen,
1.25      daniel    173:               const unsigned char* in, int *inlen) {
1.33      daniel    174:     unsigned char* outstart = out;
                    175:     const unsigned char* processed = in;
                    176:     unsigned char* outend = out + *outlen;
                    177:     const unsigned char* inend = in + *inlen;
1.1       daniel    178:     unsigned char c;
                    179: 
                    180:     while (in < inend) {
                    181:         c= *in++;
                    182:         if (c < 0x80) {
1.28      daniel    183:             if (out >= outend)  return(-1);
1.1       daniel    184:             *out++= c;
                    185:         }
1.23      daniel    186:        else if (in == inend) {
                    187:             break;
                    188:        }
                    189:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    190:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    191:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    192:        }
1.33      daniel    193:        else {
                    194:            *outlen = out - outstart;
                    195:            *inlen = processed - in;
1.28      daniel    196:            return(-2);
1.33      daniel    197:        }
                    198:        processed = in;
1.1       daniel    199:     }
1.33      daniel    200:     *outlen = out - outstart;
                    201:     *inlen = processed - in;
                    202:     return(0);
1.1       daniel    203: }
                    204: 
                    205: /**
1.28      daniel    206:  * UTF16LEToUTF8:
                    207:  * @out:  a pointer to an array of bytes to store the result
                    208:  * @outlen:  the length of @out
                    209:  * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
                    210:  * @inlenb:  the length of @in in UTF-16LE chars
                    211:  *
                    212:  * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
                    213:  * block of chars out. This function assume the endian properity
                    214:  * is the same between the native type of this machine and the
                    215:  * inputed one.
                    216:  *
                    217:  * Returns the number of byte written, or -1 by lack of space, or -2
                    218:  *     if the transcoding fails (for *in is not valid utf16 string)
                    219:  *     The value of *inlen after return is the number of octets consumed
                    220:  *     as the return value is positive, else unpredictiable.
                    221:  */
                    222: int
1.33      daniel    223: UTF16LEToUTF8(unsigned char* out, int *outlen,
1.28      daniel    224:             const unsigned char* inb, int *inlenb)
                    225: {
1.33      daniel    226:     unsigned char* outstart = out;
                    227:     const unsigned char* processed = inb;
                    228:     unsigned char* outend = out + *outlen;
1.28      daniel    229:     unsigned short* in = (unsigned short*) inb;
                    230:     unsigned short* inend;
                    231:     unsigned int c, d, inlen;
                    232:     unsigned char *tmp;
                    233:     int bits;
                    234: 
                    235:     if ((*inlenb % 2) == 1)
                    236:         (*inlenb)--;
                    237:     inlen = *inlenb / 2;
1.33      daniel    238:     inend = in + inlen;
1.28      daniel    239:     while (in < inend) {
1.34      daniel    240:         if (xmlLittleEndian) {
                    241:            c= *in++;
                    242:        } else {
                    243:            tmp = (unsigned char *) in;
                    244:            c = *tmp++;
                    245:            c = c | (((unsigned int)*tmp) << 8);
                    246:            in++;
                    247:        }
1.28      daniel    248:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    249:             if (in >= inend) {           /* (in > inend) shouldn't happens */
                    250:                 break;
                    251:             }
1.34      daniel    252:            if (xmlLittleEndian) {
                    253:                d = *in++;
                    254:            } else {
                    255:                tmp = (unsigned char *) in;
                    256:                d = *tmp++;
                    257:                d = d | (((unsigned int)*tmp) << 8);
                    258:                in++;
                    259:            }
1.28      daniel    260:             if ((d & 0xFC00) == 0xDC00) {
                    261:                 c &= 0x03FF;
                    262:                 c <<= 10;
                    263:                 c |= d & 0x03FF;
                    264:                 c += 0x10000;
                    265:             }
1.33      daniel    266:             else {
                    267:                *outlen = out - outstart;
                    268:                *inlenb = processed - inb;
1.28      daniel    269:                return(-2);
1.33      daniel    270:            }
1.28      daniel    271:         }
                    272: 
                    273:        /* assertion: c is a single UTF-4 value */
                    274:         if (out >= outend)
1.33      daniel    275:            break;
1.28      daniel    276:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
                    277:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    278:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    279:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
                    280:  
                    281:         for ( ; bits >= 0; bits-= 6) {
                    282:             if (out >= outend)
1.33      daniel    283:                break;
1.28      daniel    284:             *out++= ((c >> bits) & 0x3F) | 0x80;
                    285:         }
1.33      daniel    286:        processed = (const unsigned char*) in;
1.28      daniel    287:     }
1.33      daniel    288:     *outlen = out - outstart;
                    289:     *inlenb = processed - inb;
                    290:     return(0);
1.28      daniel    291: }
                    292: 
                    293: /**
                    294:  * UTF8ToUTF16LE:
                    295:  * @outb:  a pointer to an array of bytes to store the result
                    296:  * @outlen:  the length of @outb
                    297:  * @in:  a pointer to an array of UTF-8 chars
                    298:  * @inlen:  the length of @in
                    299:  *
                    300:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
                    301:  * block of chars out.
                    302:  * TODO: UTF8ToUTF16LE need a fallback mechanism ...
                    303:  *
                    304:  * Returns the number of byte written, or -1 by lack of space, or -2
                    305:  *     if the transcoding failed. 
                    306:  */
                    307: int
1.33      daniel    308: UTF8ToUTF16LE(unsigned char* outb, int *outlen,
1.28      daniel    309:             const unsigned char* in, int *inlen)
                    310: {
                    311:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    312:     const unsigned char* processed = in;
1.28      daniel    313:     unsigned short* outstart= out;
                    314:     unsigned short* outend;
                    315:     const unsigned char* inend= in+*inlen;
                    316:     unsigned int c, d, trailing;
                    317:     unsigned char *tmp;
                    318:     unsigned short tmp1, tmp2;
                    319: 
1.37    ! daniel    320:     if (in == NULL) {
        !           321:         /*
        !           322:         * initialization, add the Byte Order Mark
        !           323:         */
        !           324:         if (*outlen >= 2) {
        !           325:            outb[0] = 0xFF;
        !           326:            outb[1] = 0xFE;
        !           327:            *outlen = 2;
        !           328:            *inlen = 0;
        !           329: #ifdef DEBUG_ENCODING
        !           330:             fprintf(stderr, "Added FFFE Byte Order Mark\n");
        !           331: #endif
        !           332:            return(2);
        !           333:        }
        !           334:        *outlen = 0;
        !           335:        *inlen = 0;
        !           336:        return(0);
        !           337:     }
1.33      daniel    338:     outend = out + (*outlen / 2);
1.28      daniel    339:     while (in < inend) {
                    340:       d= *in++;
                    341:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    342:       else if (d < 0xC0) {
                    343:           /* trailing byte in leading position */
                    344:          *outlen = out - outstart;
                    345:          *inlen = processed - in;
                    346:          return(-2);
                    347:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.28      daniel    348:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    349:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    350:       else {
                    351:        /* no chance for this in UTF-16 */
                    352:        *outlen = out - outstart;
                    353:        *inlen = processed - in;
                    354:        return(-2);
                    355:       }
1.28      daniel    356: 
                    357:       if (inend - in < trailing) {
                    358:           break;
                    359:       } 
                    360: 
                    361:       for ( ; trailing; trailing--) {
                    362:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1.33      daniel    363:              break;
1.28      daniel    364:           c <<= 6;
                    365:           c |= d & 0x3F;
                    366:       }
                    367: 
                    368:       /* assertion: c is a single UTF-4 value */
                    369:         if (c < 0x10000) {
                    370:             if (out >= outend)
1.33      daniel    371:                break;
1.34      daniel    372:            if (xmlLittleEndian) {
                    373:                *out++ = c;
                    374:            } else {
                    375:                tmp = (unsigned char *) out;
                    376:                *tmp = c ;
                    377:                *(tmp + 1) = c >> 8 ;
                    378:                out++;
                    379:            }
1.28      daniel    380:         }
                    381:         else if (c < 0x110000) {
                    382:             if (out+1 >= outend)
1.33      daniel    383:                break;
1.28      daniel    384:             c -= 0x10000;
1.34      daniel    385:            if (xmlLittleEndian) {
                    386:                *out++ = 0xD800 | (c >> 10);
                    387:                *out++ = 0xDC00 | (c & 0x03FF);
                    388:            } else {
                    389:                tmp1 = 0xD800 | (c >> 10);
                    390:                tmp = (unsigned char *) out;
                    391:                *tmp = tmp1;
                    392:                *(tmp + 1) = tmp1 >> 8;
                    393:                out++;
                    394: 
                    395:                tmp2 = 0xDC00 | (c & 0x03FF);
                    396:                tmp = (unsigned char *) out;
                    397:                *tmp  = tmp2;
                    398:                *(tmp + 1) = tmp2 >> 8;
                    399:                out++;
                    400:            }
1.28      daniel    401:         }
                    402:         else
1.33      daniel    403:            break;
                    404:        processed = in;
1.28      daniel    405:     }
1.36      daniel    406:     *outlen = (out - outstart) * 2;
1.33      daniel    407:     *inlen = processed - in;
                    408:     return(0);
1.28      daniel    409: }
                    410: 
                    411: /**
                    412:  * UTF16BEToUTF8:
1.18      daniel    413:  * @out:  a pointer to an array of bytes to store the result
                    414:  * @outlen:  the length of @out
1.25      daniel    415:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    416:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    417:  *
                    418:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
1.28      daniel    419:  * block of chars out. This function assume the endian properity
                    420:  * is the same between the native type of this machine and the
                    421:  * inputed one.
1.25      daniel    422:  *
1.28      daniel    423:  * Returns the number of byte written, or -1 by lack of space, or -2
                    424:  *     if the transcoding fails (for *in is not valid utf16 string)
                    425:  * The value of *inlen after return is the number of octets consumed
                    426:  *     as the return value is positive, else unpredictiable.
1.1       daniel    427:  */
                    428: int
1.33      daniel    429: UTF16BEToUTF8(unsigned char* out, int *outlen,
1.25      daniel    430:             const unsigned char* inb, int *inlenb)
1.1       daniel    431: {
1.33      daniel    432:     unsigned char* outstart = out;
                    433:     const unsigned char* processed = inb;
                    434:     unsigned char* outend = out + *outlen;
1.25      daniel    435:     unsigned short* in = (unsigned short*) inb;
                    436:     unsigned short* inend;
                    437:     unsigned int c, d, inlen;
1.28      daniel    438:     unsigned char *tmp;
1.1       daniel    439:     int bits;
                    440: 
1.28      daniel    441:     if ((*inlenb % 2) == 1)
                    442:         (*inlenb)--;
1.25      daniel    443:     inlen = *inlenb / 2;
                    444:     inend= in + inlen;
1.1       daniel    445:     while (in < inend) {
1.34      daniel    446:        if (xmlLittleEndian) {
                    447:            tmp = (unsigned char *) in;
                    448:            c = *tmp++;
                    449:            c = c << 8;
                    450:            c = c | (unsigned int) *tmp;
                    451:            in++;
                    452:        } else {
                    453:            c= *in++;
                    454:        } 
1.1       daniel    455:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
1.28      daniel    456:            if (in >= inend) {           /* (in > inend) shouldn't happens */
1.33      daniel    457:                *outlen = out - outstart;
                    458:                *inlenb = processed - inb;
                    459:                return(-2);
1.28      daniel    460:            }
1.34      daniel    461:            if (xmlLittleEndian) {
                    462:                tmp = (unsigned char *) in;
                    463:                d = *tmp++;
                    464:                d = d << 8;
                    465:                d = d | (unsigned int) *tmp;
                    466:                in++;
                    467:            } else {
                    468:                d= *in++;
                    469:            }
1.28      daniel    470:             if ((d & 0xFC00) == 0xDC00) {
1.1       daniel    471:                 c &= 0x03FF;
                    472:                 c <<= 10;
                    473:                 c |= d & 0x03FF;
                    474:                 c += 0x10000;
                    475:             }
1.33      daniel    476:             else {
                    477:                *outlen = out - outstart;
                    478:                *inlenb = processed - inb;
1.28      daniel    479:                return(-2);
1.33      daniel    480:            }
1.1       daniel    481:         }
                    482: 
1.25      daniel    483:        /* assertion: c is a single UTF-4 value */
1.27      daniel    484:         if (out >= outend) 
1.33      daniel    485:            break;
1.1       daniel    486:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26      daniel    487:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
                    488:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
                    489:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    490:  
1.26      daniel    491:         for ( ; bits >= 0; bits-= 6) {
1.27      daniel    492:             if (out >= outend) 
1.33      daniel    493:                break;
1.26      daniel    494:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    495:         }
1.33      daniel    496:        processed = (const unsigned char*) in;
1.1       daniel    497:     }
1.33      daniel    498:     *outlen = out - outstart;
                    499:     *inlenb = processed - inb;
                    500:     return(0);
1.1       daniel    501: }
                    502: 
                    503: /**
1.28      daniel    504:  * UTF8ToUTF16BE:
1.25      daniel    505:  * @outb:  a pointer to an array of bytes to store the result
                    506:  * @outlen:  the length of @outb
1.18      daniel    507:  * @in:  a pointer to an array of UTF-8 chars
                    508:  * @inlen:  the length of @in
1.1       daniel    509:  *
1.28      daniel    510:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
1.1       daniel    511:  * block of chars out.
1.28      daniel    512:  * TODO: UTF8ToUTF16BE need a fallback mechanism ...
1.15      daniel    513:  *
1.6       daniel    514:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    515:  *     if the transcoding failed. 
1.1       daniel    516:  */
                    517: int
1.33      daniel    518: UTF8ToUTF16BE(unsigned char* outb, int *outlen,
1.25      daniel    519:             const unsigned char* in, int *inlen)
1.1       daniel    520: {
1.25      daniel    521:     unsigned short* out = (unsigned short*) outb;
1.33      daniel    522:     const unsigned char* processed = in;
1.1       daniel    523:     unsigned short* outstart= out;
1.28      daniel    524:     unsigned short* outend;
1.25      daniel    525:     const unsigned char* inend= in+*inlen;
1.1       daniel    526:     unsigned int c, d, trailing;
1.28      daniel    527:     unsigned char *tmp;
                    528:     unsigned short tmp1, tmp2;
1.1       daniel    529: 
1.37    ! daniel    530:     if (in == NULL) {
        !           531:         /*
        !           532:         * initialization, add the Byte Order Mark
        !           533:         */
        !           534:         if (*outlen >= 2) {
        !           535:            outb[0] = 0xFE;
        !           536:            outb[1] = 0xFF;
        !           537:            *outlen = 2;
        !           538:            *inlen = 0;
        !           539: #ifdef DEBUG_ENCODING
        !           540:             fprintf(stderr, "Added FEFF Byte Order Mark\n");
        !           541: #endif
        !           542:            return(2);
        !           543:        }
        !           544:        *outlen = 0;
        !           545:        *inlen = 0;
        !           546:        return(0);
        !           547:     }
1.33      daniel    548:     outend = out + (*outlen / 2);
1.1       daniel    549:     while (in < inend) {
                    550:       d= *in++;
                    551:       if      (d < 0x80)  { c= d; trailing= 0; }
1.33      daniel    552:       else if (d < 0xC0)  {
                    553:           /* trailing byte in leading position */
                    554:          *outlen = out - outstart;
                    555:          *inlen = processed - in;
                    556:          return(-2);
                    557:       } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1.1       daniel    558:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    559:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1.33      daniel    560:       else {
                    561:           /* no chance for this in UTF-16 */
                    562:          *outlen = out - outstart;
                    563:          *inlen = processed - in;
                    564:          return(-2);
                    565:       }
1.28      daniel    566: 
                    567:       if (inend - in < trailing) {
                    568:           break;
                    569:       } 
1.1       daniel    570: 
                    571:       for ( ; trailing; trailing--) {
1.33      daniel    572:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1.1       daniel    573:           c <<= 6;
                    574:           c |= d & 0x3F;
                    575:       }
                    576: 
                    577:       /* assertion: c is a single UTF-4 value */
                    578:         if (c < 0x10000) {
1.33      daniel    579:             if (out >= outend)  break;
1.34      daniel    580:            if (xmlLittleEndian) {
                    581:                tmp = (unsigned char *) out;
                    582:                *tmp = c >> 8;
                    583:                *(tmp + 1) = c;
                    584:                out++;
                    585:            } else {
                    586:                *out++ = c;
                    587:            }
1.1       daniel    588:         }
                    589:         else if (c < 0x110000) {
1.33      daniel    590:             if (out+1 >= outend)  break;
1.1       daniel    591:             c -= 0x10000;
1.34      daniel    592:            if (xmlLittleEndian) {
                    593:                tmp1 = 0xD800 | (c >> 10);
                    594:                tmp = (unsigned char *) out;
                    595:                *tmp = tmp1 >> 8;
                    596:                *(tmp + 1) = tmp1;
                    597:                out++;
                    598: 
                    599:                tmp2 = 0xDC00 | (c & 0x03FF);
                    600:                tmp = (unsigned char *) out;
                    601:                *tmp = tmp2 >> 8;
                    602:                *(tmp + 1) = tmp2;
                    603:                out++;
                    604:            } else {
                    605:                *out++ = 0xD800 | (c >> 10);
                    606:                *out++ = 0xDC00 | (c & 0x03FF);
                    607:            }
1.1       daniel    608:         }
1.33      daniel    609:         else
                    610:            break;
                    611:        processed = in;
1.1       daniel    612:     }
1.36      daniel    613:     *outlen = (out - outstart) * 2;
1.33      daniel    614:     *inlen = processed - in;
                    615:     return(0);
1.1       daniel    616: }
                    617: 
1.7       daniel    618: /**
                    619:  * xmlDetectCharEncoding:
                    620:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    621:  *       4 bytes long.
1.25      daniel    622:  * @len:  pointer to the length of the buffer
1.7       daniel    623:  *
                    624:  * Guess the encoding of the entity using the first bytes of the entity content
                    625:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    626:  * 
                    627:  * Returns one of the XML_CHAR_ENCODING_... values.
                    628:  */
                    629: xmlCharEncoding
1.25      daniel    630: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    631: {
1.25      daniel    632:     if (len >= 4) {
                    633:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    634:            (in[2] == 0x00) && (in[3] == 0x3C))
                    635:            return(XML_CHAR_ENCODING_UCS4BE);
                    636:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    637:            (in[2] == 0x00) && (in[3] == 0x00))
                    638:            return(XML_CHAR_ENCODING_UCS4LE);
                    639:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    640:            (in[2] == 0x3C) && (in[3] == 0x00))
                    641:            return(XML_CHAR_ENCODING_UCS4_2143);
                    642:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    643:            (in[2] == 0x00) && (in[3] == 0x00))
                    644:            return(XML_CHAR_ENCODING_UCS4_3412);
                    645:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    646:            (in[2] == 0xA7) && (in[3] == 0x94))
                    647:            return(XML_CHAR_ENCODING_EBCDIC);
                    648:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    649:            (in[2] == 0x78) && (in[3] == 0x6D))
                    650:            return(XML_CHAR_ENCODING_UTF8);
                    651:     }
                    652:     if (len >= 2) {
                    653:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    654:            return(XML_CHAR_ENCODING_UTF16BE);
                    655:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    656:            return(XML_CHAR_ENCODING_UTF16LE);
                    657:     }
1.7       daniel    658:     return(XML_CHAR_ENCODING_NONE);
                    659: }
                    660: 
                    661: /**
                    662:  * xmlParseCharEncoding:
1.18      daniel    663:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    664:  *
                    665:  * Conpare the string to the known encoding schemes already known. Note
                    666:  * that the comparison is case insensitive accordingly to the section
                    667:  * [XML] 4.3.3 Character Encoding in Entities.
                    668:  * 
                    669:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    670:  * if not recognized.
                    671:  */
                    672: xmlCharEncoding
1.8       daniel    673: xmlParseCharEncoding(const char* name)
1.7       daniel    674: {
                    675:     char upper[500];
                    676:     int i;
                    677: 
                    678:     for (i = 0;i < 499;i++) {
                    679:         upper[i] = toupper(name[i]);
                    680:        if (upper[i] == 0) break;
                    681:     }
                    682:     upper[i] = 0;
                    683: 
                    684:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    685:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    686:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    687: 
                    688:     /*
                    689:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    690:      *       already found and in use
                    691:      */
                    692:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    693:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    694:     
                    695:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    696:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    697:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    698: 
                    699:     /*
                    700:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    701:      *       already found and in use
                    702:      */
                    703:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    704:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    705:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    706: 
                    707:     
                    708:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    709:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    710:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    711: 
                    712:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    713:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    714:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    715: 
                    716:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    717:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    718:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    719:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    720:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    721:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    722:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    723: 
                    724:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1.30      daniel    725:     if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1.7       daniel    726:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1.30      daniel    727: 
                    728: #ifdef DEBUG_ENCODING
                    729:     fprintf(stderr, "Unknown encoding %s\n", name);
                    730: #endif
1.7       daniel    731:     return(XML_CHAR_ENCODING_ERROR);
                    732: }
1.9       daniel    733: 
                    734: /****************************************************************
                    735:  *                                                             *
                    736:  *             Char encoding handlers                          *
                    737:  *                                                             *
                    738:  ****************************************************************/
                    739: 
                    740: /* the size should be growable, but it's not a big deal ... */
                    741: #define MAX_ENCODING_HANDLERS 50
                    742: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    743: static int nbCharEncodingHandler = 0;
                    744: 
                    745: /*
                    746:  * The default is UTF-8 for XML, that's also the default used for the
                    747:  * parser internals, so the default encoding handler is NULL
                    748:  */
                    749: 
                    750: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    751: 
                    752: /**
                    753:  * xmlNewCharEncodingHandler:
1.18      daniel    754:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    755:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    756:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    757:  *
                    758:  * Create and registers an xmlCharEncodingHandler.
                    759:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    760:  */
                    761: xmlCharEncodingHandlerPtr
1.25      daniel    762: xmlNewCharEncodingHandler(const char *name, 
                    763:                           xmlCharEncodingInputFunc input,
1.9       daniel    764:                           xmlCharEncodingOutputFunc output) {
                    765:     xmlCharEncodingHandlerPtr handler;
                    766:     char upper[500];
                    767:     int i;
                    768:     char *up = 0;
                    769: 
                    770:     /*
                    771:      * Keep only the uppercase version of the encoding.
                    772:      */
                    773:     if (name == NULL) {
                    774:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    775:        return(NULL);
                    776:     }
                    777:     for (i = 0;i < 499;i++) {
                    778:         upper[i] = toupper(name[i]);
                    779:        if (upper[i] == 0) break;
                    780:     }
                    781:     upper[i] = 0;
1.16      daniel    782:     up = xmlMemStrdup(upper);
1.9       daniel    783:     if (up == NULL) {
                    784:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    785:        return(NULL);
                    786:     }
                    787: 
                    788:     /*
                    789:      * allocate and fill-up an handler block.
                    790:      */
                    791:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    792:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    793:     if (handler == NULL) {
                    794:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    795:        return(NULL);
                    796:     }
                    797:     handler->input = input;
                    798:     handler->output = output;
                    799:     handler->name = up;
                    800: 
                    801:     /*
                    802:      * registers and returns the handler.
                    803:      */
                    804:     xmlRegisterCharEncodingHandler(handler);
1.30      daniel    805: #ifdef DEBUG_ENCODING
                    806:     fprintf(stderr, "Registered encoding handler for %s\n", name);
                    807: #endif
1.9       daniel    808:     return(handler);
                    809: }
                    810: 
                    811: /**
                    812:  * xmlInitCharEncodingHandlers:
                    813:  *
                    814:  * Initialize the char encoding support, it registers the default
                    815:  * encoding supported.
1.18      daniel    816:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    817:  *       in normal processing.
                    818:  */
                    819: void
                    820: xmlInitCharEncodingHandlers(void) {
1.34      daniel    821:     unsigned short int tst = 0x1234;
                    822:     unsigned char *ptr = (unsigned char *) &tst; 
                    823: 
1.9       daniel    824:     if (handlers != NULL) return;
                    825: 
                    826:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    827:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.34      daniel    828: 
                    829:     if (*ptr == 0x12) xmlLittleEndian = 0;
                    830:     else if (*ptr == 0x34) xmlLittleEndian = 1;
                    831:     else fprintf(stderr, "Odd problem at endianness detection\n");
1.9       daniel    832: 
                    833:     if (handlers == NULL) {
                    834:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    835:        return;
                    836:     }
1.10      daniel    837:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    838:     xmlUTF16LEHandler = 
1.28      daniel    839:           xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
                    840:     xmlUTF16BEHandler = 
                    841:           xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1.10      daniel    842:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    843: }
                    844: 
                    845: /**
1.19      daniel    846:  * xmlCleanupCharEncodingHandlers:
                    847:  *
                    848:  * Cleanup the memory allocated for the char encoding support, it
                    849:  * unregisters all the encoding handlers.
                    850:  */
                    851: void
                    852: xmlCleanupCharEncodingHandlers(void) {
                    853:     if (handlers == NULL) return;
                    854: 
                    855:     for (;nbCharEncodingHandler > 0;) {
                    856:         nbCharEncodingHandler--;
                    857:        if (handlers[nbCharEncodingHandler] != NULL) {
1.31      daniel    858:            if (handlers[nbCharEncodingHandler]->name != NULL)
                    859:                xmlFree(handlers[nbCharEncodingHandler]->name);
1.19      daniel    860:            xmlFree(handlers[nbCharEncodingHandler]);
                    861:        }
                    862:     }
                    863:     xmlFree(handlers);
                    864:     handlers = NULL;
                    865:     nbCharEncodingHandler = 0;
                    866:     xmlDefaultCharEncodingHandler = NULL;
                    867: }
                    868: 
                    869: /**
1.9       daniel    870:  * xmlRegisterCharEncodingHandler:
                    871:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    872:  *
                    873:  * Register the char encoding handler, surprizing, isn't it ?
                    874:  */
                    875: void
                    876: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    877:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    878:     if (handler == NULL) {
                    879:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    880:        return;
                    881:     }
                    882: 
                    883:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    884:         fprintf(stderr, 
                    885:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    886:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    887:        return;
                    888:     }
                    889:     handlers[nbCharEncodingHandler++] = handler;
                    890: }
                    891: 
                    892: /**
                    893:  * xmlGetCharEncodingHandler:
                    894:  * @enc:  an xmlCharEncoding value.
                    895:  *
                    896:  * Search in the registrered set the handler able to read/write that encoding.
                    897:  *
                    898:  * Returns the handler or NULL if not found
                    899:  */
                    900: xmlCharEncodingHandlerPtr
                    901: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1.30      daniel    902:     xmlCharEncodingHandlerPtr handler;
                    903: 
1.9       daniel    904:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    905:     switch (enc) {
                    906:         case XML_CHAR_ENCODING_ERROR:
                    907:            return(NULL);
                    908:         case XML_CHAR_ENCODING_NONE:
                    909:            return(NULL);
                    910:         case XML_CHAR_ENCODING_UTF8:
                    911:            return(NULL);
                    912:         case XML_CHAR_ENCODING_UTF16LE:
                    913:            return(xmlUTF16LEHandler);
                    914:         case XML_CHAR_ENCODING_UTF16BE:
                    915:            return(xmlUTF16BEHandler);
                    916:         case XML_CHAR_ENCODING_EBCDIC:
1.30      daniel    917:             handler = xmlFindCharEncodingHandler("EBCDIC");
                    918:             if (handler != NULL) return(handler);
                    919:             handler = xmlFindCharEncodingHandler("ebcdic");
                    920:             if (handler != NULL) return(handler);
                    921:            break;
1.25      daniel    922:         case XML_CHAR_ENCODING_UCS4LE:
1.30      daniel    923:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
                    924:             if (handler != NULL) return(handler);
                    925:             handler = xmlFindCharEncodingHandler("UCS-4");
                    926:             if (handler != NULL) return(handler);
                    927:             handler = xmlFindCharEncodingHandler("UCS4");
                    928:             if (handler != NULL) return(handler);
                    929:            break;
1.25      daniel    930:         case XML_CHAR_ENCODING_UCS4BE:
1.30      daniel    931:             handler = xmlFindCharEncodingHandler("UCS4BE");
                    932:             if (handler != NULL) return(handler);
                    933:            break;
1.25      daniel    934:         case XML_CHAR_ENCODING_UCS4_2143:
1.30      daniel    935:            break;
1.25      daniel    936:         case XML_CHAR_ENCODING_UCS4_3412:
1.30      daniel    937:            break;
1.25      daniel    938:         case XML_CHAR_ENCODING_UCS2:
1.30      daniel    939:             handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
                    940:             if (handler != NULL) return(handler);
                    941:             handler = xmlFindCharEncodingHandler("UCS-2");
                    942:             if (handler != NULL) return(handler);
                    943:             handler = xmlFindCharEncodingHandler("UCS2");
                    944:             if (handler != NULL) return(handler);
                    945:            break;
1.25      daniel    946:         case XML_CHAR_ENCODING_8859_1:
                    947:         case XML_CHAR_ENCODING_8859_2:
                    948:         case XML_CHAR_ENCODING_8859_3:
                    949:         case XML_CHAR_ENCODING_8859_4:
                    950:         case XML_CHAR_ENCODING_8859_5:
                    951:         case XML_CHAR_ENCODING_8859_6:
                    952:         case XML_CHAR_ENCODING_8859_7:
                    953:         case XML_CHAR_ENCODING_8859_8:
                    954:         case XML_CHAR_ENCODING_8859_9:
                    955:            return(NULL);
                    956:         case XML_CHAR_ENCODING_2022_JP:
1.30      daniel    957:             handler = xmlFindCharEncodingHandler("ISO-2022-JP");
                    958:             if (handler != NULL) return(handler);
                    959:            break;
1.25      daniel    960:         case XML_CHAR_ENCODING_SHIFT_JIS:
1.30      daniel    961:             handler = xmlFindCharEncodingHandler("SHIFT-JIS");
                    962:             if (handler != NULL) return(handler);
                    963:             handler = xmlFindCharEncodingHandler("SHIFT_JIS");
                    964:             if (handler != NULL) return(handler);
                    965:             handler = xmlFindCharEncodingHandler("Shift_JIS");
                    966:             if (handler != NULL) return(handler);
                    967:            break;
1.25      daniel    968:         case XML_CHAR_ENCODING_EUC_JP:
1.30      daniel    969:             handler = xmlFindCharEncodingHandler("EUC-JP");
                    970:             if (handler != NULL) return(handler);
                    971:            break;
                    972:        default: 
                    973:            break;
1.25      daniel    974:     }
1.30      daniel    975:     
                    976: #ifdef DEBUG_ENCODING
                    977:     fprintf(stderr, "No handler found for encoding %d\n", enc);
                    978: #endif
1.9       daniel    979:     return(NULL);
                    980: }
                    981: 
                    982: /**
                    983:  * xmlGetCharEncodingHandler:
                    984:  * @enc:  a string describing the char encoding.
                    985:  *
                    986:  * Search in the registrered set the handler able to read/write that encoding.
                    987:  *
                    988:  * Returns the handler or NULL if not found
                    989:  */
                    990: xmlCharEncodingHandlerPtr
                    991: xmlFindCharEncodingHandler(const char *name) {
1.36      daniel    992:     xmlCharEncodingHandlerPtr enc;
                    993:     xmlCharEncoding alias;
1.30      daniel    994: #ifdef LIBXML_ICONV_ENABLED
                    995:     iconv_t icv_in, icv_out;
                    996: #endif /* LIBXML_ICONV_ENABLED */
                    997:     char upper[100];
1.9       daniel    998:     int i;
                    999: 
                   1000:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                   1001:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                   1002:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                   1003: 
1.36      daniel   1004:     /*
                   1005:      * Check first for directly registered encoding names
                   1006:      */
1.30      daniel   1007:     for (i = 0;i < 99;i++) {
1.9       daniel   1008:         upper[i] = toupper(name[i]);
                   1009:        if (upper[i] == 0) break;
                   1010:     }
                   1011:     upper[i] = 0;
                   1012: 
                   1013:     for (i = 0;i < nbCharEncodingHandler; i++)
1.30      daniel   1014:         if (!strcmp(upper, handlers[i]->name)) {
                   1015: #ifdef DEBUG_ENCODING
                   1016:             fprintf(stderr, "Found registered handler for encoding %s\n", name);
                   1017: #endif
1.9       daniel   1018:            return(handlers[i]);
1.30      daniel   1019:        }
1.9       daniel   1020: 
1.36      daniel   1021:     /*
                   1022:      * check using aliases names
                   1023:      */
                   1024:    alias = xmlParseCharEncoding(name);
                   1025:    if (alias != XML_CHAR_ENCODING_ERROR) {
                   1026:        enc = xmlGetCharEncodingHandler(alias);
                   1027:        if (enc != NULL) {
                   1028: #ifdef DEBUG_ENCODING
                   1029:             fprintf(stderr, "Found registered handler %s for encoding %s\n",
                   1030:                    enc->name, name);
                   1031: #endif
                   1032:            return(enc);
                   1033:        }
                   1034:    }
1.30      daniel   1035: #ifdef LIBXML_ICONV_ENABLED
                   1036:     /* check whether iconv can handle this */
1.31      daniel   1037:     icv_in = iconv_open("UTF-8", name);
                   1038:     icv_out = iconv_open(name, "UTF-8");
1.30      daniel   1039:     if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1.31      daniel   1040:            enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
1.32      daniel   1041:            if (enc == NULL) {
                   1042:                iconv_close(icv_in);
                   1043:                iconv_close(icv_out);
                   1044:                return(NULL);
                   1045:            }
                   1046:            enc->name = NULL;
1.30      daniel   1047:            enc->input = NULL;
                   1048:            enc->output = NULL;
                   1049:            enc->iconv_in = icv_in;
                   1050:            enc->iconv_out = icv_out;
                   1051: #ifdef DEBUG_ENCODING
                   1052:             fprintf(stderr, "Found iconv handler for encoding %s\n", name);
                   1053: #endif
                   1054:            return enc;
                   1055:     } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
                   1056:            fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
                   1057:     }
                   1058: #endif /* LIBXML_ICONV_ENABLED */
                   1059: #ifdef DEBUG_ENCODING
                   1060:     fprintf(stderr, "No handler found for encoding %s\n", name);
                   1061: #endif
1.9       daniel   1062:     return(NULL);
1.30      daniel   1063: }
                   1064: 
                   1065: #ifdef LIBXML_ICONV_ENABLED
                   1066: /**
                   1067:  * xmlIconvWrapper:
                   1068:  * @cd:                iconv converter data structure
                   1069:  * @out:  a pointer to an array of bytes to store the result
                   1070:  * @outlen:  the length of @out
                   1071:  * @in:  a pointer to an array of ISO Latin 1 chars
                   1072:  * @inlen:  the length of @in
                   1073:  *
                   1074:  * Returns 0 if success, or 
                   1075:  *     -1 by lack of space, or
                   1076:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1077:  *        the result of transformation can't fit into the encoding we want), or
                   1078:  *     -3 if there the last byte can't form a single output char.
                   1079:  *     
                   1080:  * The value of @inlen after return is the number of octets consumed
                   1081:  *     as the return value is positive, else unpredictiable.
                   1082:  * The value of @outlen after return is the number of ocetes consumed.
                   1083:  */
                   1084: static int
                   1085: xmlIconvWrapper(iconv_t cd,
                   1086:        unsigned char *out, int *outlen,
                   1087:        const unsigned char *in, int *inlen) {
                   1088: 
                   1089:        size_t icv_inlen = *inlen, icv_outlen = *outlen;
                   1090:        const char *icv_in = (const char *) in;
                   1091:        char *icv_out = (char *) out;
                   1092:        int ret;
                   1093: 
                   1094:        ret = iconv(cd,
                   1095:                &icv_in, &icv_inlen,
                   1096:                &icv_out, &icv_outlen);
1.35      daniel   1097:        if (in != NULL) {
                   1098:            *inlen -= icv_inlen;
                   1099:            *outlen -= icv_outlen;
                   1100:        } else {
                   1101:            *inlen = 0;
                   1102:            *outlen = 0;
                   1103:        }
1.30      daniel   1104:        if (icv_inlen != 0 || ret == (size_t) -1) {
                   1105: #ifdef EILSEQ
                   1106:                if (errno == EILSEQ) {
1.31      daniel   1107:                        return -2;
1.30      daniel   1108:                } else
                   1109: #endif
                   1110: #ifdef E2BIG
                   1111:                if (errno == E2BIG) {
                   1112:                        return -1;
                   1113:                } else
                   1114: #endif
                   1115: #ifdef EINVAL
                   1116:                if (errno == EINVAL) {
1.31      daniel   1117:                        return -3;
1.30      daniel   1118:                }
                   1119: #endif
                   1120:                else {
                   1121:                        return -3;
                   1122:                }
                   1123:        }
                   1124:        return 0;
                   1125: }
                   1126: #endif /* LIBXML_ICONV_ENABLED */
                   1127: 
                   1128: /**
                   1129:  * xmlCharEncInFunc:
                   1130:  * @handler:   char enconding transformation data structure
1.31      daniel   1131:  * @out:  an xmlBuffer for the output.
                   1132:  * @in:  an xmlBuffer for the input
1.30      daniel   1133:  *     
                   1134:  * Generic front-end for the encoding handler input function
                   1135:  *     
1.31      daniel   1136:  * Returns the number of byte written if success, or 
                   1137:  *     -1 general error
1.30      daniel   1138:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1139:  *        the result of transformation can't fit into the encoding we want), or
                   1140:  */
                   1141: int
1.31      daniel   1142: xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1143:                  xmlBufferPtr in) {
1.30      daniel   1144:     int ret = -2;
1.31      daniel   1145:     int written;
                   1146:     int toconv;
1.30      daniel   1147: 
1.31      daniel   1148:     if (handler == NULL) return(-1);
                   1149:     if (out == NULL) return(-1);
                   1150:     if (in == NULL) return(-1);
                   1151: 
                   1152:     written = out->size - out->use;
                   1153:     toconv = in->use;
                   1154:     if (toconv * 2 >= written) {
                   1155:         xmlBufferGrow(out, toconv * 2);
1.33      daniel   1156:        written = out->size - out->use - 1;
1.31      daniel   1157:     }
1.30      daniel   1158:     if (handler->input != NULL) {
1.32      daniel   1159:        ret = handler->input(&out->content[out->use], &written,
1.31      daniel   1160:                             in->content, &toconv);
                   1161:        xmlBufferShrink(in, toconv);
                   1162:        out->use += written;
1.33      daniel   1163:        out->content[out->use] = 0;
1.30      daniel   1164:     }
                   1165: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1166:     else if (handler->iconv_in != NULL) {
                   1167:        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
                   1168:                              &written, in->content, &toconv);
                   1169:        xmlBufferShrink(in, toconv);
                   1170:        out->use += written;
1.33      daniel   1171:        out->content[out->use] = 0;
                   1172:        if (ret == -1) ret = -3;
1.30      daniel   1173:     }
                   1174: #endif /* LIBXML_ICONV_ENABLED */
                   1175: #ifdef DEBUG_ENCODING
                   1176:     switch (ret) {
                   1177:         case 0:
                   1178:            fprintf(stderr, "converted %d bytes to %d bytes of input\n",
1.31      daniel   1179:                    toconv, written);
1.30      daniel   1180:            break;
                   1181:         case -1:
1.31      daniel   1182:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1183:                    toconv, written, in->use);
1.30      daniel   1184:            break;
                   1185:         case -2:
                   1186:            fprintf(stderr, "input conversion failed due to input error\n");
                   1187:            break;
                   1188:         case -3:
1.31      daniel   1189:            fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
                   1190:                    toconv, written, in->use);
1.30      daniel   1191:            break;
                   1192:        default:
                   1193:            fprintf(stderr,"Unknown input conversion failed %d\n", ret);
                   1194:     }
                   1195: #endif
1.33      daniel   1196:     /*
                   1197:      * Ignore when input buffer is not on a boundary
                   1198:      */
                   1199:     if (ret == -3) ret = 0;
1.30      daniel   1200:     return(ret);
                   1201: }
                   1202: 
                   1203: /**
                   1204:  * xmlCharEncOutFunc:
                   1205:  * @handler:   char enconding transformation data structure
1.31      daniel   1206:  * @out:  an xmlBuffer for the output.
                   1207:  * @in:  an xmlBuffer for the input
                   1208:  *     
                   1209:  * Generic front-end for the encoding handler output function
1.35      daniel   1210:  * a first call with @in == NULL has to be made firs to initiate the 
                   1211:  * output in case of non-stateless encoding needing to initiate their
                   1212:  * state or the output (like the BOM in UTF16).
1.30      daniel   1213:  *     
1.31      daniel   1214:  * Returns the number of byte written if success, or 
                   1215:  *     -1 general error
1.30      daniel   1216:  *     -2 if the transcoding fails (for *in is not valid utf8 string or
                   1217:  *        the result of transformation can't fit into the encoding we want), or
                   1218:  */
                   1219: int
1.31      daniel   1220: xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                   1221:                   xmlBufferPtr in) {
1.30      daniel   1222:     int ret = -2;
1.31      daniel   1223:     int written;
                   1224:     int toconv;
                   1225: 
                   1226:     if (handler == NULL) return(-1);
                   1227:     if (out == NULL) return(-1);
1.35      daniel   1228:     written = out->size - out->use;
                   1229: 
                   1230:     if (in == NULL) {
                   1231:         toconv = 0;
                   1232:        if (handler->output != NULL) {
                   1233:            ret = handler->output(&out->content[out->use], &written,
                   1234:                                  NULL, &toconv);
                   1235:            out->use += written;
                   1236:            out->content[out->use] = 0;
                   1237:        }
                   1238: #ifdef LIBXML_ICONV_ENABLED
                   1239:        else if (handler->iconv_out != NULL) {
                   1240:            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1241:                                  &written, NULL, &toconv);
                   1242:            out->use += written;
                   1243:            out->content[out->use] = 0;
                   1244:        }
                   1245: #endif /* LIBXML_ICONV_ENABLED */
                   1246: #ifdef DEBUG_ENCODING
                   1247:        fprintf(stderr, "initialized encoder\n");
                   1248: #endif
                   1249:         return(0);
                   1250:     }
1.30      daniel   1251: 
1.33      daniel   1252:     toconv = in->use;
                   1253:     if (toconv * 2 >= written) {
                   1254:         xmlBufferGrow(out, toconv * 2);
                   1255:        written = out->size - out->use - 1;
                   1256:     }
1.30      daniel   1257:     if (handler->output != NULL) {
1.33      daniel   1258:        ret = handler->output(&out->content[out->use], &written,
1.35      daniel   1259:                              in->content, &toconv);
1.31      daniel   1260:        xmlBufferShrink(in, toconv);
                   1261:        out->use += written;
1.33      daniel   1262:        out->content[out->use] = 0;
1.30      daniel   1263:     }
                   1264: #ifdef LIBXML_ICONV_ENABLED
                   1265:     else if (handler->iconv_out != NULL) {
1.31      daniel   1266:        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
                   1267:                              &written, in->content, &toconv);
                   1268:        xmlBufferShrink(in, toconv);
                   1269:        out->use += written;
1.33      daniel   1270:        out->content[out->use] = 0;
                   1271:        if (ret == -1) ret = -3;
1.30      daniel   1272:     }
                   1273: #endif /* LIBXML_ICONV_ENABLED */
                   1274: #ifdef DEBUG_ENCODING
                   1275:     switch (ret) {
                   1276:         case 0:
                   1277:            fprintf(stderr, "converted %d bytes to %d bytes of output\n",
1.31      daniel   1278:                    toconv, written);
1.30      daniel   1279:            break;
                   1280:         case -1:
                   1281:            fprintf(stderr, "output conversion failed by lack of space\n");
                   1282:            break;
                   1283:         case -2:
                   1284:            fprintf(stderr, "output conversion failed due to output error\n");
                   1285:            break;
                   1286:         case -3:
1.31      daniel   1287:            fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
                   1288:                    toconv, written, in->use);
1.30      daniel   1289:            break;
                   1290:        default:
                   1291:            fprintf(stderr,"Unknown output conversion failed %d\n", ret);
                   1292:     }
                   1293: #endif
                   1294:     return(ret);
                   1295: }
                   1296: 
                   1297: /**
                   1298:  * xmlCharEncCloseFunc:
                   1299:  * @handler:   char enconding transformation data structure
                   1300:  *     
                   1301:  * Generic front-end for hencoding handler close function
                   1302:  *
                   1303:  * Returns 0 if success, or -1 in case of error
                   1304:  */
                   1305: int
                   1306: xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
                   1307:     int ret = 0;
1.31      daniel   1308:     if (handler == NULL) return(-1);
                   1309:     if (handler->name == NULL) return(-1);
1.30      daniel   1310: #ifdef LIBXML_ICONV_ENABLED
1.31      daniel   1311:     /*
                   1312:      * Iconv handlers can be oused only once, free the whole block.
                   1313:      * and the associated icon resources.
                   1314:      */
1.32      daniel   1315:     if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
                   1316:        if (handler->name != NULL)
                   1317:            xmlFree(handler->name);
                   1318:        handler->name = NULL;
                   1319:        if (handler->iconv_out != NULL) {
                   1320:            if (iconv_close(handler->iconv_out))
                   1321:                ret = -1;
                   1322:            handler->iconv_out = NULL;
                   1323:        }
                   1324:        if (handler->iconv_in != NULL) {
                   1325:            if (iconv_close(handler->iconv_in))
                   1326:                ret = -1;
                   1327:            handler->iconv_in = NULL;
                   1328:        }
                   1329:        xmlFree(handler);
1.30      daniel   1330:     }
                   1331: #endif /* LIBXML_ICONV_ENABLED */
                   1332: #ifdef DEBUG_ENCODING
                   1333:     if (ret)
                   1334:         fprintf(stderr, "failed to close the encoding handler\n");
                   1335:     else
                   1336:         fprintf(stderr, "closed the encoding handler\n");
                   1337: 
                   1338: #endif
                   1339:     return(ret);
1.9       daniel   1340: }
                   1341: 

Webmaster