Annotation of XML/encoding.c, revision 1.26

1.1       daniel      1: /*
                      2:  * encoding.c : implements the encoding conversion functions needed for XML
                      3:  *
                      4:  * Related specs: 
                      5:  * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
                      6:  * [ISO-10646]    UTF-8 and UTF-16 in Annexes
                      7:  * [ISO-8859-1]   ISO Latin-1 characters codes.
                      8:  * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
                      9:  *                Worldwide Character Encoding -- Version 1.0", Addison-
                     10:  *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
                     11:  *                described in Unicode Technical Report #4.
                     12:  * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
                     13:  *                Information Interchange, ANSI X3.4-1986.
                     14:  *
1.9       daniel     15:  * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
1.1       daniel     16:  *
                     17:  * See Copyright for the status of this software.
                     18:  *
                     19:  * Daniel.Veillard@w3.org
                     20:  */
                     21: 
1.21      daniel     22: #ifdef WIN32
                     23: #include "win32config.h"
                     24: #else
1.14      daniel     25: #include "config.h"
1.17      daniel     26: #endif
                     27: 
                     28: #include <stdio.h>
                     29: #include <string.h>
                     30: 
                     31: #ifdef HAVE_CTYPE_H
1.7       daniel     32: #include <ctype.h>
1.17      daniel     33: #endif
1.20      daniel     34: #ifdef HAVE_STDLIB_H
                     35: #include <stdlib.h>
                     36: #endif
1.1       daniel     37: #include "encoding.h"
1.16      daniel     38: #include "xmlmemory.h"
1.3       daniel     39: 
1.25      daniel     40: xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
                     41: xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
                     42: 
1.3       daniel     43: /*
                     44:  * From rfc2044: encoding of the Unicode values on UTF-8:
                     45:  *
                     46:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
                     47:  * 0000 0000-0000 007F   0xxxxxxx
                     48:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
                     49:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
                     50:  *
                     51:  * I hope we won't use values > 0xFFFF anytime soon !
                     52:  */
1.1       daniel     53: 
                     54: /**
1.22      daniel     55:  * xmlCheckUTF8: Check utf-8 string for legality.
                     56:  * @utf: Pointer to putative utf-8 encoded string.
                     57:  *
                     58:  * Checks @utf for being valid utf-8. @utf is assumed to be
                     59:  * null-terminated. This function is not super-strict, as it will
                     60:  * allow longer utf-8 sequences than necessary. Note that Java is
                     61:  * capable of producing these sequences if provoked. Also note, this
                     62:  * routine checks for the 4-byte maxiumum size, but does not check for
                     63:  * 0x10ffff maximum value.
                     64:  *
                     65:  * Return value: true if @utf is valid.
                     66:  **/
                     67: int
                     68: xmlCheckUTF8(const unsigned char *utf)
                     69: {
                     70:     int ix;
                     71:     unsigned char c;
                     72: 
                     73:     for (ix = 0; (c = utf[ix]);) {
                     74:         if (c & 0x80) {
                     75:            if ((utf[ix + 1] & 0xc0) != 0x80)
                     76:                return(0);
                     77:            if ((c & 0xe0) == 0xe0) {
                     78:                if ((utf[ix + 2] & 0xc0) != 0x80)
                     79:                    return(0);
                     80:                if ((c & 0xf0) == 0xf0) {
                     81:                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
                     82:                        return(0);
                     83:                    ix += 4;
                     84:                    /* 4-byte code */
                     85:                } else
                     86:                  /* 3-byte code */
                     87:                    ix += 3;
                     88:            } else
                     89:              /* 2-byte code */
                     90:                ix += 2;
                     91:        } else
                     92:            /* 1-byte code */
                     93:            ix++;
                     94:       }
                     95:       return(1);
                     96: }
                     97: 
                     98: /**
1.1       daniel     99:  * isolat1ToUTF8:
1.18      daniel    100:  * @out:  a pointer to an array of bytes to store the result
                    101:  * @outlen:  the length of @out
                    102:  * @in:  a pointer to an array of ISO Latin 1 chars
                    103:  * @inlen:  the length of @in
1.1       daniel    104:  *
                    105:  * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
                    106:  * block of chars out.
1.6       daniel    107:  * Returns the number of byte written, or -1 by lack of space.
1.1       daniel    108:  */
                    109: int
1.25      daniel    110: isolat1ToUTF8(unsigned char* out, int outlen,
                    111:               const unsigned char* in, int *inlen) {
1.1       daniel    112:     unsigned char* outstart= out;
                    113:     unsigned char* outend= out+outlen;
1.25      daniel    114:     const unsigned char* inend= in+*inlen;
1.1       daniel    115:     unsigned char c;
                    116: 
                    117:     while (in < inend) {
                    118:         c= *in++;
                    119:         if (c < 0x80) {
                    120:             if (out >= outend)  return -1;
                    121:             *out++ = c;
                    122:         }
                    123:         else {
                    124:             if (out >= outend)  return -1;
                    125:             *out++ = 0xC0 | (c >> 6);
                    126:             if (out >= outend)  return -1;
                    127:             *out++ = 0x80 | (0x3F & c);
                    128:         }
                    129:     }
                    130:     return out-outstart;
                    131: }
                    132: 
                    133: /**
                    134:  * UTF8Toisolat1:
1.18      daniel    135:  * @out:  a pointer to an array of bytes to store the result
                    136:  * @outlen:  the length of @out
                    137:  * @in:  a pointer to an array of UTF-8 chars
                    138:  * @inlen:  the length of @in
1.1       daniel    139:  *
                    140:  * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
                    141:  * block of chars out.
1.15      daniel    142:  * TODO: UTF8Toisolat1 need a fallback mechanism ...
                    143:  *
1.6       daniel    144:  * Returns the number of byte written, or -1 by lack of space, or -2
1.23      daniel    145:  *     if the transcoding faile (for *in is not valid utf8 string or
                    146:  *     the result of transformation can't fit into the encoding we want)
1.1       daniel    147:  */
                    148: int
1.25      daniel    149: UTF8Toisolat1(unsigned char* out, int outlen,
                    150:               const unsigned char* in, int *inlen) {
1.1       daniel    151:     unsigned char* outstart= out;
                    152:     unsigned char* outend= out+outlen;
1.25      daniel    153:     const unsigned char* inend= in+*inlen;
1.1       daniel    154:     unsigned char c;
                    155: 
                    156:     while (in < inend) {
                    157:         c= *in++;
                    158:         if (c < 0x80) {
                    159:             if (out >= outend)  return -1;
                    160:             *out++= c;
                    161:         }
1.23      daniel    162:        else if (in == inend) {
                    163:             *inlen -= 1;
                    164:             break;
                    165:        }
                    166:        else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
                    167:            /* a two byte utf-8 and can be encoding as isolate1 */
1.1       daniel    168:             *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
1.23      daniel    169:        }
                    170:        else return -2;
                    171:        /* TODO : some should be represent as "&#x____;" */
1.1       daniel    172:     }
                    173:     return out-outstart;
                    174: }
                    175: 
                    176: /**
                    177:  * UTF16ToUTF8:
1.18      daniel    178:  * @out:  a pointer to an array of bytes to store the result
                    179:  * @outlen:  the length of @out
1.25      daniel    180:  * @inb:  a pointer to an array of UTF-16 passwd as a byte array
                    181:  * @inlenb:  the length of @in in UTF-16 chars
1.1       daniel    182:  *
                    183:  * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
                    184:  * block of chars out.
1.25      daniel    185:  *
1.6       daniel    186:  * Returns the number of byte written, or -1 by lack of space.
1.1       daniel    187:  */
                    188: int
1.25      daniel    189: UTF16ToUTF8(unsigned char* out, int outlen,
                    190:             const unsigned char* inb, int *inlenb)
1.1       daniel    191: {
                    192:     unsigned char* outstart= out;
                    193:     unsigned char* outend= out+outlen;
1.25      daniel    194:     unsigned short* in = (unsigned short*) inb;
                    195:     unsigned short* inend;
                    196:     unsigned int c, d, inlen;
1.1       daniel    197:     int bits;
                    198: 
1.25      daniel    199:     inlen = *inlenb / 2;
                    200:     inend= in + inlen;
1.1       daniel    201:     while (in < inend) {
                    202:         c= *in++;
                    203:         if ((c & 0xFC00) == 0xD800) {    /* surrogates */
                    204:             if ((in<inend) && (((d=*in++) & 0xFC00) == 0xDC00)) {
                    205:                 c &= 0x03FF;
                    206:                 c <<= 10;
                    207:                 c |= d & 0x03FF;
                    208:                 c += 0x10000;
                    209:             }
                    210:             else  return -1;
                    211:         }
                    212: 
1.25      daniel    213:        /* assertion: c is a single UTF-4 value */
1.1       daniel    214:         if (out >= outend)  return -1;
                    215:         if      (c <    0x80) {  *out++=  c;                bits= -6; }
1.26    ! daniel    216:         else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
        !           217:         else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
        !           218:         else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
1.1       daniel    219:  
1.26    ! daniel    220:         for ( ; bits >= 0; bits-= 6) {
1.1       daniel    221:             if (out >= outend)  return -1;
1.26    ! daniel    222:             *out++= ((c >> bits) & 0x3F) | 0x80;
1.1       daniel    223:         }
                    224:     }
                    225:     return out-outstart;
                    226: }
                    227: 
                    228: /**
                    229:  * UTF8ToUTF16:
1.25      daniel    230:  * @outb:  a pointer to an array of bytes to store the result
                    231:  * @outlen:  the length of @outb
1.18      daniel    232:  * @in:  a pointer to an array of UTF-8 chars
                    233:  * @inlen:  the length of @in
1.1       daniel    234:  *
                    235:  * Take a block of UTF-8 chars in and try to convert it to an UTF-16
                    236:  * block of chars out.
1.15      daniel    237:  * TODO: UTF8ToUTF16 need a fallback mechanism ...
                    238:  *
1.6       daniel    239:  * Returns the number of byte written, or -1 by lack of space, or -2
1.25      daniel    240:  *     if the transcoding failed. 
1.1       daniel    241:  */
                    242: int
1.25      daniel    243: UTF8ToUTF16(unsigned char* outb, int outlen,
                    244:             const unsigned char* in, int *inlen)
1.1       daniel    245: {
1.25      daniel    246:     unsigned short* out = (unsigned short*) outb;
1.1       daniel    247:     unsigned short* outstart= out;
                    248:     unsigned short* outend= out+outlen;
1.25      daniel    249:     const unsigned char* inend= in+*inlen;
1.1       daniel    250:     unsigned int c, d, trailing;
                    251: 
1.25      daniel    252:     outlen /= 2; /* convert in short length */
1.1       daniel    253:     while (in < inend) {
                    254:       d= *in++;
                    255:       if      (d < 0x80)  { c= d; trailing= 0; }
                    256:       else if (d < 0xC0)  return -2;    /* trailing byte in leading position */
                    257:       else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
                    258:       else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
                    259:       else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
                    260:       else return -2;    /* no chance for this in UTF-16 */
                    261: 
                    262:       for ( ; trailing; trailing--) {
                    263:           if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  return -1;
                    264:           c <<= 6;
                    265:           c |= d & 0x3F;
                    266:       }
                    267: 
                    268:       /* assertion: c is a single UTF-4 value */
                    269:         if (c < 0x10000) {
                    270:             if (out >= outend)  return -1;
                    271:             *out++ = c;
                    272:         }
                    273:         else if (c < 0x110000) {
                    274:             if (out+1 >= outend)  return -1;
                    275:             c -= 0x10000;
                    276:             *out++ = 0xD800 | (c >> 10);
                    277:             *out++ = 0xDC00 | (c & 0x03FF);
                    278:         }
                    279:         else  return -1;
                    280:     }
                    281:     return out-outstart;
                    282: }
                    283: 
1.7       daniel    284: /**
                    285:  * xmlDetectCharEncoding:
                    286:  * @in:  a pointer to the first bytes of the XML entity, must be at least
                    287:  *       4 bytes long.
1.25      daniel    288:  * @len:  pointer to the length of the buffer
1.7       daniel    289:  *
                    290:  * Guess the encoding of the entity using the first bytes of the entity content
                    291:  * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
                    292:  * 
                    293:  * Returns one of the XML_CHAR_ENCODING_... values.
                    294:  */
                    295: xmlCharEncoding
1.25      daniel    296: xmlDetectCharEncoding(const unsigned char* in, int len)
1.7       daniel    297: {
1.25      daniel    298:     if (len >= 4) {
                    299:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    300:            (in[2] == 0x00) && (in[3] == 0x3C))
                    301:            return(XML_CHAR_ENCODING_UCS4BE);
                    302:        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
                    303:            (in[2] == 0x00) && (in[3] == 0x00))
                    304:            return(XML_CHAR_ENCODING_UCS4LE);
                    305:        if ((in[0] == 0x00) && (in[1] == 0x00) &&
                    306:            (in[2] == 0x3C) && (in[3] == 0x00))
                    307:            return(XML_CHAR_ENCODING_UCS4_2143);
                    308:        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
                    309:            (in[2] == 0x00) && (in[3] == 0x00))
                    310:            return(XML_CHAR_ENCODING_UCS4_3412);
                    311:        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
                    312:            (in[2] == 0xA7) && (in[3] == 0x94))
                    313:            return(XML_CHAR_ENCODING_EBCDIC);
                    314:        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
                    315:            (in[2] == 0x78) && (in[3] == 0x6D))
                    316:            return(XML_CHAR_ENCODING_UTF8);
                    317:     }
                    318:     if (len >= 2) {
                    319:        if ((in[0] == 0xFE) && (in[1] == 0xFF))
                    320:            return(XML_CHAR_ENCODING_UTF16BE);
                    321:        if ((in[0] == 0xFF) && (in[1] == 0xFE))
                    322:            return(XML_CHAR_ENCODING_UTF16LE);
                    323:     }
1.7       daniel    324:     return(XML_CHAR_ENCODING_NONE);
                    325: }
                    326: 
                    327: /**
                    328:  * xmlParseCharEncoding:
1.18      daniel    329:  * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1.7       daniel    330:  *
                    331:  * Conpare the string to the known encoding schemes already known. Note
                    332:  * that the comparison is case insensitive accordingly to the section
                    333:  * [XML] 4.3.3 Character Encoding in Entities.
                    334:  * 
                    335:  * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
                    336:  * if not recognized.
                    337:  */
                    338: xmlCharEncoding
1.8       daniel    339: xmlParseCharEncoding(const char* name)
1.7       daniel    340: {
                    341:     char upper[500];
                    342:     int i;
                    343: 
                    344:     for (i = 0;i < 499;i++) {
                    345:         upper[i] = toupper(name[i]);
                    346:        if (upper[i] == 0) break;
                    347:     }
                    348:     upper[i] = 0;
                    349: 
                    350:     if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
                    351:     if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
                    352:     if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
                    353: 
                    354:     /*
                    355:      * NOTE: if we were able to parse this, the endianness of UTF16 is
                    356:      *       already found and in use
                    357:      */
                    358:     if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
                    359:     if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
                    360:     
                    361:     if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    362:     if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
                    363:     if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
                    364: 
                    365:     /*
                    366:      * NOTE: if we were able to parse this, the endianness of UCS4 is
                    367:      *       already found and in use
                    368:      */
                    369:     if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    370:     if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
                    371:     if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
                    372: 
                    373:     
                    374:     if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
                    375:     if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
                    376:     if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
                    377: 
                    378:     if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
                    379:     if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
                    380:     if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
                    381: 
                    382:     if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
                    383:     if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
                    384:     if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
                    385:     if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
                    386:     if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
                    387:     if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
                    388:     if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
                    389: 
                    390:     if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
                    391:     if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
                    392:     if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
                    393:     return(XML_CHAR_ENCODING_ERROR);
                    394: }
1.9       daniel    395: 
                    396: /****************************************************************
                    397:  *                                                             *
                    398:  *             Char encoding handlers                          *
                    399:  *                                                             *
                    400:  ****************************************************************/
                    401: 
                    402: /* the size should be growable, but it's not a big deal ... */
                    403: #define MAX_ENCODING_HANDLERS 50
                    404: static xmlCharEncodingHandlerPtr *handlers = NULL;
                    405: static int nbCharEncodingHandler = 0;
                    406: 
                    407: /*
                    408:  * The default is UTF-8 for XML, that's also the default used for the
                    409:  * parser internals, so the default encoding handler is NULL
                    410:  */
                    411: 
                    412: static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
                    413: 
                    414: /**
                    415:  * xmlNewCharEncodingHandler:
1.18      daniel    416:  * @name:  the encoding name, in UTF-8 format (ASCII actually)
1.9       daniel    417:  * @input:  the xmlCharEncodingInputFunc to read that encoding
                    418:  * @output:  the xmlCharEncodingOutputFunc to write that encoding
                    419:  *
                    420:  * Create and registers an xmlCharEncodingHandler.
                    421:  * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
                    422:  */
                    423: xmlCharEncodingHandlerPtr
1.25      daniel    424: xmlNewCharEncodingHandler(const char *name, 
                    425:                           xmlCharEncodingInputFunc input,
1.9       daniel    426:                           xmlCharEncodingOutputFunc output) {
                    427:     xmlCharEncodingHandlerPtr handler;
                    428:     char upper[500];
                    429:     int i;
                    430:     char *up = 0;
                    431: 
                    432:     /*
                    433:      * Keep only the uppercase version of the encoding.
                    434:      */
                    435:     if (name == NULL) {
                    436:         fprintf(stderr, "xmlNewCharEncodingHandler : no name !\n");
                    437:        return(NULL);
                    438:     }
                    439:     for (i = 0;i < 499;i++) {
                    440:         upper[i] = toupper(name[i]);
                    441:        if (upper[i] == 0) break;
                    442:     }
                    443:     upper[i] = 0;
1.16      daniel    444:     up = xmlMemStrdup(upper);
1.9       daniel    445:     if (up == NULL) {
                    446:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    447:        return(NULL);
                    448:     }
                    449: 
                    450:     /*
                    451:      * allocate and fill-up an handler block.
                    452:      */
                    453:     handler = (xmlCharEncodingHandlerPtr)
1.16      daniel    454:               xmlMalloc(sizeof(xmlCharEncodingHandler));
1.9       daniel    455:     if (handler == NULL) {
                    456:         fprintf(stderr, "xmlNewCharEncodingHandler : out of memory !\n");
                    457:        return(NULL);
                    458:     }
                    459:     handler->input = input;
                    460:     handler->output = output;
                    461:     handler->name = up;
                    462: 
                    463:     /*
                    464:      * registers and returns the handler.
                    465:      */
                    466:     xmlRegisterCharEncodingHandler(handler);
                    467:     return(handler);
                    468: }
                    469: 
                    470: /**
                    471:  * xmlInitCharEncodingHandlers:
                    472:  *
                    473:  * Initialize the char encoding support, it registers the default
                    474:  * encoding supported.
1.18      daniel    475:  * NOTE: while public, this function usually doesn't need to be called
1.9       daniel    476:  *       in normal processing.
                    477:  */
                    478: void
                    479: xmlInitCharEncodingHandlers(void) {
                    480:     if (handlers != NULL) return;
                    481: 
                    482:     handlers = (xmlCharEncodingHandlerPtr *)
1.16      daniel    483:         xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1.9       daniel    484: 
                    485:     if (handlers == NULL) {
                    486:         fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
                    487:        return;
                    488:     }
1.10      daniel    489:     xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1.25      daniel    490:     xmlUTF16LEHandler = 
                    491:           xmlNewCharEncodingHandler("UTF-16LE", UTF16ToUTF8, UTF8ToUTF16);
1.10      daniel    492:     xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1.9       daniel    493: }
                    494: 
                    495: /**
1.19      daniel    496:  * xmlCleanupCharEncodingHandlers:
                    497:  *
                    498:  * Cleanup the memory allocated for the char encoding support, it
                    499:  * unregisters all the encoding handlers.
                    500:  */
                    501: void
                    502: xmlCleanupCharEncodingHandlers(void) {
                    503:     if (handlers == NULL) return;
                    504: 
                    505:     for (;nbCharEncodingHandler > 0;) {
                    506:         nbCharEncodingHandler--;
                    507:        if (handlers[nbCharEncodingHandler] != NULL) {
                    508:            xmlFree(handlers[nbCharEncodingHandler]->name);
                    509:            xmlFree(handlers[nbCharEncodingHandler]);
                    510:        }
                    511:     }
                    512:     xmlFree(handlers);
                    513:     handlers = NULL;
                    514:     nbCharEncodingHandler = 0;
                    515:     xmlDefaultCharEncodingHandler = NULL;
                    516: }
                    517: 
                    518: /**
1.9       daniel    519:  * xmlRegisterCharEncodingHandler:
                    520:  * @handler:  the xmlCharEncodingHandlerPtr handler block
                    521:  *
                    522:  * Register the char encoding handler, surprizing, isn't it ?
                    523:  */
                    524: void
                    525: xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
                    526:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    527:     if (handler == NULL) {
                    528:         fprintf(stderr, "xmlRegisterCharEncodingHandler: NULL handler !\n");
                    529:        return;
                    530:     }
                    531: 
                    532:     if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
                    533:         fprintf(stderr, 
                    534:        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
                    535:         fprintf(stderr, "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
                    536:        return;
                    537:     }
                    538:     handlers[nbCharEncodingHandler++] = handler;
                    539: }
                    540: 
                    541: /**
                    542:  * xmlGetCharEncodingHandler:
                    543:  * @enc:  an xmlCharEncoding value.
                    544:  *
                    545:  * Search in the registrered set the handler able to read/write that encoding.
                    546:  *
                    547:  * Returns the handler or NULL if not found
                    548:  */
                    549: xmlCharEncodingHandlerPtr
                    550: xmlGetCharEncodingHandler(xmlCharEncoding enc) {
                    551:     if (handlers == NULL) xmlInitCharEncodingHandlers();
1.25      daniel    552:     switch (enc) {
                    553:         case XML_CHAR_ENCODING_ERROR:
                    554:            return(NULL);
                    555:         case XML_CHAR_ENCODING_NONE:
                    556:            return(NULL);
                    557:         case XML_CHAR_ENCODING_UTF8:
                    558:            return(NULL);
                    559:         case XML_CHAR_ENCODING_UTF16LE:
                    560:            return(xmlUTF16LEHandler);
                    561:         case XML_CHAR_ENCODING_UTF16BE:
                    562:            return(xmlUTF16BEHandler);
                    563:         case XML_CHAR_ENCODING_EBCDIC:
                    564:            return(NULL);
                    565:         case XML_CHAR_ENCODING_UCS4LE:
                    566:            return(NULL);
                    567:         case XML_CHAR_ENCODING_UCS4BE:
                    568:            return(NULL);
                    569:         case XML_CHAR_ENCODING_UCS4_2143:
                    570:            return(NULL);
                    571:         case XML_CHAR_ENCODING_UCS4_3412:
                    572:            return(NULL);
                    573:         case XML_CHAR_ENCODING_UCS2:
                    574:            return(NULL);
                    575:         case XML_CHAR_ENCODING_8859_1:
                    576:            return(NULL);
                    577:         case XML_CHAR_ENCODING_8859_2:
                    578:            return(NULL);
                    579:         case XML_CHAR_ENCODING_8859_3:
                    580:            return(NULL);
                    581:         case XML_CHAR_ENCODING_8859_4:
                    582:            return(NULL);
                    583:         case XML_CHAR_ENCODING_8859_5:
                    584:            return(NULL);
                    585:         case XML_CHAR_ENCODING_8859_6:
                    586:            return(NULL);
                    587:         case XML_CHAR_ENCODING_8859_7:
                    588:            return(NULL);
                    589:         case XML_CHAR_ENCODING_8859_8:
                    590:            return(NULL);
                    591:         case XML_CHAR_ENCODING_8859_9:
                    592:            return(NULL);
                    593:         case XML_CHAR_ENCODING_2022_JP:
                    594:         case XML_CHAR_ENCODING_SHIFT_JIS:
                    595:         case XML_CHAR_ENCODING_EUC_JP:
                    596:            return(NULL);
                    597:     }
1.9       daniel    598:     return(NULL);
                    599: }
                    600: 
                    601: /**
                    602:  * xmlGetCharEncodingHandler:
                    603:  * @enc:  a string describing the char encoding.
                    604:  *
                    605:  * Search in the registrered set the handler able to read/write that encoding.
                    606:  *
                    607:  * Returns the handler or NULL if not found
                    608:  */
                    609: xmlCharEncodingHandlerPtr
                    610: xmlFindCharEncodingHandler(const char *name) {
                    611:     char upper[500];
                    612:     int i;
                    613: 
                    614:     if (handlers == NULL) xmlInitCharEncodingHandlers();
                    615:     if (name == NULL) return(xmlDefaultCharEncodingHandler);
                    616:     if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
                    617: 
                    618:     for (i = 0;i < 499;i++) {
                    619:         upper[i] = toupper(name[i]);
                    620:        if (upper[i] == 0) break;
                    621:     }
                    622:     upper[i] = 0;
                    623: 
                    624:     for (i = 0;i < nbCharEncodingHandler; i++)
                    625:         if (!strcmp(name, handlers[i]->name))
                    626:            return(handlers[i]);
                    627: 
                    628:     return(NULL);
                    629: }
                    630: 

Webmaster