XML/parser.c - annotate

Return to parser.c CVS log
Up to [Public] / XML
Annotation of XML/parser.c, revision 1.1

1.1     ! veillard    1: /*
        !             2:  * parser.c : an XML 1.0 non-verifying validator
        !             3:  */
        !             4: 
        !             5: #include <stdio.h>
        !             6: #include <ctype.h>
        !             7: #include <string.h>
        !             8: #include <malloc.h>
        !             9: 
        !            10: #include "parser.h"
        !            11: #include "tree.h"
        !            12: 
        !            13: /*
        !            14:  * A few macros needed to help building the parser.
        !            15:  */
        !            16: 
        !            17: #ifdef UNICODE
        !            18: /*
        !            19:  * UNICODE version of the macros. Incomplete now !
        !            20:  */
        !            21: #define IS_CHAR(c)                                                     \
        !            22:     (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) ||                        \
        !            23:      (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
        !            24: 
        !            25: #define SKIP_BLANKS(p)                                                         \
        !            26:     while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) ||                \
        !            27:            (*(p) == 0x3000)) (p)++;
        !            28: 
        !            29: /* I'm too lazy to complete this one ! */
        !            30: #define IS_BASECHAR(c)                                                 \
        !            31:     ((((c) >= 0x41) && ((c) <= 0x5a)) ||                               \               
        !            32:      (((c) >= 0x61) && ((c) <= 0x7a)) ||                               \
        !            33:      (((c) >= 0xaa) && ((c) <= 0x5b)) ||                               \
        !            34:      (((c) >= 0xc0) && ((c) <= 0xd6)) ||                               \
        !            35:      (((c) >= 0xd8) && ((c) <= 0xf6)) ||                               \
        !            36:      (((c) >= 0xf8) && ((c) <= 0xff)) ||                               \
        !            37:       ((c) == 0xba))
        !            38: 
        !            39: /* I'm too lazy to complete this one ! */
        !            40: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
        !            41: 
        !            42: /* I'm too lazy to complete this one ! */
        !            43: #define IS_COMBINING(c) 0
        !            44: 
        !            45: #define IS_IDEOGRAPHIC(c)                                              \
        !            46:     ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) ||                           \
        !            47:      (((c) >= 0xf900) && ((c) <= 0xfa2d)) ||                           \
        !            48:      (((c) >= 0x3021) && ((c) <= 0x3029)) ||                           \
        !            49:       ((c) == 0x3007))
        !            50: 
        !            51: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
        !            52: 
        !            53: /* I'm too lazy to complete this one ! */
        !            54: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
        !            55: #else
        !            56: /*
        !            57:  * ASCII version of the macros.
        !            58:  */
        !            59: #define IS_CHAR(c)                                                     \
        !            60:     (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
        !            61: 
        !            62: #define IS_BASECHAR(c)                                                 \
        !            63:     ((((c) >= 0x41) && ((c) <= 0x5a)) ||                               \
        !            64:      (((c) >= 0x61) && ((c) <= 0x7a)) ||                               \
        !            65:      (((c) >= 0xaa) && ((c) <= 0x5b)) ||                               \
        !            66:      (((c) >= 0xc0) && ((c) <= 0xd6)) ||                               \
        !            67:      (((c) >= 0xd8) && ((c) <= 0xf6)) ||                               \
        !            68:      (((c) >= 0xf8) && ((c) <= 0xff)) ||                               \
        !            69:       ((c) == 0xba))
        !            70: 
        !            71: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
        !            72: 
        !            73: #define IS_LETTER(c) IS_BASECHAR(c)
        !            74: 
        !            75: #define IS_COMBINING(c) 0
        !            76: 
        !            77: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
        !            78: #endif
        !            79: 
        !            80: 
        !            81: #define SKIP_EOL(p)                                                    \
        !            82:     if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; }                 \
        !            83:     if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
        !            84: 
        !            85: #define SKIP_BLANKS(p)                                                         \
        !            86:     while (IS_BLANK(*(p))) (p)++;
        !            87: 
        !            88: #define MOVETO_ENDTAG(p)                                               \
        !            89:     while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
        !            90: 
        !            91: #define MOVETO_STARTTAG(p)                                             \
        !            92:     while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
        !            93: 
        !            94: /*
        !            95:  * xmlStrndup : a strdup for array of CHAR's
        !            96:  */
        !            97: 
        !            98: CHAR *xmlStrndup(CHAR *cur, int len) {
        !            99:     CHAR *ret = malloc((len + 1) * sizeof(CHAR));
        !           100: 
        !           101:     if (ret == NULL) {
        !           102:         fprintf(stderr, "malloc of %d byte failed\n",
        !           103:                (len + 1) * sizeof(CHAR));
        !           104:         return(NULL);
        !           105:     }
        !           106:     memcpy(ret, cur, len * sizeof(CHAR));
        !           107:     ret[len] = 0;
        !           108:     return(ret);
        !           109: }
        !           110: 
        !           111: /*
        !           112:  * xmlStrdup : a strdup for CHAR's
        !           113:  */
        !           114: 
        !           115: CHAR *xmlStrdup(CHAR *cur) {
        !           116:     CHAR *p = cur;
        !           117: 
        !           118:     while (IS_CHAR(*p)) p++;
        !           119:     return(xmlStrndup(cur, p - cur));
        !           120: }
        !           121: 
        !           122: /*
        !           123:  * xmlParseName : parse an XML name.
        !           124:  */
        !           125: 
        !           126: CHAR *xmlParseName(CHAR *cur) {
        !           127:     if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
        !           128: 
        !           129:     return(NULL); /* !!!! */
        !           130: }
        !           131: 
        !           132: /*
        !           133:  * Skip an XML (SGML) comment <!-- .... -->
        !           134:  */
        !           135: void xmlParserSkipComment(CHAR **p) {
        !           136:     CHAR *cur = *p, *q, *r, *start;
        !           137: 
        !           138:     /*
        !           139:      * An extra check may avoid errors and isn't that costly !
        !           140:      */
        !           141:     if ((cur[0] != '<') || (cur[1] != '!') ||
        !           142:         (cur[2] != '-') || (cur[3] != '-')) return;
        !           143: 
        !           144:     cur += 4;
        !           145:     start = q = cur;
        !           146:     cur++;
        !           147:     r = cur;
        !           148:     cur++;
        !           149:     while (IS_CHAR(*cur) &&
        !           150:            ((*cur != '>') || (*r != '-') || (*q != '-'))) {
        !           151:         cur++;r++;q++;
        !           152:     }
        !           153:     if (!IS_CHAR(*cur)) {
        !           154:         fprintf(stderr, "Comment not terminated <!--%s\n", start);
        !           155:        *p = start;
        !           156:     } else {
        !           157:         cur++;
        !           158:        *p = cur;
        !           159:     }
        !           160: }
        !           161: 
        !           162: /*
        !           163:  * Parse and return a string between quotes or doublequotes
        !           164:  */
        !           165: CHAR *xmlParseQuotedString(CHAR **p) {
        !           166:     CHAR *ret = NULL;
        !           167:     CHAR *cur = *p, *q;
        !           168: 
        !           169:     if (*cur == '"') {
        !           170:         cur++;
        !           171:        q = cur;
        !           172:        while (IS_CHAR(*cur) && (*cur != '"')) cur++;
        !           173:        if (*cur != '"')
        !           174:            fprintf(stderr, "String not closed \"%s\n", q);
        !           175:         else {
        !           176:             ret = xmlStrndup(q, cur - q);
        !           177:            cur++;
        !           178:        }
        !           179:     } else if (*cur == '\''){
        !           180:         cur++;
        !           181:        q = cur;
        !           182:        while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
        !           183:        if (*cur != '\'')
        !           184:            fprintf(stderr, "String not closed '%s\n", q);
        !           185:         else {
        !           186:             ret = xmlStrndup(q, cur - q);
        !           187:            cur++;
        !           188:        }
        !           189:     }
        !           190:     *p = cur;
        !           191:     return(ret);
        !           192: }
        !           193: 
        !           194: /*
        !           195:  * xmlParseWebdavNamespace: parse Webdav specific '<?namespace ...' constructs.
        !           196:  */
        !           197: 
        !           198: void xmlParseWebdavNamespace(CHAR **p, xmlDocPtr doc) {
        !           199:     CHAR *cur = *p;
        !           200:     CHAR *href = NULL;
        !           201:     CHAR *AS = NULL;
        !           202: 
        !           203:     /*
        !           204:      * We know that 'namespace' is here.
        !           205:      */
        !           206:     cur += 9;
        !           207:     SKIP_BLANKS(cur);
        !           208: 
        !           209:     while (IS_CHAR(*cur) && (*cur != '>')) {
        !           210:        /*
        !           211:         * We can have 'href' or 'AS' attributes.
        !           212:         */
        !           213:        if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') && 
        !           214:            (cur[3] == 'f')) {
        !           215:            cur += 4;
        !           216:            SKIP_BLANKS(cur);
        !           217: 
        !           218:            if (*cur != '=') continue;
        !           219:            cur++;
        !           220:            SKIP_BLANKS(cur);
        !           221: 
        !           222:            href = xmlParseQuotedString(&cur);
        !           223:            SKIP_BLANKS(cur);
        !           224:        } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
        !           225:            cur += 2;
        !           226:            SKIP_BLANKS(cur);
        !           227: 
        !           228:            if (*cur != '=') continue;
        !           229:            cur++;
        !           230:            SKIP_BLANKS(cur);
        !           231: 
        !           232:            AS = xmlParseQuotedString(&cur);
        !           233:            SKIP_BLANKS(cur);
        !           234:        } else if ((cur[0] == '?') && (cur[1] == '>')) {
        !           235:            cur ++;
        !           236:        } else {
        !           237:             /* Garbage ??? */
        !           238:             cur++;
        !           239:         }
        !           240:     }
        !           241: 
        !           242:     MOVETO_ENDTAG(cur);
        !           243:     cur++;
        !           244: 
        !           245:     /*
        !           246:      * Register the DTD.
        !           247:      */
        !           248:     if (href != NULL)
        !           249:         xmlNewDtd(doc, href, AS);
        !           250: 
        !           251:     *p = cur;
        !           252: }
        !           253: 
        !           254: /*
        !           255:  * xmlParseXMLDecl: parse an XML declaration header
        !           256:  */
        !           257: 
        !           258: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
        !           259:     CHAR *cur = *p;
        !           260:     CHAR *version;
        !           261:     xmlDocPtr ret;
        !           262: 
        !           263:     /*
        !           264:      * We know that '<?XML' is here.
        !           265:      */
        !           266:     cur += 5;
        !           267: 
        !           268:     /*
        !           269:      * Parse the version info
        !           270:      */
        !           271:     SKIP_BLANKS(cur);
        !           272: 
        !           273:     /*
        !           274:      * We should have 'version=' here !
        !           275:      */
        !           276:     if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') && 
        !           277:         (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
        !           278:        (cur[6] == 'n') && (cur[7] == '=')) {
        !           279:        cur += 8;
        !           280:        version = xmlParseQuotedString(&cur);
        !           281:        if (version == NULL)
        !           282:            ret = xmlNewDoc(XML_DEFAULT_VERSION);
        !           283:        else {
        !           284:            ret = xmlNewDoc(version);
        !           285:        }
        !           286:     } else {
        !           287:         ret = xmlNewDoc(XML_DEFAULT_VERSION);
        !           288:     }
        !           289: 
        !           290:     /*
        !           291:      * We should check for encoding !!!!
        !           292:      */
        !           293: 
        !           294:     /*
        !           295:      * We should check for Required Markup Declaration !!!!
        !           296:      */
        !           297:     MOVETO_ENDTAG(cur);
        !           298:     cur++;
        !           299: 
        !           300:     *p = cur;
        !           301:     return(ret);
        !           302: }
        !           303: 
        !           304: /*
        !           305:  * xmlParseMisc: parse an XML Misc optionnal field.
        !           306:  * (Comment | PI | S)*
        !           307:  */
        !           308: 
        !           309: void xmlParseMisc(CHAR **p, xmlDocPtr ret) {
        !           310:     CHAR *cur = *p;
        !           311: 
        !           312:     while (((cur[0] == '<') && (cur[1] == '?')) ||
        !           313:            ((cur[0] == '<') && (cur[1] == '!') &&
        !           314:            (cur[2] == '-') && (cur[2] == '-')) ||
        !           315:            IS_BLANK(*cur)) {
        !           316:         if ((cur[0] == '<') && (cur[1] == '?')) {
        !           317:            /*
        !           318:             * this is a Processing Instruction.
        !           319:             */
        !           320:            cur += 2;
        !           321: 
        !           322:            /*
        !           323:             * Special for WebDav, support for the Processing Instruction
        !           324:             * '<?namespace ...' contruct in the header of the XML document.
        !           325:             */
        !           326:            if ((cur[0] == 'n') && (cur[1] == 'a') &&
        !           327:                (cur[2] == 'm') && (cur[3] == 'e') &&
        !           328:                (cur[4] == 's') && (cur[5] == 'p') &&
        !           329:                (cur[6] == 'a') && (cur[7] == 'c') &&
        !           330:                (cur[8] == 'e')) {
        !           331:                xmlParseWebdavNamespace(&cur, ret);
        !           332:            } else {
        !           333:                 /* Unknown PI, ignore it ! */
        !           334:                MOVETO_ENDTAG(cur);
        !           335:                cur++;
        !           336:            }
        !           337:        } else if (IS_BLANK(*cur)) {
        !           338:            cur++;
        !           339:        } else
        !           340:            xmlParserSkipComment(&cur);
        !           341:     }
        !           342: 
        !           343:     *p = cur;
        !           344: }
        !           345: 
        !           346: /*
        !           347:  * xmlParseDoc : parse an XML document and build a tree.
        !           348:  */
        !           349: 
        !           350: xmlDocPtr xmlParseDoc(CHAR *cur) {
        !           351:     xmlDocPtr ret;
        !           352: 
        !           353:     /*
        !           354:      * Wipe out everything which is before the first '<'
        !           355:      */
        !           356:     SKIP_BLANKS(cur);
        !           357: 
        !           358:     /*
        !           359:      * Check for the XMLDecl in the Prolog.
        !           360:      */
        !           361:     if ((cur[0] == '<') && (cur[1] == '?') &&
        !           362:         (cur[2] == 'X') && (cur[3] == 'M') &&
        !           363:        (cur[4] == 'L')) {
        !           364:        ret = xmlParseXMLDecl(&cur);
        !           365:        /* SKIP_EOL(cur); */
        !           366:        SKIP_BLANKS(cur);
        !           367:     } else {
        !           368:         ret = xmlNewDoc(XML_DEFAULT_VERSION);
        !           369:     }
        !           370: 
        !           371:     /*
        !           372:      * The Misc part of the Prolog
        !           373:      * (Comment | PI | S) *
        !           374:      */
        !           375:     xmlParseMisc(&cur, ret);
        !           376: 
        !           377:     /*
        !           378:      * Parse the Element
        !           379:      */
        !           380: 
        !           381:     return(ret);
        !           382: }
        !           383: 
        !           384: /************************************************************************
        !           385:  *                                                                     *
        !           386:  *                             Debug                                   *
        !           387:  *                                                                     *
        !           388:  ************************************************************************/
        !           389: 
        !           390: #ifdef DEBUG
        !           391: CHAR buffer[] = 
        !           392: "\n\
        !           393: <?XML version=\"1.0\">\n\
        !           394: <?namespace href = \"http://www.ietf.org/standards/dav/\" AS = \"D\"?>\n\
        !           395: <?namespace href = \"http://www.w3.com/standards/z39.50/\" AS = \"Z\"?>\n\
        !           396: <D:propertyupdate>\n\
        !           397: <D:set>\n\
        !           398:        <D:prop>\n\
        !           399:             <Z:authors>\n\
        !           400:                  <Z:Author>Jim Whitehead</Z:Author>\n\
        !           401:                  <Z:Author>Roy Fielding</Z:Author>\n\
        !           402:             </Z:authors>\n\
        !           403:        </D:prop>\n\
        !           404:   </D:set>\n\
        !           405:   <D:remove>\n\
        !           406:        <D:prop><Z:Copyright-Owner/></D:prop>\n\
        !           407:   </D:remove>\n\
        !           408: </D:propertyupdate>\n\
        !           409: \n\
        !           410: ";
        !           411: 
        !           412: int main(void) {
        !           413:     xmlDocPtr doc;
        !           414: 
        !           415:     /*
        !           416:      * build a fake XML document from a string;
        !           417:      */
        !           418:     doc = xmlParseDoc(buffer);
        !           419: 
        !           420:     /*
        !           421:      * print it.
        !           422:      */
        !           423:     xmlDocDump(stdout, doc);
        !           424: 
        !           425:     /*
        !           426:      * free it.
        !           427:      */
        !           428:     xmlFreeDoc(doc);
        !           429:     return(0);
        !           430: }
        !           431: #endif
Webmaster