Annotation of XML/parser.c, revision 1.1
1.1 ! veillard 1: /*
! 2: * parser.c : an XML 1.0 non-verifying validator
! 3: */
! 4:
! 5: #include <stdio.h>
! 6: #include <ctype.h>
! 7: #include <string.h>
! 8: #include <malloc.h>
! 9:
! 10: #include "parser.h"
! 11: #include "tree.h"
! 12:
! 13: /*
! 14: * A few macros needed to help building the parser.
! 15: */
! 16:
! 17: #ifdef UNICODE
! 18: /*
! 19: * UNICODE version of the macros. Incomplete now !
! 20: */
! 21: #define IS_CHAR(c) \
! 22: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
! 23: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
! 24:
! 25: #define SKIP_BLANKS(p) \
! 26: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
! 27: (*(p) == 0x3000)) (p)++;
! 28:
! 29: /* I'm too lazy to complete this one ! */
! 30: #define IS_BASECHAR(c) \
! 31: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
! 32: (((c) >= 0x61) && ((c) <= 0x7a)) || \
! 33: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
! 34: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
! 35: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
! 36: (((c) >= 0xf8) && ((c) <= 0xff)) || \
! 37: ((c) == 0xba))
! 38:
! 39: /* I'm too lazy to complete this one ! */
! 40: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
! 41:
! 42: /* I'm too lazy to complete this one ! */
! 43: #define IS_COMBINING(c) 0
! 44:
! 45: #define IS_IDEOGRAPHIC(c) \
! 46: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
! 47: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
! 48: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
! 49: ((c) == 0x3007))
! 50:
! 51: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
! 52:
! 53: /* I'm too lazy to complete this one ! */
! 54: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
! 55: #else
! 56: /*
! 57: * ASCII version of the macros.
! 58: */
! 59: #define IS_CHAR(c) \
! 60: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
! 61:
! 62: #define IS_BASECHAR(c) \
! 63: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
! 64: (((c) >= 0x61) && ((c) <= 0x7a)) || \
! 65: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
! 66: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
! 67: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
! 68: (((c) >= 0xf8) && ((c) <= 0xff)) || \
! 69: ((c) == 0xba))
! 70:
! 71: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
! 72:
! 73: #define IS_LETTER(c) IS_BASECHAR(c)
! 74:
! 75: #define IS_COMBINING(c) 0
! 76:
! 77: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
! 78: #endif
! 79:
! 80:
! 81: #define SKIP_EOL(p) \
! 82: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
! 83: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
! 84:
! 85: #define SKIP_BLANKS(p) \
! 86: while (IS_BLANK(*(p))) (p)++;
! 87:
! 88: #define MOVETO_ENDTAG(p) \
! 89: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
! 90:
! 91: #define MOVETO_STARTTAG(p) \
! 92: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
! 93:
! 94: /*
! 95: * xmlStrndup : a strdup for array of CHAR's
! 96: */
! 97:
! 98: CHAR *xmlStrndup(CHAR *cur, int len) {
! 99: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
! 100:
! 101: if (ret == NULL) {
! 102: fprintf(stderr, "malloc of %d byte failed\n",
! 103: (len + 1) * sizeof(CHAR));
! 104: return(NULL);
! 105: }
! 106: memcpy(ret, cur, len * sizeof(CHAR));
! 107: ret[len] = 0;
! 108: return(ret);
! 109: }
! 110:
! 111: /*
! 112: * xmlStrdup : a strdup for CHAR's
! 113: */
! 114:
! 115: CHAR *xmlStrdup(CHAR *cur) {
! 116: CHAR *p = cur;
! 117:
! 118: while (IS_CHAR(*p)) p++;
! 119: return(xmlStrndup(cur, p - cur));
! 120: }
! 121:
! 122: /*
! 123: * xmlParseName : parse an XML name.
! 124: */
! 125:
! 126: CHAR *xmlParseName(CHAR *cur) {
! 127: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
! 128:
! 129: return(NULL); /* !!!! */
! 130: }
! 131:
! 132: /*
! 133: * Skip an XML (SGML) comment <!-- .... -->
! 134: */
! 135: void xmlParserSkipComment(CHAR **p) {
! 136: CHAR *cur = *p, *q, *r, *start;
! 137:
! 138: /*
! 139: * An extra check may avoid errors and isn't that costly !
! 140: */
! 141: if ((cur[0] != '<') || (cur[1] != '!') ||
! 142: (cur[2] != '-') || (cur[3] != '-')) return;
! 143:
! 144: cur += 4;
! 145: start = q = cur;
! 146: cur++;
! 147: r = cur;
! 148: cur++;
! 149: while (IS_CHAR(*cur) &&
! 150: ((*cur != '>') || (*r != '-') || (*q != '-'))) {
! 151: cur++;r++;q++;
! 152: }
! 153: if (!IS_CHAR(*cur)) {
! 154: fprintf(stderr, "Comment not terminated <!--%s\n", start);
! 155: *p = start;
! 156: } else {
! 157: cur++;
! 158: *p = cur;
! 159: }
! 160: }
! 161:
! 162: /*
! 163: * Parse and return a string between quotes or doublequotes
! 164: */
! 165: CHAR *xmlParseQuotedString(CHAR **p) {
! 166: CHAR *ret = NULL;
! 167: CHAR *cur = *p, *q;
! 168:
! 169: if (*cur == '"') {
! 170: cur++;
! 171: q = cur;
! 172: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
! 173: if (*cur != '"')
! 174: fprintf(stderr, "String not closed \"%s\n", q);
! 175: else {
! 176: ret = xmlStrndup(q, cur - q);
! 177: cur++;
! 178: }
! 179: } else if (*cur == '\''){
! 180: cur++;
! 181: q = cur;
! 182: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
! 183: if (*cur != '\'')
! 184: fprintf(stderr, "String not closed '%s\n", q);
! 185: else {
! 186: ret = xmlStrndup(q, cur - q);
! 187: cur++;
! 188: }
! 189: }
! 190: *p = cur;
! 191: return(ret);
! 192: }
! 193:
! 194: /*
! 195: * xmlParseWebdavNamespace: parse Webdav specific '<?namespace ...' constructs.
! 196: */
! 197:
! 198: void xmlParseWebdavNamespace(CHAR **p, xmlDocPtr doc) {
! 199: CHAR *cur = *p;
! 200: CHAR *href = NULL;
! 201: CHAR *AS = NULL;
! 202:
! 203: /*
! 204: * We know that 'namespace' is here.
! 205: */
! 206: cur += 9;
! 207: SKIP_BLANKS(cur);
! 208:
! 209: while (IS_CHAR(*cur) && (*cur != '>')) {
! 210: /*
! 211: * We can have 'href' or 'AS' attributes.
! 212: */
! 213: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
! 214: (cur[3] == 'f')) {
! 215: cur += 4;
! 216: SKIP_BLANKS(cur);
! 217:
! 218: if (*cur != '=') continue;
! 219: cur++;
! 220: SKIP_BLANKS(cur);
! 221:
! 222: href = xmlParseQuotedString(&cur);
! 223: SKIP_BLANKS(cur);
! 224: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
! 225: cur += 2;
! 226: SKIP_BLANKS(cur);
! 227:
! 228: if (*cur != '=') continue;
! 229: cur++;
! 230: SKIP_BLANKS(cur);
! 231:
! 232: AS = xmlParseQuotedString(&cur);
! 233: SKIP_BLANKS(cur);
! 234: } else if ((cur[0] == '?') && (cur[1] == '>')) {
! 235: cur ++;
! 236: } else {
! 237: /* Garbage ??? */
! 238: cur++;
! 239: }
! 240: }
! 241:
! 242: MOVETO_ENDTAG(cur);
! 243: cur++;
! 244:
! 245: /*
! 246: * Register the DTD.
! 247: */
! 248: if (href != NULL)
! 249: xmlNewDtd(doc, href, AS);
! 250:
! 251: *p = cur;
! 252: }
! 253:
! 254: /*
! 255: * xmlParseXMLDecl: parse an XML declaration header
! 256: */
! 257:
! 258: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
! 259: CHAR *cur = *p;
! 260: CHAR *version;
! 261: xmlDocPtr ret;
! 262:
! 263: /*
! 264: * We know that '<?XML' is here.
! 265: */
! 266: cur += 5;
! 267:
! 268: /*
! 269: * Parse the version info
! 270: */
! 271: SKIP_BLANKS(cur);
! 272:
! 273: /*
! 274: * We should have 'version=' here !
! 275: */
! 276: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
! 277: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
! 278: (cur[6] == 'n') && (cur[7] == '=')) {
! 279: cur += 8;
! 280: version = xmlParseQuotedString(&cur);
! 281: if (version == NULL)
! 282: ret = xmlNewDoc(XML_DEFAULT_VERSION);
! 283: else {
! 284: ret = xmlNewDoc(version);
! 285: }
! 286: } else {
! 287: ret = xmlNewDoc(XML_DEFAULT_VERSION);
! 288: }
! 289:
! 290: /*
! 291: * We should check for encoding !!!!
! 292: */
! 293:
! 294: /*
! 295: * We should check for Required Markup Declaration !!!!
! 296: */
! 297: MOVETO_ENDTAG(cur);
! 298: cur++;
! 299:
! 300: *p = cur;
! 301: return(ret);
! 302: }
! 303:
! 304: /*
! 305: * xmlParseMisc: parse an XML Misc optionnal field.
! 306: * (Comment | PI | S)*
! 307: */
! 308:
! 309: void xmlParseMisc(CHAR **p, xmlDocPtr ret) {
! 310: CHAR *cur = *p;
! 311:
! 312: while (((cur[0] == '<') && (cur[1] == '?')) ||
! 313: ((cur[0] == '<') && (cur[1] == '!') &&
! 314: (cur[2] == '-') && (cur[2] == '-')) ||
! 315: IS_BLANK(*cur)) {
! 316: if ((cur[0] == '<') && (cur[1] == '?')) {
! 317: /*
! 318: * this is a Processing Instruction.
! 319: */
! 320: cur += 2;
! 321:
! 322: /*
! 323: * Special for WebDav, support for the Processing Instruction
! 324: * '<?namespace ...' contruct in the header of the XML document.
! 325: */
! 326: if ((cur[0] == 'n') && (cur[1] == 'a') &&
! 327: (cur[2] == 'm') && (cur[3] == 'e') &&
! 328: (cur[4] == 's') && (cur[5] == 'p') &&
! 329: (cur[6] == 'a') && (cur[7] == 'c') &&
! 330: (cur[8] == 'e')) {
! 331: xmlParseWebdavNamespace(&cur, ret);
! 332: } else {
! 333: /* Unknown PI, ignore it ! */
! 334: MOVETO_ENDTAG(cur);
! 335: cur++;
! 336: }
! 337: } else if (IS_BLANK(*cur)) {
! 338: cur++;
! 339: } else
! 340: xmlParserSkipComment(&cur);
! 341: }
! 342:
! 343: *p = cur;
! 344: }
! 345:
! 346: /*
! 347: * xmlParseDoc : parse an XML document and build a tree.
! 348: */
! 349:
! 350: xmlDocPtr xmlParseDoc(CHAR *cur) {
! 351: xmlDocPtr ret;
! 352:
! 353: /*
! 354: * Wipe out everything which is before the first '<'
! 355: */
! 356: SKIP_BLANKS(cur);
! 357:
! 358: /*
! 359: * Check for the XMLDecl in the Prolog.
! 360: */
! 361: if ((cur[0] == '<') && (cur[1] == '?') &&
! 362: (cur[2] == 'X') && (cur[3] == 'M') &&
! 363: (cur[4] == 'L')) {
! 364: ret = xmlParseXMLDecl(&cur);
! 365: /* SKIP_EOL(cur); */
! 366: SKIP_BLANKS(cur);
! 367: } else {
! 368: ret = xmlNewDoc(XML_DEFAULT_VERSION);
! 369: }
! 370:
! 371: /*
! 372: * The Misc part of the Prolog
! 373: * (Comment | PI | S) *
! 374: */
! 375: xmlParseMisc(&cur, ret);
! 376:
! 377: /*
! 378: * Parse the Element
! 379: */
! 380:
! 381: return(ret);
! 382: }
! 383:
! 384: /************************************************************************
! 385: * *
! 386: * Debug *
! 387: * *
! 388: ************************************************************************/
! 389:
! 390: #ifdef DEBUG
! 391: CHAR buffer[] =
! 392: "\n\
! 393: <?XML version=\"1.0\">\n\
! 394: <?namespace href = \"http://www.ietf.org/standards/dav/\" AS = \"D\"?>\n\
! 395: <?namespace href = \"http://www.w3.com/standards/z39.50/\" AS = \"Z\"?>\n\
! 396: <D:propertyupdate>\n\
! 397: <D:set>\n\
! 398: <D:prop>\n\
! 399: <Z:authors>\n\
! 400: <Z:Author>Jim Whitehead</Z:Author>\n\
! 401: <Z:Author>Roy Fielding</Z:Author>\n\
! 402: </Z:authors>\n\
! 403: </D:prop>\n\
! 404: </D:set>\n\
! 405: <D:remove>\n\
! 406: <D:prop><Z:Copyright-Owner/></D:prop>\n\
! 407: </D:remove>\n\
! 408: </D:propertyupdate>\n\
! 409: \n\
! 410: ";
! 411:
! 412: int main(void) {
! 413: xmlDocPtr doc;
! 414:
! 415: /*
! 416: * build a fake XML document from a string;
! 417: */
! 418: doc = xmlParseDoc(buffer);
! 419:
! 420: /*
! 421: * print it.
! 422: */
! 423: xmlDocDump(stdout, doc);
! 424:
! 425: /*
! 426: * free it.
! 427: */
! 428: xmlFreeDoc(doc);
! 429: return(0);
! 430: }
! 431: #endif
Webmaster