Annotation of XML/parser.c, revision 1.16
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.15 veillard 3: *
4: * See Copyright for the status of this software.
5: *
1.16 ! daniel 6: * $Id: parser.c,v 1.15 1998/05/25 23:58:37 veillard Exp $
1.1 veillard 7: */
8:
1.9 httpng 9: #include <config.h>
1.1 veillard 10: #include <stdio.h>
11: #include <ctype.h>
1.14 veillard 12: #include <string.h> /* for memset() only */
1.1 veillard 13: #include <malloc.h>
1.9 httpng 14: #include <sys/stat.h>
15: #ifdef HAVE_FCNTL_H
16: #include <fcntl.h>
17: #endif
1.10 httpng 18: #ifdef HAVE_UNISTD_H
19: #include <unistd.h>
20: #endif
1.1 veillard 21:
1.14 veillard 22: #include "tree.h"
1.1 veillard 23: #include "parser.h"
1.14 veillard 24: #include "entities.h"
1.1 veillard 25:
26: /*
27: * A few macros needed to help building the parser.
28: */
29:
30: #ifdef UNICODE
31: /*
1.14 veillard 32: * UNICODE version of the macros. Incomplete now TODO !!!!
1.1 veillard 33: */
34: #define IS_CHAR(c) \
35: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
36: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
37:
38: #define SKIP_BLANKS(p) \
39: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
40: (*(p) == 0x3000)) (p)++;
41:
1.14 veillard 42: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 43: #define IS_BASECHAR(c) \
44: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
45: (((c) >= 0x61) && ((c) <= 0x7a)) || \
46: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
47: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
48: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
49: (((c) >= 0xf8) && ((c) <= 0xff)) || \
50: ((c) == 0xba))
51:
1.14 veillard 52: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 53: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
54:
1.14 veillard 55: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 56: #define IS_COMBINING(c) 0
57:
1.3 veillard 58: #define IS_IGNORABLE(c) \
59: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
60: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
61: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
62: ((c) == 0xfeff))
63:
64: #define IS_EXTENDER(c) \
65: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
66: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
67: ((c) == 0xec6) || ((c) == 0x3005) \
68: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
69: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
70: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
71: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
72: ((c) == 0xff9f))
73:
1.1 veillard 74: #define IS_IDEOGRAPHIC(c) \
75: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
76: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
77: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
78: ((c) == 0x3007))
79:
80: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
81:
82: /* I'm too lazy to complete this one ! */
83: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
84: #else
85: /*
1.3 veillard 86: * 8bits / ASCII version of the macros.
1.1 veillard 87: */
88: #define IS_CHAR(c) \
89: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
90:
91: #define IS_BASECHAR(c) \
92: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
93: (((c) >= 0x61) && ((c) <= 0x7a)) || \
94: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
95: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
96: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
97: (((c) >= 0xf8) && ((c) <= 0xff)) || \
98: ((c) == 0xba))
99:
100: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
101:
102: #define IS_LETTER(c) IS_BASECHAR(c)
103:
104: #define IS_COMBINING(c) 0
105:
1.3 veillard 106: #define IS_IGNORABLE(c) 0
107:
108: #define IS_EXTENDER(c) ((c) == 0xb7)
109:
1.1 veillard 110: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
111: #endif
112:
113:
114: #define SKIP_EOL(p) \
115: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
116: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
117:
118: #define SKIP_BLANKS(p) \
119: while (IS_BLANK(*(p))) (p)++;
120:
121: #define MOVETO_ENDTAG(p) \
122: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
123:
124: #define MOVETO_STARTTAG(p) \
125: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
126:
127: /*
1.3 veillard 128: * Forward definition for recusive behaviour.
129: */
1.16 ! daniel 130: xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt);
1.3 veillard 131:
132: /*
133: * xmlHandleData : this routine represent's the specific application
134: * behaviour when reading a piece of text.
135: *
136: * For example in WebDav, any piece made only of blanks is eliminated
137: */
138:
139: CHAR *xmlHandleData(CHAR *in) {
140: CHAR *cur;
141:
142: if (in == NULL) return(NULL);
143: cur = in;
144: while (IS_CHAR(*cur)) {
145: if (!IS_BLANK(*cur)) goto not_blank;
146: cur++;
147: }
148: free(in);
149: return(NULL);
150:
151: not_blank:
152: return(in);
153: }
154:
155: /*
1.1 veillard 156: * xmlStrndup : a strdup for array of CHAR's
157: */
158:
1.6 httpng 159: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 160: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
161:
162: if (ret == NULL) {
163: fprintf(stderr, "malloc of %d byte failed\n",
164: (len + 1) * sizeof(CHAR));
165: return(NULL);
166: }
167: memcpy(ret, cur, len * sizeof(CHAR));
168: ret[len] = 0;
169: return(ret);
170: }
171:
172: /*
173: * xmlStrdup : a strdup for CHAR's
174: */
175:
1.6 httpng 176: CHAR *xmlStrdup(const CHAR *cur) {
177: const CHAR *p = cur;
1.1 veillard 178:
179: while (IS_CHAR(*p)) p++;
180: return(xmlStrndup(cur, p - cur));
181: }
182:
183: /*
1.14 veillard 184: * xmlStrcmp : a strcmp for CHAR's
185: */
186:
187: int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
188: register int tmp;
189:
190: do {
191: tmp = *str1++ - *str2++;
192: if (tmp != 0) return(tmp);
193: } while ((*str1 != 0) && (*str2 != 0));
194: return (*str1 - *str2);
195: }
196:
197: /*
198: * xmlStrncmp : a strncmp for CHAR's
199: */
200:
201: int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
202: register int tmp;
203:
204: if (len <= 0) return(0);
205: do {
206: tmp = *str1++ - *str2++;
207: if (tmp != 0) return(tmp);
208: len--;
209: if (len <= 0) return(0);
210: } while ((*str1 != 0) && (*str2 != 0));
211: return (*str1 - *str2);
212: }
213:
214: /*
215: * xmlStrchr : a strchr for CHAR's
216: */
217:
218: CHAR *xmlStrchr(const CHAR *str, CHAR val) {
219: while (*str != 0) {
220: if (*str == val) return((CHAR *) str);
221: str++;
222: }
223: return(NULL);
224: }
225:
226: /*
1.1 veillard 227: * xmlParseName : parse an XML name.
228: */
229:
1.16 ! daniel 230: CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
! 231: CHAR *q, *ret = NULL;
1.1 veillard 232:
233: /*
1.3 veillard 234: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 235: */
1.16 ! daniel 236: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
! 237: q = ctxt->cur++;
! 238: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 239: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') ||
! 240: (ctxt->cur[0] == ':') ||
! 241: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 242: (IS_EXTENDER(ctxt->cur[0])))
! 243: ctxt->cur++;
1.3 veillard 244:
1.16 ! daniel 245: ret = xmlStrndup(q, ctxt->cur - q);
1.1 veillard 246:
1.3 veillard 247: return(ret);
1.1 veillard 248: }
249:
250: /*
251: * Parse and return a string between quotes or doublequotes
252: */
1.16 ! daniel 253: CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
1.1 veillard 254: CHAR *ret = NULL;
1.16 ! daniel 255: CHAR *q;
1.1 veillard 256:
1.16 ! daniel 257: if (ctxt->cur[0] == '"') {
! 258: ctxt->cur++;
! 259: q = ctxt->cur;
! 260: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++;
! 261: if (ctxt->cur[0] != '"')
1.7 veillard 262: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 263: else {
1.16 ! daniel 264: ret = xmlStrndup(q, ctxt->cur - q);
! 265: ctxt->cur++;
1.1 veillard 266: }
1.16 ! daniel 267: } else if (ctxt->cur[0] == '\''){
! 268: ctxt->cur++;
! 269: q = ctxt->cur;
! 270: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++;
! 271: if (ctxt->cur[0] != '\'')
1.7 veillard 272: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 273: else {
1.16 ! daniel 274: ret = xmlStrndup(q, ctxt->cur - q);
! 275: ctxt->cur++;
1.1 veillard 276: }
277: }
278: return(ret);
279: }
280:
281: /*
1.3 veillard 282: * Skip an XML (SGML) comment <!-- .... -->
1.16 ! daniel 283: *
! 284: * TODO !!!! Save the comment in the tree !!!
1.3 veillard 285: */
1.16 ! daniel 286: void xmlParserSkipComment(xmlParserCtxtPtr ctxt) {
! 287: CHAR *q, *r, *start;
1.3 veillard 288:
289: /*
290: * An extra check may avoid errors and isn't that costly !
291: */
1.16 ! daniel 292: if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') ||
! 293: (ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return;
1.3 veillard 294:
1.16 ! daniel 295: ctxt->cur += 4;
! 296: start = q = ctxt->cur;
! 297: ctxt->cur++;
! 298: r = ctxt->cur;
! 299: ctxt->cur++;
! 300: while (IS_CHAR(ctxt->cur[0]) &&
! 301: ((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') ||
! 302: (*r != '-') || (*q != '-'))) {
! 303: ctxt->cur++;r++;q++;
1.3 veillard 304: }
1.16 ! daniel 305: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 306: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.16 ! daniel 307: ctxt->cur = start; /* !!! We shouldn't really try to recover !!! */
1.3 veillard 308: } else {
1.16 ! daniel 309: ctxt->cur++;
1.3 veillard 310: }
311: }
312:
313: /*
1.13 veillard 314: * xmlParseNamespace: parse specific '<?namespace ...' constructs.
1.1 veillard 315: */
316:
1.16 ! daniel 317: void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
1.1 veillard 318: CHAR *href = NULL;
319: CHAR *AS = NULL;
1.3 veillard 320: int garbage = 0;
1.1 veillard 321:
322: /*
323: * We know that 'namespace' is here.
324: */
1.16 ! daniel 325: ctxt->cur += 9;
! 326: SKIP_BLANKS(ctxt->cur);
1.1 veillard 327:
1.16 ! daniel 328: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '>')) {
1.1 veillard 329: /*
330: * We can have 'href' or 'AS' attributes.
331: */
1.16 ! daniel 332: if ((ctxt->cur[0] == 'h') && (ctxt->cur[1] == 'r') &&
! 333: (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f')) {
1.3 veillard 334: garbage = 0;
1.16 ! daniel 335: ctxt->cur += 4;
! 336: SKIP_BLANKS(ctxt->cur);
1.1 veillard 337:
1.16 ! daniel 338: if (ctxt->cur[0] != '=') continue;
! 339: ctxt->cur++;
! 340: SKIP_BLANKS(ctxt->cur);
! 341:
! 342: href = xmlParseQuotedString(ctxt);
! 343: SKIP_BLANKS(ctxt->cur);
! 344: } else if ((ctxt->cur[0] == 'A') && (ctxt->cur[1] == 'S')) {
1.3 veillard 345: garbage = 0;
1.16 ! daniel 346: ctxt->cur += 2;
! 347: SKIP_BLANKS(ctxt->cur);
1.1 veillard 348:
1.16 ! daniel 349: if (ctxt->cur[0] != '=') continue;
! 350: ctxt->cur++;
! 351: SKIP_BLANKS(ctxt->cur);
! 352:
! 353: AS = xmlParseQuotedString(ctxt);
! 354: SKIP_BLANKS(ctxt->cur);
! 355: } else if ((ctxt->cur[0] == '?') && (ctxt->cur[1] == '>')) {
1.3 veillard 356: garbage = 0;
1.16 ! daniel 357: ctxt->cur ++;
1.1 veillard 358: } else {
1.3 veillard 359: /*
360: * Found garbage when parsing the namespace
361: */
362: if (!garbage) fprintf(stderr,
1.13 veillard 363: "\nxmlParseNamespace found garbage: ");
1.16 ! daniel 364: fprintf(stderr, "%c", ctxt->cur[0]);
! 365: ctxt->cur++;
1.1 veillard 366: }
367: }
368:
1.16 ! daniel 369: MOVETO_ENDTAG(ctxt->cur);
! 370: ctxt->cur++;
1.1 veillard 371:
372: /*
373: * Register the DTD.
374: */
375: if (href != NULL)
1.16 ! daniel 376: xmlNewDtd(ctxt->doc, href, AS);
1.1 veillard 377:
1.8 veillard 378: if (AS != NULL) free(AS);
379: if (href != NULL) free(href);
1.1 veillard 380: }
381:
382: /*
1.3 veillard 383: * xmlParsePI: parse an XML Processing Instruction.
384: */
385:
1.16 ! daniel 386: void xmlParsePI(xmlParserCtxtPtr ctxt) {
! 387: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
1.3 veillard 388: /*
389: * this is a Processing Instruction.
390: */
1.16 ! daniel 391: ctxt->cur += 2;
1.3 veillard 392:
393: /*
394: * Special for WebDav, support for the Processing Instruction
395: * '<?namespace ...' contruct in the header of the XML document.
396: */
1.16 ! daniel 397: if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 'a') &&
! 398: (ctxt->cur[2] == 'm') && (ctxt->cur[3] == 'e') &&
! 399: (ctxt->cur[4] == 's') && (ctxt->cur[5] == 'p') &&
! 400: (ctxt->cur[6] == 'a') && (ctxt->cur[7] == 'c') &&
! 401: (ctxt->cur[8] == 'e')) {
! 402: xmlParseNamespace(ctxt);
1.3 veillard 403: } else {
404: /* Unknown PI, ignore it ! */
1.16 ! daniel 405: fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n",
! 406: ctxt->cur);
! 407: MOVETO_ENDTAG(ctxt->cur);
! 408: ctxt->cur++;
1.3 veillard 409: }
410: }
411: }
412:
413: /*
414: * xmlParseAttribute: parse a start of tag.
415: *
416: * Attribute ::= Name Eq AttValue
417: */
418:
1.16 ! daniel 419: void xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
! 420: CHAR *q, *name, *value = NULL;
1.3 veillard 421:
1.16 ! daniel 422: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.3 veillard 423: return;
424: }
1.16 ! daniel 425: q = ctxt->cur++;
! 426: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 427: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
! 428: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
! 429: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 430: (IS_EXTENDER(ctxt->cur[0])))
! 431: ctxt->cur++;
! 432: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 433:
434: /*
435: * We should have the equal, we are laxist here and allow attributes
436: * without values and extra spaces.
437: */
1.16 ! daniel 438: SKIP_BLANKS(ctxt->cur);
! 439: if (ctxt->cur[0] == '=') {
! 440: ctxt->cur++;
! 441: SKIP_BLANKS(ctxt->cur);
! 442: if ((ctxt->cur[0] != '\'') && (ctxt->cur[0] != '"')) {
1.7 veillard 443: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 444: q);
445: } else
1.16 ! daniel 446: value = xmlParseQuotedString(ctxt);
1.3 veillard 447: }
448:
449: /*
450: * Add the attribute to the node.
451: */
452: if (name != NULL)
453: xmlNewProp(node, name, value);
454: }
455:
456: /*
1.2 veillard 457: * xmlParseStartTag: parse a start of tag.
458: */
459:
1.16 ! daniel 460: xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) {
! 461: CHAR *q, *ns, *name;
1.3 veillard 462: xmlDtdPtr dtd = NULL;
1.2 veillard 463: xmlNodePtr ret = NULL;
464:
465: /*
1.3 veillard 466: * Theorically one should just parse a Name, but with the addition
467: * of the namespace needed for WebDav, it's a bit more complicated
468: * since the element name may be prefixed by a namespace prefix.
469: *
470: * QName ::= (NSPart ':')? LocalPart
471: * NSPart ::= Name
472: * LocalPart ::= Name
473: * STag ::= '<' QName (S Attribute)* S? '>'
474: *
475: * instead of :
476: *
477: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 478: */
1.16 ! daniel 479: if (ctxt->cur[0] != '<') return(NULL);
! 480: ctxt->cur++;
1.3 veillard 481:
1.16 ! daniel 482: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
! 483: q = ctxt->cur++;
! 484: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 485: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
! 486: (ctxt->cur[0] == '_') ||
! 487: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 488: (IS_EXTENDER(ctxt->cur[0])))
! 489: ctxt->cur++;
1.3 veillard 490:
1.16 ! daniel 491: if (ctxt->cur[0] == ':') {
! 492: ns = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 493:
1.16 ! daniel 494: ctxt->cur++; /* skip the column */
! 495: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.7 veillard 496: fprintf(stderr,
497: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 498: q);
499: free(ns);
500: return(NULL);
501: }
1.16 ! daniel 502: q = ctxt->cur++;
! 503: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 504: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
! 505: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
! 506: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 507: (IS_EXTENDER(ctxt->cur[0])))
! 508: ctxt->cur++;
! 509: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 510:
511: /*
512: * Search the DTD associated to ns.
513: */
1.16 ! daniel 514: dtd = xmlSearchDtd(ctxt->doc, ns);
1.3 veillard 515: if (dtd == NULL)
1.7 veillard 516: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 517: free(ns);
518: } else
1.16 ! daniel 519: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 520:
521: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 522:
1.3 veillard 523: /*
524: * Now parse the attributes, it ends up with the ending
525: *
526: * (S Attribute)* S?
527: */
1.16 ! daniel 528: SKIP_BLANKS(ctxt->cur);
! 529: while ((IS_CHAR(ctxt->cur[0])) &&
! 530: (ctxt->cur[0] != '>') &&
! 531: ((ctxt->cur[0] != '/') || (ctxt->cur[1] != '>'))) {
! 532: if (IS_LETTER(ctxt->cur[0]) || (ctxt->cur[0] == '_'))
! 533: xmlParseAttribute(ctxt, ret);
1.3 veillard 534: else {
1.14 veillard 535: /* We should warn TODO !!! */
1.16 ! daniel 536: ctxt->cur++;
1.3 veillard 537: }
1.16 ! daniel 538: SKIP_BLANKS(ctxt->cur);
1.3 veillard 539: }
540:
541: return(ret);
542: }
543:
544: /*
1.7 veillard 545: * xmlParseEndTag: parse an end of tag, note that the '</' part has
546: * already been read.
547: */
548:
1.16 ! daniel 549: void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
! 550: CHAR *q, *ns, *name;
1.7 veillard 551: xmlDtdPtr dtd = NULL;
552:
553: *dtdPtr = NULL;
554: *tagPtr = NULL;
555:
556: /*
557: * Theorically one should just parse a Name, but with the addition
558: * of the namespace needed for WebDav, it's a bit more complicated
559: * since the element name may be prefixed by a namespace prefix.
560: *
561: * QName ::= (NSPart ':')? LocalPart
562: * NSPart ::= Name
563: * LocalPart ::= Name
564: * ETag ::= '</' QName S? '>'
565: *
566: * instead of :
567: *
568: * ETag ::= '</' Name S? '>'
569: */
1.16 ! daniel 570: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return;
! 571: q = ctxt->cur++;
! 572: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 573: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
! 574: (ctxt->cur[0] == '_') ||
! 575: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 576: (IS_EXTENDER(ctxt->cur[0])))
! 577: ctxt->cur++;
1.7 veillard 578:
1.16 ! daniel 579: if (ctxt->cur[0] == ':') {
! 580: ns = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 581:
1.16 ! daniel 582: ctxt->cur++; /* skip the column */
! 583: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.7 veillard 584: fprintf(stderr,
585: "End tag : no element name after namespace identifier %.20s\n",
586: q);
587: free(ns);
588: return;
589: }
1.16 ! daniel 590: q = ctxt->cur++;
! 591: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
! 592: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
! 593: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
! 594: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
! 595: (IS_EXTENDER(ctxt->cur[0])))
! 596: ctxt->cur++;
! 597: name = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 598:
599: /*
600: * Search the DTD associated to ns.
601: */
1.16 ! daniel 602: dtd = xmlSearchDtd(ctxt->doc, ns);
1.7 veillard 603: if (dtd == NULL)
604: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
605: free(ns);
606: } else
1.16 ! daniel 607: name = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 608:
609: *dtdPtr = dtd;
610: *tagPtr = name;
611:
612: /*
613: * We should definitely be at the ending "S? '>'" part
614: */
1.16 ! daniel 615: SKIP_BLANKS(ctxt->cur);
! 616: if ((!IS_CHAR(ctxt->cur[0])) || (ctxt->cur[0] != '>')) {
! 617: fprintf(stderr, "End tag : expected '>', got %.20s\n", ctxt->cur);
1.7 veillard 618: /*
619: * Note : skipping to the next '>' is probably otherkill,
620: * especially in case the '>' is hust missing.
621: *
622: * Otherwise add:
1.16 ! daniel 623: * MOVETO_ENDTAG(ctxt->cur);
1.7 veillard 624: */
625: } else
1.16 ! daniel 626: ctxt->cur++;
1.7 veillard 627:
628: return;
629: }
630:
631: /*
1.3 veillard 632: * xmlParseCDSect: escaped pure raw content.
633: */
1.16 ! daniel 634: CHAR *xmlParseCDSect(xmlParserCtxtPtr ctxt) {
! 635: CHAR *r, *s, *base, *ret;
1.3 veillard 636:
1.16 ! daniel 637: base = ctxt->cur;
! 638: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 639: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 640: return(NULL);
641: }
1.16 ! daniel 642: r = ctxt->cur++;
! 643: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 644: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 645: return(NULL);
646: }
1.16 ! daniel 647: s = ctxt->cur++;
! 648: while (IS_CHAR(ctxt->cur[0]) &&
! 649: ((*r != ']') || (*s != ']') || (ctxt->cur[0] != '>'))) {
! 650: r++;s++;ctxt->cur++;
1.3 veillard 651: }
1.16 ! daniel 652: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 653: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 654: return(NULL);
655: }
1.16 ! daniel 656: ret = xmlStrndup(base, ctxt->cur-base);
! 657:
1.2 veillard 658: return(ret);
659: }
660:
661: /*
662: * xmlParseContent: a content is
663: * (element | PCData | Reference | CDSect | PI | Comment)
664: *
665: * element : starts by '<'
666: * PCData : any CHAR but '&' or '<'
667: * Reference : starts by '&'
668: * CDSect : starts by '<![CDATA['
669: * PI : starts by '<?'
670: */
671:
1.16 ! daniel 672: xmlNodePtr xmlParseContent(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
! 673: CHAR *q, *data = NULL;
1.2 veillard 674: xmlNodePtr ret = NULL;
675:
676: /*
1.3 veillard 677: * First case : a Processing Instruction.
678: */
1.16 ! daniel 679: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
! 680: xmlParsePI(ctxt);
1.3 veillard 681: }
682: /*
683: * Second case : a CDSection
1.2 veillard 684: */
1.16 ! daniel 685: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
! 686: (ctxt->cur[2] == '[') && (ctxt->cur[3] == 'C') &&
! 687: (ctxt->cur[4] == 'D') && (ctxt->cur[5] == 'A') &&
! 688: (ctxt->cur[6] == 'T') && (ctxt->cur[7] == 'A') &&
! 689: (ctxt->cur[8] == '[')) {
! 690: ctxt->cur += 9;
! 691: data = xmlParseCDSect(ctxt);
1.3 veillard 692: }
693: /*
694: * Third case : a sub-element.
695: */
1.16 ! daniel 696: else if (ctxt->cur[0] == '<') {
! 697: ret = xmlParseElement(ctxt);
1.3 veillard 698: }
699: /*
700: * Last case, text. Note that References are handled directly.
701: */
702: else {
1.16 ! daniel 703: q = ctxt->cur;
! 704: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '<')) ctxt->cur++;
1.3 veillard 705:
1.16 ! daniel 706: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 707: fprintf(stderr, "Truncated content : %.50s\n", q);
1.3 veillard 708: return(NULL);
709: }
1.14 veillard 710:
711: /*
712: * Do the Entities decoding...
713: */
1.16 ! daniel 714: data = xmlStrdup(xmlDecodeEntities(ctxt->doc, q, ctxt->cur - q));
1.3 veillard 715: }
716:
717: /*
718: * Handle the data if any. If there is no child
719: * add it as content, otherwise create a new node of type text.
720: */
721: if (data != NULL)
722: data = xmlHandleData(data);
723: if (data != NULL) {
724: if (node->childs == NULL)
725: xmlNodeSetContent(node, data);
726: else {
727: ret = xmlNewText(data);
728: }
729: }
1.2 veillard 730:
731: return(ret);
732: }
733:
734: /*
735: * xmlParseElement: parse an XML element
736: */
737:
1.16 ! daniel 738: xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) {
1.2 veillard 739: xmlNodePtr ret, child;
1.16 ! daniel 740: CHAR *openTag = ctxt->cur;
! 741: CHAR *closeTag = ctxt->cur;
1.2 veillard 742:
1.16 ! daniel 743: ret = xmlParseStartTag(ctxt);
1.3 veillard 744: if (ret == NULL) {
745: return(NULL);
746: }
1.2 veillard 747:
748: /*
749: * Check for an Empty Element.
750: */
1.16 ! daniel 751: if ((ctxt->cur[0] == '/') && (ctxt->cur[1] == '>')) {
! 752: ctxt->cur += 2;
1.2 veillard 753: return(ret);
754: }
1.16 ! daniel 755: if (ctxt->cur[0] == '>') ctxt->cur++;
1.2 veillard 756: else {
1.16 ! daniel 757: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", openTag);
! 758: return(NULL);
1.2 veillard 759: }
760:
761: /*
762: * Parse the content of the element:
763: * (element | PCData | Reference | CDSect | PI | Comment) *
764: *
765: * element : starts by '<'
766: * PCData : any CHAR but '&' or '<'
767: * Reference : starts by '&'
768: * CDSect : starts by '<![CDATA['
769: * PI : starts by '<?'
770: *
771: * The loop stops upon detection of an end of tag '</'
772: */
1.16 ! daniel 773: while ((IS_CHAR(ctxt->cur[0])) &&
! 774: ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '/'))) {
! 775: child = xmlParseContent(ctxt, ret);
1.2 veillard 776: if (child != NULL)
777: xmlAddChild(ret, child);
778: }
1.16 ! daniel 779: if (!IS_CHAR(ctxt->cur[0])) {
! 780: fprintf(stderr, "Premature end of data in tag %.30s\n", openTag);
! 781: return(NULL);
1.2 veillard 782: }
783:
784: /*
785: * parse the end of tag : '</' has been detected.
786: */
1.16 ! daniel 787: ctxt->cur += 2;
! 788: if (ctxt->cur[0] == '>') ctxt->cur++; /* simplified closing </> */
1.2 veillard 789: else {
1.7 veillard 790: CHAR *endTag;
791: xmlDtdPtr endDtd;
792:
1.16 ! daniel 793: xmlParseEndTag(ctxt, &endDtd, &endTag);
1.7 veillard 794:
1.2 veillard 795: /*
1.7 veillard 796: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 797: */
1.7 veillard 798: if (endDtd != ret->dtd) {
799: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
800: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
801: }
802: if (strcmp(ret->name, endTag)) {
803: fprintf(stderr, "Start and End tags don't use the same name:\n");
804: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
805: }
1.2 veillard 806: }
807:
808: return(ret);
809: }
810:
811: /*
1.1 veillard 812: * xmlParseXMLDecl: parse an XML declaration header
813: */
814:
1.16 ! daniel 815: void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
1.1 veillard 816: CHAR *version;
817:
818: /*
819: * We know that '<?XML' is here.
820: */
1.16 ! daniel 821: ctxt->cur += 5;
1.1 veillard 822:
823: /*
824: * Parse the version info
825: */
1.16 ! daniel 826: SKIP_BLANKS(ctxt->cur);
1.1 veillard 827:
828: /*
829: * We should have 'version=' here !
830: */
1.16 ! daniel 831: if ((ctxt->cur[0] == 'v') && (ctxt->cur[1] == 'e') &&
! 832: (ctxt->cur[2] == 'r') && (ctxt->cur[3] == 's') &&
! 833: (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'o') &&
! 834: (ctxt->cur[6] == 'n') && (ctxt->cur[7] == '=')) {
! 835: ctxt->cur += 8;
! 836: version = xmlParseQuotedString(ctxt);
1.1 veillard 837: if (version == NULL)
1.16 ! daniel 838: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 839: else {
1.16 ! daniel 840: ctxt->doc = xmlNewDoc(version);
1.8 veillard 841: free(version);
1.1 veillard 842: }
843: } else {
1.16 ! daniel 844: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 845: }
846:
847: /*
1.14 veillard 848: * We should check for Required Markup Declaration TODO !!!!
1.1 veillard 849: */
1.16 ! daniel 850: MOVETO_ENDTAG(ctxt->cur);
! 851: ctxt->cur++;
1.1 veillard 852:
853: }
854:
855: /*
856: * xmlParseMisc: parse an XML Misc optionnal field.
857: * (Comment | PI | S)*
858: */
859:
1.16 ! daniel 860: void xmlParseMisc(xmlParserCtxtPtr ctxt) {
! 861: while (((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) ||
! 862: ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
! 863: (ctxt->cur[2] == '-') && (ctxt->cur[2] == '-')) ||
! 864: IS_BLANK(ctxt->cur[0])) {
! 865: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
! 866: xmlParsePI(ctxt);
! 867: } else if (IS_BLANK(ctxt->cur[0])) {
! 868: ctxt->cur++;
1.1 veillard 869: } else
1.16 ! daniel 870: xmlParserSkipComment(ctxt);
1.1 veillard 871: }
872: }
873:
874: /*
1.16 ! daniel 875: * xmlParseDocument : parse an XML document and build a tree.
1.1 veillard 876: */
877:
1.16 ! daniel 878: int xmlParseDocument(xmlParserCtxtPtr ctxt) {
1.14 veillard 879: /*
880: * We should check for encoding here and plug-in some
881: * conversion code TODO !!!!
882: */
1.1 veillard 883:
884: /*
885: * Wipe out everything which is before the first '<'
886: */
1.16 ! daniel 887: SKIP_BLANKS(ctxt->cur);
1.1 veillard 888:
889: /*
890: * Check for the XMLDecl in the Prolog.
891: */
1.16 ! daniel 892: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
! 893: (ctxt->cur[2] == 'X') && (ctxt->cur[3] == 'M') &&
! 894: (ctxt->cur[4] == 'L')) {
! 895: xmlParseXMLDecl(ctxt);
1.1 veillard 896: /* SKIP_EOL(cur); */
1.16 ! daniel 897: SKIP_BLANKS(ctxt->cur);
1.1 veillard 898: } else {
1.16 ! daniel 899: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 900: }
901:
902: /*
903: * The Misc part of the Prolog
904: * (Comment | PI | S) *
905: */
1.16 ! daniel 906: xmlParseMisc(ctxt);
1.1 veillard 907:
908: /*
1.2 veillard 909: * Time to start parsing
1.1 veillard 910: */
1.16 ! daniel 911: ctxt->doc->root = xmlParseElement(ctxt);
! 912:
! 913: return(0);
! 914: }
! 915:
! 916: /*
! 917: * xmlParseDoc : parse an XML in-memory document and build a tree.
! 918: */
! 919:
! 920: xmlDocPtr xmlParseDoc(CHAR *cur) {
! 921: xmlDocPtr ret;
! 922: xmlParserCtxtPtr ctxt;
! 923:
! 924: if (cur == NULL) return(NULL);
1.1 veillard 925:
1.16 ! daniel 926: ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
! 927: if (ctxt == NULL) {
! 928: perror("malloc");
! 929: return(NULL);
! 930: }
! 931:
! 932: ctxt->filename = NULL;
! 933: ctxt->base = cur;
! 934: ctxt->cur = cur;
! 935: ctxt->line = 1;
! 936: ctxt->col = 1;
! 937: ctxt->doc = NULL;
! 938:
! 939: xmlParseDocument(ctxt);
! 940: ret = ctxt->doc;
! 941: if (ctxt->filename != NULL)
! 942: free(ctxt->filename);
! 943: free(ctxt);
! 944:
1.1 veillard 945: return(ret);
946: }
947:
1.9 httpng 948: /*
949: * xmlParseFile : parse an XML file and build a tree.
950: */
951:
952: xmlDocPtr xmlParseFile(const char *filename) {
953: xmlDocPtr ret;
954: int input;
955: int res;
956: struct stat buf;
957: char *buffer;
1.16 ! daniel 958: xmlParserCtxtPtr ctxt;
1.9 httpng 959:
1.11 veillard 960: res = stat(filename, &buf);
1.9 httpng 961: if (res < 0) return(NULL);
962:
963: buffer = malloc(buf.st_size + 100);
964: if (buffer == NULL) {
965: perror("malloc");
966: return(NULL);
967: }
968:
969: memset(buffer, 0, sizeof(buffer));
970: input = open (filename, O_RDONLY);
971: if (input < 0) {
972: fprintf (stderr, "Cannot read file %s :\n", filename);
973: perror ("open failed");
974: return(NULL);
975: }
976: res = read(input, buffer, buf.st_size);
977: if (res < 0) {
978: fprintf (stderr, "Cannot read file %s :\n", filename);
979: perror ("read failed");
980: return(NULL);
981: }
982: close(input);
983:
1.16 ! daniel 984: ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
! 985: if (ctxt == NULL) {
! 986: perror("malloc");
! 987: return(NULL);
! 988: }
1.9 httpng 989: buffer[buf.st_size] = '\0';
1.16 ! daniel 990:
! 991: ctxt->filename = strdup(filename);
! 992: ctxt->base = buffer;
! 993: ctxt->cur = buffer;
! 994: ctxt->line = 1;
! 995: ctxt->col = 1;
! 996: ctxt->doc = NULL;
! 997:
! 998: xmlParseDocument(ctxt);
! 999: ret = ctxt->doc;
1.9 httpng 1000: free(buffer);
1.16 ! daniel 1001: if (ctxt->filename != NULL)
! 1002: free(ctxt->filename);
! 1003: free(ctxt);
! 1004:
1.9 httpng 1005: return(ret);
1006: }
Webmaster