Annotation of XML/parser.c, revision 1.17
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.15 veillard 3: *
4: * See Copyright for the status of this software.
5: *
1.17 ! daniel 6: * $Id: parser.c,v 1.16 1998/06/19 04:48:27 daniel Exp $
1.1 veillard 7: */
8:
1.9 httpng 9: #include <config.h>
1.1 veillard 10: #include <stdio.h>
11: #include <ctype.h>
1.14 veillard 12: #include <string.h> /* for memset() only */
1.1 veillard 13: #include <malloc.h>
1.9 httpng 14: #include <sys/stat.h>
15: #ifdef HAVE_FCNTL_H
16: #include <fcntl.h>
17: #endif
1.10 httpng 18: #ifdef HAVE_UNISTD_H
19: #include <unistd.h>
20: #endif
1.1 veillard 21:
1.14 veillard 22: #include "tree.h"
1.1 veillard 23: #include "parser.h"
1.14 veillard 24: #include "entities.h"
1.1 veillard 25:
26: /*
27: * A few macros needed to help building the parser.
28: */
29:
30: #ifdef UNICODE
31: /*
1.14 veillard 32: * UNICODE version of the macros. Incomplete now TODO !!!!
1.1 veillard 33: */
34: #define IS_CHAR(c) \
35: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
36: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
37:
38: #define SKIP_BLANKS(p) \
39: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
40: (*(p) == 0x3000)) (p)++;
41:
1.14 veillard 42: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 43: #define IS_BASECHAR(c) \
44: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
45: (((c) >= 0x61) && ((c) <= 0x7a)) || \
46: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
47: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
48: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
49: (((c) >= 0xf8) && ((c) <= 0xff)) || \
50: ((c) == 0xba))
51:
1.14 veillard 52: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 53: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
54:
1.14 veillard 55: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 56: #define IS_COMBINING(c) 0
57:
1.3 veillard 58: #define IS_IGNORABLE(c) \
59: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
60: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
61: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
62: ((c) == 0xfeff))
63:
64: #define IS_EXTENDER(c) \
65: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
66: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
67: ((c) == 0xec6) || ((c) == 0x3005) \
68: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
69: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
70: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
71: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
72: ((c) == 0xff9f))
73:
1.1 veillard 74: #define IS_IDEOGRAPHIC(c) \
75: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
76: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
77: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
78: ((c) == 0x3007))
79:
80: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
81:
82: /* I'm too lazy to complete this one ! */
83: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
84: #else
85: /*
1.3 veillard 86: * 8bits / ASCII version of the macros.
1.1 veillard 87: */
88: #define IS_CHAR(c) \
89: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
90:
91: #define IS_BASECHAR(c) \
92: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
93: (((c) >= 0x61) && ((c) <= 0x7a)) || \
94: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
95: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
96: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
97: (((c) >= 0xf8) && ((c) <= 0xff)) || \
98: ((c) == 0xba))
99:
100: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
101:
102: #define IS_LETTER(c) IS_BASECHAR(c)
103:
104: #define IS_COMBINING(c) 0
105:
1.3 veillard 106: #define IS_IGNORABLE(c) 0
107:
108: #define IS_EXTENDER(c) ((c) == 0xb7)
109:
1.1 veillard 110: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
111: #endif
112:
113:
114: #define SKIP_EOL(p) \
115: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
116: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
117:
118: #define SKIP_BLANKS(p) \
119: while (IS_BLANK(*(p))) (p)++;
120:
121: #define MOVETO_ENDTAG(p) \
122: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
123:
124: #define MOVETO_STARTTAG(p) \
125: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
126:
127: /*
1.3 veillard 128: * Forward definition for recusive behaviour.
129: */
1.16 daniel 130: xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt);
1.3 veillard 131:
132: /*
133: * xmlHandleData : this routine represent's the specific application
134: * behaviour when reading a piece of text.
135: *
136: * For example in WebDav, any piece made only of blanks is eliminated
137: */
138:
139: CHAR *xmlHandleData(CHAR *in) {
140: CHAR *cur;
141:
142: if (in == NULL) return(NULL);
143: cur = in;
144: while (IS_CHAR(*cur)) {
145: if (!IS_BLANK(*cur)) goto not_blank;
146: cur++;
147: }
148: free(in);
149: return(NULL);
150:
151: not_blank:
152: return(in);
153: }
154:
155: /*
1.1 veillard 156: * xmlStrndup : a strdup for array of CHAR's
157: */
158:
1.6 httpng 159: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 160: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
161:
162: if (ret == NULL) {
163: fprintf(stderr, "malloc of %d byte failed\n",
164: (len + 1) * sizeof(CHAR));
165: return(NULL);
166: }
167: memcpy(ret, cur, len * sizeof(CHAR));
168: ret[len] = 0;
169: return(ret);
170: }
171:
172: /*
173: * xmlStrdup : a strdup for CHAR's
174: */
175:
1.6 httpng 176: CHAR *xmlStrdup(const CHAR *cur) {
177: const CHAR *p = cur;
1.1 veillard 178:
179: while (IS_CHAR(*p)) p++;
180: return(xmlStrndup(cur, p - cur));
181: }
182:
183: /*
1.14 veillard 184: * xmlStrcmp : a strcmp for CHAR's
185: */
186:
187: int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
188: register int tmp;
189:
190: do {
191: tmp = *str1++ - *str2++;
192: if (tmp != 0) return(tmp);
193: } while ((*str1 != 0) && (*str2 != 0));
194: return (*str1 - *str2);
195: }
196:
197: /*
198: * xmlStrncmp : a strncmp for CHAR's
199: */
200:
201: int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
202: register int tmp;
203:
204: if (len <= 0) return(0);
205: do {
206: tmp = *str1++ - *str2++;
207: if (tmp != 0) return(tmp);
208: len--;
209: if (len <= 0) return(0);
210: } while ((*str1 != 0) && (*str2 != 0));
211: return (*str1 - *str2);
212: }
213:
214: /*
215: * xmlStrchr : a strchr for CHAR's
216: */
217:
218: CHAR *xmlStrchr(const CHAR *str, CHAR val) {
219: while (*str != 0) {
220: if (*str == val) return((CHAR *) str);
221: str++;
222: }
223: return(NULL);
224: }
225:
226: /*
1.1 veillard 227: * xmlParseName : parse an XML name.
228: */
229:
1.16 daniel 230: CHAR *xmlParseName(xmlParserCtxtPtr ctxt) {
1.17 ! daniel 231: const CHAR *q;
! 232: CHAR *ret = NULL;
1.1 veillard 233:
234: /*
1.3 veillard 235: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 236: */
1.16 daniel 237: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
238: q = ctxt->cur++;
239: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
240: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') || (ctxt->cur[0] == '_') ||
241: (ctxt->cur[0] == ':') ||
242: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
243: (IS_EXTENDER(ctxt->cur[0])))
244: ctxt->cur++;
1.3 veillard 245:
1.16 daniel 246: ret = xmlStrndup(q, ctxt->cur - q);
1.1 veillard 247:
1.3 veillard 248: return(ret);
1.1 veillard 249: }
250:
251: /*
252: * Parse and return a string between quotes or doublequotes
253: */
1.16 daniel 254: CHAR *xmlParseQuotedString(xmlParserCtxtPtr ctxt) {
1.1 veillard 255: CHAR *ret = NULL;
1.17 ! daniel 256: const CHAR *q;
1.1 veillard 257:
1.16 daniel 258: if (ctxt->cur[0] == '"') {
259: ctxt->cur++;
260: q = ctxt->cur;
261: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '"')) ctxt->cur++;
262: if (ctxt->cur[0] != '"')
1.7 veillard 263: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 264: else {
1.16 daniel 265: ret = xmlStrndup(q, ctxt->cur - q);
266: ctxt->cur++;
1.1 veillard 267: }
1.16 daniel 268: } else if (ctxt->cur[0] == '\''){
269: ctxt->cur++;
270: q = ctxt->cur;
271: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '\'')) ctxt->cur++;
272: if (ctxt->cur[0] != '\'')
1.7 veillard 273: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 274: else {
1.16 daniel 275: ret = xmlStrndup(q, ctxt->cur - q);
276: ctxt->cur++;
1.1 veillard 277: }
278: }
279: return(ret);
280: }
281:
282: /*
1.3 veillard 283: * Skip an XML (SGML) comment <!-- .... -->
1.16 daniel 284: *
285: * TODO !!!! Save the comment in the tree !!!
1.3 veillard 286: */
1.16 daniel 287: void xmlParserSkipComment(xmlParserCtxtPtr ctxt) {
1.17 ! daniel 288: const CHAR *q, *start;
! 289: const CHAR *r;
1.3 veillard 290:
291: /*
292: * An extra check may avoid errors and isn't that costly !
293: */
1.16 daniel 294: if ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '!') ||
295: (ctxt->cur[2] != '-') || (ctxt->cur[3] != '-')) return;
1.3 veillard 296:
1.16 daniel 297: ctxt->cur += 4;
298: start = q = ctxt->cur;
299: ctxt->cur++;
300: r = ctxt->cur;
301: ctxt->cur++;
302: while (IS_CHAR(ctxt->cur[0]) &&
303: ((ctxt->cur[0] == ':') || (ctxt->cur[0] != '>') ||
304: (*r != '-') || (*q != '-'))) {
305: ctxt->cur++;r++;q++;
1.3 veillard 306: }
1.16 daniel 307: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 308: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.16 daniel 309: ctxt->cur = start; /* !!! We shouldn't really try to recover !!! */
1.3 veillard 310: } else {
1.16 daniel 311: ctxt->cur++;
1.3 veillard 312: }
313: }
314:
315: /*
1.13 veillard 316: * xmlParseNamespace: parse specific '<?namespace ...' constructs.
1.1 veillard 317: */
318:
1.16 daniel 319: void xmlParseNamespace(xmlParserCtxtPtr ctxt) {
1.1 veillard 320: CHAR *href = NULL;
321: CHAR *AS = NULL;
1.3 veillard 322: int garbage = 0;
1.1 veillard 323:
324: /*
325: * We know that 'namespace' is here.
326: */
1.16 daniel 327: ctxt->cur += 9;
328: SKIP_BLANKS(ctxt->cur);
1.1 veillard 329:
1.16 daniel 330: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '>')) {
1.1 veillard 331: /*
332: * We can have 'href' or 'AS' attributes.
333: */
1.16 daniel 334: if ((ctxt->cur[0] == 'h') && (ctxt->cur[1] == 'r') &&
335: (ctxt->cur[2] == 'e') && (ctxt->cur[3] == 'f')) {
1.3 veillard 336: garbage = 0;
1.16 daniel 337: ctxt->cur += 4;
338: SKIP_BLANKS(ctxt->cur);
1.1 veillard 339:
1.16 daniel 340: if (ctxt->cur[0] != '=') continue;
341: ctxt->cur++;
342: SKIP_BLANKS(ctxt->cur);
343:
344: href = xmlParseQuotedString(ctxt);
345: SKIP_BLANKS(ctxt->cur);
346: } else if ((ctxt->cur[0] == 'A') && (ctxt->cur[1] == 'S')) {
1.3 veillard 347: garbage = 0;
1.16 daniel 348: ctxt->cur += 2;
349: SKIP_BLANKS(ctxt->cur);
1.1 veillard 350:
1.16 daniel 351: if (ctxt->cur[0] != '=') continue;
352: ctxt->cur++;
353: SKIP_BLANKS(ctxt->cur);
354:
355: AS = xmlParseQuotedString(ctxt);
356: SKIP_BLANKS(ctxt->cur);
357: } else if ((ctxt->cur[0] == '?') && (ctxt->cur[1] == '>')) {
1.3 veillard 358: garbage = 0;
1.16 daniel 359: ctxt->cur ++;
1.1 veillard 360: } else {
1.3 veillard 361: /*
362: * Found garbage when parsing the namespace
363: */
364: if (!garbage) fprintf(stderr,
1.13 veillard 365: "\nxmlParseNamespace found garbage: ");
1.16 daniel 366: fprintf(stderr, "%c", ctxt->cur[0]);
367: ctxt->cur++;
1.1 veillard 368: }
369: }
370:
1.16 daniel 371: MOVETO_ENDTAG(ctxt->cur);
372: ctxt->cur++;
1.1 veillard 373:
374: /*
375: * Register the DTD.
376: */
377: if (href != NULL)
1.16 daniel 378: xmlNewDtd(ctxt->doc, href, AS);
1.1 veillard 379:
1.8 veillard 380: if (AS != NULL) free(AS);
381: if (href != NULL) free(href);
1.1 veillard 382: }
383:
384: /*
1.3 veillard 385: * xmlParsePI: parse an XML Processing Instruction.
386: */
387:
1.16 daniel 388: void xmlParsePI(xmlParserCtxtPtr ctxt) {
389: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
1.3 veillard 390: /*
391: * this is a Processing Instruction.
392: */
1.16 daniel 393: ctxt->cur += 2;
1.3 veillard 394:
395: /*
396: * Special for WebDav, support for the Processing Instruction
397: * '<?namespace ...' contruct in the header of the XML document.
398: */
1.16 daniel 399: if ((ctxt->cur[0] == 'n') && (ctxt->cur[1] == 'a') &&
400: (ctxt->cur[2] == 'm') && (ctxt->cur[3] == 'e') &&
401: (ctxt->cur[4] == 's') && (ctxt->cur[5] == 'p') &&
402: (ctxt->cur[6] == 'a') && (ctxt->cur[7] == 'c') &&
403: (ctxt->cur[8] == 'e')) {
404: xmlParseNamespace(ctxt);
1.3 veillard 405: } else {
406: /* Unknown PI, ignore it ! */
1.16 daniel 407: fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n",
408: ctxt->cur);
409: MOVETO_ENDTAG(ctxt->cur);
410: ctxt->cur++;
1.3 veillard 411: }
412: }
413: }
414:
415: /*
416: * xmlParseAttribute: parse a start of tag.
417: *
418: * Attribute ::= Name Eq AttValue
419: */
420:
1.16 daniel 421: void xmlParseAttribute(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
1.17 ! daniel 422: const CHAR *q;
! 423: CHAR *name, *value = NULL;
1.3 veillard 424:
1.16 daniel 425: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.3 veillard 426: return;
427: }
1.16 daniel 428: q = ctxt->cur++;
429: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
430: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
431: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
432: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
433: (IS_EXTENDER(ctxt->cur[0])))
434: ctxt->cur++;
435: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 436:
437: /*
438: * We should have the equal, we are laxist here and allow attributes
439: * without values and extra spaces.
440: */
1.16 daniel 441: SKIP_BLANKS(ctxt->cur);
442: if (ctxt->cur[0] == '=') {
443: ctxt->cur++;
444: SKIP_BLANKS(ctxt->cur);
445: if ((ctxt->cur[0] != '\'') && (ctxt->cur[0] != '"')) {
1.7 veillard 446: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 447: q);
448: } else
1.16 daniel 449: value = xmlParseQuotedString(ctxt);
1.3 veillard 450: }
451:
452: /*
453: * Add the attribute to the node.
454: */
1.17 ! daniel 455: if (name != NULL) {
1.3 veillard 456: xmlNewProp(node, name, value);
1.17 ! daniel 457: free(name);
! 458: }
! 459: if ( value != NULL )
! 460: free(value);
1.3 veillard 461: }
462:
463: /*
1.2 veillard 464: * xmlParseStartTag: parse a start of tag.
465: */
466:
1.16 daniel 467: xmlNodePtr xmlParseStartTag(xmlParserCtxtPtr ctxt) {
1.17 ! daniel 468: const CHAR *q;
! 469: CHAR *ns, *name;
1.3 veillard 470: xmlDtdPtr dtd = NULL;
1.2 veillard 471: xmlNodePtr ret = NULL;
472:
473: /*
1.3 veillard 474: * Theorically one should just parse a Name, but with the addition
475: * of the namespace needed for WebDav, it's a bit more complicated
476: * since the element name may be prefixed by a namespace prefix.
477: *
478: * QName ::= (NSPart ':')? LocalPart
479: * NSPart ::= Name
480: * LocalPart ::= Name
481: * STag ::= '<' QName (S Attribute)* S? '>'
482: *
483: * instead of :
484: *
485: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 486: */
1.16 daniel 487: if (ctxt->cur[0] != '<') return(NULL);
488: ctxt->cur++;
1.3 veillard 489:
1.16 daniel 490: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return(NULL);
491: q = ctxt->cur++;
492: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
493: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
494: (ctxt->cur[0] == '_') ||
495: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
496: (IS_EXTENDER(ctxt->cur[0])))
497: ctxt->cur++;
1.3 veillard 498:
1.16 daniel 499: if (ctxt->cur[0] == ':') {
500: ns = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 501:
1.16 daniel 502: ctxt->cur++; /* skip the column */
503: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.7 veillard 504: fprintf(stderr,
505: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 506: q);
507: free(ns);
508: return(NULL);
509: }
1.16 daniel 510: q = ctxt->cur++;
511: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
512: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
513: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
514: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
515: (IS_EXTENDER(ctxt->cur[0])))
516: ctxt->cur++;
517: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 518:
519: /*
520: * Search the DTD associated to ns.
521: */
1.16 daniel 522: dtd = xmlSearchDtd(ctxt->doc, ns);
1.3 veillard 523: if (dtd == NULL)
1.7 veillard 524: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 525: free(ns);
526: } else
1.16 daniel 527: name = xmlStrndup(q, ctxt->cur - q);
1.3 veillard 528:
529: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 530:
1.3 veillard 531: /*
532: * Now parse the attributes, it ends up with the ending
533: *
534: * (S Attribute)* S?
535: */
1.16 daniel 536: SKIP_BLANKS(ctxt->cur);
537: while ((IS_CHAR(ctxt->cur[0])) &&
538: (ctxt->cur[0] != '>') &&
539: ((ctxt->cur[0] != '/') || (ctxt->cur[1] != '>'))) {
540: if (IS_LETTER(ctxt->cur[0]) || (ctxt->cur[0] == '_'))
541: xmlParseAttribute(ctxt, ret);
1.3 veillard 542: else {
1.14 veillard 543: /* We should warn TODO !!! */
1.16 daniel 544: ctxt->cur++;
1.3 veillard 545: }
1.16 daniel 546: SKIP_BLANKS(ctxt->cur);
1.3 veillard 547: }
548:
549: return(ret);
550: }
551:
552: /*
1.7 veillard 553: * xmlParseEndTag: parse an end of tag, note that the '</' part has
554: * already been read.
555: */
556:
1.16 daniel 557: void xmlParseEndTag(xmlParserCtxtPtr ctxt, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
1.17 ! daniel 558: const CHAR *q;
! 559: CHAR *ns, *name;
1.7 veillard 560: xmlDtdPtr dtd = NULL;
561:
562: *dtdPtr = NULL;
563: *tagPtr = NULL;
564:
565: /*
566: * Theorically one should just parse a Name, but with the addition
567: * of the namespace needed for WebDav, it's a bit more complicated
568: * since the element name may be prefixed by a namespace prefix.
569: *
570: * QName ::= (NSPart ':')? LocalPart
571: * NSPart ::= Name
572: * LocalPart ::= Name
573: * ETag ::= '</' QName S? '>'
574: *
575: * instead of :
576: *
577: * ETag ::= '</' Name S? '>'
578: */
1.16 daniel 579: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) return;
580: q = ctxt->cur++;
581: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
582: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
583: (ctxt->cur[0] == '_') ||
584: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
585: (IS_EXTENDER(ctxt->cur[0])))
586: ctxt->cur++;
1.7 veillard 587:
1.16 daniel 588: if (ctxt->cur[0] == ':') {
589: ns = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 590:
1.16 daniel 591: ctxt->cur++; /* skip the column */
592: if (!IS_LETTER(ctxt->cur[0]) && (ctxt->cur[0] != '_')) {
1.7 veillard 593: fprintf(stderr,
594: "End tag : no element name after namespace identifier %.20s\n",
595: q);
596: free(ns);
597: return;
598: }
1.16 daniel 599: q = ctxt->cur++;
600: while ((IS_LETTER(ctxt->cur[0])) || (IS_DIGIT(ctxt->cur[0])) ||
601: (ctxt->cur[0] == '.') || (ctxt->cur[0] == '-') ||
602: (ctxt->cur[0] == '_') || (ctxt->cur[0] == ':') ||
603: (IS_COMBINING(ctxt->cur[0])) || (IS_IGNORABLE(ctxt->cur[0])) ||
604: (IS_EXTENDER(ctxt->cur[0])))
605: ctxt->cur++;
606: name = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 607:
608: /*
609: * Search the DTD associated to ns.
610: */
1.16 daniel 611: dtd = xmlSearchDtd(ctxt->doc, ns);
1.7 veillard 612: if (dtd == NULL)
613: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
614: free(ns);
615: } else
1.16 daniel 616: name = xmlStrndup(q, ctxt->cur - q);
1.7 veillard 617:
618: *dtdPtr = dtd;
619: *tagPtr = name;
620:
621: /*
622: * We should definitely be at the ending "S? '>'" part
623: */
1.16 daniel 624: SKIP_BLANKS(ctxt->cur);
625: if ((!IS_CHAR(ctxt->cur[0])) || (ctxt->cur[0] != '>')) {
626: fprintf(stderr, "End tag : expected '>', got %.20s\n", ctxt->cur);
1.7 veillard 627: /*
628: * Note : skipping to the next '>' is probably otherkill,
629: * especially in case the '>' is hust missing.
630: *
631: * Otherwise add:
1.16 daniel 632: * MOVETO_ENDTAG(ctxt->cur);
1.7 veillard 633: */
634: } else
1.16 daniel 635: ctxt->cur++;
1.7 veillard 636:
637: return;
638: }
639:
640: /*
1.3 veillard 641: * xmlParseCDSect: escaped pure raw content.
642: */
1.16 daniel 643: CHAR *xmlParseCDSect(xmlParserCtxtPtr ctxt) {
1.17 ! daniel 644: const CHAR *r, *s, *base;
! 645: CHAR *ret;
1.3 veillard 646:
1.16 daniel 647: base = ctxt->cur;
648: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 649: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 650: return(NULL);
651: }
1.16 daniel 652: r = ctxt->cur++;
653: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 654: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 655: return(NULL);
656: }
1.16 daniel 657: s = ctxt->cur++;
658: while (IS_CHAR(ctxt->cur[0]) &&
659: ((*r != ']') || (*s != ']') || (ctxt->cur[0] != '>'))) {
660: r++;s++;ctxt->cur++;
1.3 veillard 661: }
1.16 daniel 662: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 663: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 664: return(NULL);
665: }
1.16 daniel 666: ret = xmlStrndup(base, ctxt->cur-base);
667:
1.2 veillard 668: return(ret);
669: }
670:
671: /*
672: * xmlParseContent: a content is
673: * (element | PCData | Reference | CDSect | PI | Comment)
674: *
675: * element : starts by '<'
676: * PCData : any CHAR but '&' or '<'
677: * Reference : starts by '&'
678: * CDSect : starts by '<![CDATA['
679: * PI : starts by '<?'
680: */
681:
1.16 daniel 682: xmlNodePtr xmlParseContent(xmlParserCtxtPtr ctxt, xmlNodePtr node) {
1.17 ! daniel 683: const CHAR *q;
! 684: CHAR *data = NULL;
1.2 veillard 685: xmlNodePtr ret = NULL;
686:
687: /*
1.3 veillard 688: * First case : a Processing Instruction.
689: */
1.16 daniel 690: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
691: xmlParsePI(ctxt);
1.3 veillard 692: }
693: /*
694: * Second case : a CDSection
1.2 veillard 695: */
1.16 daniel 696: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
697: (ctxt->cur[2] == '[') && (ctxt->cur[3] == 'C') &&
698: (ctxt->cur[4] == 'D') && (ctxt->cur[5] == 'A') &&
699: (ctxt->cur[6] == 'T') && (ctxt->cur[7] == 'A') &&
700: (ctxt->cur[8] == '[')) {
701: ctxt->cur += 9;
702: data = xmlParseCDSect(ctxt);
1.3 veillard 703: }
704: /*
705: * Third case : a sub-element.
706: */
1.16 daniel 707: else if (ctxt->cur[0] == '<') {
708: ret = xmlParseElement(ctxt);
1.3 veillard 709: }
710: /*
711: * Last case, text. Note that References are handled directly.
712: */
713: else {
1.16 daniel 714: q = ctxt->cur;
715: while (IS_CHAR(ctxt->cur[0]) && (ctxt->cur[0] != '<')) ctxt->cur++;
1.3 veillard 716:
1.16 daniel 717: if (!IS_CHAR(ctxt->cur[0])) {
1.7 veillard 718: fprintf(stderr, "Truncated content : %.50s\n", q);
1.3 veillard 719: return(NULL);
720: }
1.14 veillard 721:
722: /*
723: * Do the Entities decoding...
724: */
1.16 daniel 725: data = xmlStrdup(xmlDecodeEntities(ctxt->doc, q, ctxt->cur - q));
1.3 veillard 726: }
727:
728: /*
729: * Handle the data if any. If there is no child
730: * add it as content, otherwise create a new node of type text.
731: */
732: if (data != NULL)
733: data = xmlHandleData(data);
734: if (data != NULL) {
735: if (node->childs == NULL)
736: xmlNodeSetContent(node, data);
1.17 ! daniel 737: else
1.3 veillard 738: ret = xmlNewText(data);
1.17 ! daniel 739: free(data);
1.3 veillard 740: }
1.2 veillard 741:
742: return(ret);
743: }
744:
745: /*
746: * xmlParseElement: parse an XML element
747: */
748:
1.16 daniel 749: xmlNodePtr xmlParseElement(xmlParserCtxtPtr ctxt) {
1.2 veillard 750: xmlNodePtr ret, child;
1.17 ! daniel 751: const CHAR *openTag = ctxt->cur;
! 752: const CHAR *closeTag = ctxt->cur;
1.2 veillard 753:
1.16 daniel 754: ret = xmlParseStartTag(ctxt);
1.3 veillard 755: if (ret == NULL) {
756: return(NULL);
757: }
1.2 veillard 758:
759: /*
760: * Check for an Empty Element.
761: */
1.16 daniel 762: if ((ctxt->cur[0] == '/') && (ctxt->cur[1] == '>')) {
763: ctxt->cur += 2;
1.2 veillard 764: return(ret);
765: }
1.16 daniel 766: if (ctxt->cur[0] == '>') ctxt->cur++;
1.2 veillard 767: else {
1.16 daniel 768: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", openTag);
769: return(NULL);
1.2 veillard 770: }
771:
772: /*
773: * Parse the content of the element:
774: * (element | PCData | Reference | CDSect | PI | Comment) *
775: *
776: * element : starts by '<'
777: * PCData : any CHAR but '&' or '<'
778: * Reference : starts by '&'
779: * CDSect : starts by '<![CDATA['
780: * PI : starts by '<?'
781: *
782: * The loop stops upon detection of an end of tag '</'
783: */
1.16 daniel 784: while ((IS_CHAR(ctxt->cur[0])) &&
785: ((ctxt->cur[0] != '<') || (ctxt->cur[1] != '/'))) {
786: child = xmlParseContent(ctxt, ret);
1.2 veillard 787: if (child != NULL)
788: xmlAddChild(ret, child);
789: }
1.16 daniel 790: if (!IS_CHAR(ctxt->cur[0])) {
791: fprintf(stderr, "Premature end of data in tag %.30s\n", openTag);
792: return(NULL);
1.2 veillard 793: }
794:
795: /*
796: * parse the end of tag : '</' has been detected.
797: */
1.16 daniel 798: ctxt->cur += 2;
799: if (ctxt->cur[0] == '>') ctxt->cur++; /* simplified closing </> */
1.2 veillard 800: else {
1.7 veillard 801: CHAR *endTag;
802: xmlDtdPtr endDtd;
803:
1.16 daniel 804: xmlParseEndTag(ctxt, &endDtd, &endTag);
1.7 veillard 805:
1.2 veillard 806: /*
1.7 veillard 807: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 808: */
1.7 veillard 809: if (endDtd != ret->dtd) {
810: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
811: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
812: }
813: if (strcmp(ret->name, endTag)) {
814: fprintf(stderr, "Start and End tags don't use the same name:\n");
815: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
816: }
1.17 ! daniel 817:
! 818: if ( endTag != NULL )
! 819: free(endTag);
1.2 veillard 820: }
821:
822: return(ret);
823: }
824:
825: /*
1.1 veillard 826: * xmlParseXMLDecl: parse an XML declaration header
827: */
828:
1.16 daniel 829: void xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
1.1 veillard 830: CHAR *version;
831:
832: /*
833: * We know that '<?XML' is here.
834: */
1.16 daniel 835: ctxt->cur += 5;
1.1 veillard 836:
837: /*
838: * Parse the version info
839: */
1.16 daniel 840: SKIP_BLANKS(ctxt->cur);
1.1 veillard 841:
842: /*
843: * We should have 'version=' here !
844: */
1.16 daniel 845: if ((ctxt->cur[0] == 'v') && (ctxt->cur[1] == 'e') &&
846: (ctxt->cur[2] == 'r') && (ctxt->cur[3] == 's') &&
847: (ctxt->cur[4] == 'i') && (ctxt->cur[5] == 'o') &&
848: (ctxt->cur[6] == 'n') && (ctxt->cur[7] == '=')) {
849: ctxt->cur += 8;
850: version = xmlParseQuotedString(ctxt);
1.1 veillard 851: if (version == NULL)
1.16 daniel 852: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 853: else {
1.16 daniel 854: ctxt->doc = xmlNewDoc(version);
1.8 veillard 855: free(version);
1.1 veillard 856: }
857: } else {
1.16 daniel 858: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 859: }
860:
861: /*
1.14 veillard 862: * We should check for Required Markup Declaration TODO !!!!
1.1 veillard 863: */
1.16 daniel 864: MOVETO_ENDTAG(ctxt->cur);
865: ctxt->cur++;
1.1 veillard 866:
867: }
868:
869: /*
870: * xmlParseMisc: parse an XML Misc optionnal field.
871: * (Comment | PI | S)*
872: */
873:
1.16 daniel 874: void xmlParseMisc(xmlParserCtxtPtr ctxt) {
875: while (((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) ||
876: ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '!') &&
877: (ctxt->cur[2] == '-') && (ctxt->cur[2] == '-')) ||
878: IS_BLANK(ctxt->cur[0])) {
879: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?')) {
880: xmlParsePI(ctxt);
881: } else if (IS_BLANK(ctxt->cur[0])) {
882: ctxt->cur++;
1.1 veillard 883: } else
1.16 daniel 884: xmlParserSkipComment(ctxt);
1.1 veillard 885: }
886: }
887:
888: /*
1.16 daniel 889: * xmlParseDocument : parse an XML document and build a tree.
1.1 veillard 890: */
891:
1.16 daniel 892: int xmlParseDocument(xmlParserCtxtPtr ctxt) {
1.14 veillard 893: /*
894: * We should check for encoding here and plug-in some
895: * conversion code TODO !!!!
896: */
1.1 veillard 897:
898: /*
899: * Wipe out everything which is before the first '<'
900: */
1.16 daniel 901: SKIP_BLANKS(ctxt->cur);
1.1 veillard 902:
903: /*
904: * Check for the XMLDecl in the Prolog.
905: */
1.16 daniel 906: if ((ctxt->cur[0] == '<') && (ctxt->cur[1] == '?') &&
907: (ctxt->cur[2] == 'X') && (ctxt->cur[3] == 'M') &&
908: (ctxt->cur[4] == 'L')) {
909: xmlParseXMLDecl(ctxt);
1.1 veillard 910: /* SKIP_EOL(cur); */
1.16 daniel 911: SKIP_BLANKS(ctxt->cur);
1.1 veillard 912: } else {
1.16 daniel 913: ctxt->doc = xmlNewDoc(XML_DEFAULT_VERSION);
1.1 veillard 914: }
915:
916: /*
917: * The Misc part of the Prolog
918: * (Comment | PI | S) *
919: */
1.16 daniel 920: xmlParseMisc(ctxt);
1.1 veillard 921:
922: /*
1.2 veillard 923: * Time to start parsing
1.1 veillard 924: */
1.16 daniel 925: ctxt->doc->root = xmlParseElement(ctxt);
926:
927: return(0);
928: }
929:
930: /*
931: * xmlParseDoc : parse an XML in-memory document and build a tree.
932: */
933:
934: xmlDocPtr xmlParseDoc(CHAR *cur) {
935: xmlDocPtr ret;
936: xmlParserCtxtPtr ctxt;
937:
938: if (cur == NULL) return(NULL);
1.1 veillard 939:
1.16 daniel 940: ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
941: if (ctxt == NULL) {
942: perror("malloc");
943: return(NULL);
944: }
945:
946: ctxt->filename = NULL;
947: ctxt->base = cur;
948: ctxt->cur = cur;
949: ctxt->line = 1;
950: ctxt->col = 1;
951: ctxt->doc = NULL;
952:
953: xmlParseDocument(ctxt);
954: ret = ctxt->doc;
955: free(ctxt);
956:
1.1 veillard 957: return(ret);
958: }
959:
1.9 httpng 960: /*
961: * xmlParseFile : parse an XML file and build a tree.
962: */
963:
964: xmlDocPtr xmlParseFile(const char *filename) {
965: xmlDocPtr ret;
966: int input;
967: int res;
968: struct stat buf;
969: char *buffer;
1.16 daniel 970: xmlParserCtxtPtr ctxt;
1.9 httpng 971:
1.11 veillard 972: res = stat(filename, &buf);
1.9 httpng 973: if (res < 0) return(NULL);
974:
975: buffer = malloc(buf.st_size + 100);
976: if (buffer == NULL) {
977: perror("malloc");
978: return(NULL);
979: }
980:
981: memset(buffer, 0, sizeof(buffer));
982: input = open (filename, O_RDONLY);
983: if (input < 0) {
984: fprintf (stderr, "Cannot read file %s :\n", filename);
985: perror ("open failed");
986: return(NULL);
987: }
988: res = read(input, buffer, buf.st_size);
989: if (res < 0) {
990: fprintf (stderr, "Cannot read file %s :\n", filename);
991: perror ("read failed");
992: return(NULL);
993: }
994: close(input);
995:
1.16 daniel 996: ctxt = (xmlParserCtxtPtr) malloc(sizeof(xmlParserCtxt));
997: if (ctxt == NULL) {
998: perror("malloc");
999: return(NULL);
1000: }
1.9 httpng 1001: buffer[buf.st_size] = '\0';
1.16 daniel 1002:
1.17 ! daniel 1003: ctxt->filename = filename;
1.16 daniel 1004: ctxt->base = buffer;
1005: ctxt->cur = buffer;
1006: ctxt->line = 1;
1007: ctxt->col = 1;
1008: ctxt->doc = NULL;
1009:
1010: xmlParseDocument(ctxt);
1011: ret = ctxt->doc;
1.9 httpng 1012: free(buffer);
1.16 daniel 1013: free(ctxt);
1014:
1.9 httpng 1015: return(ret);
1.17 ! daniel 1016: }
! 1017:
! 1018:
! 1019:
! 1020:
! 1021: /* Initialize parser context */
! 1022: void xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
! 1023: {
! 1024: ctxt->filename = NULL;
! 1025: ctxt->base = NULL;
! 1026: ctxt->cur = NULL;
! 1027: ctxt->line = 1;
! 1028: ctxt->col = 1;
! 1029: ctxt->doc = NULL;
! 1030: ctxt->node = NULL;
! 1031: }
! 1032:
! 1033:
! 1034: /* Clear (release owned resources) and reinitialize context */
! 1035: void xmlClearParserCtxt(xmlParserCtxtPtr ctx)
! 1036: {
! 1037: xmlInitParserCtxt(ctx);
! 1038: }
! 1039:
! 1040:
! 1041: /* Setup the parser context to parse a new buffer; Clears any prior
! 1042: contents from the parser context. The buffer parameter must not be
! 1043: NULL, but the filename parameter can be */
! 1044: void xmlSetupParserForBuffer(xmlParserCtxtPtr ctxt, const CHAR* buffer,
! 1045: const char* filename)
! 1046: {
! 1047: xmlClearParserCtxt(ctxt);
! 1048: ctxt->base = buffer;
! 1049: ctxt->cur = buffer;
! 1050: ctxt->filename = filename;
! 1051: }
! 1052:
! 1053:
! 1054:
! 1055: void xmlReportError(xmlParserCtxtPtr ctx, const CHAR* msg)
! 1056: {
! 1057: fputs(msg, stderr);
1.9 httpng 1058: }
Webmaster