Annotation of XML/parser.c, revision 1.14
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.1 veillard 3: */
4:
1.9 httpng 5: #include <config.h>
1.1 veillard 6: #include <stdio.h>
7: #include <ctype.h>
1.14 ! veillard 8: #include <string.h> /* for memset() only */
1.1 veillard 9: #include <malloc.h>
1.9 httpng 10: #include <sys/stat.h>
11: #ifdef HAVE_FCNTL_H
12: #include <fcntl.h>
13: #endif
1.10 httpng 14: #ifdef HAVE_UNISTD_H
15: #include <unistd.h>
16: #endif
1.1 veillard 17:
1.14 ! veillard 18: #include "tree.h"
1.1 veillard 19: #include "parser.h"
1.14 ! veillard 20: #include "entities.h"
1.1 veillard 21:
22: /*
23: * A few macros needed to help building the parser.
24: */
25:
26: #ifdef UNICODE
27: /*
1.14 ! veillard 28: * UNICODE version of the macros. Incomplete now TODO !!!!
1.1 veillard 29: */
30: #define IS_CHAR(c) \
31: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
32: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
33:
34: #define SKIP_BLANKS(p) \
35: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
36: (*(p) == 0x3000)) (p)++;
37:
1.14 ! veillard 38: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 39: #define IS_BASECHAR(c) \
40: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
41: (((c) >= 0x61) && ((c) <= 0x7a)) || \
42: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
43: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
44: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
45: (((c) >= 0xf8) && ((c) <= 0xff)) || \
46: ((c) == 0xba))
47:
1.14 ! veillard 48: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 49: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
50:
1.14 ! veillard 51: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 52: #define IS_COMBINING(c) 0
53:
1.3 veillard 54: #define IS_IGNORABLE(c) \
55: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
56: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
57: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
58: ((c) == 0xfeff))
59:
60: #define IS_EXTENDER(c) \
61: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
62: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
63: ((c) == 0xec6) || ((c) == 0x3005) \
64: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
65: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
66: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
67: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
68: ((c) == 0xff9f))
69:
1.1 veillard 70: #define IS_IDEOGRAPHIC(c) \
71: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
72: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
73: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
74: ((c) == 0x3007))
75:
76: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
77:
78: /* I'm too lazy to complete this one ! */
79: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
80: #else
81: /*
1.3 veillard 82: * 8bits / ASCII version of the macros.
1.1 veillard 83: */
84: #define IS_CHAR(c) \
85: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
86:
87: #define IS_BASECHAR(c) \
88: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
89: (((c) >= 0x61) && ((c) <= 0x7a)) || \
90: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
91: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
92: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
93: (((c) >= 0xf8) && ((c) <= 0xff)) || \
94: ((c) == 0xba))
95:
96: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
97:
98: #define IS_LETTER(c) IS_BASECHAR(c)
99:
100: #define IS_COMBINING(c) 0
101:
1.3 veillard 102: #define IS_IGNORABLE(c) 0
103:
104: #define IS_EXTENDER(c) ((c) == 0xb7)
105:
1.1 veillard 106: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
107: #endif
108:
109:
110: #define SKIP_EOL(p) \
111: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
112: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
113:
114: #define SKIP_BLANKS(p) \
115: while (IS_BLANK(*(p))) (p)++;
116:
117: #define MOVETO_ENDTAG(p) \
118: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
119:
120: #define MOVETO_STARTTAG(p) \
121: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
122:
123: /*
1.3 veillard 124: * Forward definition for recusive behaviour.
125: */
126: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);
127:
128: /*
129: * xmlHandleData : this routine represent's the specific application
130: * behaviour when reading a piece of text.
131: *
132: * For example in WebDav, any piece made only of blanks is eliminated
133: */
134:
135: CHAR *xmlHandleData(CHAR *in) {
136: CHAR *cur;
137:
138: if (in == NULL) return(NULL);
139: cur = in;
140: while (IS_CHAR(*cur)) {
141: if (!IS_BLANK(*cur)) goto not_blank;
142: cur++;
143: }
144: free(in);
145: return(NULL);
146:
147: not_blank:
148: return(in);
149: }
150:
151: /*
1.1 veillard 152: * xmlStrndup : a strdup for array of CHAR's
153: */
154:
1.6 httpng 155: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 156: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
157:
158: if (ret == NULL) {
159: fprintf(stderr, "malloc of %d byte failed\n",
160: (len + 1) * sizeof(CHAR));
161: return(NULL);
162: }
163: memcpy(ret, cur, len * sizeof(CHAR));
164: ret[len] = 0;
165: return(ret);
166: }
167:
168: /*
169: * xmlStrdup : a strdup for CHAR's
170: */
171:
1.6 httpng 172: CHAR *xmlStrdup(const CHAR *cur) {
173: const CHAR *p = cur;
1.1 veillard 174:
175: while (IS_CHAR(*p)) p++;
176: return(xmlStrndup(cur, p - cur));
177: }
178:
179: /*
1.14 ! veillard 180: * xmlStrcmp : a strcmp for CHAR's
! 181: */
! 182:
! 183: int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
! 184: register int tmp;
! 185:
! 186: do {
! 187: tmp = *str1++ - *str2++;
! 188: if (tmp != 0) return(tmp);
! 189: } while ((*str1 != 0) && (*str2 != 0));
! 190: return (*str1 - *str2);
! 191: }
! 192:
! 193: /*
! 194: * xmlStrncmp : a strncmp for CHAR's
! 195: */
! 196:
! 197: int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
! 198: register int tmp;
! 199:
! 200: if (len <= 0) return(0);
! 201: do {
! 202: tmp = *str1++ - *str2++;
! 203: if (tmp != 0) return(tmp);
! 204: len--;
! 205: if (len <= 0) return(0);
! 206: } while ((*str1 != 0) && (*str2 != 0));
! 207: return (*str1 - *str2);
! 208: }
! 209:
! 210: /*
! 211: * xmlStrchr : a strchr for CHAR's
! 212: */
! 213:
! 214: CHAR *xmlStrchr(const CHAR *str, CHAR val) {
! 215: while (*str != 0) {
! 216: if (*str == val) return((CHAR *) str);
! 217: str++;
! 218: }
! 219: return(NULL);
! 220: }
! 221:
! 222: /*
1.1 veillard 223: * xmlParseName : parse an XML name.
224: */
225:
1.3 veillard 226: CHAR *xmlParseName(CHAR **p) {
227: CHAR *cur = *p, *q, *ret = NULL;
1.1 veillard 228:
229: /*
1.3 veillard 230: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 231: */
1.3 veillard 232: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
233: q = cur++;
234: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
235: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 236: (*cur == ':') ||
1.3 veillard 237: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
238: (IS_EXTENDER(*cur)))
239: cur++;
240:
241: ret = xmlStrndup(q, cur - q);
1.1 veillard 242:
1.3 veillard 243: *p = cur;
244: return(ret);
1.1 veillard 245: }
246:
247: /*
248: * Parse and return a string between quotes or doublequotes
249: */
250: CHAR *xmlParseQuotedString(CHAR **p) {
251: CHAR *ret = NULL;
252: CHAR *cur = *p, *q;
253:
254: if (*cur == '"') {
255: cur++;
256: q = cur;
257: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
258: if (*cur != '"')
1.7 veillard 259: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 260: else {
261: ret = xmlStrndup(q, cur - q);
262: cur++;
263: }
264: } else if (*cur == '\''){
265: cur++;
266: q = cur;
267: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
268: if (*cur != '\'')
1.7 veillard 269: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 270: else {
271: ret = xmlStrndup(q, cur - q);
272: cur++;
273: }
274: }
275: *p = cur;
276: return(ret);
277: }
278:
279: /*
1.3 veillard 280: * Skip an XML (SGML) comment <!-- .... -->
281: */
282: void xmlParserSkipComment(CHAR **p) {
283: CHAR *cur = *p, *q, *r, *start;
284:
285: /*
286: * An extra check may avoid errors and isn't that costly !
287: */
288: if ((cur[0] != '<') || (cur[1] != '!') ||
289: (cur[2] != '-') || (cur[3] != '-')) return;
290:
291: cur += 4;
292: start = q = cur;
293: cur++;
294: r = cur;
295: cur++;
296: while (IS_CHAR(*cur) &&
1.12 veillard 297: ((*cur == ':') || (*cur != '>') || (*r != '-') || (*q != '-'))) {
1.3 veillard 298: cur++;r++;q++;
299: }
300: if (!IS_CHAR(*cur)) {
1.7 veillard 301: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.3 veillard 302: *p = start;
303: } else {
304: cur++;
305: *p = cur;
306: }
307: }
308:
309: /*
1.13 veillard 310: * xmlParseNamespace: parse specific '<?namespace ...' constructs.
1.1 veillard 311: */
312:
1.13 veillard 313: void xmlParseNamespace(CHAR **p, xmlDocPtr doc) {
1.1 veillard 314: CHAR *cur = *p;
315: CHAR *href = NULL;
316: CHAR *AS = NULL;
1.3 veillard 317: int garbage = 0;
1.1 veillard 318:
319: /*
320: * We know that 'namespace' is here.
321: */
322: cur += 9;
323: SKIP_BLANKS(cur);
324:
325: while (IS_CHAR(*cur) && (*cur != '>')) {
326: /*
327: * We can have 'href' or 'AS' attributes.
328: */
329: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
330: (cur[3] == 'f')) {
1.3 veillard 331: garbage = 0;
1.1 veillard 332: cur += 4;
333: SKIP_BLANKS(cur);
334:
335: if (*cur != '=') continue;
336: cur++;
337: SKIP_BLANKS(cur);
338:
339: href = xmlParseQuotedString(&cur);
340: SKIP_BLANKS(cur);
341: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
1.3 veillard 342: garbage = 0;
1.1 veillard 343: cur += 2;
344: SKIP_BLANKS(cur);
345:
346: if (*cur != '=') continue;
347: cur++;
348: SKIP_BLANKS(cur);
349:
350: AS = xmlParseQuotedString(&cur);
351: SKIP_BLANKS(cur);
352: } else if ((cur[0] == '?') && (cur[1] == '>')) {
1.3 veillard 353: garbage = 0;
1.1 veillard 354: cur ++;
355: } else {
1.3 veillard 356: /*
357: * Found garbage when parsing the namespace
358: */
359: if (!garbage) fprintf(stderr,
1.13 veillard 360: "\nxmlParseNamespace found garbage: ");
1.3 veillard 361: fprintf(stderr, "%c", *cur);
1.1 veillard 362: cur++;
363: }
364: }
365:
366: MOVETO_ENDTAG(cur);
367: cur++;
368:
369: /*
370: * Register the DTD.
371: */
372: if (href != NULL)
373: xmlNewDtd(doc, href, AS);
374:
1.8 veillard 375: if (AS != NULL) free(AS);
376: if (href != NULL) free(href);
377:
1.1 veillard 378: *p = cur;
379: }
380:
381: /*
1.3 veillard 382: * xmlParsePI: parse an XML Processing Instruction.
383: */
384:
385: void xmlParsePI(CHAR **p, xmlDocPtr doc) {
386: CHAR *cur = *p;
387:
388: if ((cur[0] == '<') && (cur[1] == '?')) {
389: /*
390: * this is a Processing Instruction.
391: */
392: cur += 2;
393:
394: /*
395: * Special for WebDav, support for the Processing Instruction
396: * '<?namespace ...' contruct in the header of the XML document.
397: */
398: if ((cur[0] == 'n') && (cur[1] == 'a') &&
399: (cur[2] == 'm') && (cur[3] == 'e') &&
400: (cur[4] == 's') && (cur[5] == 'p') &&
401: (cur[6] == 'a') && (cur[7] == 'c') &&
402: (cur[8] == 'e')) {
1.13 veillard 403: xmlParseNamespace(&cur, doc);
1.3 veillard 404: } else {
405: /* Unknown PI, ignore it ! */
1.13 veillard 406: fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n", cur);
1.3 veillard 407: MOVETO_ENDTAG(cur);
408: cur++;
409: }
410: }
411: *p = cur;
412: }
413:
414: /*
415: * xmlParseAttribute: parse a start of tag.
416: *
417: * Attribute ::= Name Eq AttValue
418: */
419:
420: void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
421: CHAR *cur = *p, *q, *name, *value = NULL;
422:
423: if (!IS_LETTER(*cur) && (*cur != '_')) {
424: return;
425: }
426: q = cur++;
427: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
428: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 429: (*cur == ':') ||
1.3 veillard 430: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
431: (IS_EXTENDER(*cur)))
432: cur++;
433: name = xmlStrndup(q, cur - q);
434:
435: /*
436: * We should have the equal, we are laxist here and allow attributes
437: * without values and extra spaces.
438: */
439: SKIP_BLANKS(cur);
440: if (*cur == '=') {
441: cur++;
442: SKIP_BLANKS(cur);
443: if ((*cur != '\'') && (*cur != '"')) {
1.7 veillard 444: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 445: q);
446: } else
447: value = xmlParseQuotedString(&cur);
448: }
449:
450: /*
451: * Add the attribute to the node.
452: */
453: if (name != NULL)
454: xmlNewProp(node, name, value);
455:
456: *p = cur;
457: }
458:
459: /*
1.2 veillard 460: * xmlParseStartTag: parse a start of tag.
461: */
462:
1.3 veillard 463: xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
464: CHAR *cur = *p, *q, *ns, *name;
465: xmlDtdPtr dtd = NULL;
1.2 veillard 466: xmlNodePtr ret = NULL;
467:
468: /*
1.3 veillard 469: * Theorically one should just parse a Name, but with the addition
470: * of the namespace needed for WebDav, it's a bit more complicated
471: * since the element name may be prefixed by a namespace prefix.
472: *
473: * QName ::= (NSPart ':')? LocalPart
474: * NSPart ::= Name
475: * LocalPart ::= Name
476: * STag ::= '<' QName (S Attribute)* S? '>'
477: *
478: * instead of :
479: *
480: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 481: */
1.3 veillard 482: if (*cur != '<') return(NULL);
483: cur++;
484:
485: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
486: q = cur++;
487: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
488: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
489: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
490: (IS_EXTENDER(*cur)))
491: cur++;
492:
493: if (*cur == ':') {
494: ns = xmlStrndup(q, cur - q);
495:
496: cur++; /* skip the column */
497: if (!IS_LETTER(*cur) && (*cur != '_')) {
1.7 veillard 498: fprintf(stderr,
499: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 500: q);
501: free(ns);
502: *p = cur;
503: return(NULL);
504: }
505: q = cur++;
506: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
507: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 508: (*cur == ':') ||
1.3 veillard 509: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
510: (IS_EXTENDER(*cur)))
511: cur++;
512: name = xmlStrndup(q, cur - q);
513:
514: /*
515: * Search the DTD associated to ns.
516: */
517: dtd = xmlSearchDtd(doc, ns);
518: if (dtd == NULL)
1.7 veillard 519: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 520: free(ns);
521: } else
522: name = xmlStrndup(q, cur - q);
523:
524: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 525:
1.3 veillard 526: /*
527: * Now parse the attributes, it ends up with the ending
528: *
529: * (S Attribute)* S?
530: */
531: SKIP_BLANKS(cur);
532: while ((IS_CHAR(*cur)) &&
533: (*cur != '>') &&
534: ((cur[0] != '/') || (cur[1] != '>'))) {
535: if (IS_LETTER(*cur) || (*cur == '_'))
536: xmlParseAttribute(&cur, ret);
537: else {
1.14 ! veillard 538: /* We should warn TODO !!! */
1.3 veillard 539: cur++;
540: }
541: SKIP_BLANKS(cur);
542: }
543:
544: *p = cur;
545: return(ret);
546: }
547:
548: /*
1.7 veillard 549: * xmlParseEndTag: parse an end of tag, note that the '</' part has
550: * already been read.
551: */
552:
553: void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
554: CHAR *cur = *p, *q, *ns, *name;
555: xmlDtdPtr dtd = NULL;
556:
557: *dtdPtr = NULL;
558: *tagPtr = NULL;
559:
560: /*
561: * Theorically one should just parse a Name, but with the addition
562: * of the namespace needed for WebDav, it's a bit more complicated
563: * since the element name may be prefixed by a namespace prefix.
564: *
565: * QName ::= (NSPart ':')? LocalPart
566: * NSPart ::= Name
567: * LocalPart ::= Name
568: * ETag ::= '</' QName S? '>'
569: *
570: * instead of :
571: *
572: * ETag ::= '</' Name S? '>'
573: */
574: if (!IS_LETTER(*cur) && (*cur != '_')) return;
575: q = cur++;
576: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
577: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
578: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
579: (IS_EXTENDER(*cur)))
580: cur++;
581:
582: if (*cur == ':') {
583: ns = xmlStrndup(q, cur - q);
584:
585: cur++; /* skip the column */
586: if (!IS_LETTER(*cur) && (*cur != '_')) {
587: fprintf(stderr,
588: "End tag : no element name after namespace identifier %.20s\n",
589: q);
590: free(ns);
591: *p = cur;
592: return;
593: }
594: q = cur++;
595: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
596: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 597: (*cur == ':') ||
1.7 veillard 598: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
599: (IS_EXTENDER(*cur)))
600: cur++;
601: name = xmlStrndup(q, cur - q);
602:
603: /*
604: * Search the DTD associated to ns.
605: */
606: dtd = xmlSearchDtd(doc, ns);
607: if (dtd == NULL)
608: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
609: free(ns);
610: } else
611: name = xmlStrndup(q, cur - q);
612:
613: *dtdPtr = dtd;
614: *tagPtr = name;
615:
616: /*
617: * We should definitely be at the ending "S? '>'" part
618: */
619: SKIP_BLANKS(cur);
620: if ((!IS_CHAR(*cur)) || (*cur != '>')) {
621: fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
622: /*
623: * Note : skipping to the next '>' is probably otherkill,
624: * especially in case the '>' is hust missing.
625: *
626: * Otherwise add:
627: * MOVETO_ENDTAG(cur);
628: */
629: } else
630: cur++;
631:
632: *p = cur;
633: return;
634: }
635:
636: /*
1.3 veillard 637: * xmlParseCDSect: escaped pure raw content.
638: */
639: CHAR *xmlParseCDSect(CHAR **p) {
640: CHAR *cur = *p, *r, *s, *base, *ret;
641:
642: base = cur;
643: if (!IS_CHAR(*cur)) {
1.7 veillard 644: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 645: return(NULL);
646: }
647: r = cur++;
648: if (!IS_CHAR(*cur)) {
1.7 veillard 649: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 650: return(NULL);
651: }
652: s = cur++;
653: while (IS_CHAR(*cur) &&
654: ((*r != ']') || (*s != ']') || (*cur != '>'))) {
655: r++;s++;cur++;
656: }
657: if (!IS_CHAR(*cur)) {
1.7 veillard 658: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 659: return(NULL);
660: }
661: ret = xmlStrndup(base, cur-base);
1.2 veillard 662: *p = cur;
663: return(ret);
664: }
665:
666: /*
667: * xmlParseContent: a content is
668: * (element | PCData | Reference | CDSect | PI | Comment)
669: *
670: * element : starts by '<'
671: * PCData : any CHAR but '&' or '<'
672: * Reference : starts by '&'
673: * CDSect : starts by '<![CDATA['
674: * PI : starts by '<?'
675: */
676:
1.3 veillard 677: xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
678: CHAR *cur = *p, *q, *data = NULL;
1.2 veillard 679: xmlNodePtr ret = NULL;
680:
681: /*
1.3 veillard 682: * First case : a Processing Instruction.
683: */
684: if ((cur[0] == '<') && (cur[1] == '?')) {
685: xmlParsePI(&cur, doc);
686: }
687: /*
688: * Second case : a CDSection
1.2 veillard 689: */
1.3 veillard 690: if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
691: (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
692: (cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
693: cur += 9;
694: data = xmlParseCDSect(&cur);
695: }
696: /*
697: * Third case : a sub-element.
698: */
699: else if (cur[0] == '<') {
700: ret = xmlParseElement(&cur, doc);
701: }
702: /*
703: * Last case, text. Note that References are handled directly.
704: */
705: else {
706: q = cur;
707: while (IS_CHAR(*cur) && (*cur != '<')) cur++;
708:
709: if (!IS_CHAR(*cur)) {
1.7 veillard 710: fprintf(stderr, "Truncated content : %.50s\n", q);
1.4 veillard 711: *p = cur;
1.3 veillard 712: return(NULL);
713: }
1.14 ! veillard 714:
! 715: /*
! 716: * Do the Entities decoding...
! 717: */
! 718: data = xmlStrdup(xmlDecodeEntities(doc, q, cur - q));
1.3 veillard 719: }
720:
721: /*
722: * Handle the data if any. If there is no child
723: * add it as content, otherwise create a new node of type text.
724: */
725: if (data != NULL)
726: data = xmlHandleData(data);
727: if (data != NULL) {
728: if (node->childs == NULL)
729: xmlNodeSetContent(node, data);
730: else {
731: ret = xmlNewText(data);
732: }
733: }
1.2 veillard 734:
735: *p = cur;
736: return(ret);
737: }
738:
739: /*
740: * xmlParseElement: parse an XML element
741: */
742:
1.3 veillard 743: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
1.2 veillard 744: CHAR *cur = *p;
745: xmlNodePtr ret, child;
1.7 veillard 746: CHAR *openTag = *p;
747: CHAR *closeTag = *p;
1.2 veillard 748:
1.3 veillard 749: ret = xmlParseStartTag(&cur, doc);
750: if (ret == NULL) {
751: *p = cur;
752: return(NULL);
753: }
1.2 veillard 754:
755: /*
756: * Check for an Empty Element.
757: */
758: if ((cur[0] == '/') && (cur[1] == '>')) {
759: cur += 2;
760: *p = cur;
761: return(ret);
762: }
763: if (cur[0] == '>') cur++;
764: else {
1.7 veillard 765: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
1.2 veillard 766: *p = cur;
767: return(ret);
768: }
769:
770: /*
771: * Parse the content of the element:
772: * (element | PCData | Reference | CDSect | PI | Comment) *
773: *
774: * element : starts by '<'
775: * PCData : any CHAR but '&' or '<'
776: * Reference : starts by '&'
777: * CDSect : starts by '<![CDATA['
778: * PI : starts by '<?'
779: *
780: * The loop stops upon detection of an end of tag '</'
781: */
782: while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
1.3 veillard 783: child = xmlParseContent(&cur, doc, ret);
1.2 veillard 784: if (child != NULL)
785: xmlAddChild(ret, child);
786: }
787: if (!IS_CHAR(cur[0])) {
1.7 veillard 788: fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
1.2 veillard 789: *p = cur;
790: return(ret);
791: }
792:
793: /*
794: * parse the end of tag : '</' has been detected.
795: */
796: cur += 2;
797: if (*cur == '>') cur++; /* simplified closing </> */
798: else {
1.7 veillard 799: CHAR *endTag;
800: xmlDtdPtr endDtd;
801:
802: xmlParseEndTag(&cur, doc, &endDtd, &endTag);
803:
1.2 veillard 804: /*
1.7 veillard 805: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 806: */
1.7 veillard 807: if (endDtd != ret->dtd) {
808: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
809: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
810: }
811: if (strcmp(ret->name, endTag)) {
812: fprintf(stderr, "Start and End tags don't use the same name:\n");
813: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
814: }
1.2 veillard 815: }
816:
817: *p = cur;
818: return(ret);
819: }
820:
821: /*
1.1 veillard 822: * xmlParseXMLDecl: parse an XML declaration header
823: */
824:
825: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
826: CHAR *cur = *p;
827: CHAR *version;
828: xmlDocPtr ret;
829:
830: /*
831: * We know that '<?XML' is here.
832: */
833: cur += 5;
834:
835: /*
836: * Parse the version info
837: */
838: SKIP_BLANKS(cur);
839:
840: /*
841: * We should have 'version=' here !
842: */
843: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
844: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
845: (cur[6] == 'n') && (cur[7] == '=')) {
846: cur += 8;
847: version = xmlParseQuotedString(&cur);
848: if (version == NULL)
849: ret = xmlNewDoc(XML_DEFAULT_VERSION);
850: else {
851: ret = xmlNewDoc(version);
1.8 veillard 852: free(version);
1.1 veillard 853: }
854: } else {
855: ret = xmlNewDoc(XML_DEFAULT_VERSION);
856: }
857:
858: /*
1.14 ! veillard 859: * We should check for Required Markup Declaration TODO !!!!
1.1 veillard 860: */
861: MOVETO_ENDTAG(cur);
862: cur++;
863:
864: *p = cur;
865: return(ret);
866: }
867:
868: /*
869: * xmlParseMisc: parse an XML Misc optionnal field.
870: * (Comment | PI | S)*
871: */
872:
1.3 veillard 873: void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
1.1 veillard 874: CHAR *cur = *p;
875:
876: while (((cur[0] == '<') && (cur[1] == '?')) ||
877: ((cur[0] == '<') && (cur[1] == '!') &&
878: (cur[2] == '-') && (cur[2] == '-')) ||
879: IS_BLANK(*cur)) {
880: if ((cur[0] == '<') && (cur[1] == '?')) {
1.3 veillard 881: xmlParsePI(&cur, doc);
1.1 veillard 882: } else if (IS_BLANK(*cur)) {
883: cur++;
884: } else
885: xmlParserSkipComment(&cur);
886: }
887:
888: *p = cur;
889: }
890:
891: /*
892: * xmlParseDoc : parse an XML document and build a tree.
893: */
894:
895: xmlDocPtr xmlParseDoc(CHAR *cur) {
896: xmlDocPtr ret;
1.14 ! veillard 897:
! 898: /*
! 899: * We should check for encoding here and plug-in some
! 900: * conversion code TODO !!!!
! 901: */
1.1 veillard 902:
903: /*
904: * Wipe out everything which is before the first '<'
905: */
906: SKIP_BLANKS(cur);
907:
908: /*
909: * Check for the XMLDecl in the Prolog.
910: */
911: if ((cur[0] == '<') && (cur[1] == '?') &&
912: (cur[2] == 'X') && (cur[3] == 'M') &&
913: (cur[4] == 'L')) {
914: ret = xmlParseXMLDecl(&cur);
915: /* SKIP_EOL(cur); */
916: SKIP_BLANKS(cur);
917: } else {
918: ret = xmlNewDoc(XML_DEFAULT_VERSION);
919: }
920:
921: /*
922: * The Misc part of the Prolog
923: * (Comment | PI | S) *
924: */
925: xmlParseMisc(&cur, ret);
926:
927: /*
1.2 veillard 928: * Time to start parsing
1.1 veillard 929: */
1.3 veillard 930: ret->root = xmlParseElement(&cur, ret);
1.1 veillard 931:
932: return(ret);
933: }
934:
1.9 httpng 935: /*
936: * xmlParseFile : parse an XML file and build a tree.
937: */
938:
939: xmlDocPtr xmlParseFile(const char *filename) {
940: xmlDocPtr ret;
941: int input;
942: int res;
943: struct stat buf;
944: char *buffer;
945:
1.11 veillard 946: res = stat(filename, &buf);
1.9 httpng 947: if (res < 0) return(NULL);
948:
949: buffer = malloc(buf.st_size + 100);
950: if (buffer == NULL) {
951: perror("malloc");
952: return(NULL);
953: }
954:
955: memset(buffer, 0, sizeof(buffer));
956: input = open (filename, O_RDONLY);
957: if (input < 0) {
958: fprintf (stderr, "Cannot read file %s :\n", filename);
959: perror ("open failed");
960: return(NULL);
961: }
962: res = read(input, buffer, buf.st_size);
963: if (res < 0) {
964: fprintf (stderr, "Cannot read file %s :\n", filename);
965: perror ("read failed");
966: return(NULL);
967: }
968: close(input);
969:
970: buffer[buf.st_size] = '\0';
971: ret = xmlParseDoc(buffer);
972: free(buffer);
973: return(ret);
974: }
Webmaster