Annotation of XML/parser.c, revision 1.15
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.15 ! veillard 3: *
! 4: * See Copyright for the status of this software.
! 5: *
! 6: * $Id$
1.1 veillard 7: */
8:
1.9 httpng 9: #include <config.h>
1.1 veillard 10: #include <stdio.h>
11: #include <ctype.h>
1.14 veillard 12: #include <string.h> /* for memset() only */
1.1 veillard 13: #include <malloc.h>
1.9 httpng 14: #include <sys/stat.h>
15: #ifdef HAVE_FCNTL_H
16: #include <fcntl.h>
17: #endif
1.10 httpng 18: #ifdef HAVE_UNISTD_H
19: #include <unistd.h>
20: #endif
1.1 veillard 21:
1.14 veillard 22: #include "tree.h"
1.1 veillard 23: #include "parser.h"
1.14 veillard 24: #include "entities.h"
1.1 veillard 25:
26: /*
27: * A few macros needed to help building the parser.
28: */
29:
30: #ifdef UNICODE
31: /*
1.14 veillard 32: * UNICODE version of the macros. Incomplete now TODO !!!!
1.1 veillard 33: */
34: #define IS_CHAR(c) \
35: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
36: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
37:
38: #define SKIP_BLANKS(p) \
39: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
40: (*(p) == 0x3000)) (p)++;
41:
1.14 veillard 42: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 43: #define IS_BASECHAR(c) \
44: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
45: (((c) >= 0x61) && ((c) <= 0x7a)) || \
46: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
47: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
48: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
49: (((c) >= 0xf8) && ((c) <= 0xff)) || \
50: ((c) == 0xba))
51:
1.14 veillard 52: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 53: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
54:
1.14 veillard 55: /* I'm too lazy to complete this one TODO !!!! */
1.1 veillard 56: #define IS_COMBINING(c) 0
57:
1.3 veillard 58: #define IS_IGNORABLE(c) \
59: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
60: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
61: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
62: ((c) == 0xfeff))
63:
64: #define IS_EXTENDER(c) \
65: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
66: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
67: ((c) == 0xec6) || ((c) == 0x3005) \
68: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
69: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
70: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
71: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
72: ((c) == 0xff9f))
73:
1.1 veillard 74: #define IS_IDEOGRAPHIC(c) \
75: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
76: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
77: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
78: ((c) == 0x3007))
79:
80: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
81:
82: /* I'm too lazy to complete this one ! */
83: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
84: #else
85: /*
1.3 veillard 86: * 8bits / ASCII version of the macros.
1.1 veillard 87: */
88: #define IS_CHAR(c) \
89: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
90:
91: #define IS_BASECHAR(c) \
92: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
93: (((c) >= 0x61) && ((c) <= 0x7a)) || \
94: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
95: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
96: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
97: (((c) >= 0xf8) && ((c) <= 0xff)) || \
98: ((c) == 0xba))
99:
100: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
101:
102: #define IS_LETTER(c) IS_BASECHAR(c)
103:
104: #define IS_COMBINING(c) 0
105:
1.3 veillard 106: #define IS_IGNORABLE(c) 0
107:
108: #define IS_EXTENDER(c) ((c) == 0xb7)
109:
1.1 veillard 110: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
111: #endif
112:
113:
114: #define SKIP_EOL(p) \
115: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
116: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
117:
118: #define SKIP_BLANKS(p) \
119: while (IS_BLANK(*(p))) (p)++;
120:
121: #define MOVETO_ENDTAG(p) \
122: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
123:
124: #define MOVETO_STARTTAG(p) \
125: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
126:
127: /*
1.3 veillard 128: * Forward definition for recusive behaviour.
129: */
130: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);
131:
132: /*
133: * xmlHandleData : this routine represent's the specific application
134: * behaviour when reading a piece of text.
135: *
136: * For example in WebDav, any piece made only of blanks is eliminated
137: */
138:
139: CHAR *xmlHandleData(CHAR *in) {
140: CHAR *cur;
141:
142: if (in == NULL) return(NULL);
143: cur = in;
144: while (IS_CHAR(*cur)) {
145: if (!IS_BLANK(*cur)) goto not_blank;
146: cur++;
147: }
148: free(in);
149: return(NULL);
150:
151: not_blank:
152: return(in);
153: }
154:
155: /*
1.1 veillard 156: * xmlStrndup : a strdup for array of CHAR's
157: */
158:
1.6 httpng 159: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 160: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
161:
162: if (ret == NULL) {
163: fprintf(stderr, "malloc of %d byte failed\n",
164: (len + 1) * sizeof(CHAR));
165: return(NULL);
166: }
167: memcpy(ret, cur, len * sizeof(CHAR));
168: ret[len] = 0;
169: return(ret);
170: }
171:
172: /*
173: * xmlStrdup : a strdup for CHAR's
174: */
175:
1.6 httpng 176: CHAR *xmlStrdup(const CHAR *cur) {
177: const CHAR *p = cur;
1.1 veillard 178:
179: while (IS_CHAR(*p)) p++;
180: return(xmlStrndup(cur, p - cur));
181: }
182:
183: /*
1.14 veillard 184: * xmlStrcmp : a strcmp for CHAR's
185: */
186:
187: int xmlStrcmp(const CHAR *str1, const CHAR *str2) {
188: register int tmp;
189:
190: do {
191: tmp = *str1++ - *str2++;
192: if (tmp != 0) return(tmp);
193: } while ((*str1 != 0) && (*str2 != 0));
194: return (*str1 - *str2);
195: }
196:
197: /*
198: * xmlStrncmp : a strncmp for CHAR's
199: */
200:
201: int xmlStrncmp(const CHAR *str1, const CHAR *str2, int len) {
202: register int tmp;
203:
204: if (len <= 0) return(0);
205: do {
206: tmp = *str1++ - *str2++;
207: if (tmp != 0) return(tmp);
208: len--;
209: if (len <= 0) return(0);
210: } while ((*str1 != 0) && (*str2 != 0));
211: return (*str1 - *str2);
212: }
213:
214: /*
215: * xmlStrchr : a strchr for CHAR's
216: */
217:
218: CHAR *xmlStrchr(const CHAR *str, CHAR val) {
219: while (*str != 0) {
220: if (*str == val) return((CHAR *) str);
221: str++;
222: }
223: return(NULL);
224: }
225:
226: /*
1.1 veillard 227: * xmlParseName : parse an XML name.
228: */
229:
1.3 veillard 230: CHAR *xmlParseName(CHAR **p) {
231: CHAR *cur = *p, *q, *ret = NULL;
1.1 veillard 232:
233: /*
1.3 veillard 234: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 235: */
1.3 veillard 236: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
237: q = cur++;
238: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
239: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 240: (*cur == ':') ||
1.3 veillard 241: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
242: (IS_EXTENDER(*cur)))
243: cur++;
244:
245: ret = xmlStrndup(q, cur - q);
1.1 veillard 246:
1.3 veillard 247: *p = cur;
248: return(ret);
1.1 veillard 249: }
250:
251: /*
252: * Parse and return a string between quotes or doublequotes
253: */
254: CHAR *xmlParseQuotedString(CHAR **p) {
255: CHAR *ret = NULL;
256: CHAR *cur = *p, *q;
257:
258: if (*cur == '"') {
259: cur++;
260: q = cur;
261: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
262: if (*cur != '"')
1.7 veillard 263: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 264: else {
265: ret = xmlStrndup(q, cur - q);
266: cur++;
267: }
268: } else if (*cur == '\''){
269: cur++;
270: q = cur;
271: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
272: if (*cur != '\'')
1.7 veillard 273: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 274: else {
275: ret = xmlStrndup(q, cur - q);
276: cur++;
277: }
278: }
279: *p = cur;
280: return(ret);
281: }
282:
283: /*
1.3 veillard 284: * Skip an XML (SGML) comment <!-- .... -->
285: */
286: void xmlParserSkipComment(CHAR **p) {
287: CHAR *cur = *p, *q, *r, *start;
288:
289: /*
290: * An extra check may avoid errors and isn't that costly !
291: */
292: if ((cur[0] != '<') || (cur[1] != '!') ||
293: (cur[2] != '-') || (cur[3] != '-')) return;
294:
295: cur += 4;
296: start = q = cur;
297: cur++;
298: r = cur;
299: cur++;
300: while (IS_CHAR(*cur) &&
1.12 veillard 301: ((*cur == ':') || (*cur != '>') || (*r != '-') || (*q != '-'))) {
1.3 veillard 302: cur++;r++;q++;
303: }
304: if (!IS_CHAR(*cur)) {
1.7 veillard 305: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.3 veillard 306: *p = start;
307: } else {
308: cur++;
309: *p = cur;
310: }
311: }
312:
313: /*
1.13 veillard 314: * xmlParseNamespace: parse specific '<?namespace ...' constructs.
1.1 veillard 315: */
316:
1.13 veillard 317: void xmlParseNamespace(CHAR **p, xmlDocPtr doc) {
1.1 veillard 318: CHAR *cur = *p;
319: CHAR *href = NULL;
320: CHAR *AS = NULL;
1.3 veillard 321: int garbage = 0;
1.1 veillard 322:
323: /*
324: * We know that 'namespace' is here.
325: */
326: cur += 9;
327: SKIP_BLANKS(cur);
328:
329: while (IS_CHAR(*cur) && (*cur != '>')) {
330: /*
331: * We can have 'href' or 'AS' attributes.
332: */
333: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
334: (cur[3] == 'f')) {
1.3 veillard 335: garbage = 0;
1.1 veillard 336: cur += 4;
337: SKIP_BLANKS(cur);
338:
339: if (*cur != '=') continue;
340: cur++;
341: SKIP_BLANKS(cur);
342:
343: href = xmlParseQuotedString(&cur);
344: SKIP_BLANKS(cur);
345: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
1.3 veillard 346: garbage = 0;
1.1 veillard 347: cur += 2;
348: SKIP_BLANKS(cur);
349:
350: if (*cur != '=') continue;
351: cur++;
352: SKIP_BLANKS(cur);
353:
354: AS = xmlParseQuotedString(&cur);
355: SKIP_BLANKS(cur);
356: } else if ((cur[0] == '?') && (cur[1] == '>')) {
1.3 veillard 357: garbage = 0;
1.1 veillard 358: cur ++;
359: } else {
1.3 veillard 360: /*
361: * Found garbage when parsing the namespace
362: */
363: if (!garbage) fprintf(stderr,
1.13 veillard 364: "\nxmlParseNamespace found garbage: ");
1.3 veillard 365: fprintf(stderr, "%c", *cur);
1.1 veillard 366: cur++;
367: }
368: }
369:
370: MOVETO_ENDTAG(cur);
371: cur++;
372:
373: /*
374: * Register the DTD.
375: */
376: if (href != NULL)
377: xmlNewDtd(doc, href, AS);
378:
1.8 veillard 379: if (AS != NULL) free(AS);
380: if (href != NULL) free(href);
381:
1.1 veillard 382: *p = cur;
383: }
384:
385: /*
1.3 veillard 386: * xmlParsePI: parse an XML Processing Instruction.
387: */
388:
389: void xmlParsePI(CHAR **p, xmlDocPtr doc) {
390: CHAR *cur = *p;
391:
392: if ((cur[0] == '<') && (cur[1] == '?')) {
393: /*
394: * this is a Processing Instruction.
395: */
396: cur += 2;
397:
398: /*
399: * Special for WebDav, support for the Processing Instruction
400: * '<?namespace ...' contruct in the header of the XML document.
401: */
402: if ((cur[0] == 'n') && (cur[1] == 'a') &&
403: (cur[2] == 'm') && (cur[3] == 'e') &&
404: (cur[4] == 's') && (cur[5] == 'p') &&
405: (cur[6] == 'a') && (cur[7] == 'c') &&
406: (cur[8] == 'e')) {
1.13 veillard 407: xmlParseNamespace(&cur, doc);
1.3 veillard 408: } else {
409: /* Unknown PI, ignore it ! */
1.13 veillard 410: fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n", cur);
1.3 veillard 411: MOVETO_ENDTAG(cur);
412: cur++;
413: }
414: }
415: *p = cur;
416: }
417:
418: /*
419: * xmlParseAttribute: parse a start of tag.
420: *
421: * Attribute ::= Name Eq AttValue
422: */
423:
424: void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
425: CHAR *cur = *p, *q, *name, *value = NULL;
426:
427: if (!IS_LETTER(*cur) && (*cur != '_')) {
428: return;
429: }
430: q = cur++;
431: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
432: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 433: (*cur == ':') ||
1.3 veillard 434: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
435: (IS_EXTENDER(*cur)))
436: cur++;
437: name = xmlStrndup(q, cur - q);
438:
439: /*
440: * We should have the equal, we are laxist here and allow attributes
441: * without values and extra spaces.
442: */
443: SKIP_BLANKS(cur);
444: if (*cur == '=') {
445: cur++;
446: SKIP_BLANKS(cur);
447: if ((*cur != '\'') && (*cur != '"')) {
1.7 veillard 448: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 449: q);
450: } else
451: value = xmlParseQuotedString(&cur);
452: }
453:
454: /*
455: * Add the attribute to the node.
456: */
457: if (name != NULL)
458: xmlNewProp(node, name, value);
459:
460: *p = cur;
461: }
462:
463: /*
1.2 veillard 464: * xmlParseStartTag: parse a start of tag.
465: */
466:
1.3 veillard 467: xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
468: CHAR *cur = *p, *q, *ns, *name;
469: xmlDtdPtr dtd = NULL;
1.2 veillard 470: xmlNodePtr ret = NULL;
471:
472: /*
1.3 veillard 473: * Theorically one should just parse a Name, but with the addition
474: * of the namespace needed for WebDav, it's a bit more complicated
475: * since the element name may be prefixed by a namespace prefix.
476: *
477: * QName ::= (NSPart ':')? LocalPart
478: * NSPart ::= Name
479: * LocalPart ::= Name
480: * STag ::= '<' QName (S Attribute)* S? '>'
481: *
482: * instead of :
483: *
484: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 485: */
1.3 veillard 486: if (*cur != '<') return(NULL);
487: cur++;
488:
489: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
490: q = cur++;
491: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
492: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
493: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
494: (IS_EXTENDER(*cur)))
495: cur++;
496:
497: if (*cur == ':') {
498: ns = xmlStrndup(q, cur - q);
499:
500: cur++; /* skip the column */
501: if (!IS_LETTER(*cur) && (*cur != '_')) {
1.7 veillard 502: fprintf(stderr,
503: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 504: q);
505: free(ns);
506: *p = cur;
507: return(NULL);
508: }
509: q = cur++;
510: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
511: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 512: (*cur == ':') ||
1.3 veillard 513: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
514: (IS_EXTENDER(*cur)))
515: cur++;
516: name = xmlStrndup(q, cur - q);
517:
518: /*
519: * Search the DTD associated to ns.
520: */
521: dtd = xmlSearchDtd(doc, ns);
522: if (dtd == NULL)
1.7 veillard 523: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 524: free(ns);
525: } else
526: name = xmlStrndup(q, cur - q);
527:
528: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 529:
1.3 veillard 530: /*
531: * Now parse the attributes, it ends up with the ending
532: *
533: * (S Attribute)* S?
534: */
535: SKIP_BLANKS(cur);
536: while ((IS_CHAR(*cur)) &&
537: (*cur != '>') &&
538: ((cur[0] != '/') || (cur[1] != '>'))) {
539: if (IS_LETTER(*cur) || (*cur == '_'))
540: xmlParseAttribute(&cur, ret);
541: else {
1.14 veillard 542: /* We should warn TODO !!! */
1.3 veillard 543: cur++;
544: }
545: SKIP_BLANKS(cur);
546: }
547:
548: *p = cur;
549: return(ret);
550: }
551:
552: /*
1.7 veillard 553: * xmlParseEndTag: parse an end of tag, note that the '</' part has
554: * already been read.
555: */
556:
557: void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
558: CHAR *cur = *p, *q, *ns, *name;
559: xmlDtdPtr dtd = NULL;
560:
561: *dtdPtr = NULL;
562: *tagPtr = NULL;
563:
564: /*
565: * Theorically one should just parse a Name, but with the addition
566: * of the namespace needed for WebDav, it's a bit more complicated
567: * since the element name may be prefixed by a namespace prefix.
568: *
569: * QName ::= (NSPart ':')? LocalPart
570: * NSPart ::= Name
571: * LocalPart ::= Name
572: * ETag ::= '</' QName S? '>'
573: *
574: * instead of :
575: *
576: * ETag ::= '</' Name S? '>'
577: */
578: if (!IS_LETTER(*cur) && (*cur != '_')) return;
579: q = cur++;
580: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
581: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
582: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
583: (IS_EXTENDER(*cur)))
584: cur++;
585:
586: if (*cur == ':') {
587: ns = xmlStrndup(q, cur - q);
588:
589: cur++; /* skip the column */
590: if (!IS_LETTER(*cur) && (*cur != '_')) {
591: fprintf(stderr,
592: "End tag : no element name after namespace identifier %.20s\n",
593: q);
594: free(ns);
595: *p = cur;
596: return;
597: }
598: q = cur++;
599: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
600: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
1.12 veillard 601: (*cur == ':') ||
1.7 veillard 602: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
603: (IS_EXTENDER(*cur)))
604: cur++;
605: name = xmlStrndup(q, cur - q);
606:
607: /*
608: * Search the DTD associated to ns.
609: */
610: dtd = xmlSearchDtd(doc, ns);
611: if (dtd == NULL)
612: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
613: free(ns);
614: } else
615: name = xmlStrndup(q, cur - q);
616:
617: *dtdPtr = dtd;
618: *tagPtr = name;
619:
620: /*
621: * We should definitely be at the ending "S? '>'" part
622: */
623: SKIP_BLANKS(cur);
624: if ((!IS_CHAR(*cur)) || (*cur != '>')) {
625: fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
626: /*
627: * Note : skipping to the next '>' is probably otherkill,
628: * especially in case the '>' is hust missing.
629: *
630: * Otherwise add:
631: * MOVETO_ENDTAG(cur);
632: */
633: } else
634: cur++;
635:
636: *p = cur;
637: return;
638: }
639:
640: /*
1.3 veillard 641: * xmlParseCDSect: escaped pure raw content.
642: */
643: CHAR *xmlParseCDSect(CHAR **p) {
644: CHAR *cur = *p, *r, *s, *base, *ret;
645:
646: base = cur;
647: if (!IS_CHAR(*cur)) {
1.7 veillard 648: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 649: return(NULL);
650: }
651: r = cur++;
652: if (!IS_CHAR(*cur)) {
1.7 veillard 653: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 654: return(NULL);
655: }
656: s = cur++;
657: while (IS_CHAR(*cur) &&
658: ((*r != ']') || (*s != ']') || (*cur != '>'))) {
659: r++;s++;cur++;
660: }
661: if (!IS_CHAR(*cur)) {
1.7 veillard 662: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 663: return(NULL);
664: }
665: ret = xmlStrndup(base, cur-base);
1.2 veillard 666: *p = cur;
667: return(ret);
668: }
669:
670: /*
671: * xmlParseContent: a content is
672: * (element | PCData | Reference | CDSect | PI | Comment)
673: *
674: * element : starts by '<'
675: * PCData : any CHAR but '&' or '<'
676: * Reference : starts by '&'
677: * CDSect : starts by '<![CDATA['
678: * PI : starts by '<?'
679: */
680:
1.3 veillard 681: xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
682: CHAR *cur = *p, *q, *data = NULL;
1.2 veillard 683: xmlNodePtr ret = NULL;
684:
685: /*
1.3 veillard 686: * First case : a Processing Instruction.
687: */
688: if ((cur[0] == '<') && (cur[1] == '?')) {
689: xmlParsePI(&cur, doc);
690: }
691: /*
692: * Second case : a CDSection
1.2 veillard 693: */
1.3 veillard 694: if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
695: (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
696: (cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
697: cur += 9;
698: data = xmlParseCDSect(&cur);
699: }
700: /*
701: * Third case : a sub-element.
702: */
703: else if (cur[0] == '<') {
704: ret = xmlParseElement(&cur, doc);
705: }
706: /*
707: * Last case, text. Note that References are handled directly.
708: */
709: else {
710: q = cur;
711: while (IS_CHAR(*cur) && (*cur != '<')) cur++;
712:
713: if (!IS_CHAR(*cur)) {
1.7 veillard 714: fprintf(stderr, "Truncated content : %.50s\n", q);
1.4 veillard 715: *p = cur;
1.3 veillard 716: return(NULL);
717: }
1.14 veillard 718:
719: /*
720: * Do the Entities decoding...
721: */
722: data = xmlStrdup(xmlDecodeEntities(doc, q, cur - q));
1.3 veillard 723: }
724:
725: /*
726: * Handle the data if any. If there is no child
727: * add it as content, otherwise create a new node of type text.
728: */
729: if (data != NULL)
730: data = xmlHandleData(data);
731: if (data != NULL) {
732: if (node->childs == NULL)
733: xmlNodeSetContent(node, data);
734: else {
735: ret = xmlNewText(data);
736: }
737: }
1.2 veillard 738:
739: *p = cur;
740: return(ret);
741: }
742:
743: /*
744: * xmlParseElement: parse an XML element
745: */
746:
1.3 veillard 747: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
1.2 veillard 748: CHAR *cur = *p;
749: xmlNodePtr ret, child;
1.7 veillard 750: CHAR *openTag = *p;
751: CHAR *closeTag = *p;
1.2 veillard 752:
1.3 veillard 753: ret = xmlParseStartTag(&cur, doc);
754: if (ret == NULL) {
755: *p = cur;
756: return(NULL);
757: }
1.2 veillard 758:
759: /*
760: * Check for an Empty Element.
761: */
762: if ((cur[0] == '/') && (cur[1] == '>')) {
763: cur += 2;
764: *p = cur;
765: return(ret);
766: }
767: if (cur[0] == '>') cur++;
768: else {
1.7 veillard 769: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
1.2 veillard 770: *p = cur;
771: return(ret);
772: }
773:
774: /*
775: * Parse the content of the element:
776: * (element | PCData | Reference | CDSect | PI | Comment) *
777: *
778: * element : starts by '<'
779: * PCData : any CHAR but '&' or '<'
780: * Reference : starts by '&'
781: * CDSect : starts by '<![CDATA['
782: * PI : starts by '<?'
783: *
784: * The loop stops upon detection of an end of tag '</'
785: */
786: while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
1.3 veillard 787: child = xmlParseContent(&cur, doc, ret);
1.2 veillard 788: if (child != NULL)
789: xmlAddChild(ret, child);
790: }
791: if (!IS_CHAR(cur[0])) {
1.7 veillard 792: fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
1.2 veillard 793: *p = cur;
794: return(ret);
795: }
796:
797: /*
798: * parse the end of tag : '</' has been detected.
799: */
800: cur += 2;
801: if (*cur == '>') cur++; /* simplified closing </> */
802: else {
1.7 veillard 803: CHAR *endTag;
804: xmlDtdPtr endDtd;
805:
806: xmlParseEndTag(&cur, doc, &endDtd, &endTag);
807:
1.2 veillard 808: /*
1.7 veillard 809: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 810: */
1.7 veillard 811: if (endDtd != ret->dtd) {
812: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
813: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
814: }
815: if (strcmp(ret->name, endTag)) {
816: fprintf(stderr, "Start and End tags don't use the same name:\n");
817: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
818: }
1.2 veillard 819: }
820:
821: *p = cur;
822: return(ret);
823: }
824:
825: /*
1.1 veillard 826: * xmlParseXMLDecl: parse an XML declaration header
827: */
828:
829: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
830: CHAR *cur = *p;
831: CHAR *version;
832: xmlDocPtr ret;
833:
834: /*
835: * We know that '<?XML' is here.
836: */
837: cur += 5;
838:
839: /*
840: * Parse the version info
841: */
842: SKIP_BLANKS(cur);
843:
844: /*
845: * We should have 'version=' here !
846: */
847: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
848: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
849: (cur[6] == 'n') && (cur[7] == '=')) {
850: cur += 8;
851: version = xmlParseQuotedString(&cur);
852: if (version == NULL)
853: ret = xmlNewDoc(XML_DEFAULT_VERSION);
854: else {
855: ret = xmlNewDoc(version);
1.8 veillard 856: free(version);
1.1 veillard 857: }
858: } else {
859: ret = xmlNewDoc(XML_DEFAULT_VERSION);
860: }
861:
862: /*
1.14 veillard 863: * We should check for Required Markup Declaration TODO !!!!
1.1 veillard 864: */
865: MOVETO_ENDTAG(cur);
866: cur++;
867:
868: *p = cur;
869: return(ret);
870: }
871:
872: /*
873: * xmlParseMisc: parse an XML Misc optionnal field.
874: * (Comment | PI | S)*
875: */
876:
1.3 veillard 877: void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
1.1 veillard 878: CHAR *cur = *p;
879:
880: while (((cur[0] == '<') && (cur[1] == '?')) ||
881: ((cur[0] == '<') && (cur[1] == '!') &&
882: (cur[2] == '-') && (cur[2] == '-')) ||
883: IS_BLANK(*cur)) {
884: if ((cur[0] == '<') && (cur[1] == '?')) {
1.3 veillard 885: xmlParsePI(&cur, doc);
1.1 veillard 886: } else if (IS_BLANK(*cur)) {
887: cur++;
888: } else
889: xmlParserSkipComment(&cur);
890: }
891:
892: *p = cur;
893: }
894:
895: /*
896: * xmlParseDoc : parse an XML document and build a tree.
897: */
898:
899: xmlDocPtr xmlParseDoc(CHAR *cur) {
900: xmlDocPtr ret;
1.14 veillard 901:
902: /*
903: * We should check for encoding here and plug-in some
904: * conversion code TODO !!!!
905: */
1.1 veillard 906:
907: /*
908: * Wipe out everything which is before the first '<'
909: */
910: SKIP_BLANKS(cur);
911:
912: /*
913: * Check for the XMLDecl in the Prolog.
914: */
915: if ((cur[0] == '<') && (cur[1] == '?') &&
916: (cur[2] == 'X') && (cur[3] == 'M') &&
917: (cur[4] == 'L')) {
918: ret = xmlParseXMLDecl(&cur);
919: /* SKIP_EOL(cur); */
920: SKIP_BLANKS(cur);
921: } else {
922: ret = xmlNewDoc(XML_DEFAULT_VERSION);
923: }
924:
925: /*
926: * The Misc part of the Prolog
927: * (Comment | PI | S) *
928: */
929: xmlParseMisc(&cur, ret);
930:
931: /*
1.2 veillard 932: * Time to start parsing
1.1 veillard 933: */
1.3 veillard 934: ret->root = xmlParseElement(&cur, ret);
1.1 veillard 935:
936: return(ret);
937: }
938:
1.9 httpng 939: /*
940: * xmlParseFile : parse an XML file and build a tree.
941: */
942:
943: xmlDocPtr xmlParseFile(const char *filename) {
944: xmlDocPtr ret;
945: int input;
946: int res;
947: struct stat buf;
948: char *buffer;
949:
1.11 veillard 950: res = stat(filename, &buf);
1.9 httpng 951: if (res < 0) return(NULL);
952:
953: buffer = malloc(buf.st_size + 100);
954: if (buffer == NULL) {
955: perror("malloc");
956: return(NULL);
957: }
958:
959: memset(buffer, 0, sizeof(buffer));
960: input = open (filename, O_RDONLY);
961: if (input < 0) {
962: fprintf (stderr, "Cannot read file %s :\n", filename);
963: perror ("open failed");
964: return(NULL);
965: }
966: res = read(input, buffer, buf.st_size);
967: if (res < 0) {
968: fprintf (stderr, "Cannot read file %s :\n", filename);
969: perror ("read failed");
970: return(NULL);
971: }
972: close(input);
973:
974: buffer[buf.st_size] = '\0';
975: ret = xmlParseDoc(buffer);
976: free(buffer);
977: return(ret);
978: }
Webmaster