Annotation of XML/parser.c, revision 1.9
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.1 veillard 3: */
4:
1.9 ! httpng 5: #include <config.h>
1.1 veillard 6: #include <stdio.h>
7: #include <ctype.h>
8: #include <string.h>
9: #include <malloc.h>
1.9 ! httpng 10: #include <sys/stat.h>
! 11: #ifdef HAVE_FCNTL_H
! 12: #include <fcntl.h>
! 13: #endif
1.1 veillard 14:
15: #include "parser.h"
16: #include "tree.h"
17:
18: /*
19: * A few macros needed to help building the parser.
20: */
21:
22: #ifdef UNICODE
23: /*
1.3 veillard 24: * UNICODE version of the macros. Incomplete now !!!!
1.1 veillard 25: */
26: #define IS_CHAR(c) \
27: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
28: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
29:
30: #define SKIP_BLANKS(p) \
31: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
32: (*(p) == 0x3000)) (p)++;
33:
1.3 veillard 34: /* I'm too lazy to complete this one !!!! */
1.1 veillard 35: #define IS_BASECHAR(c) \
36: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
37: (((c) >= 0x61) && ((c) <= 0x7a)) || \
38: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
39: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
40: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
41: (((c) >= 0xf8) && ((c) <= 0xff)) || \
42: ((c) == 0xba))
43:
1.3 veillard 44: /* I'm too lazy to complete this one !!!! */
1.1 veillard 45: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
46:
1.3 veillard 47: /* I'm too lazy to complete this one !!!! */
1.1 veillard 48: #define IS_COMBINING(c) 0
49:
1.3 veillard 50: #define IS_IGNORABLE(c) \
51: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
52: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
53: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
54: ((c) == 0xfeff))
55:
56: #define IS_EXTENDER(c) \
57: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
58: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
59: ((c) == 0xec6) || ((c) == 0x3005) \
60: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
61: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
62: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
63: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
64: ((c) == 0xff9f))
65:
1.1 veillard 66: #define IS_IDEOGRAPHIC(c) \
67: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
68: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
69: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
70: ((c) == 0x3007))
71:
72: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
73:
74: /* I'm too lazy to complete this one ! */
75: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
76: #else
77: /*
1.3 veillard 78: * 8bits / ASCII version of the macros.
1.1 veillard 79: */
80: #define IS_CHAR(c) \
81: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
82:
83: #define IS_BASECHAR(c) \
84: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
85: (((c) >= 0x61) && ((c) <= 0x7a)) || \
86: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
87: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
88: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
89: (((c) >= 0xf8) && ((c) <= 0xff)) || \
90: ((c) == 0xba))
91:
92: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
93:
94: #define IS_LETTER(c) IS_BASECHAR(c)
95:
96: #define IS_COMBINING(c) 0
97:
1.3 veillard 98: #define IS_IGNORABLE(c) 0
99:
100: #define IS_EXTENDER(c) ((c) == 0xb7)
101:
1.1 veillard 102: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
103: #endif
104:
105:
106: #define SKIP_EOL(p) \
107: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
108: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
109:
110: #define SKIP_BLANKS(p) \
111: while (IS_BLANK(*(p))) (p)++;
112:
113: #define MOVETO_ENDTAG(p) \
114: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
115:
116: #define MOVETO_STARTTAG(p) \
117: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
118:
119: /*
1.3 veillard 120: * Forward definition for recusive behaviour.
121: */
122: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);
123:
124: /*
125: * xmlHandleData : this routine represent's the specific application
126: * behaviour when reading a piece of text.
127: *
128: * For example in WebDav, any piece made only of blanks is eliminated
129: */
130:
131: CHAR *xmlHandleData(CHAR *in) {
132: CHAR *cur;
133:
134: if (in == NULL) return(NULL);
135: cur = in;
136: while (IS_CHAR(*cur)) {
137: if (!IS_BLANK(*cur)) goto not_blank;
138: cur++;
139: }
140: free(in);
141: return(NULL);
142:
143: not_blank:
144: return(in);
145: }
146:
147: /*
1.1 veillard 148: * xmlStrndup : a strdup for array of CHAR's
149: */
150:
1.6 httpng 151: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 152: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
153:
154: if (ret == NULL) {
155: fprintf(stderr, "malloc of %d byte failed\n",
156: (len + 1) * sizeof(CHAR));
157: return(NULL);
158: }
159: memcpy(ret, cur, len * sizeof(CHAR));
160: ret[len] = 0;
161: return(ret);
162: }
163:
164: /*
165: * xmlStrdup : a strdup for CHAR's
166: */
167:
1.6 httpng 168: CHAR *xmlStrdup(const CHAR *cur) {
169: const CHAR *p = cur;
1.1 veillard 170:
171: while (IS_CHAR(*p)) p++;
172: return(xmlStrndup(cur, p - cur));
173: }
174:
175: /*
176: * xmlParseName : parse an XML name.
177: */
178:
1.3 veillard 179: CHAR *xmlParseName(CHAR **p) {
180: CHAR *cur = *p, *q, *ret = NULL;
1.1 veillard 181:
182: /*
1.3 veillard 183: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 184: */
1.3 veillard 185: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
186: q = cur++;
187: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
188: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
189: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
190: (IS_EXTENDER(*cur)))
191: cur++;
192:
193: ret = xmlStrndup(q, cur - q);
1.1 veillard 194:
1.3 veillard 195: *p = cur;
196: return(ret);
1.1 veillard 197: }
198:
199: /*
200: * Parse and return a string between quotes or doublequotes
201: */
202: CHAR *xmlParseQuotedString(CHAR **p) {
203: CHAR *ret = NULL;
204: CHAR *cur = *p, *q;
205:
206: if (*cur == '"') {
207: cur++;
208: q = cur;
209: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
210: if (*cur != '"')
1.7 veillard 211: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 212: else {
213: ret = xmlStrndup(q, cur - q);
214: cur++;
215: }
216: } else if (*cur == '\''){
217: cur++;
218: q = cur;
219: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
220: if (*cur != '\'')
1.7 veillard 221: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 222: else {
223: ret = xmlStrndup(q, cur - q);
224: cur++;
225: }
226: }
227: *p = cur;
228: return(ret);
229: }
230:
231: /*
1.3 veillard 232: * Skip an XML (SGML) comment <!-- .... -->
233: */
234: void xmlParserSkipComment(CHAR **p) {
235: CHAR *cur = *p, *q, *r, *start;
236:
237: /*
238: * An extra check may avoid errors and isn't that costly !
239: */
240: if ((cur[0] != '<') || (cur[1] != '!') ||
241: (cur[2] != '-') || (cur[3] != '-')) return;
242:
243: cur += 4;
244: start = q = cur;
245: cur++;
246: r = cur;
247: cur++;
248: while (IS_CHAR(*cur) &&
249: ((*cur != '>') || (*r != '-') || (*q != '-'))) {
250: cur++;r++;q++;
251: }
252: if (!IS_CHAR(*cur)) {
1.7 veillard 253: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.3 veillard 254: *p = start;
255: } else {
256: cur++;
257: *p = cur;
258: }
259: }
260:
261: /*
1.1 veillard 262: * xmlParseWebdavNamespace: parse Webdav specific '<?namespace ...' constructs.
263: */
264:
265: void xmlParseWebdavNamespace(CHAR **p, xmlDocPtr doc) {
266: CHAR *cur = *p;
267: CHAR *href = NULL;
268: CHAR *AS = NULL;
1.3 veillard 269: int garbage = 0;
1.1 veillard 270:
271: /*
272: * We know that 'namespace' is here.
273: */
274: cur += 9;
275: SKIP_BLANKS(cur);
276:
277: while (IS_CHAR(*cur) && (*cur != '>')) {
278: /*
279: * We can have 'href' or 'AS' attributes.
280: */
281: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
282: (cur[3] == 'f')) {
1.3 veillard 283: garbage = 0;
1.1 veillard 284: cur += 4;
285: SKIP_BLANKS(cur);
286:
287: if (*cur != '=') continue;
288: cur++;
289: SKIP_BLANKS(cur);
290:
291: href = xmlParseQuotedString(&cur);
292: SKIP_BLANKS(cur);
293: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
1.3 veillard 294: garbage = 0;
1.1 veillard 295: cur += 2;
296: SKIP_BLANKS(cur);
297:
298: if (*cur != '=') continue;
299: cur++;
300: SKIP_BLANKS(cur);
301:
302: AS = xmlParseQuotedString(&cur);
303: SKIP_BLANKS(cur);
304: } else if ((cur[0] == '?') && (cur[1] == '>')) {
1.3 veillard 305: garbage = 0;
1.1 veillard 306: cur ++;
307: } else {
1.3 veillard 308: /*
309: * Found garbage when parsing the namespace
310: */
311: if (!garbage) fprintf(stderr,
1.4 veillard 312: "\nxmlParseWebdavNamespace found garbage: ");
1.3 veillard 313: fprintf(stderr, "%c", *cur);
1.1 veillard 314: cur++;
315: }
316: }
317:
318: MOVETO_ENDTAG(cur);
319: cur++;
320:
321: /*
322: * Register the DTD.
323: */
324: if (href != NULL)
325: xmlNewDtd(doc, href, AS);
326:
1.8 veillard 327: if (AS != NULL) free(AS);
328: if (href != NULL) free(href);
329:
1.1 veillard 330: *p = cur;
331: }
332:
333: /*
1.3 veillard 334: * xmlParsePI: parse an XML Processing Instruction.
335: */
336:
337: void xmlParsePI(CHAR **p, xmlDocPtr doc) {
338: CHAR *cur = *p;
339:
340: if ((cur[0] == '<') && (cur[1] == '?')) {
341: /*
342: * this is a Processing Instruction.
343: */
344: cur += 2;
345:
346: /*
347: * Special for WebDav, support for the Processing Instruction
348: * '<?namespace ...' contruct in the header of the XML document.
349: */
350: if ((cur[0] == 'n') && (cur[1] == 'a') &&
351: (cur[2] == 'm') && (cur[3] == 'e') &&
352: (cur[4] == 's') && (cur[5] == 'p') &&
353: (cur[6] == 'a') && (cur[7] == 'c') &&
354: (cur[8] == 'e')) {
355: xmlParseWebdavNamespace(&cur, doc);
356: } else {
357: /* Unknown PI, ignore it ! */
358: MOVETO_ENDTAG(cur);
359: cur++;
360: }
361: }
362: *p = cur;
363: }
364:
365: /*
366: * xmlParseAttribute: parse a start of tag.
367: *
368: * Attribute ::= Name Eq AttValue
369: */
370:
371: void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
372: CHAR *cur = *p, *q, *name, *value = NULL;
373:
374: if (!IS_LETTER(*cur) && (*cur != '_')) {
375: return;
376: }
377: q = cur++;
378: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
379: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
380: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
381: (IS_EXTENDER(*cur)))
382: cur++;
383: name = xmlStrndup(q, cur - q);
384:
385: /*
386: * We should have the equal, we are laxist here and allow attributes
387: * without values and extra spaces.
388: */
389: SKIP_BLANKS(cur);
390: if (*cur == '=') {
391: cur++;
392: SKIP_BLANKS(cur);
393: if ((*cur != '\'') && (*cur != '"')) {
1.7 veillard 394: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 395: q);
396: } else
397: value = xmlParseQuotedString(&cur);
398: }
399:
400: /*
401: * Add the attribute to the node.
402: */
403: if (name != NULL)
404: xmlNewProp(node, name, value);
405:
406: *p = cur;
407: }
408:
409: /*
1.2 veillard 410: * xmlParseStartTag: parse a start of tag.
411: */
412:
1.3 veillard 413: xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
414: CHAR *cur = *p, *q, *ns, *name;
415: xmlDtdPtr dtd = NULL;
1.2 veillard 416: xmlNodePtr ret = NULL;
417:
418: /*
1.3 veillard 419: * Theorically one should just parse a Name, but with the addition
420: * of the namespace needed for WebDav, it's a bit more complicated
421: * since the element name may be prefixed by a namespace prefix.
422: *
423: * QName ::= (NSPart ':')? LocalPart
424: * NSPart ::= Name
425: * LocalPart ::= Name
426: * STag ::= '<' QName (S Attribute)* S? '>'
427: *
428: * instead of :
429: *
430: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 431: */
1.3 veillard 432: if (*cur != '<') return(NULL);
433: cur++;
434:
435: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
436: q = cur++;
437: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
438: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
439: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
440: (IS_EXTENDER(*cur)))
441: cur++;
442:
443: if (*cur == ':') {
444: ns = xmlStrndup(q, cur - q);
445:
446: cur++; /* skip the column */
447: if (!IS_LETTER(*cur) && (*cur != '_')) {
1.7 veillard 448: fprintf(stderr,
449: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 450: q);
451: free(ns);
452: *p = cur;
453: return(NULL);
454: }
455: q = cur++;
456: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
457: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
458: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
459: (IS_EXTENDER(*cur)))
460: cur++;
461: name = xmlStrndup(q, cur - q);
462:
463: /*
464: * Search the DTD associated to ns.
465: */
466: dtd = xmlSearchDtd(doc, ns);
467: if (dtd == NULL)
1.7 veillard 468: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 469: free(ns);
470: } else
471: name = xmlStrndup(q, cur - q);
472:
473: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 474:
1.3 veillard 475: /*
476: * Now parse the attributes, it ends up with the ending
477: *
478: * (S Attribute)* S?
479: */
480: SKIP_BLANKS(cur);
481: while ((IS_CHAR(*cur)) &&
482: (*cur != '>') &&
483: ((cur[0] != '/') || (cur[1] != '>'))) {
484: if (IS_LETTER(*cur) || (*cur == '_'))
485: xmlParseAttribute(&cur, ret);
486: else {
487: /* We should warn !!! */
488: cur++;
489: }
490: SKIP_BLANKS(cur);
491: }
492:
493: *p = cur;
494: return(ret);
495: }
496:
497: /*
1.7 veillard 498: * xmlParseEndTag: parse an end of tag, note that the '</' part has
499: * already been read.
500: */
501:
502: void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
503: CHAR *cur = *p, *q, *ns, *name;
504: xmlDtdPtr dtd = NULL;
505:
506: *dtdPtr = NULL;
507: *tagPtr = NULL;
508:
509: /*
510: * Theorically one should just parse a Name, but with the addition
511: * of the namespace needed for WebDav, it's a bit more complicated
512: * since the element name may be prefixed by a namespace prefix.
513: *
514: * QName ::= (NSPart ':')? LocalPart
515: * NSPart ::= Name
516: * LocalPart ::= Name
517: * ETag ::= '</' QName S? '>'
518: *
519: * instead of :
520: *
521: * ETag ::= '</' Name S? '>'
522: */
523: if (!IS_LETTER(*cur) && (*cur != '_')) return;
524: q = cur++;
525: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
526: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
527: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
528: (IS_EXTENDER(*cur)))
529: cur++;
530:
531: if (*cur == ':') {
532: ns = xmlStrndup(q, cur - q);
533:
534: cur++; /* skip the column */
535: if (!IS_LETTER(*cur) && (*cur != '_')) {
536: fprintf(stderr,
537: "End tag : no element name after namespace identifier %.20s\n",
538: q);
539: free(ns);
540: *p = cur;
541: return;
542: }
543: q = cur++;
544: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
545: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
546: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
547: (IS_EXTENDER(*cur)))
548: cur++;
549: name = xmlStrndup(q, cur - q);
550:
551: /*
552: * Search the DTD associated to ns.
553: */
554: dtd = xmlSearchDtd(doc, ns);
555: if (dtd == NULL)
556: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
557: free(ns);
558: } else
559: name = xmlStrndup(q, cur - q);
560:
561: *dtdPtr = dtd;
562: *tagPtr = name;
563:
564: /*
565: * We should definitely be at the ending "S? '>'" part
566: */
567: SKIP_BLANKS(cur);
568: if ((!IS_CHAR(*cur)) || (*cur != '>')) {
569: fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
570: /*
571: * Note : skipping to the next '>' is probably otherkill,
572: * especially in case the '>' is hust missing.
573: *
574: * Otherwise add:
575: * MOVETO_ENDTAG(cur);
576: */
577: } else
578: cur++;
579:
580: *p = cur;
581: return;
582: }
583:
584: /*
1.3 veillard 585: * xmlParseCDSect: escaped pure raw content.
586: */
587: CHAR *xmlParseCDSect(CHAR **p) {
588: CHAR *cur = *p, *r, *s, *base, *ret;
589:
590: base = cur;
591: if (!IS_CHAR(*cur)) {
1.7 veillard 592: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 593: return(NULL);
594: }
595: r = cur++;
596: if (!IS_CHAR(*cur)) {
1.7 veillard 597: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 598: return(NULL);
599: }
600: s = cur++;
601: while (IS_CHAR(*cur) &&
602: ((*r != ']') || (*s != ']') || (*cur != '>'))) {
603: r++;s++;cur++;
604: }
605: if (!IS_CHAR(*cur)) {
1.7 veillard 606: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 607: return(NULL);
608: }
609: ret = xmlStrndup(base, cur-base);
1.2 veillard 610: *p = cur;
611: return(ret);
612: }
613:
614: /*
615: * xmlParseContent: a content is
616: * (element | PCData | Reference | CDSect | PI | Comment)
617: *
618: * element : starts by '<'
619: * PCData : any CHAR but '&' or '<'
620: * Reference : starts by '&'
621: * CDSect : starts by '<![CDATA['
622: * PI : starts by '<?'
623: */
624:
1.3 veillard 625: xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
626: CHAR *cur = *p, *q, *data = NULL;
1.2 veillard 627: xmlNodePtr ret = NULL;
628:
629: /*
1.3 veillard 630: * First case : a Processing Instruction.
631: */
632: if ((cur[0] == '<') && (cur[1] == '?')) {
633: xmlParsePI(&cur, doc);
634: }
635: /*
636: * Second case : a CDSection
1.2 veillard 637: */
1.3 veillard 638: if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
639: (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
640: (cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
641: cur += 9;
642: data = xmlParseCDSect(&cur);
643: }
644: /*
645: * Third case : a sub-element.
646: */
647: else if (cur[0] == '<') {
648: ret = xmlParseElement(&cur, doc);
649: }
650: /*
651: * Last case, text. Note that References are handled directly.
652: */
653: else {
654: q = cur;
655: while (IS_CHAR(*cur) && (*cur != '<')) cur++;
656:
657: if (!IS_CHAR(*cur)) {
1.7 veillard 658: fprintf(stderr, "Truncated content : %.50s\n", q);
1.4 veillard 659: *p = cur;
1.3 veillard 660: return(NULL);
661: }
662: data = xmlStrndup(q, cur - q);
663: /* Should apply the &...; reduction !!!! */
664: }
665:
666: /*
667: * Handle the data if any. If there is no child
668: * add it as content, otherwise create a new node of type text.
669: */
670: if (data != NULL)
671: data = xmlHandleData(data);
672: if (data != NULL) {
673: if (node->childs == NULL)
674: xmlNodeSetContent(node, data);
675: else {
676: ret = xmlNewText(data);
677: }
678: }
1.2 veillard 679:
680: *p = cur;
681: return(ret);
682: }
683:
684: /*
685: * xmlParseElement: parse an XML element
686: */
687:
1.3 veillard 688: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
1.2 veillard 689: CHAR *cur = *p;
690: xmlNodePtr ret, child;
1.7 veillard 691: CHAR *openTag = *p;
692: CHAR *closeTag = *p;
1.2 veillard 693:
1.3 veillard 694: ret = xmlParseStartTag(&cur, doc);
695: if (ret == NULL) {
696: *p = cur;
697: return(NULL);
698: }
1.2 veillard 699:
700: /*
701: * Check for an Empty Element.
702: */
703: if ((cur[0] == '/') && (cur[1] == '>')) {
704: cur += 2;
705: *p = cur;
706: return(ret);
707: }
708: if (cur[0] == '>') cur++;
709: else {
1.7 veillard 710: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
1.2 veillard 711: *p = cur;
712: return(ret);
713: }
714:
715: /*
716: * Parse the content of the element:
717: * (element | PCData | Reference | CDSect | PI | Comment) *
718: *
719: * element : starts by '<'
720: * PCData : any CHAR but '&' or '<'
721: * Reference : starts by '&'
722: * CDSect : starts by '<![CDATA['
723: * PI : starts by '<?'
724: *
725: * The loop stops upon detection of an end of tag '</'
726: */
727: while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
1.3 veillard 728: child = xmlParseContent(&cur, doc, ret);
1.2 veillard 729: if (child != NULL)
730: xmlAddChild(ret, child);
731: }
732: if (!IS_CHAR(cur[0])) {
1.7 veillard 733: fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
1.2 veillard 734: *p = cur;
735: return(ret);
736: }
737:
738: /*
739: * parse the end of tag : '</' has been detected.
740: */
741: cur += 2;
742: if (*cur == '>') cur++; /* simplified closing </> */
743: else {
1.7 veillard 744: CHAR *endTag;
745: xmlDtdPtr endDtd;
746:
747: xmlParseEndTag(&cur, doc, &endDtd, &endTag);
748:
1.2 veillard 749: /*
1.7 veillard 750: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 751: */
1.7 veillard 752: if (endDtd != ret->dtd) {
753: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
754: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
755: }
756: if (strcmp(ret->name, endTag)) {
757: fprintf(stderr, "Start and End tags don't use the same name:\n");
758: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
759: }
1.2 veillard 760: }
761:
762: *p = cur;
763: return(ret);
764: }
765:
766: /*
1.1 veillard 767: * xmlParseXMLDecl: parse an XML declaration header
768: */
769:
770: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
771: CHAR *cur = *p;
772: CHAR *version;
773: xmlDocPtr ret;
774:
775: /*
776: * We know that '<?XML' is here.
777: */
778: cur += 5;
779:
780: /*
781: * Parse the version info
782: */
783: SKIP_BLANKS(cur);
784:
785: /*
786: * We should have 'version=' here !
787: */
788: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
789: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
790: (cur[6] == 'n') && (cur[7] == '=')) {
791: cur += 8;
792: version = xmlParseQuotedString(&cur);
793: if (version == NULL)
794: ret = xmlNewDoc(XML_DEFAULT_VERSION);
795: else {
796: ret = xmlNewDoc(version);
1.8 veillard 797: free(version);
1.1 veillard 798: }
799: } else {
800: ret = xmlNewDoc(XML_DEFAULT_VERSION);
801: }
802:
803: /*
804: * We should check for encoding !!!!
805: */
806:
807: /*
808: * We should check for Required Markup Declaration !!!!
809: */
810: MOVETO_ENDTAG(cur);
811: cur++;
812:
813: *p = cur;
814: return(ret);
815: }
816:
817: /*
818: * xmlParseMisc: parse an XML Misc optionnal field.
819: * (Comment | PI | S)*
820: */
821:
1.3 veillard 822: void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
1.1 veillard 823: CHAR *cur = *p;
824:
825: while (((cur[0] == '<') && (cur[1] == '?')) ||
826: ((cur[0] == '<') && (cur[1] == '!') &&
827: (cur[2] == '-') && (cur[2] == '-')) ||
828: IS_BLANK(*cur)) {
829: if ((cur[0] == '<') && (cur[1] == '?')) {
1.3 veillard 830: xmlParsePI(&cur, doc);
1.1 veillard 831: } else if (IS_BLANK(*cur)) {
832: cur++;
833: } else
834: xmlParserSkipComment(&cur);
835: }
836:
837: *p = cur;
838: }
839:
840: /*
841: * xmlParseDoc : parse an XML document and build a tree.
842: */
843:
844: xmlDocPtr xmlParseDoc(CHAR *cur) {
845: xmlDocPtr ret;
846:
847: /*
848: * Wipe out everything which is before the first '<'
849: */
850: SKIP_BLANKS(cur);
851:
852: /*
853: * Check for the XMLDecl in the Prolog.
854: */
855: if ((cur[0] == '<') && (cur[1] == '?') &&
856: (cur[2] == 'X') && (cur[3] == 'M') &&
857: (cur[4] == 'L')) {
858: ret = xmlParseXMLDecl(&cur);
859: /* SKIP_EOL(cur); */
860: SKIP_BLANKS(cur);
861: } else {
862: ret = xmlNewDoc(XML_DEFAULT_VERSION);
863: }
864:
865: /*
866: * The Misc part of the Prolog
867: * (Comment | PI | S) *
868: */
869: xmlParseMisc(&cur, ret);
870:
871: /*
1.2 veillard 872: * Time to start parsing
1.1 veillard 873: */
1.3 veillard 874: ret->root = xmlParseElement(&cur, ret);
1.1 veillard 875:
876: return(ret);
877: }
878:
1.9 ! httpng 879: /*
! 880: * xmlParseFile : parse an XML file and build a tree.
! 881: */
! 882:
! 883: xmlDocPtr xmlParseFile(const char *filename) {
! 884: xmlDocPtr ret;
! 885: int input;
! 886: int res;
! 887: struct stat buf;
! 888: char *buffer;
! 889:
! 890: res = stat(buffer, &buf);
! 891: if (res < 0) return(NULL);
! 892:
! 893: buffer = malloc(buf.st_size + 100);
! 894: if (buffer == NULL) {
! 895: perror("malloc");
! 896: return(NULL);
! 897: }
! 898:
! 899: memset(buffer, 0, sizeof(buffer));
! 900: input = open (filename, O_RDONLY);
! 901: if (input < 0) {
! 902: fprintf (stderr, "Cannot read file %s :\n", filename);
! 903: perror ("open failed");
! 904: return(NULL);
! 905: }
! 906: res = read(input, buffer, buf.st_size);
! 907: if (res < 0) {
! 908: fprintf (stderr, "Cannot read file %s :\n", filename);
! 909: perror ("read failed");
! 910: return(NULL);
! 911: }
! 912: close(input);
! 913:
! 914: buffer[buf.st_size] = '\0';
! 915: ret = xmlParseDoc(buffer);
! 916: free(buffer);
! 917: return(ret);
! 918: }
Webmaster