Annotation of XML/parser.c, revision 1.10
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.1 veillard 3: */
4:
1.9 httpng 5: #include <config.h>
1.1 veillard 6: #include <stdio.h>
7: #include <ctype.h>
8: #include <string.h>
9: #include <malloc.h>
1.9 httpng 10: #include <sys/stat.h>
11: #ifdef HAVE_FCNTL_H
12: #include <fcntl.h>
13: #endif
1.10 ! httpng 14: #ifdef HAVE_UNISTD_H
! 15: #include <unistd.h>
! 16: #endif
1.1 veillard 17:
18: #include "parser.h"
19: #include "tree.h"
20:
21: /*
22: * A few macros needed to help building the parser.
23: */
24:
25: #ifdef UNICODE
26: /*
1.3 veillard 27: * UNICODE version of the macros. Incomplete now !!!!
1.1 veillard 28: */
29: #define IS_CHAR(c) \
30: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
31: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
32:
33: #define SKIP_BLANKS(p) \
34: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
35: (*(p) == 0x3000)) (p)++;
36:
1.3 veillard 37: /* I'm too lazy to complete this one !!!! */
1.1 veillard 38: #define IS_BASECHAR(c) \
39: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
40: (((c) >= 0x61) && ((c) <= 0x7a)) || \
41: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
42: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
43: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
44: (((c) >= 0xf8) && ((c) <= 0xff)) || \
45: ((c) == 0xba))
46:
1.3 veillard 47: /* I'm too lazy to complete this one !!!! */
1.1 veillard 48: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
49:
1.3 veillard 50: /* I'm too lazy to complete this one !!!! */
1.1 veillard 51: #define IS_COMBINING(c) 0
52:
1.3 veillard 53: #define IS_IGNORABLE(c) \
54: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
55: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
56: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
57: ((c) == 0xfeff))
58:
59: #define IS_EXTENDER(c) \
60: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
61: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
62: ((c) == 0xec6) || ((c) == 0x3005) \
63: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
64: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
65: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
66: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
67: ((c) == 0xff9f))
68:
1.1 veillard 69: #define IS_IDEOGRAPHIC(c) \
70: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
71: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
72: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
73: ((c) == 0x3007))
74:
75: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
76:
77: /* I'm too lazy to complete this one ! */
78: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
79: #else
80: /*
1.3 veillard 81: * 8bits / ASCII version of the macros.
1.1 veillard 82: */
83: #define IS_CHAR(c) \
84: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
85:
86: #define IS_BASECHAR(c) \
87: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
88: (((c) >= 0x61) && ((c) <= 0x7a)) || \
89: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
90: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
91: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
92: (((c) >= 0xf8) && ((c) <= 0xff)) || \
93: ((c) == 0xba))
94:
95: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
96:
97: #define IS_LETTER(c) IS_BASECHAR(c)
98:
99: #define IS_COMBINING(c) 0
100:
1.3 veillard 101: #define IS_IGNORABLE(c) 0
102:
103: #define IS_EXTENDER(c) ((c) == 0xb7)
104:
1.1 veillard 105: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
106: #endif
107:
108:
109: #define SKIP_EOL(p) \
110: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
111: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
112:
113: #define SKIP_BLANKS(p) \
114: while (IS_BLANK(*(p))) (p)++;
115:
116: #define MOVETO_ENDTAG(p) \
117: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
118:
119: #define MOVETO_STARTTAG(p) \
120: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
121:
122: /*
1.3 veillard 123: * Forward definition for recusive behaviour.
124: */
125: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);
126:
127: /*
128: * xmlHandleData : this routine represent's the specific application
129: * behaviour when reading a piece of text.
130: *
131: * For example in WebDav, any piece made only of blanks is eliminated
132: */
133:
134: CHAR *xmlHandleData(CHAR *in) {
135: CHAR *cur;
136:
137: if (in == NULL) return(NULL);
138: cur = in;
139: while (IS_CHAR(*cur)) {
140: if (!IS_BLANK(*cur)) goto not_blank;
141: cur++;
142: }
143: free(in);
144: return(NULL);
145:
146: not_blank:
147: return(in);
148: }
149:
150: /*
1.1 veillard 151: * xmlStrndup : a strdup for array of CHAR's
152: */
153:
1.6 httpng 154: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 155: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
156:
157: if (ret == NULL) {
158: fprintf(stderr, "malloc of %d byte failed\n",
159: (len + 1) * sizeof(CHAR));
160: return(NULL);
161: }
162: memcpy(ret, cur, len * sizeof(CHAR));
163: ret[len] = 0;
164: return(ret);
165: }
166:
167: /*
168: * xmlStrdup : a strdup for CHAR's
169: */
170:
1.6 httpng 171: CHAR *xmlStrdup(const CHAR *cur) {
172: const CHAR *p = cur;
1.1 veillard 173:
174: while (IS_CHAR(*p)) p++;
175: return(xmlStrndup(cur, p - cur));
176: }
177:
178: /*
179: * xmlParseName : parse an XML name.
180: */
181:
1.3 veillard 182: CHAR *xmlParseName(CHAR **p) {
183: CHAR *cur = *p, *q, *ret = NULL;
1.1 veillard 184:
185: /*
1.3 veillard 186: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 187: */
1.3 veillard 188: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
189: q = cur++;
190: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
191: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
192: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
193: (IS_EXTENDER(*cur)))
194: cur++;
195:
196: ret = xmlStrndup(q, cur - q);
1.1 veillard 197:
1.3 veillard 198: *p = cur;
199: return(ret);
1.1 veillard 200: }
201:
202: /*
203: * Parse and return a string between quotes or doublequotes
204: */
205: CHAR *xmlParseQuotedString(CHAR **p) {
206: CHAR *ret = NULL;
207: CHAR *cur = *p, *q;
208:
209: if (*cur == '"') {
210: cur++;
211: q = cur;
212: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
213: if (*cur != '"')
1.7 veillard 214: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 215: else {
216: ret = xmlStrndup(q, cur - q);
217: cur++;
218: }
219: } else if (*cur == '\''){
220: cur++;
221: q = cur;
222: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
223: if (*cur != '\'')
1.7 veillard 224: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 225: else {
226: ret = xmlStrndup(q, cur - q);
227: cur++;
228: }
229: }
230: *p = cur;
231: return(ret);
232: }
233:
234: /*
1.3 veillard 235: * Skip an XML (SGML) comment <!-- .... -->
236: */
237: void xmlParserSkipComment(CHAR **p) {
238: CHAR *cur = *p, *q, *r, *start;
239:
240: /*
241: * An extra check may avoid errors and isn't that costly !
242: */
243: if ((cur[0] != '<') || (cur[1] != '!') ||
244: (cur[2] != '-') || (cur[3] != '-')) return;
245:
246: cur += 4;
247: start = q = cur;
248: cur++;
249: r = cur;
250: cur++;
251: while (IS_CHAR(*cur) &&
252: ((*cur != '>') || (*r != '-') || (*q != '-'))) {
253: cur++;r++;q++;
254: }
255: if (!IS_CHAR(*cur)) {
1.7 veillard 256: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.3 veillard 257: *p = start;
258: } else {
259: cur++;
260: *p = cur;
261: }
262: }
263:
264: /*
1.1 veillard 265: * xmlParseWebdavNamespace: parse Webdav specific '<?namespace ...' constructs.
266: */
267:
268: void xmlParseWebdavNamespace(CHAR **p, xmlDocPtr doc) {
269: CHAR *cur = *p;
270: CHAR *href = NULL;
271: CHAR *AS = NULL;
1.3 veillard 272: int garbage = 0;
1.1 veillard 273:
274: /*
275: * We know that 'namespace' is here.
276: */
277: cur += 9;
278: SKIP_BLANKS(cur);
279:
280: while (IS_CHAR(*cur) && (*cur != '>')) {
281: /*
282: * We can have 'href' or 'AS' attributes.
283: */
284: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
285: (cur[3] == 'f')) {
1.3 veillard 286: garbage = 0;
1.1 veillard 287: cur += 4;
288: SKIP_BLANKS(cur);
289:
290: if (*cur != '=') continue;
291: cur++;
292: SKIP_BLANKS(cur);
293:
294: href = xmlParseQuotedString(&cur);
295: SKIP_BLANKS(cur);
296: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
1.3 veillard 297: garbage = 0;
1.1 veillard 298: cur += 2;
299: SKIP_BLANKS(cur);
300:
301: if (*cur != '=') continue;
302: cur++;
303: SKIP_BLANKS(cur);
304:
305: AS = xmlParseQuotedString(&cur);
306: SKIP_BLANKS(cur);
307: } else if ((cur[0] == '?') && (cur[1] == '>')) {
1.3 veillard 308: garbage = 0;
1.1 veillard 309: cur ++;
310: } else {
1.3 veillard 311: /*
312: * Found garbage when parsing the namespace
313: */
314: if (!garbage) fprintf(stderr,
1.4 veillard 315: "\nxmlParseWebdavNamespace found garbage: ");
1.3 veillard 316: fprintf(stderr, "%c", *cur);
1.1 veillard 317: cur++;
318: }
319: }
320:
321: MOVETO_ENDTAG(cur);
322: cur++;
323:
324: /*
325: * Register the DTD.
326: */
327: if (href != NULL)
328: xmlNewDtd(doc, href, AS);
329:
1.8 veillard 330: if (AS != NULL) free(AS);
331: if (href != NULL) free(href);
332:
1.1 veillard 333: *p = cur;
334: }
335:
336: /*
1.3 veillard 337: * xmlParsePI: parse an XML Processing Instruction.
338: */
339:
340: void xmlParsePI(CHAR **p, xmlDocPtr doc) {
341: CHAR *cur = *p;
342:
343: if ((cur[0] == '<') && (cur[1] == '?')) {
344: /*
345: * this is a Processing Instruction.
346: */
347: cur += 2;
348:
349: /*
350: * Special for WebDav, support for the Processing Instruction
351: * '<?namespace ...' contruct in the header of the XML document.
352: */
353: if ((cur[0] == 'n') && (cur[1] == 'a') &&
354: (cur[2] == 'm') && (cur[3] == 'e') &&
355: (cur[4] == 's') && (cur[5] == 'p') &&
356: (cur[6] == 'a') && (cur[7] == 'c') &&
357: (cur[8] == 'e')) {
358: xmlParseWebdavNamespace(&cur, doc);
359: } else {
360: /* Unknown PI, ignore it ! */
361: MOVETO_ENDTAG(cur);
362: cur++;
363: }
364: }
365: *p = cur;
366: }
367:
368: /*
369: * xmlParseAttribute: parse a start of tag.
370: *
371: * Attribute ::= Name Eq AttValue
372: */
373:
374: void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
375: CHAR *cur = *p, *q, *name, *value = NULL;
376:
377: if (!IS_LETTER(*cur) && (*cur != '_')) {
378: return;
379: }
380: q = cur++;
381: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
382: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
383: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
384: (IS_EXTENDER(*cur)))
385: cur++;
386: name = xmlStrndup(q, cur - q);
387:
388: /*
389: * We should have the equal, we are laxist here and allow attributes
390: * without values and extra spaces.
391: */
392: SKIP_BLANKS(cur);
393: if (*cur == '=') {
394: cur++;
395: SKIP_BLANKS(cur);
396: if ((*cur != '\'') && (*cur != '"')) {
1.7 veillard 397: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 398: q);
399: } else
400: value = xmlParseQuotedString(&cur);
401: }
402:
403: /*
404: * Add the attribute to the node.
405: */
406: if (name != NULL)
407: xmlNewProp(node, name, value);
408:
409: *p = cur;
410: }
411:
412: /*
1.2 veillard 413: * xmlParseStartTag: parse a start of tag.
414: */
415:
1.3 veillard 416: xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
417: CHAR *cur = *p, *q, *ns, *name;
418: xmlDtdPtr dtd = NULL;
1.2 veillard 419: xmlNodePtr ret = NULL;
420:
421: /*
1.3 veillard 422: * Theorically one should just parse a Name, but with the addition
423: * of the namespace needed for WebDav, it's a bit more complicated
424: * since the element name may be prefixed by a namespace prefix.
425: *
426: * QName ::= (NSPart ':')? LocalPart
427: * NSPart ::= Name
428: * LocalPart ::= Name
429: * STag ::= '<' QName (S Attribute)* S? '>'
430: *
431: * instead of :
432: *
433: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 434: */
1.3 veillard 435: if (*cur != '<') return(NULL);
436: cur++;
437:
438: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
439: q = cur++;
440: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
441: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
442: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
443: (IS_EXTENDER(*cur)))
444: cur++;
445:
446: if (*cur == ':') {
447: ns = xmlStrndup(q, cur - q);
448:
449: cur++; /* skip the column */
450: if (!IS_LETTER(*cur) && (*cur != '_')) {
1.7 veillard 451: fprintf(stderr,
452: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 453: q);
454: free(ns);
455: *p = cur;
456: return(NULL);
457: }
458: q = cur++;
459: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
460: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
461: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
462: (IS_EXTENDER(*cur)))
463: cur++;
464: name = xmlStrndup(q, cur - q);
465:
466: /*
467: * Search the DTD associated to ns.
468: */
469: dtd = xmlSearchDtd(doc, ns);
470: if (dtd == NULL)
1.7 veillard 471: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 472: free(ns);
473: } else
474: name = xmlStrndup(q, cur - q);
475:
476: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 477:
1.3 veillard 478: /*
479: * Now parse the attributes, it ends up with the ending
480: *
481: * (S Attribute)* S?
482: */
483: SKIP_BLANKS(cur);
484: while ((IS_CHAR(*cur)) &&
485: (*cur != '>') &&
486: ((cur[0] != '/') || (cur[1] != '>'))) {
487: if (IS_LETTER(*cur) || (*cur == '_'))
488: xmlParseAttribute(&cur, ret);
489: else {
490: /* We should warn !!! */
491: cur++;
492: }
493: SKIP_BLANKS(cur);
494: }
495:
496: *p = cur;
497: return(ret);
498: }
499:
500: /*
1.7 veillard 501: * xmlParseEndTag: parse an end of tag, note that the '</' part has
502: * already been read.
503: */
504:
505: void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
506: CHAR *cur = *p, *q, *ns, *name;
507: xmlDtdPtr dtd = NULL;
508:
509: *dtdPtr = NULL;
510: *tagPtr = NULL;
511:
512: /*
513: * Theorically one should just parse a Name, but with the addition
514: * of the namespace needed for WebDav, it's a bit more complicated
515: * since the element name may be prefixed by a namespace prefix.
516: *
517: * QName ::= (NSPart ':')? LocalPart
518: * NSPart ::= Name
519: * LocalPart ::= Name
520: * ETag ::= '</' QName S? '>'
521: *
522: * instead of :
523: *
524: * ETag ::= '</' Name S? '>'
525: */
526: if (!IS_LETTER(*cur) && (*cur != '_')) return;
527: q = cur++;
528: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
529: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
530: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
531: (IS_EXTENDER(*cur)))
532: cur++;
533:
534: if (*cur == ':') {
535: ns = xmlStrndup(q, cur - q);
536:
537: cur++; /* skip the column */
538: if (!IS_LETTER(*cur) && (*cur != '_')) {
539: fprintf(stderr,
540: "End tag : no element name after namespace identifier %.20s\n",
541: q);
542: free(ns);
543: *p = cur;
544: return;
545: }
546: q = cur++;
547: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
548: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
549: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
550: (IS_EXTENDER(*cur)))
551: cur++;
552: name = xmlStrndup(q, cur - q);
553:
554: /*
555: * Search the DTD associated to ns.
556: */
557: dtd = xmlSearchDtd(doc, ns);
558: if (dtd == NULL)
559: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
560: free(ns);
561: } else
562: name = xmlStrndup(q, cur - q);
563:
564: *dtdPtr = dtd;
565: *tagPtr = name;
566:
567: /*
568: * We should definitely be at the ending "S? '>'" part
569: */
570: SKIP_BLANKS(cur);
571: if ((!IS_CHAR(*cur)) || (*cur != '>')) {
572: fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
573: /*
574: * Note : skipping to the next '>' is probably otherkill,
575: * especially in case the '>' is hust missing.
576: *
577: * Otherwise add:
578: * MOVETO_ENDTAG(cur);
579: */
580: } else
581: cur++;
582:
583: *p = cur;
584: return;
585: }
586:
587: /*
1.3 veillard 588: * xmlParseCDSect: escaped pure raw content.
589: */
590: CHAR *xmlParseCDSect(CHAR **p) {
591: CHAR *cur = *p, *r, *s, *base, *ret;
592:
593: base = cur;
594: if (!IS_CHAR(*cur)) {
1.7 veillard 595: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 596: return(NULL);
597: }
598: r = cur++;
599: if (!IS_CHAR(*cur)) {
1.7 veillard 600: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 601: return(NULL);
602: }
603: s = cur++;
604: while (IS_CHAR(*cur) &&
605: ((*r != ']') || (*s != ']') || (*cur != '>'))) {
606: r++;s++;cur++;
607: }
608: if (!IS_CHAR(*cur)) {
1.7 veillard 609: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 610: return(NULL);
611: }
612: ret = xmlStrndup(base, cur-base);
1.2 veillard 613: *p = cur;
614: return(ret);
615: }
616:
617: /*
618: * xmlParseContent: a content is
619: * (element | PCData | Reference | CDSect | PI | Comment)
620: *
621: * element : starts by '<'
622: * PCData : any CHAR but '&' or '<'
623: * Reference : starts by '&'
624: * CDSect : starts by '<![CDATA['
625: * PI : starts by '<?'
626: */
627:
1.3 veillard 628: xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
629: CHAR *cur = *p, *q, *data = NULL;
1.2 veillard 630: xmlNodePtr ret = NULL;
631:
632: /*
1.3 veillard 633: * First case : a Processing Instruction.
634: */
635: if ((cur[0] == '<') && (cur[1] == '?')) {
636: xmlParsePI(&cur, doc);
637: }
638: /*
639: * Second case : a CDSection
1.2 veillard 640: */
1.3 veillard 641: if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
642: (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
643: (cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
644: cur += 9;
645: data = xmlParseCDSect(&cur);
646: }
647: /*
648: * Third case : a sub-element.
649: */
650: else if (cur[0] == '<') {
651: ret = xmlParseElement(&cur, doc);
652: }
653: /*
654: * Last case, text. Note that References are handled directly.
655: */
656: else {
657: q = cur;
658: while (IS_CHAR(*cur) && (*cur != '<')) cur++;
659:
660: if (!IS_CHAR(*cur)) {
1.7 veillard 661: fprintf(stderr, "Truncated content : %.50s\n", q);
1.4 veillard 662: *p = cur;
1.3 veillard 663: return(NULL);
664: }
665: data = xmlStrndup(q, cur - q);
666: /* Should apply the &...; reduction !!!! */
667: }
668:
669: /*
670: * Handle the data if any. If there is no child
671: * add it as content, otherwise create a new node of type text.
672: */
673: if (data != NULL)
674: data = xmlHandleData(data);
675: if (data != NULL) {
676: if (node->childs == NULL)
677: xmlNodeSetContent(node, data);
678: else {
679: ret = xmlNewText(data);
680: }
681: }
1.2 veillard 682:
683: *p = cur;
684: return(ret);
685: }
686:
687: /*
688: * xmlParseElement: parse an XML element
689: */
690:
1.3 veillard 691: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
1.2 veillard 692: CHAR *cur = *p;
693: xmlNodePtr ret, child;
1.7 veillard 694: CHAR *openTag = *p;
695: CHAR *closeTag = *p;
1.2 veillard 696:
1.3 veillard 697: ret = xmlParseStartTag(&cur, doc);
698: if (ret == NULL) {
699: *p = cur;
700: return(NULL);
701: }
1.2 veillard 702:
703: /*
704: * Check for an Empty Element.
705: */
706: if ((cur[0] == '/') && (cur[1] == '>')) {
707: cur += 2;
708: *p = cur;
709: return(ret);
710: }
711: if (cur[0] == '>') cur++;
712: else {
1.7 veillard 713: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
1.2 veillard 714: *p = cur;
715: return(ret);
716: }
717:
718: /*
719: * Parse the content of the element:
720: * (element | PCData | Reference | CDSect | PI | Comment) *
721: *
722: * element : starts by '<'
723: * PCData : any CHAR but '&' or '<'
724: * Reference : starts by '&'
725: * CDSect : starts by '<![CDATA['
726: * PI : starts by '<?'
727: *
728: * The loop stops upon detection of an end of tag '</'
729: */
730: while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
1.3 veillard 731: child = xmlParseContent(&cur, doc, ret);
1.2 veillard 732: if (child != NULL)
733: xmlAddChild(ret, child);
734: }
735: if (!IS_CHAR(cur[0])) {
1.7 veillard 736: fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
1.2 veillard 737: *p = cur;
738: return(ret);
739: }
740:
741: /*
742: * parse the end of tag : '</' has been detected.
743: */
744: cur += 2;
745: if (*cur == '>') cur++; /* simplified closing </> */
746: else {
1.7 veillard 747: CHAR *endTag;
748: xmlDtdPtr endDtd;
749:
750: xmlParseEndTag(&cur, doc, &endDtd, &endTag);
751:
1.2 veillard 752: /*
1.7 veillard 753: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 754: */
1.7 veillard 755: if (endDtd != ret->dtd) {
756: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
757: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
758: }
759: if (strcmp(ret->name, endTag)) {
760: fprintf(stderr, "Start and End tags don't use the same name:\n");
761: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
762: }
1.2 veillard 763: }
764:
765: *p = cur;
766: return(ret);
767: }
768:
769: /*
1.1 veillard 770: * xmlParseXMLDecl: parse an XML declaration header
771: */
772:
773: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
774: CHAR *cur = *p;
775: CHAR *version;
776: xmlDocPtr ret;
777:
778: /*
779: * We know that '<?XML' is here.
780: */
781: cur += 5;
782:
783: /*
784: * Parse the version info
785: */
786: SKIP_BLANKS(cur);
787:
788: /*
789: * We should have 'version=' here !
790: */
791: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
792: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
793: (cur[6] == 'n') && (cur[7] == '=')) {
794: cur += 8;
795: version = xmlParseQuotedString(&cur);
796: if (version == NULL)
797: ret = xmlNewDoc(XML_DEFAULT_VERSION);
798: else {
799: ret = xmlNewDoc(version);
1.8 veillard 800: free(version);
1.1 veillard 801: }
802: } else {
803: ret = xmlNewDoc(XML_DEFAULT_VERSION);
804: }
805:
806: /*
807: * We should check for encoding !!!!
808: */
809:
810: /*
811: * We should check for Required Markup Declaration !!!!
812: */
813: MOVETO_ENDTAG(cur);
814: cur++;
815:
816: *p = cur;
817: return(ret);
818: }
819:
820: /*
821: * xmlParseMisc: parse an XML Misc optionnal field.
822: * (Comment | PI | S)*
823: */
824:
1.3 veillard 825: void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
1.1 veillard 826: CHAR *cur = *p;
827:
828: while (((cur[0] == '<') && (cur[1] == '?')) ||
829: ((cur[0] == '<') && (cur[1] == '!') &&
830: (cur[2] == '-') && (cur[2] == '-')) ||
831: IS_BLANK(*cur)) {
832: if ((cur[0] == '<') && (cur[1] == '?')) {
1.3 veillard 833: xmlParsePI(&cur, doc);
1.1 veillard 834: } else if (IS_BLANK(*cur)) {
835: cur++;
836: } else
837: xmlParserSkipComment(&cur);
838: }
839:
840: *p = cur;
841: }
842:
843: /*
844: * xmlParseDoc : parse an XML document and build a tree.
845: */
846:
847: xmlDocPtr xmlParseDoc(CHAR *cur) {
848: xmlDocPtr ret;
849:
850: /*
851: * Wipe out everything which is before the first '<'
852: */
853: SKIP_BLANKS(cur);
854:
855: /*
856: * Check for the XMLDecl in the Prolog.
857: */
858: if ((cur[0] == '<') && (cur[1] == '?') &&
859: (cur[2] == 'X') && (cur[3] == 'M') &&
860: (cur[4] == 'L')) {
861: ret = xmlParseXMLDecl(&cur);
862: /* SKIP_EOL(cur); */
863: SKIP_BLANKS(cur);
864: } else {
865: ret = xmlNewDoc(XML_DEFAULT_VERSION);
866: }
867:
868: /*
869: * The Misc part of the Prolog
870: * (Comment | PI | S) *
871: */
872: xmlParseMisc(&cur, ret);
873:
874: /*
1.2 veillard 875: * Time to start parsing
1.1 veillard 876: */
1.3 veillard 877: ret->root = xmlParseElement(&cur, ret);
1.1 veillard 878:
879: return(ret);
880: }
881:
1.9 httpng 882: /*
883: * xmlParseFile : parse an XML file and build a tree.
884: */
885:
886: xmlDocPtr xmlParseFile(const char *filename) {
887: xmlDocPtr ret;
888: int input;
889: int res;
890: struct stat buf;
891: char *buffer;
892:
893: res = stat(buffer, &buf);
894: if (res < 0) return(NULL);
895:
896: buffer = malloc(buf.st_size + 100);
897: if (buffer == NULL) {
898: perror("malloc");
899: return(NULL);
900: }
901:
902: memset(buffer, 0, sizeof(buffer));
903: input = open (filename, O_RDONLY);
904: if (input < 0) {
905: fprintf (stderr, "Cannot read file %s :\n", filename);
906: perror ("open failed");
907: return(NULL);
908: }
909: res = read(input, buffer, buf.st_size);
910: if (res < 0) {
911: fprintf (stderr, "Cannot read file %s :\n", filename);
912: perror ("read failed");
913: return(NULL);
914: }
915: close(input);
916:
917: buffer[buf.st_size] = '\0';
918: ret = xmlParseDoc(buffer);
919: free(buffer);
920: return(ret);
921: }
Webmaster