Annotation of XML/parser.c, revision 1.8
1.1 veillard 1: /*
1.3 veillard 2: * parser.c : an XML 1.0 non-verifying parser
1.1 veillard 3: */
4:
5: #include <stdio.h>
6: #include <ctype.h>
7: #include <string.h>
8: #include <malloc.h>
9:
10: #include "parser.h"
11: #include "tree.h"
12:
13: /*
14: * A few macros needed to help building the parser.
15: */
16:
17: #ifdef UNICODE
18: /*
1.3 veillard 19: * UNICODE version of the macros. Incomplete now !!!!
1.1 veillard 20: */
21: #define IS_CHAR(c) \
22: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || \
23: (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))
24:
25: #define SKIP_BLANKS(p) \
26: while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) || \
27: (*(p) == 0x3000)) (p)++;
28:
1.3 veillard 29: /* I'm too lazy to complete this one !!!! */
1.1 veillard 30: #define IS_BASECHAR(c) \
31: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
32: (((c) >= 0x61) && ((c) <= 0x7a)) || \
33: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
34: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
35: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
36: (((c) >= 0xf8) && ((c) <= 0xff)) || \
37: ((c) == 0xba))
38:
1.3 veillard 39: /* I'm too lazy to complete this one !!!! */
1.1 veillard 40: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
41:
1.3 veillard 42: /* I'm too lazy to complete this one !!!! */
1.1 veillard 43: #define IS_COMBINING(c) 0
44:
1.3 veillard 45: #define IS_IGNORABLE(c) \
46: ((((c) >= 0x200c) && ((c) <= 0x200f)) || \
47: (((c) >= 0x202a) && ((c) <= 0x202e)) || \
48: (((c) >= 0x206a) && ((c) <= 0x206f)) || \
49: ((c) == 0xfeff))
50:
51: #define IS_EXTENDER(c) \
52: (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) || \
53: ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) || \
54: ((c) == 0xec6) || ((c) == 0x3005) \
55: (((c) >= 0x3031) && ((c) <= 0x3035)) || \
56: (((c) >= 0x309b) && ((c) <= 0x309e)) || \
57: (((c) >= 0x30fc) && ((c) <= 0x30fe)) || \
58: (((c) >= 0xff70) && ((c) <= 0xff9e)) || \
59: ((c) == 0xff9f))
60:
1.1 veillard 61: #define IS_IDEOGRAPHIC(c) \
62: ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) || \
63: (((c) >= 0xf900) && ((c) <= 0xfa2d)) || \
64: (((c) >= 0x3021) && ((c) <= 0x3029)) || \
65: ((c) == 0x3007))
66:
67: #define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))
68:
69: /* I'm too lazy to complete this one ! */
70: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
71: #else
72: /*
1.3 veillard 73: * 8bits / ASCII version of the macros.
1.1 veillard 74: */
75: #define IS_CHAR(c) \
76: (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))
77:
78: #define IS_BASECHAR(c) \
79: ((((c) >= 0x41) && ((c) <= 0x5a)) || \
80: (((c) >= 0x61) && ((c) <= 0x7a)) || \
81: (((c) >= 0xaa) && ((c) <= 0x5b)) || \
82: (((c) >= 0xc0) && ((c) <= 0xd6)) || \
83: (((c) >= 0xd8) && ((c) <= 0xf6)) || \
84: (((c) >= 0xf8) && ((c) <= 0xff)) || \
85: ((c) == 0xba))
86:
87: #define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))
88:
89: #define IS_LETTER(c) IS_BASECHAR(c)
90:
91: #define IS_COMBINING(c) 0
92:
1.3 veillard 93: #define IS_IGNORABLE(c) 0
94:
95: #define IS_EXTENDER(c) ((c) == 0xb7)
96:
1.1 veillard 97: #define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
98: #endif
99:
100:
101: #define SKIP_EOL(p) \
102: if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; } \
103: if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }
104:
105: #define SKIP_BLANKS(p) \
106: while (IS_BLANK(*(p))) (p)++;
107:
108: #define MOVETO_ENDTAG(p) \
109: while (IS_CHAR(*p) && (*(p) != '>')) (p)++;
110:
111: #define MOVETO_STARTTAG(p) \
112: while (IS_CHAR(*p) && (*(p) != '<')) (p)++;
113:
114: /*
1.3 veillard 115: * Forward definition for recusive behaviour.
116: */
117: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);
118:
119: /*
120: * xmlHandleData : this routine represent's the specific application
121: * behaviour when reading a piece of text.
122: *
123: * For example in WebDav, any piece made only of blanks is eliminated
124: */
125:
126: CHAR *xmlHandleData(CHAR *in) {
127: CHAR *cur;
128:
129: if (in == NULL) return(NULL);
130: cur = in;
131: while (IS_CHAR(*cur)) {
132: if (!IS_BLANK(*cur)) goto not_blank;
133: cur++;
134: }
135: free(in);
136: return(NULL);
137:
138: not_blank:
139: return(in);
140: }
141:
142: /*
1.1 veillard 143: * xmlStrndup : a strdup for array of CHAR's
144: */
145:
1.6 httpng 146: CHAR *xmlStrndup(const CHAR *cur, int len) {
1.1 veillard 147: CHAR *ret = malloc((len + 1) * sizeof(CHAR));
148:
149: if (ret == NULL) {
150: fprintf(stderr, "malloc of %d byte failed\n",
151: (len + 1) * sizeof(CHAR));
152: return(NULL);
153: }
154: memcpy(ret, cur, len * sizeof(CHAR));
155: ret[len] = 0;
156: return(ret);
157: }
158:
159: /*
160: * xmlStrdup : a strdup for CHAR's
161: */
162:
1.6 httpng 163: CHAR *xmlStrdup(const CHAR *cur) {
164: const CHAR *p = cur;
1.1 veillard 165:
166: while (IS_CHAR(*p)) p++;
167: return(xmlStrndup(cur, p - cur));
168: }
169:
170: /*
171: * xmlParseName : parse an XML name.
172: */
173:
1.3 veillard 174: CHAR *xmlParseName(CHAR **p) {
175: CHAR *cur = *p, *q, *ret = NULL;
1.1 veillard 176:
177: /*
1.3 veillard 178: * Name ::= (Letter | '_') (NameChar)*
1.1 veillard 179: */
1.3 veillard 180: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
181: q = cur++;
182: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
183: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
184: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
185: (IS_EXTENDER(*cur)))
186: cur++;
187:
188: ret = xmlStrndup(q, cur - q);
1.1 veillard 189:
1.3 veillard 190: *p = cur;
191: return(ret);
1.1 veillard 192: }
193:
194: /*
195: * Parse and return a string between quotes or doublequotes
196: */
197: CHAR *xmlParseQuotedString(CHAR **p) {
198: CHAR *ret = NULL;
199: CHAR *cur = *p, *q;
200:
201: if (*cur == '"') {
202: cur++;
203: q = cur;
204: while (IS_CHAR(*cur) && (*cur != '"')) cur++;
205: if (*cur != '"')
1.7 veillard 206: fprintf(stderr, "String not closed \"%.50s\n", q);
1.1 veillard 207: else {
208: ret = xmlStrndup(q, cur - q);
209: cur++;
210: }
211: } else if (*cur == '\''){
212: cur++;
213: q = cur;
214: while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
215: if (*cur != '\'')
1.7 veillard 216: fprintf(stderr, "String not closed '%.50s\n", q);
1.1 veillard 217: else {
218: ret = xmlStrndup(q, cur - q);
219: cur++;
220: }
221: }
222: *p = cur;
223: return(ret);
224: }
225:
226: /*
1.3 veillard 227: * Skip an XML (SGML) comment <!-- .... -->
228: */
229: void xmlParserSkipComment(CHAR **p) {
230: CHAR *cur = *p, *q, *r, *start;
231:
232: /*
233: * An extra check may avoid errors and isn't that costly !
234: */
235: if ((cur[0] != '<') || (cur[1] != '!') ||
236: (cur[2] != '-') || (cur[3] != '-')) return;
237:
238: cur += 4;
239: start = q = cur;
240: cur++;
241: r = cur;
242: cur++;
243: while (IS_CHAR(*cur) &&
244: ((*cur != '>') || (*r != '-') || (*q != '-'))) {
245: cur++;r++;q++;
246: }
247: if (!IS_CHAR(*cur)) {
1.7 veillard 248: fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
1.3 veillard 249: *p = start;
250: } else {
251: cur++;
252: *p = cur;
253: }
254: }
255:
256: /*
1.1 veillard 257: * xmlParseWebdavNamespace: parse Webdav specific '<?namespace ...' constructs.
258: */
259:
260: void xmlParseWebdavNamespace(CHAR **p, xmlDocPtr doc) {
261: CHAR *cur = *p;
262: CHAR *href = NULL;
263: CHAR *AS = NULL;
1.3 veillard 264: int garbage = 0;
1.1 veillard 265:
266: /*
267: * We know that 'namespace' is here.
268: */
269: cur += 9;
270: SKIP_BLANKS(cur);
271:
272: while (IS_CHAR(*cur) && (*cur != '>')) {
273: /*
274: * We can have 'href' or 'AS' attributes.
275: */
276: if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') &&
277: (cur[3] == 'f')) {
1.3 veillard 278: garbage = 0;
1.1 veillard 279: cur += 4;
280: SKIP_BLANKS(cur);
281:
282: if (*cur != '=') continue;
283: cur++;
284: SKIP_BLANKS(cur);
285:
286: href = xmlParseQuotedString(&cur);
287: SKIP_BLANKS(cur);
288: } else if ((cur[0] == 'A') && (cur[1] == 'S')) {
1.3 veillard 289: garbage = 0;
1.1 veillard 290: cur += 2;
291: SKIP_BLANKS(cur);
292:
293: if (*cur != '=') continue;
294: cur++;
295: SKIP_BLANKS(cur);
296:
297: AS = xmlParseQuotedString(&cur);
298: SKIP_BLANKS(cur);
299: } else if ((cur[0] == '?') && (cur[1] == '>')) {
1.3 veillard 300: garbage = 0;
1.1 veillard 301: cur ++;
302: } else {
1.3 veillard 303: /*
304: * Found garbage when parsing the namespace
305: */
306: if (!garbage) fprintf(stderr,
1.4 veillard 307: "\nxmlParseWebdavNamespace found garbage: ");
1.3 veillard 308: fprintf(stderr, "%c", *cur);
1.1 veillard 309: cur++;
310: }
311: }
312:
313: MOVETO_ENDTAG(cur);
314: cur++;
315:
316: /*
317: * Register the DTD.
318: */
319: if (href != NULL)
320: xmlNewDtd(doc, href, AS);
321:
1.8 ! veillard 322: if (AS != NULL) free(AS);
! 323: if (href != NULL) free(href);
! 324:
1.1 veillard 325: *p = cur;
326: }
327:
328: /*
1.3 veillard 329: * xmlParsePI: parse an XML Processing Instruction.
330: */
331:
332: void xmlParsePI(CHAR **p, xmlDocPtr doc) {
333: CHAR *cur = *p;
334:
335: if ((cur[0] == '<') && (cur[1] == '?')) {
336: /*
337: * this is a Processing Instruction.
338: */
339: cur += 2;
340:
341: /*
342: * Special for WebDav, support for the Processing Instruction
343: * '<?namespace ...' contruct in the header of the XML document.
344: */
345: if ((cur[0] == 'n') && (cur[1] == 'a') &&
346: (cur[2] == 'm') && (cur[3] == 'e') &&
347: (cur[4] == 's') && (cur[5] == 'p') &&
348: (cur[6] == 'a') && (cur[7] == 'c') &&
349: (cur[8] == 'e')) {
350: xmlParseWebdavNamespace(&cur, doc);
351: } else {
352: /* Unknown PI, ignore it ! */
353: MOVETO_ENDTAG(cur);
354: cur++;
355: }
356: }
357: *p = cur;
358: }
359:
360: /*
361: * xmlParseAttribute: parse a start of tag.
362: *
363: * Attribute ::= Name Eq AttValue
364: */
365:
366: void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
367: CHAR *cur = *p, *q, *name, *value = NULL;
368:
369: if (!IS_LETTER(*cur) && (*cur != '_')) {
370: return;
371: }
372: q = cur++;
373: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
374: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
375: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
376: (IS_EXTENDER(*cur)))
377: cur++;
378: name = xmlStrndup(q, cur - q);
379:
380: /*
381: * We should have the equal, we are laxist here and allow attributes
382: * without values and extra spaces.
383: */
384: SKIP_BLANKS(cur);
385: if (*cur == '=') {
386: cur++;
387: SKIP_BLANKS(cur);
388: if ((*cur != '\'') && (*cur != '"')) {
1.7 veillard 389: fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
1.3 veillard 390: q);
391: } else
392: value = xmlParseQuotedString(&cur);
393: }
394:
395: /*
396: * Add the attribute to the node.
397: */
398: if (name != NULL)
399: xmlNewProp(node, name, value);
400:
401: *p = cur;
402: }
403:
404: /*
1.2 veillard 405: * xmlParseStartTag: parse a start of tag.
406: */
407:
1.3 veillard 408: xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
409: CHAR *cur = *p, *q, *ns, *name;
410: xmlDtdPtr dtd = NULL;
1.2 veillard 411: xmlNodePtr ret = NULL;
412:
413: /*
1.3 veillard 414: * Theorically one should just parse a Name, but with the addition
415: * of the namespace needed for WebDav, it's a bit more complicated
416: * since the element name may be prefixed by a namespace prefix.
417: *
418: * QName ::= (NSPart ':')? LocalPart
419: * NSPart ::= Name
420: * LocalPart ::= Name
421: * STag ::= '<' QName (S Attribute)* S? '>'
422: *
423: * instead of :
424: *
425: * STag ::= '<' QName (S Attribute)* S? '>'
1.2 veillard 426: */
1.3 veillard 427: if (*cur != '<') return(NULL);
428: cur++;
429:
430: if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
431: q = cur++;
432: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
433: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
434: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
435: (IS_EXTENDER(*cur)))
436: cur++;
437:
438: if (*cur == ':') {
439: ns = xmlStrndup(q, cur - q);
440:
441: cur++; /* skip the column */
442: if (!IS_LETTER(*cur) && (*cur != '_')) {
1.7 veillard 443: fprintf(stderr,
444: "Start tag : no element name after namespace identifier %.20s\n",
1.3 veillard 445: q);
446: free(ns);
447: *p = cur;
448: return(NULL);
449: }
450: q = cur++;
451: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
452: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
453: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
454: (IS_EXTENDER(*cur)))
455: cur++;
456: name = xmlStrndup(q, cur - q);
457:
458: /*
459: * Search the DTD associated to ns.
460: */
461: dtd = xmlSearchDtd(doc, ns);
462: if (dtd == NULL)
1.7 veillard 463: fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
1.3 veillard 464: free(ns);
465: } else
466: name = xmlStrndup(q, cur - q);
467:
468: ret = xmlNewNode(dtd, name, NULL);
1.2 veillard 469:
1.3 veillard 470: /*
471: * Now parse the attributes, it ends up with the ending
472: *
473: * (S Attribute)* S?
474: */
475: SKIP_BLANKS(cur);
476: while ((IS_CHAR(*cur)) &&
477: (*cur != '>') &&
478: ((cur[0] != '/') || (cur[1] != '>'))) {
479: if (IS_LETTER(*cur) || (*cur == '_'))
480: xmlParseAttribute(&cur, ret);
481: else {
482: /* We should warn !!! */
483: cur++;
484: }
485: SKIP_BLANKS(cur);
486: }
487:
488: *p = cur;
489: return(ret);
490: }
491:
492: /*
1.7 veillard 493: * xmlParseEndTag: parse an end of tag, note that the '</' part has
494: * already been read.
495: */
496:
497: void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
498: CHAR *cur = *p, *q, *ns, *name;
499: xmlDtdPtr dtd = NULL;
500:
501: *dtdPtr = NULL;
502: *tagPtr = NULL;
503:
504: /*
505: * Theorically one should just parse a Name, but with the addition
506: * of the namespace needed for WebDav, it's a bit more complicated
507: * since the element name may be prefixed by a namespace prefix.
508: *
509: * QName ::= (NSPart ':')? LocalPart
510: * NSPart ::= Name
511: * LocalPart ::= Name
512: * ETag ::= '</' QName S? '>'
513: *
514: * instead of :
515: *
516: * ETag ::= '</' Name S? '>'
517: */
518: if (!IS_LETTER(*cur) && (*cur != '_')) return;
519: q = cur++;
520: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
521: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
522: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
523: (IS_EXTENDER(*cur)))
524: cur++;
525:
526: if (*cur == ':') {
527: ns = xmlStrndup(q, cur - q);
528:
529: cur++; /* skip the column */
530: if (!IS_LETTER(*cur) && (*cur != '_')) {
531: fprintf(stderr,
532: "End tag : no element name after namespace identifier %.20s\n",
533: q);
534: free(ns);
535: *p = cur;
536: return;
537: }
538: q = cur++;
539: while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
540: (*cur == '.') || (*cur == '-') || (*cur == '_') ||
541: (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
542: (IS_EXTENDER(*cur)))
543: cur++;
544: name = xmlStrndup(q, cur - q);
545:
546: /*
547: * Search the DTD associated to ns.
548: */
549: dtd = xmlSearchDtd(doc, ns);
550: if (dtd == NULL)
551: fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
552: free(ns);
553: } else
554: name = xmlStrndup(q, cur - q);
555:
556: *dtdPtr = dtd;
557: *tagPtr = name;
558:
559: /*
560: * We should definitely be at the ending "S? '>'" part
561: */
562: SKIP_BLANKS(cur);
563: if ((!IS_CHAR(*cur)) || (*cur != '>')) {
564: fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
565: /*
566: * Note : skipping to the next '>' is probably otherkill,
567: * especially in case the '>' is hust missing.
568: *
569: * Otherwise add:
570: * MOVETO_ENDTAG(cur);
571: */
572: } else
573: cur++;
574:
575: *p = cur;
576: return;
577: }
578:
579: /*
1.3 veillard 580: * xmlParseCDSect: escaped pure raw content.
581: */
582: CHAR *xmlParseCDSect(CHAR **p) {
583: CHAR *cur = *p, *r, *s, *base, *ret;
584:
585: base = cur;
586: if (!IS_CHAR(*cur)) {
1.7 veillard 587: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 588: return(NULL);
589: }
590: r = cur++;
591: if (!IS_CHAR(*cur)) {
1.7 veillard 592: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 593: return(NULL);
594: }
595: s = cur++;
596: while (IS_CHAR(*cur) &&
597: ((*r != ']') || (*s != ']') || (*cur != '>'))) {
598: r++;s++;cur++;
599: }
600: if (!IS_CHAR(*cur)) {
1.7 veillard 601: fprintf(stderr, "CData section not finished : %.20s\n", base);
1.3 veillard 602: return(NULL);
603: }
604: ret = xmlStrndup(base, cur-base);
1.2 veillard 605: *p = cur;
606: return(ret);
607: }
608:
609: /*
610: * xmlParseContent: a content is
611: * (element | PCData | Reference | CDSect | PI | Comment)
612: *
613: * element : starts by '<'
614: * PCData : any CHAR but '&' or '<'
615: * Reference : starts by '&'
616: * CDSect : starts by '<![CDATA['
617: * PI : starts by '<?'
618: */
619:
1.3 veillard 620: xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
621: CHAR *cur = *p, *q, *data = NULL;
1.2 veillard 622: xmlNodePtr ret = NULL;
623:
624: /*
1.3 veillard 625: * First case : a Processing Instruction.
626: */
627: if ((cur[0] == '<') && (cur[1] == '?')) {
628: xmlParsePI(&cur, doc);
629: }
630: /*
631: * Second case : a CDSection
1.2 veillard 632: */
1.3 veillard 633: if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
634: (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
635: (cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
636: cur += 9;
637: data = xmlParseCDSect(&cur);
638: }
639: /*
640: * Third case : a sub-element.
641: */
642: else if (cur[0] == '<') {
643: ret = xmlParseElement(&cur, doc);
644: }
645: /*
646: * Last case, text. Note that References are handled directly.
647: */
648: else {
649: q = cur;
650: while (IS_CHAR(*cur) && (*cur != '<')) cur++;
651:
652: if (!IS_CHAR(*cur)) {
1.7 veillard 653: fprintf(stderr, "Truncated content : %.50s\n", q);
1.4 veillard 654: *p = cur;
1.3 veillard 655: return(NULL);
656: }
657: data = xmlStrndup(q, cur - q);
658: /* Should apply the &...; reduction !!!! */
659: }
660:
661: /*
662: * Handle the data if any. If there is no child
663: * add it as content, otherwise create a new node of type text.
664: */
665: if (data != NULL)
666: data = xmlHandleData(data);
667: if (data != NULL) {
668: if (node->childs == NULL)
669: xmlNodeSetContent(node, data);
670: else {
671: ret = xmlNewText(data);
672: }
673: }
1.2 veillard 674:
675: *p = cur;
676: return(ret);
677: }
678:
679: /*
680: * xmlParseElement: parse an XML element
681: */
682:
1.3 veillard 683: xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
1.2 veillard 684: CHAR *cur = *p;
685: xmlNodePtr ret, child;
1.7 veillard 686: CHAR *openTag = *p;
687: CHAR *closeTag = *p;
1.2 veillard 688:
1.3 veillard 689: ret = xmlParseStartTag(&cur, doc);
690: if (ret == NULL) {
691: *p = cur;
692: return(NULL);
693: }
1.2 veillard 694:
695: /*
696: * Check for an Empty Element.
697: */
698: if ((cur[0] == '/') && (cur[1] == '>')) {
699: cur += 2;
700: *p = cur;
701: return(ret);
702: }
703: if (cur[0] == '>') cur++;
704: else {
1.7 veillard 705: fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
1.2 veillard 706: *p = cur;
707: return(ret);
708: }
709:
710: /*
711: * Parse the content of the element:
712: * (element | PCData | Reference | CDSect | PI | Comment) *
713: *
714: * element : starts by '<'
715: * PCData : any CHAR but '&' or '<'
716: * Reference : starts by '&'
717: * CDSect : starts by '<![CDATA['
718: * PI : starts by '<?'
719: *
720: * The loop stops upon detection of an end of tag '</'
721: */
722: while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
1.3 veillard 723: child = xmlParseContent(&cur, doc, ret);
1.2 veillard 724: if (child != NULL)
725: xmlAddChild(ret, child);
726: }
727: if (!IS_CHAR(cur[0])) {
1.7 veillard 728: fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
1.2 veillard 729: *p = cur;
730: return(ret);
731: }
732:
733: /*
734: * parse the end of tag : '</' has been detected.
735: */
736: cur += 2;
737: if (*cur == '>') cur++; /* simplified closing </> */
738: else {
1.7 veillard 739: CHAR *endTag;
740: xmlDtdPtr endDtd;
741:
742: xmlParseEndTag(&cur, doc, &endDtd, &endTag);
743:
1.2 veillard 744: /*
1.7 veillard 745: * Check that the Name in the ETag is the same as in the STag.
1.2 veillard 746: */
1.7 veillard 747: if (endDtd != ret->dtd) {
748: fprintf(stderr, "Start and End tags don't use the same DTD:\n");
749: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
750: }
751: if (strcmp(ret->name, endTag)) {
752: fprintf(stderr, "Start and End tags don't use the same name:\n");
753: fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
754: }
1.2 veillard 755: }
756:
757: *p = cur;
758: return(ret);
759: }
760:
761: /*
1.1 veillard 762: * xmlParseXMLDecl: parse an XML declaration header
763: */
764:
765: xmlDocPtr xmlParseXMLDecl(CHAR **p) {
766: CHAR *cur = *p;
767: CHAR *version;
768: xmlDocPtr ret;
769:
770: /*
771: * We know that '<?XML' is here.
772: */
773: cur += 5;
774:
775: /*
776: * Parse the version info
777: */
778: SKIP_BLANKS(cur);
779:
780: /*
781: * We should have 'version=' here !
782: */
783: if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') &&
784: (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
785: (cur[6] == 'n') && (cur[7] == '=')) {
786: cur += 8;
787: version = xmlParseQuotedString(&cur);
788: if (version == NULL)
789: ret = xmlNewDoc(XML_DEFAULT_VERSION);
790: else {
791: ret = xmlNewDoc(version);
1.8 ! veillard 792: free(version);
1.1 veillard 793: }
794: } else {
795: ret = xmlNewDoc(XML_DEFAULT_VERSION);
796: }
797:
798: /*
799: * We should check for encoding !!!!
800: */
801:
802: /*
803: * We should check for Required Markup Declaration !!!!
804: */
805: MOVETO_ENDTAG(cur);
806: cur++;
807:
808: *p = cur;
809: return(ret);
810: }
811:
812: /*
813: * xmlParseMisc: parse an XML Misc optionnal field.
814: * (Comment | PI | S)*
815: */
816:
1.3 veillard 817: void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
1.1 veillard 818: CHAR *cur = *p;
819:
820: while (((cur[0] == '<') && (cur[1] == '?')) ||
821: ((cur[0] == '<') && (cur[1] == '!') &&
822: (cur[2] == '-') && (cur[2] == '-')) ||
823: IS_BLANK(*cur)) {
824: if ((cur[0] == '<') && (cur[1] == '?')) {
1.3 veillard 825: xmlParsePI(&cur, doc);
1.1 veillard 826: } else if (IS_BLANK(*cur)) {
827: cur++;
828: } else
829: xmlParserSkipComment(&cur);
830: }
831:
832: *p = cur;
833: }
834:
835: /*
836: * xmlParseDoc : parse an XML document and build a tree.
837: */
838:
839: xmlDocPtr xmlParseDoc(CHAR *cur) {
840: xmlDocPtr ret;
841:
842: /*
843: * Wipe out everything which is before the first '<'
844: */
845: SKIP_BLANKS(cur);
846:
847: /*
848: * Check for the XMLDecl in the Prolog.
849: */
850: if ((cur[0] == '<') && (cur[1] == '?') &&
851: (cur[2] == 'X') && (cur[3] == 'M') &&
852: (cur[4] == 'L')) {
853: ret = xmlParseXMLDecl(&cur);
854: /* SKIP_EOL(cur); */
855: SKIP_BLANKS(cur);
856: } else {
857: ret = xmlNewDoc(XML_DEFAULT_VERSION);
858: }
859:
860: /*
861: * The Misc part of the Prolog
862: * (Comment | PI | S) *
863: */
864: xmlParseMisc(&cur, ret);
865:
866: /*
1.2 veillard 867: * Time to start parsing
1.1 veillard 868: */
1.3 veillard 869: ret->root = xmlParseElement(&cur, ret);
1.1 veillard 870:
871: return(ret);
872: }
873:
Webmaster