Annotation of XML/HTMLtree.c, revision 1.32
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
1.30 veillard 16: #include <libxml/xmlversion.h>
1.18 daniel 17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.1 daniel 34:
1.21 veillard 35: /************************************************************************
36: * *
1.23 veillard 37: * Getting/Setting encoding meta tags *
38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
44: *
45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if (cur->name != NULL) {
1.29 veillard 64: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.24 veillard 65: break;
1.29 veillard 66: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 67: goto found_head;
1.29 veillard 68: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if (cur->name != NULL) {
1.29 veillard 82: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 83: break;
1.29 veillard 84: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if (cur->name != NULL) {
1.29 veillard 100: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.24 veillard 101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: #ifndef XML_USE_BUFFER_CONTENT
112: value = attr->children->content;
113: #else
114: value = xmlBufferContent(attr->children->content);
115: #endif
1.28 veillard 116: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
117: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.24 veillard 118: http = 1;
1.28 veillard 119: else if ((value != NULL)
120: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.24 veillard 121: content = value;
122: if ((http != 0) && (content != NULL))
123: goto found_content;
124: }
125: attr = attr->next;
126: }
127: }
128: }
129: cur = cur->next;
130: }
131: return(NULL);
132:
133: found_content:
134: encoding = xmlStrstr(content, BAD_CAST"charset=");
135: if (encoding == NULL)
136: encoding = xmlStrstr(content, BAD_CAST"Charset=");
137: if (encoding == NULL)
138: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
139: if (encoding != NULL) {
140: encoding += 8;
141: } else {
142: encoding = xmlStrstr(content, BAD_CAST"charset =");
143: if (encoding == NULL)
144: encoding = xmlStrstr(content, BAD_CAST"Charset =");
145: if (encoding == NULL)
146: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
147: if (encoding != NULL)
148: encoding += 9;
149: }
150: if (encoding != NULL) {
151: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
152: }
153: return(encoding);
1.23 veillard 154: }
155:
156: /**
157: * htmlSetMetaEncoding:
158: * @doc: the document
159: * @encoding: the encoding string
160: *
161: * Sets the current encoding in the Meta tags
162: * NOTE: this will not change the document content encoding, just
163: * the META flag associated.
164: *
165: * Returns 0 in case of success and -1 in case of error
166: */
167: int
168: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 169: htmlNodePtr cur, meta;
170: const xmlChar *content;
171: char newcontent[100];
172:
173:
174: if (doc == NULL)
175: return(-1);
176:
177: if (encoding != NULL) {
1.27 veillard 178: #ifdef HAVE_SNPRINTF
179: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180: encoding);
181: #else
1.26 veillard 182: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 veillard 183: #endif
184: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 185: }
186:
187: cur = doc->children;
188:
189: /*
190: * Search the html
191: */
192: while (cur != NULL) {
193: if (cur->name != NULL) {
1.29 veillard 194: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.26 veillard 195: break;
1.29 veillard 196: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 197: if (encoding == NULL)
198: return(0);
199: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
200: xmlAddPrevSibling(cur, meta);
201: cur = meta;
202: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
203: xmlAddChild(cur, meta);
204: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
205: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
206: return(0);
207: }
1.29 veillard 208: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 209: goto found_head;
1.29 veillard 210: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 211: goto found_meta;
212: }
213: cur = cur->next;
214: }
215: if (cur == NULL)
216: return(-1);
217: cur = cur->children;
218:
219: /*
220: * Search the head
221: */
222: while (cur != NULL) {
223: if (cur->name != NULL) {
1.29 veillard 224: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 225: break;
1.29 veillard 226: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 227: if (encoding == NULL)
228: return(0);
229: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
230: xmlAddPrevSibling(cur, meta);
231: cur = meta;
232: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
233: xmlAddChild(cur, meta);
234: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
235: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
236: return(0);
237: }
1.29 veillard 238: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 239: goto found_meta;
240: }
241: cur = cur->next;
242: }
243: if (cur == NULL)
244: return(-1);
245: found_head:
246: if (cur->children == NULL) {
247: if (encoding == NULL)
248: return(0);
249: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
250: xmlAddChild(cur, meta);
251: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
252: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
253: return(0);
254: }
255: cur = cur->children;
256:
257: found_meta:
258: if (encoding != NULL) {
259: /*
260: * Create a new Meta element with the right aatributes
261: */
262:
263: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
264: xmlAddPrevSibling(cur, meta);
265: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
266: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
267: }
268:
269: /*
270: * Search and destroy all the remaining the meta elements carrying
271: * encoding informations
272: */
273: while (cur != NULL) {
274: if (cur->name != NULL) {
1.29 veillard 275: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.26 veillard 276: xmlAttrPtr attr = cur->properties;
277: int http;
278: const xmlChar *value;
279:
280: content = NULL;
281: http = 0;
282: while (attr != NULL) {
283: if ((attr->children != NULL) &&
284: (attr->children->type == XML_TEXT_NODE) &&
285: (attr->children->next == NULL)) {
286: #ifndef XML_USE_BUFFER_CONTENT
287: value = attr->children->content;
288: #else
289: value = xmlBufferContent(attr->children->content);
290: #endif
1.28 veillard 291: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
292: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.26 veillard 293: http = 1;
1.28 veillard 294: else if ((value != NULL)
295: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.26 veillard 296: content = value;
297: if ((http != 0) && (content != NULL))
298: break;
299: }
300: attr = attr->next;
301: }
302: if ((http != 0) && (content != NULL)) {
303: meta = cur;
304: cur = cur->next;
305: xmlUnlinkNode(meta);
306: xmlFreeNode(meta);
307: continue;
308: }
309:
310: }
311: }
312: cur = cur->next;
313: }
314: return(0);
1.23 veillard 315: }
316:
317: /************************************************************************
318: * *
1.21 veillard 319: * Dumping HTML tree content to a simple buffer *
320: * *
321: ************************************************************************/
322:
1.14 daniel 323: static void
324: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
325:
1.1 daniel 326: /**
327: * htmlDtdDump:
328: * @buf: the HTML buffer output
329: * @doc: the document
330: *
331: * Dump the HTML document DTD, if any.
332: */
333: static void
334: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
335: xmlDtdPtr cur = doc->intSubset;
336:
337: if (cur == NULL) {
338: fprintf(stderr, "htmlDtdDump : no internal subset\n");
339: return;
340: }
341: xmlBufferWriteChar(buf, "<!DOCTYPE ");
342: xmlBufferWriteCHAR(buf, cur->name);
343: if (cur->ExternalID != NULL) {
344: xmlBufferWriteChar(buf, " PUBLIC ");
345: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 346: if (cur->SystemID != NULL) {
347: xmlBufferWriteChar(buf, " ");
348: xmlBufferWriteQuotedString(buf, cur->SystemID);
349: }
1.1 daniel 350: } else if (cur->SystemID != NULL) {
351: xmlBufferWriteChar(buf, " SYSTEM ");
352: xmlBufferWriteQuotedString(buf, cur->SystemID);
353: }
354: xmlBufferWriteChar(buf, ">\n");
355: }
356:
357: /**
358: * htmlAttrDump:
359: * @buf: the HTML buffer output
360: * @doc: the document
361: * @cur: the attribute pointer
362: *
363: * Dump an HTML attribute
364: */
365: static void
366: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 367: xmlChar *value;
1.1 daniel 368:
369: if (cur == NULL) {
370: fprintf(stderr, "htmlAttrDump : property == NULL\n");
371: return;
372: }
373: xmlBufferWriteChar(buf, " ");
374: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 375: if (cur->children != NULL) {
376: value = xmlNodeListGetString(doc, cur->children, 0);
377: if (value) {
378: xmlBufferWriteChar(buf, "=");
379: xmlBufferWriteQuotedString(buf, value);
380: xmlFree(value);
381: } else {
382: xmlBufferWriteChar(buf, "=\"\"");
383: }
1.1 daniel 384: }
385: }
386:
387: /**
388: * htmlAttrListDump:
389: * @buf: the HTML buffer output
390: * @doc: the document
391: * @cur: the first attribute pointer
392: *
393: * Dump a list of HTML attributes
394: */
395: static void
396: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
397: if (cur == NULL) {
398: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
399: return;
400: }
401: while (cur != NULL) {
402: htmlAttrDump(buf, doc, cur);
403: cur = cur->next;
404: }
405: }
406:
407:
1.14 daniel 408: void
1.1 daniel 409: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
410: /**
411: * htmlNodeListDump:
412: * @buf: the HTML buffer output
413: * @doc: the document
414: * @cur: the first node
415: *
416: * Dump an HTML node list, recursive behaviour,children are printed too.
417: */
418: static void
419: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
420: if (cur == NULL) {
421: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
422: return;
423: }
424: while (cur != NULL) {
425: htmlNodeDump(buf, doc, cur);
426: cur = cur->next;
427: }
428: }
429:
430: /**
431: * htmlNodeDump:
432: * @buf: the HTML buffer output
433: * @doc: the document
434: * @cur: the current node
435: *
436: * Dump an HTML node, recursive behaviour,children are printed too.
437: */
1.14 daniel 438: void
1.1 daniel 439: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440: htmlElemDescPtr info;
441:
442: if (cur == NULL) {
443: fprintf(stderr, "htmlNodeDump : node == NULL\n");
444: return;
445: }
446: /*
447: * Special cases.
448: */
1.20 daniel 449: if (cur->type == XML_DTD_NODE)
450: return;
1.14 daniel 451: if (cur->type == XML_HTML_DOCUMENT_NODE) {
452: htmlDocContentDump(buf, (xmlDocPtr) cur);
453: return;
454: }
1.1 daniel 455: if (cur->type == HTML_TEXT_NODE) {
456: if (cur->content != NULL) {
1.6 daniel 457: xmlChar *buffer;
1.1 daniel 458:
1.9 daniel 459: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 460: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 461: #else
462: buffer = xmlEncodeEntitiesReentrant(doc,
463: xmlBufferContent(cur->content));
464: #endif
1.1 daniel 465: if (buffer != NULL) {
466: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 467: xmlFree(buffer);
1.1 daniel 468: }
469: }
470: return;
471: }
472: if (cur->type == HTML_COMMENT_NODE) {
473: if (cur->content != NULL) {
474: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 475: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 476: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 477: #else
478: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
479: #endif
1.1 daniel 480: xmlBufferWriteChar(buf, "-->");
481: }
482: return;
483: }
484: if (cur->type == HTML_ENTITY_REF_NODE) {
485: xmlBufferWriteChar(buf, "&");
486: xmlBufferWriteCHAR(buf, cur->name);
487: xmlBufferWriteChar(buf, ";");
488: return;
489: }
490:
491: /*
492: * Get specific HTmL info for taht node.
493: */
494: info = htmlTagLookup(cur->name);
495:
496: xmlBufferWriteChar(buf, "<");
497: xmlBufferWriteCHAR(buf, cur->name);
498: if (cur->properties != NULL)
499: htmlAttrListDump(buf, doc, cur->properties);
500:
1.7 daniel 501: if ((info != NULL) && (info->empty)) {
1.1 daniel 502: xmlBufferWriteChar(buf, ">");
503: if (cur->next != NULL) {
504: if ((cur->next->type != HTML_TEXT_NODE) &&
505: (cur->next->type != HTML_ENTITY_REF_NODE))
506: xmlBufferWriteChar(buf, "\n");
507: }
508: return;
509: }
1.17 daniel 510: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 511: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 512: xmlBufferWriteChar(buf, ">");
513: else {
514: xmlBufferWriteChar(buf, "></");
515: xmlBufferWriteCHAR(buf, cur->name);
516: xmlBufferWriteChar(buf, ">");
517: }
518: if (cur->next != NULL) {
519: if ((cur->next->type != HTML_TEXT_NODE) &&
520: (cur->next->type != HTML_ENTITY_REF_NODE))
521: xmlBufferWriteChar(buf, "\n");
522: }
523: return;
524: }
525: xmlBufferWriteChar(buf, ">");
526: if (cur->content != NULL) {
1.6 daniel 527: xmlChar *buffer;
1.1 daniel 528:
1.9 daniel 529: #ifndef XML_USE_BUFFER_CONTENT
530: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
531: #else
532: buffer = xmlEncodeEntitiesReentrant(doc,
533: xmlBufferContent(cur->content));
534: #endif
1.1 daniel 535: if (buffer != NULL) {
536: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 537: xmlFree(buffer);
1.1 daniel 538: }
539: }
1.17 daniel 540: if (cur->children != NULL) {
541: if ((cur->children->type != HTML_TEXT_NODE) &&
542: (cur->children->type != HTML_ENTITY_REF_NODE) &&
543: (cur->children != cur->last))
1.1 daniel 544: xmlBufferWriteChar(buf, "\n");
1.17 daniel 545: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 546: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 547: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 548: (cur->children != cur->last))
1.1 daniel 549: xmlBufferWriteChar(buf, "\n");
550: }
1.11 daniel 551: if (!htmlIsAutoClosed(doc, cur)) {
552: xmlBufferWriteChar(buf, "</");
553: xmlBufferWriteCHAR(buf, cur->name);
554: xmlBufferWriteChar(buf, ">");
555: }
1.1 daniel 556: if (cur->next != NULL) {
557: if ((cur->next->type != HTML_TEXT_NODE) &&
558: (cur->next->type != HTML_ENTITY_REF_NODE))
559: xmlBufferWriteChar(buf, "\n");
560: }
561: }
562:
563: /**
1.16 daniel 564: * htmlNodeDumpFile:
565: * @out: the FILE pointer
566: * @doc: the document
567: * @cur: the current node
568: *
569: * Dump an HTML node, recursive behaviour,children are printed too.
570: */
571: void
572: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
573: xmlBufferPtr buf;
574:
575: buf = xmlBufferCreate();
576: if (buf == NULL) return;
577: htmlNodeDump(buf, doc, cur);
578: xmlBufferDump(out, buf);
579: xmlBufferFree(buf);
580: }
581:
582: /**
1.1 daniel 583: * htmlDocContentDump:
584: * @buf: the HTML buffer output
585: * @cur: the document
586: *
587: * Dump an HTML document.
588: */
589: static void
590: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 591: int type;
592:
593: /*
594: * force to output the stuff as HTML, especially for entities
595: */
596: type = cur->type;
597: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 598: if (cur->intSubset != NULL)
599: htmlDtdDump(buf, cur);
1.11 daniel 600: else {
601: /* Default to HTML-4.0 transitionnal @@@@ */
602: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
603:
604: }
1.17 daniel 605: if (cur->children != NULL) {
606: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 607: }
608: xmlBufferWriteChar(buf, "\n");
1.22 veillard 609: cur->type = (xmlElementType) type;
1.1 daniel 610: }
611:
612: /**
613: * htmlDocDumpMemory:
614: * @cur: the document
615: * @mem: OUT: the memory pointer
616: * @size: OUT: the memory lenght
617: *
1.6 daniel 618: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 619: * It's up to the caller to free the memory.
620: */
621: void
1.6 daniel 622: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 623: xmlBufferPtr buf;
624:
625: if (cur == NULL) {
626: #ifdef DEBUG_TREE
1.15 daniel 627: fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 628: #endif
629: *mem = NULL;
630: *size = 0;
631: return;
632: }
633: buf = xmlBufferCreate();
634: if (buf == NULL) {
635: *mem = NULL;
636: *size = 0;
637: return;
638: }
639: htmlDocContentDump(buf, cur);
640: *mem = buf->content;
641: *size = buf->use;
642: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 643: xmlFree(buf);
1.1 daniel 644: }
645:
646:
1.21 veillard 647: /************************************************************************
648: * *
649: * Dumping HTML tree content to an I/O output buffer *
650: * *
651: ************************************************************************/
652:
653: static void
654: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
655:
656: /**
657: * htmlDtdDump:
658: * @buf: the HTML buffer output
659: * @doc: the document
660: *
661: * Dump the HTML document DTD, if any.
662: */
663: static void
664: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
665: xmlDtdPtr cur = doc->intSubset;
666:
667: if (cur == NULL) {
668: fprintf(stderr, "htmlDtdDump : no internal subset\n");
669: return;
670: }
671: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
672: xmlOutputBufferWriteString(buf, (const char *)cur->name);
673: if (cur->ExternalID != NULL) {
674: xmlOutputBufferWriteString(buf, " PUBLIC ");
675: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
676: if (cur->SystemID != NULL) {
677: xmlOutputBufferWriteString(buf, " ");
678: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
679: }
680: } else if (cur->SystemID != NULL) {
681: xmlOutputBufferWriteString(buf, " SYSTEM ");
682: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
683: }
684: xmlOutputBufferWriteString(buf, ">\n");
685: }
686:
687: /**
688: * htmlAttrDump:
689: * @buf: the HTML buffer output
690: * @doc: the document
691: * @cur: the attribute pointer
692: *
693: * Dump an HTML attribute
694: */
695: static void
696: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
697: xmlChar *value;
698:
699: if (cur == NULL) {
700: fprintf(stderr, "htmlAttrDump : property == NULL\n");
701: return;
702: }
703: xmlOutputBufferWriteString(buf, " ");
704: xmlOutputBufferWriteString(buf, (const char *)cur->name);
705: if (cur->children != NULL) {
706: value = xmlNodeListGetString(doc, cur->children, 0);
707: if (value) {
708: xmlOutputBufferWriteString(buf, "=");
709: xmlBufferWriteQuotedString(buf->buffer, value);
710: xmlFree(value);
711: } else {
712: xmlOutputBufferWriteString(buf, "=\"\"");
713: }
714: }
715: }
716:
717: /**
718: * htmlAttrListDump:
719: * @buf: the HTML buffer output
720: * @doc: the document
721: * @cur: the first attribute pointer
722: *
723: * Dump a list of HTML attributes
724: */
725: static void
726: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
727: if (cur == NULL) {
728: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
729: return;
730: }
731: while (cur != NULL) {
732: htmlAttrDumpOutput(buf, doc, cur, encoding);
733: cur = cur->next;
734: }
735: }
736:
737:
738: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739: xmlNodePtr cur, const char *encoding);
740:
741: /**
742: * htmlNodeListDump:
743: * @buf: the HTML buffer output
744: * @doc: the document
745: * @cur: the first node
746: *
747: * Dump an HTML node list, recursive behaviour,children are printed too.
748: */
749: static void
750: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
751: if (cur == NULL) {
752: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
753: return;
754: }
755: while (cur != NULL) {
756: htmlNodeDumpOutput(buf, doc, cur, encoding);
757: cur = cur->next;
758: }
759: }
760:
761: /**
762: * htmlNodeDump:
763: * @buf: the HTML buffer output
764: * @doc: the document
765: * @cur: the current node
766: *
767: * Dump an HTML node, recursive behaviour,children are printed too.
768: */
769: void
770: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
771: htmlElemDescPtr info;
772:
773: if (cur == NULL) {
774: fprintf(stderr, "htmlNodeDump : node == NULL\n");
775: return;
776: }
777: /*
778: * Special cases.
779: */
780: if (cur->type == XML_DTD_NODE)
781: return;
782: if (cur->type == XML_HTML_DOCUMENT_NODE) {
783: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
784: return;
785: }
786: if (cur->type == HTML_TEXT_NODE) {
787: if (cur->content != NULL) {
788: xmlChar *buffer;
789:
790: #ifndef XML_USE_BUFFER_CONTENT
791: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
792: #else
793: buffer = xmlEncodeEntitiesReentrant(doc,
794: xmlBufferContent(cur->content));
795: #endif
796: if (buffer != NULL) {
1.25 veillard 797: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 798: xmlFree(buffer);
799: }
800: }
801: return;
802: }
803: if (cur->type == HTML_COMMENT_NODE) {
804: if (cur->content != NULL) {
805: xmlOutputBufferWriteString(buf, "<!--");
806: #ifndef XML_USE_BUFFER_CONTENT
807: xmlOutputBufferWriteString(buf, (const char *)cur->content);
808: #else
809: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
810: #endif
811: xmlOutputBufferWriteString(buf, "-->");
812: }
813: return;
814: }
815: if (cur->type == HTML_ENTITY_REF_NODE) {
816: xmlOutputBufferWriteString(buf, "&");
817: xmlOutputBufferWriteString(buf, (const char *)cur->name);
818: xmlOutputBufferWriteString(buf, ";");
819: return;
820: }
1.31 veillard 821: if (cur->type == HTML_PRESERVE_NODE) {
822: if (cur->content != NULL) {
823: #ifndef XML_USE_BUFFER_CONTENT
824: xmlOutputBufferWriteString(buf, (const char *)cur->content);
825: #else
826: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
827: #endif
828: }
829: return;
830: }
1.21 veillard 831:
832: /*
833: * Get specific HTmL info for taht node.
834: */
835: info = htmlTagLookup(cur->name);
836:
837: xmlOutputBufferWriteString(buf, "<");
838: xmlOutputBufferWriteString(buf, (const char *)cur->name);
839: if (cur->properties != NULL)
840: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
841:
842: if ((info != NULL) && (info->empty)) {
843: xmlOutputBufferWriteString(buf, ">");
844: if (cur->next != NULL) {
845: if ((cur->next->type != HTML_TEXT_NODE) &&
846: (cur->next->type != HTML_ENTITY_REF_NODE))
847: xmlOutputBufferWriteString(buf, "\n");
848: }
849: return;
850: }
851: if ((cur->content == NULL) && (cur->children == NULL)) {
1.32 ! veillard 852: if ((info != NULL) && (info->endTag != 0) &&
! 853: (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
1.21 veillard 854: xmlOutputBufferWriteString(buf, ">");
1.32 ! veillard 855: } else {
1.21 veillard 856: xmlOutputBufferWriteString(buf, "></");
857: xmlOutputBufferWriteString(buf, (const char *)cur->name);
858: xmlOutputBufferWriteString(buf, ">");
859: }
860: if (cur->next != NULL) {
861: if ((cur->next->type != HTML_TEXT_NODE) &&
862: (cur->next->type != HTML_ENTITY_REF_NODE))
863: xmlOutputBufferWriteString(buf, "\n");
864: }
865: return;
866: }
867: xmlOutputBufferWriteString(buf, ">");
868: if (cur->content != NULL) {
869: #if 0
870: xmlChar *buffer;
871:
872: #ifndef XML_USE_BUFFER_CONTENT
873: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
874: #else
875: buffer = xmlEncodeEntitiesReentrant(doc,
876: xmlBufferContent(cur->content));
877: #endif
878: if (buffer != NULL) {
879: xmlOutputBufferWriteString(buf, buffer);
880: xmlFree(buffer);
881: }
882: #else
883: /*
884: * Uses the OutputBuffer property to automatically convert
885: * invalids to charrefs
886: */
887:
888: #ifndef XML_USE_BUFFER_CONTENT
889: xmlOutputBufferWriteString(buf, (const char *) cur->content);
890: #else
891: xmlOutputBufferWriteString(buf,
892: (const char *) xmlBufferContent(cur->content));
893: #endif
894: #endif
895: }
896: if (cur->children != NULL) {
897: if ((cur->children->type != HTML_TEXT_NODE) &&
898: (cur->children->type != HTML_ENTITY_REF_NODE) &&
899: (cur->children != cur->last))
900: xmlOutputBufferWriteString(buf, "\n");
901: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
902: if ((cur->last->type != HTML_TEXT_NODE) &&
903: (cur->last->type != HTML_ENTITY_REF_NODE) &&
904: (cur->children != cur->last))
905: xmlOutputBufferWriteString(buf, "\n");
906: }
907: if (!htmlIsAutoClosed(doc, cur)) {
908: xmlOutputBufferWriteString(buf, "</");
909: xmlOutputBufferWriteString(buf, (const char *)cur->name);
910: xmlOutputBufferWriteString(buf, ">");
911: }
912: if (cur->next != NULL) {
913: if ((cur->next->type != HTML_TEXT_NODE) &&
914: (cur->next->type != HTML_ENTITY_REF_NODE))
915: xmlOutputBufferWriteString(buf, "\n");
916: }
917: }
918:
919: /**
920: * htmlDocContentDump:
921: * @buf: the HTML buffer output
922: * @cur: the document
923: *
924: * Dump an HTML document.
925: */
926: static void
927: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
928: int type;
929:
930: /*
931: * force to output the stuff as HTML, especially for entities
932: */
933: type = cur->type;
934: cur->type = XML_HTML_DOCUMENT_NODE;
935: if (cur->intSubset != NULL)
936: htmlDtdDumpOutput(buf, cur, NULL);
937: else {
938: /* Default to HTML-4.0 transitionnal @@@@ */
939: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
940:
941: }
942: if (cur->children != NULL) {
943: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
944: }
945: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 946: cur->type = (xmlElementType) type;
1.21 veillard 947: }
948:
949:
950: /************************************************************************
951: * *
952: * Saving functions front-ends *
953: * *
954: ************************************************************************/
955:
1.1 daniel 956: /**
957: * htmlDocDump:
958: * @f: the FILE*
959: * @cur: the document
960: *
961: * Dump an HTML document to an open FILE.
1.21 veillard 962: *
963: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 964: */
1.21 veillard 965: int
1.1 daniel 966: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 967: xmlOutputBufferPtr buf;
1.24 veillard 968: xmlCharEncodingHandlerPtr handler = NULL;
969: const char *encoding;
1.21 veillard 970: int ret;
1.1 daniel 971:
972: if (cur == NULL) {
973: #ifdef DEBUG_TREE
1.15 daniel 974: fprintf(stderr, "htmlDocDump : document == NULL\n");
1.1 daniel 975: #endif
1.21 veillard 976: return(-1);
1.1 daniel 977: }
1.24 veillard 978:
979: encoding = (const char *) htmlGetMetaEncoding(cur);
980:
981: if (encoding != NULL) {
982: xmlCharEncoding enc;
983:
984: enc = xmlParseCharEncoding(encoding);
985: if (enc != cur->charset) {
986: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
987: /*
988: * Not supported yet
989: */
990: return(-1);
991: }
992:
993: handler = xmlFindCharEncodingHandler(encoding);
994: if (handler == NULL)
995: return(-1);
996: }
997: }
998:
999: /*
1.25 veillard 1000: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1001: */
1002: if (handler == NULL)
1.25 veillard 1003: handler = xmlFindCharEncodingHandler("HTML");
1004: if (handler == NULL)
1.24 veillard 1005: handler = xmlFindCharEncodingHandler("ascii");
1006:
1007: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1008: if (buf == NULL) return(-1);
1009: htmlDocContentDumpOutput(buf, cur, NULL);
1010:
1011: ret = xmlOutputBufferClose(buf);
1012: return(ret);
1013: }
1014:
1015: /**
1016: * htmlSaveFile:
1017: * @filename: the filename (or URL)
1018: * @cur: the document
1019: *
1020: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1021: * used.
1022: * returns: the number of byte written or -1 in case of failure.
1023: */
1024: int
1025: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1026: xmlOutputBufferPtr buf;
1.24 veillard 1027: xmlCharEncodingHandlerPtr handler = NULL;
1028: const char *encoding;
1.21 veillard 1029: int ret;
1030:
1.24 veillard 1031: encoding = (const char *) htmlGetMetaEncoding(cur);
1032:
1033: if (encoding != NULL) {
1034: xmlCharEncoding enc;
1035:
1036: enc = xmlParseCharEncoding(encoding);
1037: if (enc != cur->charset) {
1038: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1039: /*
1040: * Not supported yet
1041: */
1042: return(-1);
1043: }
1044:
1045: handler = xmlFindCharEncodingHandler(encoding);
1046: if (handler == NULL)
1047: return(-1);
1048: }
1049: }
1050:
1051: /*
1.25 veillard 1052: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1053: */
1054: if (handler == NULL)
1.25 veillard 1055: handler = xmlFindCharEncodingHandler("HTML");
1056: if (handler == NULL)
1.24 veillard 1057: handler = xmlFindCharEncodingHandler("ascii");
1058:
1.21 veillard 1059: /*
1060: * save the content to a temp buffer.
1061: */
1.24 veillard 1062: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1063: if (buf == NULL) return(0);
1064:
1065: htmlDocContentDumpOutput(buf, cur, NULL);
1066:
1067: ret = xmlOutputBufferClose(buf);
1068: return(ret);
1.1 daniel 1069: }
1070:
1071: /**
1.26 veillard 1072: * htmlSaveFileEnc:
1.1 daniel 1073: * @filename: the filename
1074: * @cur: the document
1075: *
1.26 veillard 1076: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1077: *
1078: * returns: the number of byte written or -1 in case of failure.
1079: */
1080: int
1.21 veillard 1081: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1082: xmlOutputBufferPtr buf;
1083: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1084: int ret;
1085:
1.21 veillard 1086: if (encoding != NULL) {
1087: xmlCharEncoding enc;
1088:
1089: enc = xmlParseCharEncoding(encoding);
1090: if (enc != cur->charset) {
1091: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1092: /*
1093: * Not supported yet
1094: */
1095: return(-1);
1096: }
1097:
1098: handler = xmlFindCharEncodingHandler(encoding);
1099: if (handler == NULL)
1100: return(-1);
1.26 veillard 1101: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1102: }
1103: }
1.24 veillard 1104:
1105: /*
1.25 veillard 1106: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1107: */
1.25 veillard 1108: if (handler == NULL)
1109: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1110: if (handler == NULL)
1111: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1112:
1.1 daniel 1113: /*
1114: * save the content to a temp buffer.
1115: */
1.21 veillard 1116: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1117: if (buf == NULL) return(0);
1118:
1.21 veillard 1119: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1120:
1.21 veillard 1121: ret = xmlOutputBufferClose(buf);
1122: return(ret);
1.1 daniel 1123: }
1.18 daniel 1124: #endif /* LIBXML_HTML_ENABLED */
Webmaster