Annotation of XML/HTMLtree.c, revision 1.33
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
1.30 veillard 16: #include <libxml/xmlversion.h>
1.18 daniel 17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.33 ! veillard 34: #include <libxml/xmlerror.h>
1.1 daniel 35:
1.21 veillard 36: /************************************************************************
37: * *
1.23 veillard 38: * Getting/Setting encoding meta tags *
39: * *
40: ************************************************************************/
41:
42: /**
43: * htmlGetMetaEncoding:
44: * @doc: the document
45: *
46: * Encoding definition lookup in the Meta tags
47: *
48: * Returns the current encoding as flagged in the HTML source
49: */
50: const xmlChar *
51: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 52: htmlNodePtr cur;
53: const xmlChar *content;
54: const xmlChar *encoding;
55:
56: if (doc == NULL)
57: return(NULL);
58: cur = doc->children;
59:
60: /*
61: * Search the html
62: */
63: while (cur != NULL) {
64: if (cur->name != NULL) {
1.29 veillard 65: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.24 veillard 66: break;
1.29 veillard 67: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 68: goto found_head;
1.29 veillard 69: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 70: goto found_meta;
71: }
72: cur = cur->next;
73: }
74: if (cur == NULL)
75: return(NULL);
76: cur = cur->children;
77:
78: /*
79: * Search the head
80: */
81: while (cur != NULL) {
82: if (cur->name != NULL) {
1.29 veillard 83: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 84: break;
1.29 veillard 85: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 86: goto found_meta;
87: }
88: cur = cur->next;
89: }
90: if (cur == NULL)
91: return(NULL);
92: found_head:
93: cur = cur->children;
94:
95: /*
96: * Search the meta elements
97: */
98: found_meta:
99: while (cur != NULL) {
100: if (cur->name != NULL) {
1.29 veillard 101: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.24 veillard 102: xmlAttrPtr attr = cur->properties;
103: int http;
104: const xmlChar *value;
105:
106: content = NULL;
107: http = 0;
108: while (attr != NULL) {
109: if ((attr->children != NULL) &&
110: (attr->children->type == XML_TEXT_NODE) &&
111: (attr->children->next == NULL)) {
112: #ifndef XML_USE_BUFFER_CONTENT
113: value = attr->children->content;
114: #else
115: value = xmlBufferContent(attr->children->content);
116: #endif
1.28 veillard 117: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
118: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.24 veillard 119: http = 1;
1.28 veillard 120: else if ((value != NULL)
121: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.24 veillard 122: content = value;
123: if ((http != 0) && (content != NULL))
124: goto found_content;
125: }
126: attr = attr->next;
127: }
128: }
129: }
130: cur = cur->next;
131: }
132: return(NULL);
133:
134: found_content:
135: encoding = xmlStrstr(content, BAD_CAST"charset=");
136: if (encoding == NULL)
137: encoding = xmlStrstr(content, BAD_CAST"Charset=");
138: if (encoding == NULL)
139: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
140: if (encoding != NULL) {
141: encoding += 8;
142: } else {
143: encoding = xmlStrstr(content, BAD_CAST"charset =");
144: if (encoding == NULL)
145: encoding = xmlStrstr(content, BAD_CAST"Charset =");
146: if (encoding == NULL)
147: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
148: if (encoding != NULL)
149: encoding += 9;
150: }
151: if (encoding != NULL) {
152: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
153: }
154: return(encoding);
1.23 veillard 155: }
156:
157: /**
158: * htmlSetMetaEncoding:
159: * @doc: the document
160: * @encoding: the encoding string
161: *
162: * Sets the current encoding in the Meta tags
163: * NOTE: this will not change the document content encoding, just
164: * the META flag associated.
165: *
166: * Returns 0 in case of success and -1 in case of error
167: */
168: int
169: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 170: htmlNodePtr cur, meta;
171: const xmlChar *content;
172: char newcontent[100];
173:
174:
175: if (doc == NULL)
176: return(-1);
177:
178: if (encoding != NULL) {
1.27 veillard 179: #ifdef HAVE_SNPRINTF
180: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
181: encoding);
182: #else
1.26 veillard 183: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 veillard 184: #endif
185: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 186: }
187:
188: cur = doc->children;
189:
190: /*
191: * Search the html
192: */
193: while (cur != NULL) {
194: if (cur->name != NULL) {
1.29 veillard 195: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.26 veillard 196: break;
1.29 veillard 197: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 198: if (encoding == NULL)
199: return(0);
200: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
201: xmlAddPrevSibling(cur, meta);
202: cur = meta;
203: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
204: xmlAddChild(cur, meta);
205: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
206: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
207: return(0);
208: }
1.29 veillard 209: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 210: goto found_head;
1.29 veillard 211: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 212: goto found_meta;
213: }
214: cur = cur->next;
215: }
216: if (cur == NULL)
217: return(-1);
218: cur = cur->children;
219:
220: /*
221: * Search the head
222: */
223: while (cur != NULL) {
224: if (cur->name != NULL) {
1.29 veillard 225: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 226: break;
1.29 veillard 227: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 228: if (encoding == NULL)
229: return(0);
230: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
231: xmlAddPrevSibling(cur, meta);
232: cur = meta;
233: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
234: xmlAddChild(cur, meta);
235: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
236: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
237: return(0);
238: }
1.29 veillard 239: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 240: goto found_meta;
241: }
242: cur = cur->next;
243: }
244: if (cur == NULL)
245: return(-1);
246: found_head:
247: if (cur->children == NULL) {
248: if (encoding == NULL)
249: return(0);
250: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
251: xmlAddChild(cur, meta);
252: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
253: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
254: return(0);
255: }
256: cur = cur->children;
257:
258: found_meta:
259: if (encoding != NULL) {
260: /*
261: * Create a new Meta element with the right aatributes
262: */
263:
264: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
265: xmlAddPrevSibling(cur, meta);
266: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
267: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
268: }
269:
270: /*
271: * Search and destroy all the remaining the meta elements carrying
272: * encoding informations
273: */
274: while (cur != NULL) {
275: if (cur->name != NULL) {
1.29 veillard 276: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.26 veillard 277: xmlAttrPtr attr = cur->properties;
278: int http;
279: const xmlChar *value;
280:
281: content = NULL;
282: http = 0;
283: while (attr != NULL) {
284: if ((attr->children != NULL) &&
285: (attr->children->type == XML_TEXT_NODE) &&
286: (attr->children->next == NULL)) {
287: #ifndef XML_USE_BUFFER_CONTENT
288: value = attr->children->content;
289: #else
290: value = xmlBufferContent(attr->children->content);
291: #endif
1.28 veillard 292: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
293: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.26 veillard 294: http = 1;
1.28 veillard 295: else if ((value != NULL)
296: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.26 veillard 297: content = value;
298: if ((http != 0) && (content != NULL))
299: break;
300: }
301: attr = attr->next;
302: }
303: if ((http != 0) && (content != NULL)) {
304: meta = cur;
305: cur = cur->next;
306: xmlUnlinkNode(meta);
307: xmlFreeNode(meta);
308: continue;
309: }
310:
311: }
312: }
313: cur = cur->next;
314: }
315: return(0);
1.23 veillard 316: }
317:
318: /************************************************************************
319: * *
1.21 veillard 320: * Dumping HTML tree content to a simple buffer *
321: * *
322: ************************************************************************/
323:
1.14 daniel 324: static void
325: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
326:
1.1 daniel 327: /**
328: * htmlDtdDump:
329: * @buf: the HTML buffer output
330: * @doc: the document
331: *
332: * Dump the HTML document DTD, if any.
333: */
334: static void
335: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
336: xmlDtdPtr cur = doc->intSubset;
337:
338: if (cur == NULL) {
1.33 ! veillard 339: xmlGenericError(xmlGenericErrorContext,
! 340: "htmlDtdDump : no internal subset\n");
1.1 daniel 341: return;
342: }
343: xmlBufferWriteChar(buf, "<!DOCTYPE ");
344: xmlBufferWriteCHAR(buf, cur->name);
345: if (cur->ExternalID != NULL) {
346: xmlBufferWriteChar(buf, " PUBLIC ");
347: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 348: if (cur->SystemID != NULL) {
349: xmlBufferWriteChar(buf, " ");
350: xmlBufferWriteQuotedString(buf, cur->SystemID);
351: }
1.1 daniel 352: } else if (cur->SystemID != NULL) {
353: xmlBufferWriteChar(buf, " SYSTEM ");
354: xmlBufferWriteQuotedString(buf, cur->SystemID);
355: }
356: xmlBufferWriteChar(buf, ">\n");
357: }
358:
359: /**
360: * htmlAttrDump:
361: * @buf: the HTML buffer output
362: * @doc: the document
363: * @cur: the attribute pointer
364: *
365: * Dump an HTML attribute
366: */
367: static void
368: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 369: xmlChar *value;
1.1 daniel 370:
371: if (cur == NULL) {
1.33 ! veillard 372: xmlGenericError(xmlGenericErrorContext,
! 373: "htmlAttrDump : property == NULL\n");
1.1 daniel 374: return;
375: }
376: xmlBufferWriteChar(buf, " ");
377: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 378: if (cur->children != NULL) {
379: value = xmlNodeListGetString(doc, cur->children, 0);
380: if (value) {
381: xmlBufferWriteChar(buf, "=");
382: xmlBufferWriteQuotedString(buf, value);
383: xmlFree(value);
384: } else {
385: xmlBufferWriteChar(buf, "=\"\"");
386: }
1.1 daniel 387: }
388: }
389:
390: /**
391: * htmlAttrListDump:
392: * @buf: the HTML buffer output
393: * @doc: the document
394: * @cur: the first attribute pointer
395: *
396: * Dump a list of HTML attributes
397: */
398: static void
399: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
400: if (cur == NULL) {
1.33 ! veillard 401: xmlGenericError(xmlGenericErrorContext,
! 402: "htmlAttrListDump : property == NULL\n");
1.1 daniel 403: return;
404: }
405: while (cur != NULL) {
406: htmlAttrDump(buf, doc, cur);
407: cur = cur->next;
408: }
409: }
410:
411:
1.14 daniel 412: void
1.1 daniel 413: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
414: /**
415: * htmlNodeListDump:
416: * @buf: the HTML buffer output
417: * @doc: the document
418: * @cur: the first node
419: *
420: * Dump an HTML node list, recursive behaviour,children are printed too.
421: */
422: static void
423: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
424: if (cur == NULL) {
1.33 ! veillard 425: xmlGenericError(xmlGenericErrorContext,
! 426: "htmlNodeListDump : node == NULL\n");
1.1 daniel 427: return;
428: }
429: while (cur != NULL) {
430: htmlNodeDump(buf, doc, cur);
431: cur = cur->next;
432: }
433: }
434:
435: /**
436: * htmlNodeDump:
437: * @buf: the HTML buffer output
438: * @doc: the document
439: * @cur: the current node
440: *
441: * Dump an HTML node, recursive behaviour,children are printed too.
442: */
1.14 daniel 443: void
1.1 daniel 444: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
445: htmlElemDescPtr info;
446:
447: if (cur == NULL) {
1.33 ! veillard 448: xmlGenericError(xmlGenericErrorContext,
! 449: "htmlNodeDump : node == NULL\n");
1.1 daniel 450: return;
451: }
452: /*
453: * Special cases.
454: */
1.20 daniel 455: if (cur->type == XML_DTD_NODE)
456: return;
1.14 daniel 457: if (cur->type == XML_HTML_DOCUMENT_NODE) {
458: htmlDocContentDump(buf, (xmlDocPtr) cur);
459: return;
460: }
1.1 daniel 461: if (cur->type == HTML_TEXT_NODE) {
462: if (cur->content != NULL) {
1.6 daniel 463: xmlChar *buffer;
1.1 daniel 464:
1.9 daniel 465: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 466: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 467: #else
468: buffer = xmlEncodeEntitiesReentrant(doc,
469: xmlBufferContent(cur->content));
470: #endif
1.1 daniel 471: if (buffer != NULL) {
472: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 473: xmlFree(buffer);
1.1 daniel 474: }
475: }
476: return;
477: }
478: if (cur->type == HTML_COMMENT_NODE) {
479: if (cur->content != NULL) {
480: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 481: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 482: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 483: #else
484: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
485: #endif
1.1 daniel 486: xmlBufferWriteChar(buf, "-->");
487: }
488: return;
489: }
490: if (cur->type == HTML_ENTITY_REF_NODE) {
491: xmlBufferWriteChar(buf, "&");
492: xmlBufferWriteCHAR(buf, cur->name);
493: xmlBufferWriteChar(buf, ";");
494: return;
495: }
496:
497: /*
498: * Get specific HTmL info for taht node.
499: */
500: info = htmlTagLookup(cur->name);
501:
502: xmlBufferWriteChar(buf, "<");
503: xmlBufferWriteCHAR(buf, cur->name);
504: if (cur->properties != NULL)
505: htmlAttrListDump(buf, doc, cur->properties);
506:
1.7 daniel 507: if ((info != NULL) && (info->empty)) {
1.1 daniel 508: xmlBufferWriteChar(buf, ">");
509: if (cur->next != NULL) {
510: if ((cur->next->type != HTML_TEXT_NODE) &&
511: (cur->next->type != HTML_ENTITY_REF_NODE))
512: xmlBufferWriteChar(buf, "\n");
513: }
514: return;
515: }
1.17 daniel 516: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 517: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 518: xmlBufferWriteChar(buf, ">");
519: else {
520: xmlBufferWriteChar(buf, "></");
521: xmlBufferWriteCHAR(buf, cur->name);
522: xmlBufferWriteChar(buf, ">");
523: }
524: if (cur->next != NULL) {
525: if ((cur->next->type != HTML_TEXT_NODE) &&
526: (cur->next->type != HTML_ENTITY_REF_NODE))
527: xmlBufferWriteChar(buf, "\n");
528: }
529: return;
530: }
531: xmlBufferWriteChar(buf, ">");
532: if (cur->content != NULL) {
1.6 daniel 533: xmlChar *buffer;
1.1 daniel 534:
1.9 daniel 535: #ifndef XML_USE_BUFFER_CONTENT
536: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
537: #else
538: buffer = xmlEncodeEntitiesReentrant(doc,
539: xmlBufferContent(cur->content));
540: #endif
1.1 daniel 541: if (buffer != NULL) {
542: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 543: xmlFree(buffer);
1.1 daniel 544: }
545: }
1.17 daniel 546: if (cur->children != NULL) {
547: if ((cur->children->type != HTML_TEXT_NODE) &&
548: (cur->children->type != HTML_ENTITY_REF_NODE) &&
549: (cur->children != cur->last))
1.1 daniel 550: xmlBufferWriteChar(buf, "\n");
1.17 daniel 551: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 552: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 553: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 554: (cur->children != cur->last))
1.1 daniel 555: xmlBufferWriteChar(buf, "\n");
556: }
1.11 daniel 557: if (!htmlIsAutoClosed(doc, cur)) {
558: xmlBufferWriteChar(buf, "</");
559: xmlBufferWriteCHAR(buf, cur->name);
560: xmlBufferWriteChar(buf, ">");
561: }
1.1 daniel 562: if (cur->next != NULL) {
563: if ((cur->next->type != HTML_TEXT_NODE) &&
564: (cur->next->type != HTML_ENTITY_REF_NODE))
565: xmlBufferWriteChar(buf, "\n");
566: }
567: }
568:
569: /**
1.16 daniel 570: * htmlNodeDumpFile:
571: * @out: the FILE pointer
572: * @doc: the document
573: * @cur: the current node
574: *
575: * Dump an HTML node, recursive behaviour,children are printed too.
576: */
577: void
578: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
579: xmlBufferPtr buf;
580:
581: buf = xmlBufferCreate();
582: if (buf == NULL) return;
583: htmlNodeDump(buf, doc, cur);
584: xmlBufferDump(out, buf);
585: xmlBufferFree(buf);
586: }
587:
588: /**
1.1 daniel 589: * htmlDocContentDump:
590: * @buf: the HTML buffer output
591: * @cur: the document
592: *
593: * Dump an HTML document.
594: */
595: static void
596: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 597: int type;
598:
599: /*
600: * force to output the stuff as HTML, especially for entities
601: */
602: type = cur->type;
603: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 604: if (cur->intSubset != NULL)
605: htmlDtdDump(buf, cur);
1.11 daniel 606: else {
607: /* Default to HTML-4.0 transitionnal @@@@ */
608: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
609:
610: }
1.17 daniel 611: if (cur->children != NULL) {
612: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 613: }
614: xmlBufferWriteChar(buf, "\n");
1.22 veillard 615: cur->type = (xmlElementType) type;
1.1 daniel 616: }
617:
618: /**
619: * htmlDocDumpMemory:
620: * @cur: the document
621: * @mem: OUT: the memory pointer
622: * @size: OUT: the memory lenght
623: *
1.6 daniel 624: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 625: * It's up to the caller to free the memory.
626: */
627: void
1.6 daniel 628: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 629: xmlBufferPtr buf;
630:
631: if (cur == NULL) {
632: #ifdef DEBUG_TREE
1.33 ! veillard 633: xmlGenericError(xmlGenericErrorContext,
! 634: "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 635: #endif
636: *mem = NULL;
637: *size = 0;
638: return;
639: }
640: buf = xmlBufferCreate();
641: if (buf == NULL) {
642: *mem = NULL;
643: *size = 0;
644: return;
645: }
646: htmlDocContentDump(buf, cur);
647: *mem = buf->content;
648: *size = buf->use;
649: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 650: xmlFree(buf);
1.1 daniel 651: }
652:
653:
1.21 veillard 654: /************************************************************************
655: * *
656: * Dumping HTML tree content to an I/O output buffer *
657: * *
658: ************************************************************************/
659:
660: static void
661: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
662:
663: /**
664: * htmlDtdDump:
665: * @buf: the HTML buffer output
666: * @doc: the document
667: *
668: * Dump the HTML document DTD, if any.
669: */
670: static void
671: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
672: xmlDtdPtr cur = doc->intSubset;
673:
674: if (cur == NULL) {
1.33 ! veillard 675: xmlGenericError(xmlGenericErrorContext,
! 676: "htmlDtdDump : no internal subset\n");
1.21 veillard 677: return;
678: }
679: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
680: xmlOutputBufferWriteString(buf, (const char *)cur->name);
681: if (cur->ExternalID != NULL) {
682: xmlOutputBufferWriteString(buf, " PUBLIC ");
683: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
684: if (cur->SystemID != NULL) {
685: xmlOutputBufferWriteString(buf, " ");
686: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
687: }
688: } else if (cur->SystemID != NULL) {
689: xmlOutputBufferWriteString(buf, " SYSTEM ");
690: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
691: }
692: xmlOutputBufferWriteString(buf, ">\n");
693: }
694:
695: /**
696: * htmlAttrDump:
697: * @buf: the HTML buffer output
698: * @doc: the document
699: * @cur: the attribute pointer
700: *
701: * Dump an HTML attribute
702: */
703: static void
704: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
705: xmlChar *value;
706:
707: if (cur == NULL) {
1.33 ! veillard 708: xmlGenericError(xmlGenericErrorContext,
! 709: "htmlAttrDump : property == NULL\n");
1.21 veillard 710: return;
711: }
712: xmlOutputBufferWriteString(buf, " ");
713: xmlOutputBufferWriteString(buf, (const char *)cur->name);
714: if (cur->children != NULL) {
715: value = xmlNodeListGetString(doc, cur->children, 0);
716: if (value) {
717: xmlOutputBufferWriteString(buf, "=");
718: xmlBufferWriteQuotedString(buf->buffer, value);
719: xmlFree(value);
720: } else {
721: xmlOutputBufferWriteString(buf, "=\"\"");
722: }
723: }
724: }
725:
726: /**
727: * htmlAttrListDump:
728: * @buf: the HTML buffer output
729: * @doc: the document
730: * @cur: the first attribute pointer
731: *
732: * Dump a list of HTML attributes
733: */
734: static void
735: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
736: if (cur == NULL) {
1.33 ! veillard 737: xmlGenericError(xmlGenericErrorContext,
! 738: "htmlAttrListDump : property == NULL\n");
1.21 veillard 739: return;
740: }
741: while (cur != NULL) {
742: htmlAttrDumpOutput(buf, doc, cur, encoding);
743: cur = cur->next;
744: }
745: }
746:
747:
748: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
749: xmlNodePtr cur, const char *encoding);
750:
751: /**
752: * htmlNodeListDump:
753: * @buf: the HTML buffer output
754: * @doc: the document
755: * @cur: the first node
756: *
757: * Dump an HTML node list, recursive behaviour,children are printed too.
758: */
759: static void
760: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
761: if (cur == NULL) {
1.33 ! veillard 762: xmlGenericError(xmlGenericErrorContext,
! 763: "htmlNodeListDump : node == NULL\n");
1.21 veillard 764: return;
765: }
766: while (cur != NULL) {
767: htmlNodeDumpOutput(buf, doc, cur, encoding);
768: cur = cur->next;
769: }
770: }
771:
772: /**
773: * htmlNodeDump:
774: * @buf: the HTML buffer output
775: * @doc: the document
776: * @cur: the current node
777: *
778: * Dump an HTML node, recursive behaviour,children are printed too.
779: */
780: void
781: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
782: htmlElemDescPtr info;
783:
784: if (cur == NULL) {
1.33 ! veillard 785: xmlGenericError(xmlGenericErrorContext,
! 786: "htmlNodeDump : node == NULL\n");
1.21 veillard 787: return;
788: }
789: /*
790: * Special cases.
791: */
792: if (cur->type == XML_DTD_NODE)
793: return;
794: if (cur->type == XML_HTML_DOCUMENT_NODE) {
795: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
796: return;
797: }
798: if (cur->type == HTML_TEXT_NODE) {
799: if (cur->content != NULL) {
800: xmlChar *buffer;
801:
802: #ifndef XML_USE_BUFFER_CONTENT
803: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
804: #else
805: buffer = xmlEncodeEntitiesReentrant(doc,
806: xmlBufferContent(cur->content));
807: #endif
808: if (buffer != NULL) {
1.25 veillard 809: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 810: xmlFree(buffer);
811: }
812: }
813: return;
814: }
815: if (cur->type == HTML_COMMENT_NODE) {
816: if (cur->content != NULL) {
817: xmlOutputBufferWriteString(buf, "<!--");
818: #ifndef XML_USE_BUFFER_CONTENT
819: xmlOutputBufferWriteString(buf, (const char *)cur->content);
820: #else
821: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
822: #endif
823: xmlOutputBufferWriteString(buf, "-->");
824: }
825: return;
826: }
827: if (cur->type == HTML_ENTITY_REF_NODE) {
828: xmlOutputBufferWriteString(buf, "&");
829: xmlOutputBufferWriteString(buf, (const char *)cur->name);
830: xmlOutputBufferWriteString(buf, ";");
831: return;
832: }
1.31 veillard 833: if (cur->type == HTML_PRESERVE_NODE) {
834: if (cur->content != NULL) {
835: #ifndef XML_USE_BUFFER_CONTENT
836: xmlOutputBufferWriteString(buf, (const char *)cur->content);
837: #else
838: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
839: #endif
840: }
841: return;
842: }
1.21 veillard 843:
844: /*
845: * Get specific HTmL info for taht node.
846: */
847: info = htmlTagLookup(cur->name);
848:
849: xmlOutputBufferWriteString(buf, "<");
850: xmlOutputBufferWriteString(buf, (const char *)cur->name);
851: if (cur->properties != NULL)
852: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
853:
854: if ((info != NULL) && (info->empty)) {
855: xmlOutputBufferWriteString(buf, ">");
856: if (cur->next != NULL) {
857: if ((cur->next->type != HTML_TEXT_NODE) &&
858: (cur->next->type != HTML_ENTITY_REF_NODE))
859: xmlOutputBufferWriteString(buf, "\n");
860: }
861: return;
862: }
863: if ((cur->content == NULL) && (cur->children == NULL)) {
1.32 veillard 864: if ((info != NULL) && (info->endTag != 0) &&
865: (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
1.21 veillard 866: xmlOutputBufferWriteString(buf, ">");
1.32 veillard 867: } else {
1.21 veillard 868: xmlOutputBufferWriteString(buf, "></");
869: xmlOutputBufferWriteString(buf, (const char *)cur->name);
870: xmlOutputBufferWriteString(buf, ">");
871: }
872: if (cur->next != NULL) {
873: if ((cur->next->type != HTML_TEXT_NODE) &&
874: (cur->next->type != HTML_ENTITY_REF_NODE))
875: xmlOutputBufferWriteString(buf, "\n");
876: }
877: return;
878: }
879: xmlOutputBufferWriteString(buf, ">");
880: if (cur->content != NULL) {
881: #if 0
882: xmlChar *buffer;
883:
884: #ifndef XML_USE_BUFFER_CONTENT
885: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
886: #else
887: buffer = xmlEncodeEntitiesReentrant(doc,
888: xmlBufferContent(cur->content));
889: #endif
890: if (buffer != NULL) {
891: xmlOutputBufferWriteString(buf, buffer);
892: xmlFree(buffer);
893: }
894: #else
895: /*
896: * Uses the OutputBuffer property to automatically convert
897: * invalids to charrefs
898: */
899:
900: #ifndef XML_USE_BUFFER_CONTENT
901: xmlOutputBufferWriteString(buf, (const char *) cur->content);
902: #else
903: xmlOutputBufferWriteString(buf,
904: (const char *) xmlBufferContent(cur->content));
905: #endif
906: #endif
907: }
908: if (cur->children != NULL) {
909: if ((cur->children->type != HTML_TEXT_NODE) &&
910: (cur->children->type != HTML_ENTITY_REF_NODE) &&
911: (cur->children != cur->last))
912: xmlOutputBufferWriteString(buf, "\n");
913: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
914: if ((cur->last->type != HTML_TEXT_NODE) &&
915: (cur->last->type != HTML_ENTITY_REF_NODE) &&
916: (cur->children != cur->last))
917: xmlOutputBufferWriteString(buf, "\n");
918: }
919: if (!htmlIsAutoClosed(doc, cur)) {
920: xmlOutputBufferWriteString(buf, "</");
921: xmlOutputBufferWriteString(buf, (const char *)cur->name);
922: xmlOutputBufferWriteString(buf, ">");
923: }
924: if (cur->next != NULL) {
925: if ((cur->next->type != HTML_TEXT_NODE) &&
926: (cur->next->type != HTML_ENTITY_REF_NODE))
927: xmlOutputBufferWriteString(buf, "\n");
928: }
929: }
930:
931: /**
932: * htmlDocContentDump:
933: * @buf: the HTML buffer output
934: * @cur: the document
935: *
936: * Dump an HTML document.
937: */
938: static void
939: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
940: int type;
941:
942: /*
943: * force to output the stuff as HTML, especially for entities
944: */
945: type = cur->type;
946: cur->type = XML_HTML_DOCUMENT_NODE;
947: if (cur->intSubset != NULL)
948: htmlDtdDumpOutput(buf, cur, NULL);
949: else {
950: /* Default to HTML-4.0 transitionnal @@@@ */
951: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
952:
953: }
954: if (cur->children != NULL) {
955: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
956: }
957: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 958: cur->type = (xmlElementType) type;
1.21 veillard 959: }
960:
961:
962: /************************************************************************
963: * *
964: * Saving functions front-ends *
965: * *
966: ************************************************************************/
967:
1.1 daniel 968: /**
969: * htmlDocDump:
970: * @f: the FILE*
971: * @cur: the document
972: *
973: * Dump an HTML document to an open FILE.
1.21 veillard 974: *
975: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 976: */
1.21 veillard 977: int
1.1 daniel 978: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 979: xmlOutputBufferPtr buf;
1.24 veillard 980: xmlCharEncodingHandlerPtr handler = NULL;
981: const char *encoding;
1.21 veillard 982: int ret;
1.1 daniel 983:
984: if (cur == NULL) {
985: #ifdef DEBUG_TREE
1.33 ! veillard 986: xmlGenericError(xmlGenericErrorContext,
! 987: "htmlDocDump : document == NULL\n");
1.1 daniel 988: #endif
1.21 veillard 989: return(-1);
1.1 daniel 990: }
1.24 veillard 991:
992: encoding = (const char *) htmlGetMetaEncoding(cur);
993:
994: if (encoding != NULL) {
995: xmlCharEncoding enc;
996:
997: enc = xmlParseCharEncoding(encoding);
998: if (enc != cur->charset) {
999: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1000: /*
1001: * Not supported yet
1002: */
1003: return(-1);
1004: }
1005:
1006: handler = xmlFindCharEncodingHandler(encoding);
1007: if (handler == NULL)
1008: return(-1);
1009: }
1010: }
1011:
1012: /*
1.25 veillard 1013: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1014: */
1015: if (handler == NULL)
1.25 veillard 1016: handler = xmlFindCharEncodingHandler("HTML");
1017: if (handler == NULL)
1.24 veillard 1018: handler = xmlFindCharEncodingHandler("ascii");
1019:
1020: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1021: if (buf == NULL) return(-1);
1022: htmlDocContentDumpOutput(buf, cur, NULL);
1023:
1024: ret = xmlOutputBufferClose(buf);
1025: return(ret);
1026: }
1027:
1028: /**
1029: * htmlSaveFile:
1030: * @filename: the filename (or URL)
1031: * @cur: the document
1032: *
1033: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1034: * used.
1035: * returns: the number of byte written or -1 in case of failure.
1036: */
1037: int
1038: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1039: xmlOutputBufferPtr buf;
1.24 veillard 1040: xmlCharEncodingHandlerPtr handler = NULL;
1041: const char *encoding;
1.21 veillard 1042: int ret;
1043:
1.24 veillard 1044: encoding = (const char *) htmlGetMetaEncoding(cur);
1045:
1046: if (encoding != NULL) {
1047: xmlCharEncoding enc;
1048:
1049: enc = xmlParseCharEncoding(encoding);
1050: if (enc != cur->charset) {
1051: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1052: /*
1053: * Not supported yet
1054: */
1055: return(-1);
1056: }
1057:
1058: handler = xmlFindCharEncodingHandler(encoding);
1059: if (handler == NULL)
1060: return(-1);
1061: }
1062: }
1063:
1064: /*
1.25 veillard 1065: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1066: */
1067: if (handler == NULL)
1.25 veillard 1068: handler = xmlFindCharEncodingHandler("HTML");
1069: if (handler == NULL)
1.24 veillard 1070: handler = xmlFindCharEncodingHandler("ascii");
1071:
1.21 veillard 1072: /*
1073: * save the content to a temp buffer.
1074: */
1.24 veillard 1075: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1076: if (buf == NULL) return(0);
1077:
1078: htmlDocContentDumpOutput(buf, cur, NULL);
1079:
1080: ret = xmlOutputBufferClose(buf);
1081: return(ret);
1.1 daniel 1082: }
1083:
1084: /**
1.26 veillard 1085: * htmlSaveFileEnc:
1.1 daniel 1086: * @filename: the filename
1087: * @cur: the document
1088: *
1.26 veillard 1089: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1090: *
1091: * returns: the number of byte written or -1 in case of failure.
1092: */
1093: int
1.21 veillard 1094: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1095: xmlOutputBufferPtr buf;
1096: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1097: int ret;
1098:
1.21 veillard 1099: if (encoding != NULL) {
1100: xmlCharEncoding enc;
1101:
1102: enc = xmlParseCharEncoding(encoding);
1103: if (enc != cur->charset) {
1104: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1105: /*
1106: * Not supported yet
1107: */
1108: return(-1);
1109: }
1110:
1111: handler = xmlFindCharEncodingHandler(encoding);
1112: if (handler == NULL)
1113: return(-1);
1.26 veillard 1114: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1115: }
1116: }
1.24 veillard 1117:
1118: /*
1.25 veillard 1119: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1120: */
1.25 veillard 1121: if (handler == NULL)
1122: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1123: if (handler == NULL)
1124: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1125:
1.1 daniel 1126: /*
1127: * save the content to a temp buffer.
1128: */
1.21 veillard 1129: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1130: if (buf == NULL) return(0);
1131:
1.21 veillard 1132: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1133:
1.21 veillard 1134: ret = xmlOutputBufferClose(buf);
1135: return(ret);
1.1 daniel 1136: }
1.18 daniel 1137: #endif /* LIBXML_HTML_ENABLED */
Webmaster