Annotation of XML/HTMLtree.c, revision 1.34
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
1.30 veillard 16: #include <libxml/xmlversion.h>
1.18 daniel 17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.33 veillard 34: #include <libxml/xmlerror.h>
1.1 daniel 35:
1.21 veillard 36: /************************************************************************
37: * *
1.23 veillard 38: * Getting/Setting encoding meta tags *
39: * *
40: ************************************************************************/
41:
42: /**
43: * htmlGetMetaEncoding:
44: * @doc: the document
45: *
46: * Encoding definition lookup in the Meta tags
47: *
48: * Returns the current encoding as flagged in the HTML source
49: */
50: const xmlChar *
51: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 52: htmlNodePtr cur;
53: const xmlChar *content;
54: const xmlChar *encoding;
55:
56: if (doc == NULL)
57: return(NULL);
58: cur = doc->children;
59:
60: /*
61: * Search the html
62: */
63: while (cur != NULL) {
64: if (cur->name != NULL) {
1.29 veillard 65: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.24 veillard 66: break;
1.29 veillard 67: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 68: goto found_head;
1.29 veillard 69: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 70: goto found_meta;
71: }
72: cur = cur->next;
73: }
74: if (cur == NULL)
75: return(NULL);
76: cur = cur->children;
77:
78: /*
79: * Search the head
80: */
81: while (cur != NULL) {
82: if (cur->name != NULL) {
1.29 veillard 83: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 84: break;
1.29 veillard 85: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 86: goto found_meta;
87: }
88: cur = cur->next;
89: }
90: if (cur == NULL)
91: return(NULL);
92: found_head:
93: cur = cur->children;
94:
95: /*
96: * Search the meta elements
97: */
98: found_meta:
99: while (cur != NULL) {
100: if (cur->name != NULL) {
1.29 veillard 101: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.24 veillard 102: xmlAttrPtr attr = cur->properties;
103: int http;
104: const xmlChar *value;
105:
106: content = NULL;
107: http = 0;
108: while (attr != NULL) {
109: if ((attr->children != NULL) &&
110: (attr->children->type == XML_TEXT_NODE) &&
111: (attr->children->next == NULL)) {
112: #ifndef XML_USE_BUFFER_CONTENT
113: value = attr->children->content;
114: #else
115: value = xmlBufferContent(attr->children->content);
116: #endif
1.28 veillard 117: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
118: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.24 veillard 119: http = 1;
1.28 veillard 120: else if ((value != NULL)
121: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.24 veillard 122: content = value;
123: if ((http != 0) && (content != NULL))
124: goto found_content;
125: }
126: attr = attr->next;
127: }
128: }
129: }
130: cur = cur->next;
131: }
132: return(NULL);
133:
134: found_content:
135: encoding = xmlStrstr(content, BAD_CAST"charset=");
136: if (encoding == NULL)
137: encoding = xmlStrstr(content, BAD_CAST"Charset=");
138: if (encoding == NULL)
139: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
140: if (encoding != NULL) {
141: encoding += 8;
142: } else {
143: encoding = xmlStrstr(content, BAD_CAST"charset =");
144: if (encoding == NULL)
145: encoding = xmlStrstr(content, BAD_CAST"Charset =");
146: if (encoding == NULL)
147: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
148: if (encoding != NULL)
149: encoding += 9;
150: }
151: if (encoding != NULL) {
152: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
153: }
154: return(encoding);
1.23 veillard 155: }
156:
157: /**
158: * htmlSetMetaEncoding:
159: * @doc: the document
160: * @encoding: the encoding string
161: *
162: * Sets the current encoding in the Meta tags
163: * NOTE: this will not change the document content encoding, just
164: * the META flag associated.
165: *
166: * Returns 0 in case of success and -1 in case of error
167: */
168: int
169: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 170: htmlNodePtr cur, meta;
171: const xmlChar *content;
172: char newcontent[100];
173:
174:
175: if (doc == NULL)
176: return(-1);
177:
178: if (encoding != NULL) {
1.27 veillard 179: #ifdef HAVE_SNPRINTF
180: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
181: encoding);
182: #else
1.26 veillard 183: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 veillard 184: #endif
185: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 186: }
187:
188: cur = doc->children;
189:
190: /*
191: * Search the html
192: */
193: while (cur != NULL) {
194: if (cur->name != NULL) {
1.29 veillard 195: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.26 veillard 196: break;
1.29 veillard 197: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 198: if (encoding == NULL)
199: return(0);
200: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
201: xmlAddPrevSibling(cur, meta);
202: cur = meta;
203: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
204: xmlAddChild(cur, meta);
205: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
206: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
207: return(0);
208: }
1.29 veillard 209: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 210: goto found_head;
1.29 veillard 211: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 212: goto found_meta;
213: }
214: cur = cur->next;
215: }
216: if (cur == NULL)
217: return(-1);
218: cur = cur->children;
219:
220: /*
221: * Search the head
222: */
223: while (cur != NULL) {
224: if (cur->name != NULL) {
1.29 veillard 225: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 226: break;
1.29 veillard 227: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 228: if (encoding == NULL)
229: return(0);
230: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
231: xmlAddPrevSibling(cur, meta);
232: cur = meta;
233: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
234: xmlAddChild(cur, meta);
235: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
236: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
237: return(0);
238: }
1.29 veillard 239: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 240: goto found_meta;
241: }
242: cur = cur->next;
243: }
244: if (cur == NULL)
245: return(-1);
246: found_head:
247: if (cur->children == NULL) {
248: if (encoding == NULL)
249: return(0);
250: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
251: xmlAddChild(cur, meta);
252: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
253: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
254: return(0);
255: }
256: cur = cur->children;
257:
258: found_meta:
259: if (encoding != NULL) {
260: /*
261: * Create a new Meta element with the right aatributes
262: */
263:
264: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
265: xmlAddPrevSibling(cur, meta);
266: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
267: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
268: }
269:
270: /*
271: * Search and destroy all the remaining the meta elements carrying
272: * encoding informations
273: */
274: while (cur != NULL) {
275: if (cur->name != NULL) {
1.29 veillard 276: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.26 veillard 277: xmlAttrPtr attr = cur->properties;
278: int http;
279: const xmlChar *value;
280:
281: content = NULL;
282: http = 0;
283: while (attr != NULL) {
284: if ((attr->children != NULL) &&
285: (attr->children->type == XML_TEXT_NODE) &&
286: (attr->children->next == NULL)) {
287: #ifndef XML_USE_BUFFER_CONTENT
288: value = attr->children->content;
289: #else
290: value = xmlBufferContent(attr->children->content);
291: #endif
1.28 veillard 292: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
293: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.26 veillard 294: http = 1;
1.28 veillard 295: else if ((value != NULL)
296: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.26 veillard 297: content = value;
298: if ((http != 0) && (content != NULL))
299: break;
300: }
301: attr = attr->next;
302: }
303: if ((http != 0) && (content != NULL)) {
304: meta = cur;
305: cur = cur->next;
306: xmlUnlinkNode(meta);
307: xmlFreeNode(meta);
308: continue;
309: }
310:
311: }
312: }
313: cur = cur->next;
314: }
315: return(0);
1.23 veillard 316: }
317:
318: /************************************************************************
319: * *
1.21 veillard 320: * Dumping HTML tree content to a simple buffer *
321: * *
322: ************************************************************************/
323:
1.14 daniel 324: static void
325: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
326:
1.1 daniel 327: /**
328: * htmlDtdDump:
329: * @buf: the HTML buffer output
330: * @doc: the document
331: *
332: * Dump the HTML document DTD, if any.
333: */
334: static void
335: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
336: xmlDtdPtr cur = doc->intSubset;
337:
338: if (cur == NULL) {
1.33 veillard 339: xmlGenericError(xmlGenericErrorContext,
340: "htmlDtdDump : no internal subset\n");
1.1 daniel 341: return;
342: }
343: xmlBufferWriteChar(buf, "<!DOCTYPE ");
344: xmlBufferWriteCHAR(buf, cur->name);
345: if (cur->ExternalID != NULL) {
346: xmlBufferWriteChar(buf, " PUBLIC ");
347: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 348: if (cur->SystemID != NULL) {
349: xmlBufferWriteChar(buf, " ");
350: xmlBufferWriteQuotedString(buf, cur->SystemID);
351: }
1.1 daniel 352: } else if (cur->SystemID != NULL) {
353: xmlBufferWriteChar(buf, " SYSTEM ");
354: xmlBufferWriteQuotedString(buf, cur->SystemID);
355: }
356: xmlBufferWriteChar(buf, ">\n");
357: }
358:
359: /**
360: * htmlAttrDump:
361: * @buf: the HTML buffer output
362: * @doc: the document
363: * @cur: the attribute pointer
364: *
365: * Dump an HTML attribute
366: */
367: static void
368: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 369: xmlChar *value;
1.1 daniel 370:
371: if (cur == NULL) {
1.33 veillard 372: xmlGenericError(xmlGenericErrorContext,
373: "htmlAttrDump : property == NULL\n");
1.1 daniel 374: return;
375: }
376: xmlBufferWriteChar(buf, " ");
377: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 378: if (cur->children != NULL) {
379: value = xmlNodeListGetString(doc, cur->children, 0);
380: if (value) {
381: xmlBufferWriteChar(buf, "=");
382: xmlBufferWriteQuotedString(buf, value);
383: xmlFree(value);
384: } else {
385: xmlBufferWriteChar(buf, "=\"\"");
386: }
1.1 daniel 387: }
388: }
389:
390: /**
391: * htmlAttrListDump:
392: * @buf: the HTML buffer output
393: * @doc: the document
394: * @cur: the first attribute pointer
395: *
396: * Dump a list of HTML attributes
397: */
398: static void
399: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
400: if (cur == NULL) {
1.33 veillard 401: xmlGenericError(xmlGenericErrorContext,
402: "htmlAttrListDump : property == NULL\n");
1.1 daniel 403: return;
404: }
405: while (cur != NULL) {
406: htmlAttrDump(buf, doc, cur);
407: cur = cur->next;
408: }
409: }
410:
411:
1.14 daniel 412: void
1.1 daniel 413: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
414: /**
415: * htmlNodeListDump:
416: * @buf: the HTML buffer output
417: * @doc: the document
418: * @cur: the first node
419: *
420: * Dump an HTML node list, recursive behaviour,children are printed too.
421: */
422: static void
423: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
424: if (cur == NULL) {
1.33 veillard 425: xmlGenericError(xmlGenericErrorContext,
426: "htmlNodeListDump : node == NULL\n");
1.1 daniel 427: return;
428: }
429: while (cur != NULL) {
430: htmlNodeDump(buf, doc, cur);
431: cur = cur->next;
432: }
433: }
434:
435: /**
436: * htmlNodeDump:
437: * @buf: the HTML buffer output
438: * @doc: the document
439: * @cur: the current node
440: *
441: * Dump an HTML node, recursive behaviour,children are printed too.
442: */
1.14 daniel 443: void
1.1 daniel 444: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
445: htmlElemDescPtr info;
446:
447: if (cur == NULL) {
1.33 veillard 448: xmlGenericError(xmlGenericErrorContext,
449: "htmlNodeDump : node == NULL\n");
1.1 daniel 450: return;
451: }
452: /*
453: * Special cases.
454: */
1.20 daniel 455: if (cur->type == XML_DTD_NODE)
456: return;
1.14 daniel 457: if (cur->type == XML_HTML_DOCUMENT_NODE) {
458: htmlDocContentDump(buf, (xmlDocPtr) cur);
459: return;
460: }
1.1 daniel 461: if (cur->type == HTML_TEXT_NODE) {
462: if (cur->content != NULL) {
1.6 daniel 463: xmlChar *buffer;
1.1 daniel 464:
1.9 daniel 465: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 466: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 467: #else
468: buffer = xmlEncodeEntitiesReentrant(doc,
469: xmlBufferContent(cur->content));
470: #endif
1.1 daniel 471: if (buffer != NULL) {
472: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 473: xmlFree(buffer);
1.1 daniel 474: }
475: }
476: return;
477: }
478: if (cur->type == HTML_COMMENT_NODE) {
479: if (cur->content != NULL) {
480: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 481: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 482: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 483: #else
484: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
485: #endif
1.1 daniel 486: xmlBufferWriteChar(buf, "-->");
487: }
488: return;
489: }
490: if (cur->type == HTML_ENTITY_REF_NODE) {
491: xmlBufferWriteChar(buf, "&");
492: xmlBufferWriteCHAR(buf, cur->name);
493: xmlBufferWriteChar(buf, ";");
494: return;
495: }
496:
497: /*
498: * Get specific HTmL info for taht node.
499: */
500: info = htmlTagLookup(cur->name);
501:
502: xmlBufferWriteChar(buf, "<");
503: xmlBufferWriteCHAR(buf, cur->name);
504: if (cur->properties != NULL)
505: htmlAttrListDump(buf, doc, cur->properties);
506:
1.7 daniel 507: if ((info != NULL) && (info->empty)) {
1.1 daniel 508: xmlBufferWriteChar(buf, ">");
509: if (cur->next != NULL) {
510: if ((cur->next->type != HTML_TEXT_NODE) &&
511: (cur->next->type != HTML_ENTITY_REF_NODE))
512: xmlBufferWriteChar(buf, "\n");
513: }
514: return;
515: }
1.17 daniel 516: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 517: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 518: xmlBufferWriteChar(buf, ">");
519: else {
520: xmlBufferWriteChar(buf, "></");
521: xmlBufferWriteCHAR(buf, cur->name);
522: xmlBufferWriteChar(buf, ">");
523: }
524: if (cur->next != NULL) {
525: if ((cur->next->type != HTML_TEXT_NODE) &&
526: (cur->next->type != HTML_ENTITY_REF_NODE))
527: xmlBufferWriteChar(buf, "\n");
528: }
529: return;
530: }
531: xmlBufferWriteChar(buf, ">");
532: if (cur->content != NULL) {
1.6 daniel 533: xmlChar *buffer;
1.1 daniel 534:
1.9 daniel 535: #ifndef XML_USE_BUFFER_CONTENT
536: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
537: #else
538: buffer = xmlEncodeEntitiesReentrant(doc,
539: xmlBufferContent(cur->content));
540: #endif
1.1 daniel 541: if (buffer != NULL) {
542: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 543: xmlFree(buffer);
1.1 daniel 544: }
545: }
1.17 daniel 546: if (cur->children != NULL) {
547: if ((cur->children->type != HTML_TEXT_NODE) &&
548: (cur->children->type != HTML_ENTITY_REF_NODE) &&
549: (cur->children != cur->last))
1.1 daniel 550: xmlBufferWriteChar(buf, "\n");
1.17 daniel 551: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 552: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 553: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 554: (cur->children != cur->last))
1.1 daniel 555: xmlBufferWriteChar(buf, "\n");
556: }
1.11 daniel 557: if (!htmlIsAutoClosed(doc, cur)) {
558: xmlBufferWriteChar(buf, "</");
559: xmlBufferWriteCHAR(buf, cur->name);
560: xmlBufferWriteChar(buf, ">");
561: }
1.1 daniel 562: if (cur->next != NULL) {
563: if ((cur->next->type != HTML_TEXT_NODE) &&
564: (cur->next->type != HTML_ENTITY_REF_NODE))
565: xmlBufferWriteChar(buf, "\n");
566: }
567: }
568:
569: /**
1.16 daniel 570: * htmlNodeDumpFile:
571: * @out: the FILE pointer
572: * @doc: the document
573: * @cur: the current node
574: *
575: * Dump an HTML node, recursive behaviour,children are printed too.
576: */
577: void
578: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
579: xmlBufferPtr buf;
580:
581: buf = xmlBufferCreate();
582: if (buf == NULL) return;
583: htmlNodeDump(buf, doc, cur);
584: xmlBufferDump(out, buf);
585: xmlBufferFree(buf);
586: }
587:
588: /**
1.1 daniel 589: * htmlDocContentDump:
590: * @buf: the HTML buffer output
591: * @cur: the document
592: *
593: * Dump an HTML document.
594: */
595: static void
596: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 597: int type;
598:
599: /*
600: * force to output the stuff as HTML, especially for entities
601: */
602: type = cur->type;
603: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 604: if (cur->intSubset != NULL)
605: htmlDtdDump(buf, cur);
1.11 daniel 606: else {
607: /* Default to HTML-4.0 transitionnal @@@@ */
608: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
609:
610: }
1.17 daniel 611: if (cur->children != NULL) {
612: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 613: }
614: xmlBufferWriteChar(buf, "\n");
1.22 veillard 615: cur->type = (xmlElementType) type;
1.1 daniel 616: }
617:
618: /**
619: * htmlDocDumpMemory:
620: * @cur: the document
621: * @mem: OUT: the memory pointer
622: * @size: OUT: the memory lenght
623: *
1.6 daniel 624: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 625: * It's up to the caller to free the memory.
626: */
627: void
1.6 daniel 628: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 629: xmlBufferPtr buf;
630:
631: if (cur == NULL) {
632: #ifdef DEBUG_TREE
1.33 veillard 633: xmlGenericError(xmlGenericErrorContext,
634: "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 635: #endif
636: *mem = NULL;
637: *size = 0;
638: return;
639: }
640: buf = xmlBufferCreate();
641: if (buf == NULL) {
642: *mem = NULL;
643: *size = 0;
644: return;
645: }
646: htmlDocContentDump(buf, cur);
647: *mem = buf->content;
648: *size = buf->use;
649: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 650: xmlFree(buf);
1.1 daniel 651: }
652:
653:
1.21 veillard 654: /************************************************************************
655: * *
656: * Dumping HTML tree content to an I/O output buffer *
657: * *
658: ************************************************************************/
659:
660: static void
661: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
662:
663: /**
664: * htmlDtdDump:
665: * @buf: the HTML buffer output
666: * @doc: the document
667: *
668: * Dump the HTML document DTD, if any.
669: */
670: static void
671: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
672: xmlDtdPtr cur = doc->intSubset;
673:
674: if (cur == NULL) {
1.33 veillard 675: xmlGenericError(xmlGenericErrorContext,
676: "htmlDtdDump : no internal subset\n");
1.21 veillard 677: return;
678: }
679: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
680: xmlOutputBufferWriteString(buf, (const char *)cur->name);
681: if (cur->ExternalID != NULL) {
682: xmlOutputBufferWriteString(buf, " PUBLIC ");
683: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
684: if (cur->SystemID != NULL) {
685: xmlOutputBufferWriteString(buf, " ");
686: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
687: }
688: } else if (cur->SystemID != NULL) {
689: xmlOutputBufferWriteString(buf, " SYSTEM ");
690: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
691: }
692: xmlOutputBufferWriteString(buf, ">\n");
693: }
694:
695: /**
696: * htmlAttrDump:
697: * @buf: the HTML buffer output
698: * @doc: the document
699: * @cur: the attribute pointer
700: *
701: * Dump an HTML attribute
702: */
703: static void
704: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
705: xmlChar *value;
706:
707: if (cur == NULL) {
1.33 veillard 708: xmlGenericError(xmlGenericErrorContext,
709: "htmlAttrDump : property == NULL\n");
1.21 veillard 710: return;
711: }
712: xmlOutputBufferWriteString(buf, " ");
713: xmlOutputBufferWriteString(buf, (const char *)cur->name);
714: if (cur->children != NULL) {
715: value = xmlNodeListGetString(doc, cur->children, 0);
716: if (value) {
717: xmlOutputBufferWriteString(buf, "=");
718: xmlBufferWriteQuotedString(buf->buffer, value);
719: xmlFree(value);
720: } else {
721: xmlOutputBufferWriteString(buf, "=\"\"");
722: }
723: }
724: }
725:
726: /**
727: * htmlAttrListDump:
728: * @buf: the HTML buffer output
729: * @doc: the document
730: * @cur: the first attribute pointer
731: *
732: * Dump a list of HTML attributes
733: */
734: static void
735: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
736: if (cur == NULL) {
1.33 veillard 737: xmlGenericError(xmlGenericErrorContext,
738: "htmlAttrListDump : property == NULL\n");
1.21 veillard 739: return;
740: }
741: while (cur != NULL) {
742: htmlAttrDumpOutput(buf, doc, cur, encoding);
743: cur = cur->next;
744: }
745: }
746:
747:
748: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
749: xmlNodePtr cur, const char *encoding);
750:
751: /**
752: * htmlNodeListDump:
753: * @buf: the HTML buffer output
754: * @doc: the document
755: * @cur: the first node
756: *
757: * Dump an HTML node list, recursive behaviour,children are printed too.
758: */
759: static void
760: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
761: if (cur == NULL) {
1.33 veillard 762: xmlGenericError(xmlGenericErrorContext,
763: "htmlNodeListDump : node == NULL\n");
1.21 veillard 764: return;
765: }
766: while (cur != NULL) {
767: htmlNodeDumpOutput(buf, doc, cur, encoding);
768: cur = cur->next;
769: }
770: }
771:
772: /**
773: * htmlNodeDump:
774: * @buf: the HTML buffer output
775: * @doc: the document
776: * @cur: the current node
777: *
778: * Dump an HTML node, recursive behaviour,children are printed too.
779: */
780: void
781: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
782: htmlElemDescPtr info;
783:
784: if (cur == NULL) {
1.33 veillard 785: xmlGenericError(xmlGenericErrorContext,
786: "htmlNodeDump : node == NULL\n");
1.21 veillard 787: return;
788: }
789: /*
790: * Special cases.
791: */
792: if (cur->type == XML_DTD_NODE)
793: return;
794: if (cur->type == XML_HTML_DOCUMENT_NODE) {
795: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
796: return;
797: }
798: if (cur->type == HTML_TEXT_NODE) {
799: if (cur->content != NULL) {
800: xmlChar *buffer;
801:
802: #ifndef XML_USE_BUFFER_CONTENT
803: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
804: #else
805: buffer = xmlEncodeEntitiesReentrant(doc,
806: xmlBufferContent(cur->content));
807: #endif
808: if (buffer != NULL) {
1.25 veillard 809: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 810: xmlFree(buffer);
811: }
812: }
813: return;
814: }
815: if (cur->type == HTML_COMMENT_NODE) {
816: if (cur->content != NULL) {
817: xmlOutputBufferWriteString(buf, "<!--");
818: #ifndef XML_USE_BUFFER_CONTENT
819: xmlOutputBufferWriteString(buf, (const char *)cur->content);
820: #else
1.34 ! veillard 821: xmlOutputBufferWriteString(buf, (const char *)
! 822: xmlBufferContent(cur->content));
1.21 veillard 823: #endif
824: xmlOutputBufferWriteString(buf, "-->");
825: }
826: return;
827: }
828: if (cur->type == HTML_ENTITY_REF_NODE) {
829: xmlOutputBufferWriteString(buf, "&");
830: xmlOutputBufferWriteString(buf, (const char *)cur->name);
831: xmlOutputBufferWriteString(buf, ";");
832: return;
833: }
1.31 veillard 834: if (cur->type == HTML_PRESERVE_NODE) {
835: if (cur->content != NULL) {
836: #ifndef XML_USE_BUFFER_CONTENT
837: xmlOutputBufferWriteString(buf, (const char *)cur->content);
838: #else
1.34 ! veillard 839: xmlOutputBufferWriteString(buf, (const char *)
! 840: xmlBufferContent(cur->content));
1.31 veillard 841: #endif
842: }
843: return;
844: }
1.21 veillard 845:
846: /*
847: * Get specific HTmL info for taht node.
848: */
849: info = htmlTagLookup(cur->name);
850:
851: xmlOutputBufferWriteString(buf, "<");
852: xmlOutputBufferWriteString(buf, (const char *)cur->name);
853: if (cur->properties != NULL)
854: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
855:
856: if ((info != NULL) && (info->empty)) {
857: xmlOutputBufferWriteString(buf, ">");
858: if (cur->next != NULL) {
859: if ((cur->next->type != HTML_TEXT_NODE) &&
860: (cur->next->type != HTML_ENTITY_REF_NODE))
861: xmlOutputBufferWriteString(buf, "\n");
862: }
863: return;
864: }
865: if ((cur->content == NULL) && (cur->children == NULL)) {
1.32 veillard 866: if ((info != NULL) && (info->endTag != 0) &&
867: (strcmp(info->name, "html")) && (strcmp(info->name, "body"))) {
1.21 veillard 868: xmlOutputBufferWriteString(buf, ">");
1.32 veillard 869: } else {
1.21 veillard 870: xmlOutputBufferWriteString(buf, "></");
871: xmlOutputBufferWriteString(buf, (const char *)cur->name);
872: xmlOutputBufferWriteString(buf, ">");
873: }
874: if (cur->next != NULL) {
875: if ((cur->next->type != HTML_TEXT_NODE) &&
876: (cur->next->type != HTML_ENTITY_REF_NODE))
877: xmlOutputBufferWriteString(buf, "\n");
878: }
879: return;
880: }
881: xmlOutputBufferWriteString(buf, ">");
882: if (cur->content != NULL) {
883: #if 0
884: xmlChar *buffer;
885:
886: #ifndef XML_USE_BUFFER_CONTENT
887: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
888: #else
889: buffer = xmlEncodeEntitiesReentrant(doc,
890: xmlBufferContent(cur->content));
891: #endif
892: if (buffer != NULL) {
893: xmlOutputBufferWriteString(buf, buffer);
894: xmlFree(buffer);
895: }
896: #else
897: /*
898: * Uses the OutputBuffer property to automatically convert
899: * invalids to charrefs
900: */
901:
902: #ifndef XML_USE_BUFFER_CONTENT
903: xmlOutputBufferWriteString(buf, (const char *) cur->content);
904: #else
905: xmlOutputBufferWriteString(buf,
906: (const char *) xmlBufferContent(cur->content));
907: #endif
908: #endif
909: }
910: if (cur->children != NULL) {
911: if ((cur->children->type != HTML_TEXT_NODE) &&
912: (cur->children->type != HTML_ENTITY_REF_NODE) &&
913: (cur->children != cur->last))
914: xmlOutputBufferWriteString(buf, "\n");
915: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
916: if ((cur->last->type != HTML_TEXT_NODE) &&
917: (cur->last->type != HTML_ENTITY_REF_NODE) &&
918: (cur->children != cur->last))
919: xmlOutputBufferWriteString(buf, "\n");
920: }
921: if (!htmlIsAutoClosed(doc, cur)) {
922: xmlOutputBufferWriteString(buf, "</");
923: xmlOutputBufferWriteString(buf, (const char *)cur->name);
924: xmlOutputBufferWriteString(buf, ">");
925: }
926: if (cur->next != NULL) {
927: if ((cur->next->type != HTML_TEXT_NODE) &&
928: (cur->next->type != HTML_ENTITY_REF_NODE))
929: xmlOutputBufferWriteString(buf, "\n");
930: }
931: }
932:
933: /**
934: * htmlDocContentDump:
935: * @buf: the HTML buffer output
936: * @cur: the document
937: *
938: * Dump an HTML document.
939: */
940: static void
941: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
942: int type;
943:
944: /*
945: * force to output the stuff as HTML, especially for entities
946: */
947: type = cur->type;
948: cur->type = XML_HTML_DOCUMENT_NODE;
949: if (cur->intSubset != NULL)
950: htmlDtdDumpOutput(buf, cur, NULL);
951: else {
952: /* Default to HTML-4.0 transitionnal @@@@ */
953: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
954:
955: }
956: if (cur->children != NULL) {
957: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
958: }
959: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 960: cur->type = (xmlElementType) type;
1.21 veillard 961: }
962:
963:
964: /************************************************************************
965: * *
966: * Saving functions front-ends *
967: * *
968: ************************************************************************/
969:
1.1 daniel 970: /**
971: * htmlDocDump:
972: * @f: the FILE*
973: * @cur: the document
974: *
975: * Dump an HTML document to an open FILE.
1.21 veillard 976: *
977: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 978: */
1.21 veillard 979: int
1.1 daniel 980: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 981: xmlOutputBufferPtr buf;
1.24 veillard 982: xmlCharEncodingHandlerPtr handler = NULL;
983: const char *encoding;
1.21 veillard 984: int ret;
1.1 daniel 985:
986: if (cur == NULL) {
987: #ifdef DEBUG_TREE
1.33 veillard 988: xmlGenericError(xmlGenericErrorContext,
989: "htmlDocDump : document == NULL\n");
1.1 daniel 990: #endif
1.21 veillard 991: return(-1);
1.1 daniel 992: }
1.24 veillard 993:
994: encoding = (const char *) htmlGetMetaEncoding(cur);
995:
996: if (encoding != NULL) {
997: xmlCharEncoding enc;
998:
999: enc = xmlParseCharEncoding(encoding);
1000: if (enc != cur->charset) {
1001: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1002: /*
1003: * Not supported yet
1004: */
1005: return(-1);
1006: }
1007:
1008: handler = xmlFindCharEncodingHandler(encoding);
1009: if (handler == NULL)
1010: return(-1);
1011: }
1012: }
1013:
1014: /*
1.25 veillard 1015: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1016: */
1017: if (handler == NULL)
1.25 veillard 1018: handler = xmlFindCharEncodingHandler("HTML");
1019: if (handler == NULL)
1.24 veillard 1020: handler = xmlFindCharEncodingHandler("ascii");
1021:
1022: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1023: if (buf == NULL) return(-1);
1024: htmlDocContentDumpOutput(buf, cur, NULL);
1025:
1026: ret = xmlOutputBufferClose(buf);
1027: return(ret);
1028: }
1029:
1030: /**
1031: * htmlSaveFile:
1032: * @filename: the filename (or URL)
1033: * @cur: the document
1034: *
1035: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1036: * used.
1037: * returns: the number of byte written or -1 in case of failure.
1038: */
1039: int
1040: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1041: xmlOutputBufferPtr buf;
1.24 veillard 1042: xmlCharEncodingHandlerPtr handler = NULL;
1043: const char *encoding;
1.21 veillard 1044: int ret;
1045:
1.24 veillard 1046: encoding = (const char *) htmlGetMetaEncoding(cur);
1047:
1048: if (encoding != NULL) {
1049: xmlCharEncoding enc;
1050:
1051: enc = xmlParseCharEncoding(encoding);
1052: if (enc != cur->charset) {
1053: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1054: /*
1055: * Not supported yet
1056: */
1057: return(-1);
1058: }
1059:
1060: handler = xmlFindCharEncodingHandler(encoding);
1061: if (handler == NULL)
1062: return(-1);
1063: }
1064: }
1065:
1066: /*
1.25 veillard 1067: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1068: */
1069: if (handler == NULL)
1.25 veillard 1070: handler = xmlFindCharEncodingHandler("HTML");
1071: if (handler == NULL)
1.24 veillard 1072: handler = xmlFindCharEncodingHandler("ascii");
1073:
1.21 veillard 1074: /*
1075: * save the content to a temp buffer.
1076: */
1.24 veillard 1077: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1078: if (buf == NULL) return(0);
1079:
1080: htmlDocContentDumpOutput(buf, cur, NULL);
1081:
1082: ret = xmlOutputBufferClose(buf);
1083: return(ret);
1.1 daniel 1084: }
1085:
1086: /**
1.26 veillard 1087: * htmlSaveFileEnc:
1.1 daniel 1088: * @filename: the filename
1089: * @cur: the document
1090: *
1.26 veillard 1091: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1092: *
1093: * returns: the number of byte written or -1 in case of failure.
1094: */
1095: int
1.21 veillard 1096: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1097: xmlOutputBufferPtr buf;
1098: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1099: int ret;
1100:
1.21 veillard 1101: if (encoding != NULL) {
1102: xmlCharEncoding enc;
1103:
1104: enc = xmlParseCharEncoding(encoding);
1105: if (enc != cur->charset) {
1106: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1107: /*
1108: * Not supported yet
1109: */
1110: return(-1);
1111: }
1112:
1113: handler = xmlFindCharEncodingHandler(encoding);
1114: if (handler == NULL)
1115: return(-1);
1.26 veillard 1116: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1117: }
1118: }
1.24 veillard 1119:
1120: /*
1.25 veillard 1121: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1122: */
1.25 veillard 1123: if (handler == NULL)
1124: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1125: if (handler == NULL)
1126: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1127:
1.1 daniel 1128: /*
1129: * save the content to a temp buffer.
1130: */
1.21 veillard 1131: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1132: if (buf == NULL) return(0);
1133:
1.21 veillard 1134: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1135:
1.21 veillard 1136: ret = xmlOutputBufferClose(buf);
1137: return(ret);
1.1 daniel 1138: }
1.18 daniel 1139: #endif /* LIBXML_HTML_ENABLED */
Webmaster