Annotation of XML/HTMLtree.c, revision 1.28
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
16: #include "xmlversion.h"
17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.1 daniel 34:
1.21 veillard 35: /************************************************************************
36: * *
1.23 veillard 37: * Getting/Setting encoding meta tags *
38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
44: *
45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if (cur->name != NULL) {
64: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
65: break;
66: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
67: goto found_head;
68: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if (cur->name != NULL) {
82: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
83: break;
84: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if (cur->name != NULL) {
100: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: #ifndef XML_USE_BUFFER_CONTENT
112: value = attr->children->content;
113: #else
114: value = xmlBufferContent(attr->children->content);
115: #endif
1.28 ! veillard 116: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
! 117: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.24 veillard 118: http = 1;
1.28 ! veillard 119: else if ((value != NULL)
! 120: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.24 veillard 121: content = value;
122: if ((http != 0) && (content != NULL))
123: goto found_content;
124: }
125: attr = attr->next;
126: }
127: }
128: }
129: cur = cur->next;
130: }
131: return(NULL);
132:
133: found_content:
134: encoding = xmlStrstr(content, BAD_CAST"charset=");
135: if (encoding == NULL)
136: encoding = xmlStrstr(content, BAD_CAST"Charset=");
137: if (encoding == NULL)
138: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
139: if (encoding != NULL) {
140: encoding += 8;
141: } else {
142: encoding = xmlStrstr(content, BAD_CAST"charset =");
143: if (encoding == NULL)
144: encoding = xmlStrstr(content, BAD_CAST"Charset =");
145: if (encoding == NULL)
146: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
147: if (encoding != NULL)
148: encoding += 9;
149: }
150: if (encoding != NULL) {
151: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
152: }
153: return(encoding);
1.23 veillard 154: }
155:
156: /**
157: * htmlSetMetaEncoding:
158: * @doc: the document
159: * @encoding: the encoding string
160: *
161: * Sets the current encoding in the Meta tags
162: * NOTE: this will not change the document content encoding, just
163: * the META flag associated.
164: *
165: * Returns 0 in case of success and -1 in case of error
166: */
167: int
168: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 169: htmlNodePtr cur, meta;
170: const xmlChar *content;
171: char newcontent[100];
172:
173:
174: if (doc == NULL)
175: return(-1);
176:
177: if (encoding != NULL) {
1.27 veillard 178: #ifdef HAVE_SNPRINTF
179: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180: encoding);
181: #else
1.26 veillard 182: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 veillard 183: #endif
184: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 185: }
186:
187: cur = doc->children;
188:
189: /*
190: * Search the html
191: */
192: while (cur != NULL) {
193: if (cur->name != NULL) {
194: if (!xmlStrcmp(cur->name, BAD_CAST"html"))
195: break;
196: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
197: if (encoding == NULL)
198: return(0);
199: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
200: xmlAddPrevSibling(cur, meta);
201: cur = meta;
202: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
203: xmlAddChild(cur, meta);
204: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
205: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
206: return(0);
207: }
208: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
209: goto found_head;
210: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
211: goto found_meta;
212: }
213: cur = cur->next;
214: }
215: if (cur == NULL)
216: return(-1);
217: cur = cur->children;
218:
219: /*
220: * Search the head
221: */
222: while (cur != NULL) {
223: if (cur->name != NULL) {
224: if (!xmlStrcmp(cur->name, BAD_CAST"head"))
225: break;
226: if (!xmlStrcmp(cur->name, BAD_CAST"body")) {
227: if (encoding == NULL)
228: return(0);
229: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
230: xmlAddPrevSibling(cur, meta);
231: cur = meta;
232: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
233: xmlAddChild(cur, meta);
234: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
235: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
236: return(0);
237: }
238: if (!xmlStrcmp(cur->name, BAD_CAST"meta"))
239: goto found_meta;
240: }
241: cur = cur->next;
242: }
243: if (cur == NULL)
244: return(-1);
245: found_head:
246: if (cur->children == NULL) {
247: if (encoding == NULL)
248: return(0);
249: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
250: xmlAddChild(cur, meta);
251: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
252: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
253: return(0);
254: }
255: cur = cur->children;
256:
257: found_meta:
258: if (encoding != NULL) {
259: /*
260: * Create a new Meta element with the right aatributes
261: */
262:
263: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
264: xmlAddPrevSibling(cur, meta);
265: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
266: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
267: }
268:
269: /*
270: * Search and destroy all the remaining the meta elements carrying
271: * encoding informations
272: */
273: while (cur != NULL) {
274: if (cur->name != NULL) {
275: if (!xmlStrcmp(cur->name, BAD_CAST"meta")) {
276: xmlAttrPtr attr = cur->properties;
277: int http;
278: const xmlChar *value;
279:
280: content = NULL;
281: http = 0;
282: while (attr != NULL) {
283: if ((attr->children != NULL) &&
284: (attr->children->type == XML_TEXT_NODE) &&
285: (attr->children->next == NULL)) {
286: #ifndef XML_USE_BUFFER_CONTENT
287: value = attr->children->content;
288: #else
289: value = xmlBufferContent(attr->children->content);
290: #endif
1.28 ! veillard 291: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
! 292: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.26 veillard 293: http = 1;
1.28 ! veillard 294: else if ((value != NULL)
! 295: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.26 veillard 296: content = value;
297: if ((http != 0) && (content != NULL))
298: break;
299: }
300: attr = attr->next;
301: }
302: if ((http != 0) && (content != NULL)) {
303: meta = cur;
304: cur = cur->next;
305: xmlUnlinkNode(meta);
306: xmlFreeNode(meta);
307: continue;
308: }
309:
310: }
311: }
312: cur = cur->next;
313: }
314: return(0);
1.23 veillard 315: }
316:
317: /************************************************************************
318: * *
1.21 veillard 319: * Dumping HTML tree content to a simple buffer *
320: * *
321: ************************************************************************/
322:
1.14 daniel 323: static void
324: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
325:
1.1 daniel 326: /**
327: * htmlDtdDump:
328: * @buf: the HTML buffer output
329: * @doc: the document
330: *
331: * Dump the HTML document DTD, if any.
332: */
333: static void
334: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
335: xmlDtdPtr cur = doc->intSubset;
336:
337: if (cur == NULL) {
338: fprintf(stderr, "htmlDtdDump : no internal subset\n");
339: return;
340: }
341: xmlBufferWriteChar(buf, "<!DOCTYPE ");
342: xmlBufferWriteCHAR(buf, cur->name);
343: if (cur->ExternalID != NULL) {
344: xmlBufferWriteChar(buf, " PUBLIC ");
345: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 346: if (cur->SystemID != NULL) {
347: xmlBufferWriteChar(buf, " ");
348: xmlBufferWriteQuotedString(buf, cur->SystemID);
349: }
1.1 daniel 350: } else if (cur->SystemID != NULL) {
351: xmlBufferWriteChar(buf, " SYSTEM ");
352: xmlBufferWriteQuotedString(buf, cur->SystemID);
353: }
354: xmlBufferWriteChar(buf, ">\n");
355: }
356:
357: /**
358: * htmlAttrDump:
359: * @buf: the HTML buffer output
360: * @doc: the document
361: * @cur: the attribute pointer
362: *
363: * Dump an HTML attribute
364: */
365: static void
366: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 367: xmlChar *value;
1.1 daniel 368:
369: if (cur == NULL) {
370: fprintf(stderr, "htmlAttrDump : property == NULL\n");
371: return;
372: }
373: xmlBufferWriteChar(buf, " ");
374: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 375: if (cur->children != NULL) {
376: value = xmlNodeListGetString(doc, cur->children, 0);
377: if (value) {
378: xmlBufferWriteChar(buf, "=");
379: xmlBufferWriteQuotedString(buf, value);
380: xmlFree(value);
381: } else {
382: xmlBufferWriteChar(buf, "=\"\"");
383: }
1.1 daniel 384: }
385: }
386:
387: /**
388: * htmlAttrListDump:
389: * @buf: the HTML buffer output
390: * @doc: the document
391: * @cur: the first attribute pointer
392: *
393: * Dump a list of HTML attributes
394: */
395: static void
396: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
397: if (cur == NULL) {
398: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
399: return;
400: }
401: while (cur != NULL) {
402: htmlAttrDump(buf, doc, cur);
403: cur = cur->next;
404: }
405: }
406:
407:
1.14 daniel 408: void
1.1 daniel 409: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
410: /**
411: * htmlNodeListDump:
412: * @buf: the HTML buffer output
413: * @doc: the document
414: * @cur: the first node
415: *
416: * Dump an HTML node list, recursive behaviour,children are printed too.
417: */
418: static void
419: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
420: if (cur == NULL) {
421: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
422: return;
423: }
424: while (cur != NULL) {
425: htmlNodeDump(buf, doc, cur);
426: cur = cur->next;
427: }
428: }
429:
430: /**
431: * htmlNodeDump:
432: * @buf: the HTML buffer output
433: * @doc: the document
434: * @cur: the current node
435: *
436: * Dump an HTML node, recursive behaviour,children are printed too.
437: */
1.14 daniel 438: void
1.1 daniel 439: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440: htmlElemDescPtr info;
441:
442: if (cur == NULL) {
443: fprintf(stderr, "htmlNodeDump : node == NULL\n");
444: return;
445: }
446: /*
447: * Special cases.
448: */
1.20 daniel 449: if (cur->type == XML_DTD_NODE)
450: return;
1.14 daniel 451: if (cur->type == XML_HTML_DOCUMENT_NODE) {
452: htmlDocContentDump(buf, (xmlDocPtr) cur);
453: return;
454: }
1.1 daniel 455: if (cur->type == HTML_TEXT_NODE) {
456: if (cur->content != NULL) {
1.6 daniel 457: xmlChar *buffer;
1.1 daniel 458:
1.9 daniel 459: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 460: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 461: #else
462: buffer = xmlEncodeEntitiesReentrant(doc,
463: xmlBufferContent(cur->content));
464: #endif
1.1 daniel 465: if (buffer != NULL) {
466: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 467: xmlFree(buffer);
1.1 daniel 468: }
469: }
470: return;
471: }
472: if (cur->type == HTML_COMMENT_NODE) {
473: if (cur->content != NULL) {
474: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 475: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 476: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 477: #else
478: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
479: #endif
1.1 daniel 480: xmlBufferWriteChar(buf, "-->");
481: }
482: return;
483: }
484: if (cur->type == HTML_ENTITY_REF_NODE) {
485: xmlBufferWriteChar(buf, "&");
486: xmlBufferWriteCHAR(buf, cur->name);
487: xmlBufferWriteChar(buf, ";");
488: return;
489: }
490:
491: /*
492: * Get specific HTmL info for taht node.
493: */
494: info = htmlTagLookup(cur->name);
495:
496: xmlBufferWriteChar(buf, "<");
497: xmlBufferWriteCHAR(buf, cur->name);
498: if (cur->properties != NULL)
499: htmlAttrListDump(buf, doc, cur->properties);
500:
1.7 daniel 501: if ((info != NULL) && (info->empty)) {
1.1 daniel 502: xmlBufferWriteChar(buf, ">");
503: if (cur->next != NULL) {
504: if ((cur->next->type != HTML_TEXT_NODE) &&
505: (cur->next->type != HTML_ENTITY_REF_NODE))
506: xmlBufferWriteChar(buf, "\n");
507: }
508: return;
509: }
1.17 daniel 510: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 511: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 512: xmlBufferWriteChar(buf, ">");
513: else {
514: xmlBufferWriteChar(buf, "></");
515: xmlBufferWriteCHAR(buf, cur->name);
516: xmlBufferWriteChar(buf, ">");
517: }
518: if (cur->next != NULL) {
519: if ((cur->next->type != HTML_TEXT_NODE) &&
520: (cur->next->type != HTML_ENTITY_REF_NODE))
521: xmlBufferWriteChar(buf, "\n");
522: }
523: return;
524: }
525: xmlBufferWriteChar(buf, ">");
526: if (cur->content != NULL) {
1.6 daniel 527: xmlChar *buffer;
1.1 daniel 528:
1.9 daniel 529: #ifndef XML_USE_BUFFER_CONTENT
530: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
531: #else
532: buffer = xmlEncodeEntitiesReentrant(doc,
533: xmlBufferContent(cur->content));
534: #endif
1.1 daniel 535: if (buffer != NULL) {
536: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 537: xmlFree(buffer);
1.1 daniel 538: }
539: }
1.17 daniel 540: if (cur->children != NULL) {
541: if ((cur->children->type != HTML_TEXT_NODE) &&
542: (cur->children->type != HTML_ENTITY_REF_NODE) &&
543: (cur->children != cur->last))
1.1 daniel 544: xmlBufferWriteChar(buf, "\n");
1.17 daniel 545: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 546: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 547: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 548: (cur->children != cur->last))
1.1 daniel 549: xmlBufferWriteChar(buf, "\n");
550: }
1.11 daniel 551: if (!htmlIsAutoClosed(doc, cur)) {
552: xmlBufferWriteChar(buf, "</");
553: xmlBufferWriteCHAR(buf, cur->name);
554: xmlBufferWriteChar(buf, ">");
555: }
1.1 daniel 556: if (cur->next != NULL) {
557: if ((cur->next->type != HTML_TEXT_NODE) &&
558: (cur->next->type != HTML_ENTITY_REF_NODE))
559: xmlBufferWriteChar(buf, "\n");
560: }
561: }
562:
563: /**
1.16 daniel 564: * htmlNodeDumpFile:
565: * @out: the FILE pointer
566: * @doc: the document
567: * @cur: the current node
568: *
569: * Dump an HTML node, recursive behaviour,children are printed too.
570: */
571: void
572: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
573: xmlBufferPtr buf;
574:
575: buf = xmlBufferCreate();
576: if (buf == NULL) return;
577: htmlNodeDump(buf, doc, cur);
578: xmlBufferDump(out, buf);
579: xmlBufferFree(buf);
580: }
581:
582: /**
1.1 daniel 583: * htmlDocContentDump:
584: * @buf: the HTML buffer output
585: * @cur: the document
586: *
587: * Dump an HTML document.
588: */
589: static void
590: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 591: int type;
592:
593: /*
594: * force to output the stuff as HTML, especially for entities
595: */
596: type = cur->type;
597: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 598: if (cur->intSubset != NULL)
599: htmlDtdDump(buf, cur);
1.11 daniel 600: else {
601: /* Default to HTML-4.0 transitionnal @@@@ */
602: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
603:
604: }
1.17 daniel 605: if (cur->children != NULL) {
606: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 607: }
608: xmlBufferWriteChar(buf, "\n");
1.22 veillard 609: cur->type = (xmlElementType) type;
1.1 daniel 610: }
611:
612: /**
613: * htmlDocDumpMemory:
614: * @cur: the document
615: * @mem: OUT: the memory pointer
616: * @size: OUT: the memory lenght
617: *
1.6 daniel 618: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 619: * It's up to the caller to free the memory.
620: */
621: void
1.6 daniel 622: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 623: xmlBufferPtr buf;
624:
625: if (cur == NULL) {
626: #ifdef DEBUG_TREE
1.15 daniel 627: fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 628: #endif
629: *mem = NULL;
630: *size = 0;
631: return;
632: }
633: buf = xmlBufferCreate();
634: if (buf == NULL) {
635: *mem = NULL;
636: *size = 0;
637: return;
638: }
639: htmlDocContentDump(buf, cur);
640: *mem = buf->content;
641: *size = buf->use;
642: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 643: xmlFree(buf);
1.1 daniel 644: }
645:
646:
1.21 veillard 647: /************************************************************************
648: * *
649: * Dumping HTML tree content to an I/O output buffer *
650: * *
651: ************************************************************************/
652:
653: static void
654: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
655:
656: /**
657: * htmlDtdDump:
658: * @buf: the HTML buffer output
659: * @doc: the document
660: *
661: * Dump the HTML document DTD, if any.
662: */
663: static void
664: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
665: xmlDtdPtr cur = doc->intSubset;
666:
667: if (cur == NULL) {
668: fprintf(stderr, "htmlDtdDump : no internal subset\n");
669: return;
670: }
671: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
672: xmlOutputBufferWriteString(buf, (const char *)cur->name);
673: if (cur->ExternalID != NULL) {
674: xmlOutputBufferWriteString(buf, " PUBLIC ");
675: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
676: if (cur->SystemID != NULL) {
677: xmlOutputBufferWriteString(buf, " ");
678: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
679: }
680: } else if (cur->SystemID != NULL) {
681: xmlOutputBufferWriteString(buf, " SYSTEM ");
682: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
683: }
684: xmlOutputBufferWriteString(buf, ">\n");
685: }
686:
687: /**
688: * htmlAttrDump:
689: * @buf: the HTML buffer output
690: * @doc: the document
691: * @cur: the attribute pointer
692: *
693: * Dump an HTML attribute
694: */
695: static void
696: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
697: xmlChar *value;
698:
699: if (cur == NULL) {
700: fprintf(stderr, "htmlAttrDump : property == NULL\n");
701: return;
702: }
703: xmlOutputBufferWriteString(buf, " ");
704: xmlOutputBufferWriteString(buf, (const char *)cur->name);
705: if (cur->children != NULL) {
706: value = xmlNodeListGetString(doc, cur->children, 0);
707: if (value) {
708: xmlOutputBufferWriteString(buf, "=");
709: xmlBufferWriteQuotedString(buf->buffer, value);
710: xmlFree(value);
711: } else {
712: xmlOutputBufferWriteString(buf, "=\"\"");
713: }
714: }
715: }
716:
717: /**
718: * htmlAttrListDump:
719: * @buf: the HTML buffer output
720: * @doc: the document
721: * @cur: the first attribute pointer
722: *
723: * Dump a list of HTML attributes
724: */
725: static void
726: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
727: if (cur == NULL) {
728: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
729: return;
730: }
731: while (cur != NULL) {
732: htmlAttrDumpOutput(buf, doc, cur, encoding);
733: cur = cur->next;
734: }
735: }
736:
737:
738: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739: xmlNodePtr cur, const char *encoding);
740:
741: /**
742: * htmlNodeListDump:
743: * @buf: the HTML buffer output
744: * @doc: the document
745: * @cur: the first node
746: *
747: * Dump an HTML node list, recursive behaviour,children are printed too.
748: */
749: static void
750: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
751: if (cur == NULL) {
752: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
753: return;
754: }
755: while (cur != NULL) {
756: htmlNodeDumpOutput(buf, doc, cur, encoding);
757: cur = cur->next;
758: }
759: }
760:
761: /**
762: * htmlNodeDump:
763: * @buf: the HTML buffer output
764: * @doc: the document
765: * @cur: the current node
766: *
767: * Dump an HTML node, recursive behaviour,children are printed too.
768: */
769: void
770: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
771: htmlElemDescPtr info;
772:
773: if (cur == NULL) {
774: fprintf(stderr, "htmlNodeDump : node == NULL\n");
775: return;
776: }
777: /*
778: * Special cases.
779: */
780: if (cur->type == XML_DTD_NODE)
781: return;
782: if (cur->type == XML_HTML_DOCUMENT_NODE) {
783: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
784: return;
785: }
786: if (cur->type == HTML_TEXT_NODE) {
787: if (cur->content != NULL) {
788: xmlChar *buffer;
789:
790: #ifndef XML_USE_BUFFER_CONTENT
791: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
792: #else
793: buffer = xmlEncodeEntitiesReentrant(doc,
794: xmlBufferContent(cur->content));
795: #endif
796: if (buffer != NULL) {
1.25 veillard 797: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 798: xmlFree(buffer);
799: }
800: }
801: return;
802: }
803: if (cur->type == HTML_COMMENT_NODE) {
804: if (cur->content != NULL) {
805: xmlOutputBufferWriteString(buf, "<!--");
806: #ifndef XML_USE_BUFFER_CONTENT
807: xmlOutputBufferWriteString(buf, (const char *)cur->content);
808: #else
809: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
810: #endif
811: xmlOutputBufferWriteString(buf, "-->");
812: }
813: return;
814: }
815: if (cur->type == HTML_ENTITY_REF_NODE) {
816: xmlOutputBufferWriteString(buf, "&");
817: xmlOutputBufferWriteString(buf, (const char *)cur->name);
818: xmlOutputBufferWriteString(buf, ";");
819: return;
820: }
821:
822: /*
823: * Get specific HTmL info for taht node.
824: */
825: info = htmlTagLookup(cur->name);
826:
827: xmlOutputBufferWriteString(buf, "<");
828: xmlOutputBufferWriteString(buf, (const char *)cur->name);
829: if (cur->properties != NULL)
830: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
831:
832: if ((info != NULL) && (info->empty)) {
833: xmlOutputBufferWriteString(buf, ">");
834: if (cur->next != NULL) {
835: if ((cur->next->type != HTML_TEXT_NODE) &&
836: (cur->next->type != HTML_ENTITY_REF_NODE))
837: xmlOutputBufferWriteString(buf, "\n");
838: }
839: return;
840: }
841: if ((cur->content == NULL) && (cur->children == NULL)) {
842: if ((info != NULL) && (info->endTag != 0))
843: xmlOutputBufferWriteString(buf, ">");
844: else {
845: xmlOutputBufferWriteString(buf, "></");
846: xmlOutputBufferWriteString(buf, (const char *)cur->name);
847: xmlOutputBufferWriteString(buf, ">");
848: }
849: if (cur->next != NULL) {
850: if ((cur->next->type != HTML_TEXT_NODE) &&
851: (cur->next->type != HTML_ENTITY_REF_NODE))
852: xmlOutputBufferWriteString(buf, "\n");
853: }
854: return;
855: }
856: xmlOutputBufferWriteString(buf, ">");
857: if (cur->content != NULL) {
858: #if 0
859: xmlChar *buffer;
860:
861: #ifndef XML_USE_BUFFER_CONTENT
862: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863: #else
864: buffer = xmlEncodeEntitiesReentrant(doc,
865: xmlBufferContent(cur->content));
866: #endif
867: if (buffer != NULL) {
868: xmlOutputBufferWriteString(buf, buffer);
869: xmlFree(buffer);
870: }
871: #else
872: /*
873: * Uses the OutputBuffer property to automatically convert
874: * invalids to charrefs
875: */
876:
877: #ifndef XML_USE_BUFFER_CONTENT
878: xmlOutputBufferWriteString(buf, (const char *) cur->content);
879: #else
880: xmlOutputBufferWriteString(buf,
881: (const char *) xmlBufferContent(cur->content));
882: #endif
883: #endif
884: }
885: if (cur->children != NULL) {
886: if ((cur->children->type != HTML_TEXT_NODE) &&
887: (cur->children->type != HTML_ENTITY_REF_NODE) &&
888: (cur->children != cur->last))
889: xmlOutputBufferWriteString(buf, "\n");
890: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
891: if ((cur->last->type != HTML_TEXT_NODE) &&
892: (cur->last->type != HTML_ENTITY_REF_NODE) &&
893: (cur->children != cur->last))
894: xmlOutputBufferWriteString(buf, "\n");
895: }
896: if (!htmlIsAutoClosed(doc, cur)) {
897: xmlOutputBufferWriteString(buf, "</");
898: xmlOutputBufferWriteString(buf, (const char *)cur->name);
899: xmlOutputBufferWriteString(buf, ">");
900: }
901: if (cur->next != NULL) {
902: if ((cur->next->type != HTML_TEXT_NODE) &&
903: (cur->next->type != HTML_ENTITY_REF_NODE))
904: xmlOutputBufferWriteString(buf, "\n");
905: }
906: }
907:
908: /**
909: * htmlDocContentDump:
910: * @buf: the HTML buffer output
911: * @cur: the document
912: *
913: * Dump an HTML document.
914: */
915: static void
916: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
917: int type;
918:
919: /*
920: * force to output the stuff as HTML, especially for entities
921: */
922: type = cur->type;
923: cur->type = XML_HTML_DOCUMENT_NODE;
924: if (cur->intSubset != NULL)
925: htmlDtdDumpOutput(buf, cur, NULL);
926: else {
927: /* Default to HTML-4.0 transitionnal @@@@ */
928: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
929:
930: }
931: if (cur->children != NULL) {
932: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
933: }
934: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 935: cur->type = (xmlElementType) type;
1.21 veillard 936: }
937:
938:
939: /************************************************************************
940: * *
941: * Saving functions front-ends *
942: * *
943: ************************************************************************/
944:
1.1 daniel 945: /**
946: * htmlDocDump:
947: * @f: the FILE*
948: * @cur: the document
949: *
950: * Dump an HTML document to an open FILE.
1.21 veillard 951: *
952: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 953: */
1.21 veillard 954: int
1.1 daniel 955: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 956: xmlOutputBufferPtr buf;
1.24 veillard 957: xmlCharEncodingHandlerPtr handler = NULL;
958: const char *encoding;
1.21 veillard 959: int ret;
1.1 daniel 960:
961: if (cur == NULL) {
962: #ifdef DEBUG_TREE
1.15 daniel 963: fprintf(stderr, "htmlDocDump : document == NULL\n");
1.1 daniel 964: #endif
1.21 veillard 965: return(-1);
1.1 daniel 966: }
1.24 veillard 967:
968: encoding = (const char *) htmlGetMetaEncoding(cur);
969:
970: if (encoding != NULL) {
971: xmlCharEncoding enc;
972:
973: enc = xmlParseCharEncoding(encoding);
974: if (enc != cur->charset) {
975: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
976: /*
977: * Not supported yet
978: */
979: return(-1);
980: }
981:
982: handler = xmlFindCharEncodingHandler(encoding);
983: if (handler == NULL)
984: return(-1);
985: }
986: }
987:
988: /*
1.25 veillard 989: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 990: */
991: if (handler == NULL)
1.25 veillard 992: handler = xmlFindCharEncodingHandler("HTML");
993: if (handler == NULL)
1.24 veillard 994: handler = xmlFindCharEncodingHandler("ascii");
995:
996: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 997: if (buf == NULL) return(-1);
998: htmlDocContentDumpOutput(buf, cur, NULL);
999:
1000: ret = xmlOutputBufferClose(buf);
1001: return(ret);
1002: }
1003:
1004: /**
1005: * htmlSaveFile:
1006: * @filename: the filename (or URL)
1007: * @cur: the document
1008: *
1009: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1010: * used.
1011: * returns: the number of byte written or -1 in case of failure.
1012: */
1013: int
1014: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1015: xmlOutputBufferPtr buf;
1.24 veillard 1016: xmlCharEncodingHandlerPtr handler = NULL;
1017: const char *encoding;
1.21 veillard 1018: int ret;
1019:
1.24 veillard 1020: encoding = (const char *) htmlGetMetaEncoding(cur);
1021:
1022: if (encoding != NULL) {
1023: xmlCharEncoding enc;
1024:
1025: enc = xmlParseCharEncoding(encoding);
1026: if (enc != cur->charset) {
1027: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1028: /*
1029: * Not supported yet
1030: */
1031: return(-1);
1032: }
1033:
1034: handler = xmlFindCharEncodingHandler(encoding);
1035: if (handler == NULL)
1036: return(-1);
1037: }
1038: }
1039:
1040: /*
1.25 veillard 1041: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1042: */
1043: if (handler == NULL)
1.25 veillard 1044: handler = xmlFindCharEncodingHandler("HTML");
1045: if (handler == NULL)
1.24 veillard 1046: handler = xmlFindCharEncodingHandler("ascii");
1047:
1.21 veillard 1048: /*
1049: * save the content to a temp buffer.
1050: */
1.24 veillard 1051: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1052: if (buf == NULL) return(0);
1053:
1054: htmlDocContentDumpOutput(buf, cur, NULL);
1055:
1056: ret = xmlOutputBufferClose(buf);
1057: return(ret);
1.1 daniel 1058: }
1059:
1060: /**
1.26 veillard 1061: * htmlSaveFileEnc:
1.1 daniel 1062: * @filename: the filename
1063: * @cur: the document
1064: *
1.26 veillard 1065: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1066: *
1067: * returns: the number of byte written or -1 in case of failure.
1068: */
1069: int
1.21 veillard 1070: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1071: xmlOutputBufferPtr buf;
1072: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1073: int ret;
1074:
1.21 veillard 1075: if (encoding != NULL) {
1076: xmlCharEncoding enc;
1077:
1078: enc = xmlParseCharEncoding(encoding);
1079: if (enc != cur->charset) {
1080: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1081: /*
1082: * Not supported yet
1083: */
1084: return(-1);
1085: }
1086:
1087: handler = xmlFindCharEncodingHandler(encoding);
1088: if (handler == NULL)
1089: return(-1);
1.26 veillard 1090: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1091: }
1092: }
1.24 veillard 1093:
1094: /*
1.25 veillard 1095: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1096: */
1.25 veillard 1097: if (handler == NULL)
1098: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1099: if (handler == NULL)
1100: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1101:
1.1 daniel 1102: /*
1103: * save the content to a temp buffer.
1104: */
1.21 veillard 1105: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1106: if (buf == NULL) return(0);
1107:
1.21 veillard 1108: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1109:
1.21 veillard 1110: ret = xmlOutputBufferClose(buf);
1111: return(ret);
1.1 daniel 1112: }
1.18 daniel 1113: #endif /* LIBXML_HTML_ENABLED */
Webmaster