Annotation of XML/HTMLtree.c, revision 1.31
1.1 daniel 1: /*
2: * HTMLtree.c : implemetation of access function for an HTML tree.
3: *
4: * See Copyright for the status of this software.
5: *
6: * Daniel.Veillard@w3.org
7: */
8:
1.5 daniel 9:
1.13 daniel 10: #ifdef WIN32
11: #include "win32config.h"
12: #else
1.1 daniel 13: #include "config.h"
1.5 daniel 14: #endif
1.18 daniel 15:
1.30 veillard 16: #include <libxml/xmlversion.h>
1.18 daniel 17: #ifdef LIBXML_HTML_ENABLED
18:
1.1 daniel 19: #include <stdio.h>
1.5 daniel 20: #include <string.h> /* for memset() only ! */
21:
22: #ifdef HAVE_CTYPE_H
1.1 daniel 23: #include <ctype.h>
1.5 daniel 24: #endif
25: #ifdef HAVE_STDLIB_H
1.1 daniel 26: #include <stdlib.h>
1.5 daniel 27: #endif
1.1 daniel 28:
1.18 daniel 29: #include <libxml/xmlmemory.h>
30: #include <libxml/HTMLparser.h>
31: #include <libxml/HTMLtree.h>
32: #include <libxml/entities.h>
33: #include <libxml/valid.h>
1.1 daniel 34:
1.21 veillard 35: /************************************************************************
36: * *
1.23 veillard 37: * Getting/Setting encoding meta tags *
38: * *
39: ************************************************************************/
40:
41: /**
42: * htmlGetMetaEncoding:
43: * @doc: the document
44: *
45: * Encoding definition lookup in the Meta tags
46: *
47: * Returns the current encoding as flagged in the HTML source
48: */
49: const xmlChar *
50: htmlGetMetaEncoding(htmlDocPtr doc) {
1.24 veillard 51: htmlNodePtr cur;
52: const xmlChar *content;
53: const xmlChar *encoding;
54:
55: if (doc == NULL)
56: return(NULL);
57: cur = doc->children;
58:
59: /*
60: * Search the html
61: */
62: while (cur != NULL) {
63: if (cur->name != NULL) {
1.29 veillard 64: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.24 veillard 65: break;
1.29 veillard 66: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 67: goto found_head;
1.29 veillard 68: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 69: goto found_meta;
70: }
71: cur = cur->next;
72: }
73: if (cur == NULL)
74: return(NULL);
75: cur = cur->children;
76:
77: /*
78: * Search the head
79: */
80: while (cur != NULL) {
81: if (cur->name != NULL) {
1.29 veillard 82: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.24 veillard 83: break;
1.29 veillard 84: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.24 veillard 85: goto found_meta;
86: }
87: cur = cur->next;
88: }
89: if (cur == NULL)
90: return(NULL);
91: found_head:
92: cur = cur->children;
93:
94: /*
95: * Search the meta elements
96: */
97: found_meta:
98: while (cur != NULL) {
99: if (cur->name != NULL) {
1.29 veillard 100: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.24 veillard 101: xmlAttrPtr attr = cur->properties;
102: int http;
103: const xmlChar *value;
104:
105: content = NULL;
106: http = 0;
107: while (attr != NULL) {
108: if ((attr->children != NULL) &&
109: (attr->children->type == XML_TEXT_NODE) &&
110: (attr->children->next == NULL)) {
111: #ifndef XML_USE_BUFFER_CONTENT
112: value = attr->children->content;
113: #else
114: value = xmlBufferContent(attr->children->content);
115: #endif
1.28 veillard 116: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
117: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.24 veillard 118: http = 1;
1.28 veillard 119: else if ((value != NULL)
120: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.24 veillard 121: content = value;
122: if ((http != 0) && (content != NULL))
123: goto found_content;
124: }
125: attr = attr->next;
126: }
127: }
128: }
129: cur = cur->next;
130: }
131: return(NULL);
132:
133: found_content:
134: encoding = xmlStrstr(content, BAD_CAST"charset=");
135: if (encoding == NULL)
136: encoding = xmlStrstr(content, BAD_CAST"Charset=");
137: if (encoding == NULL)
138: encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
139: if (encoding != NULL) {
140: encoding += 8;
141: } else {
142: encoding = xmlStrstr(content, BAD_CAST"charset =");
143: if (encoding == NULL)
144: encoding = xmlStrstr(content, BAD_CAST"Charset =");
145: if (encoding == NULL)
146: encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
147: if (encoding != NULL)
148: encoding += 9;
149: }
150: if (encoding != NULL) {
151: while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
152: }
153: return(encoding);
1.23 veillard 154: }
155:
156: /**
157: * htmlSetMetaEncoding:
158: * @doc: the document
159: * @encoding: the encoding string
160: *
161: * Sets the current encoding in the Meta tags
162: * NOTE: this will not change the document content encoding, just
163: * the META flag associated.
164: *
165: * Returns 0 in case of success and -1 in case of error
166: */
167: int
168: htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
1.26 veillard 169: htmlNodePtr cur, meta;
170: const xmlChar *content;
171: char newcontent[100];
172:
173:
174: if (doc == NULL)
175: return(-1);
176:
177: if (encoding != NULL) {
1.27 veillard 178: #ifdef HAVE_SNPRINTF
179: snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180: encoding);
181: #else
1.26 veillard 182: sprintf(newcontent, "text/html; charset=%s", encoding);
1.27 veillard 183: #endif
184: newcontent[sizeof(newcontent) - 1] = 0;
1.26 veillard 185: }
186:
187: cur = doc->children;
188:
189: /*
190: * Search the html
191: */
192: while (cur != NULL) {
193: if (cur->name != NULL) {
1.29 veillard 194: if (xmlStrEqual(cur->name, BAD_CAST"html"))
1.26 veillard 195: break;
1.29 veillard 196: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 197: if (encoding == NULL)
198: return(0);
199: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
200: xmlAddPrevSibling(cur, meta);
201: cur = meta;
202: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
203: xmlAddChild(cur, meta);
204: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
205: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
206: return(0);
207: }
1.29 veillard 208: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 209: goto found_head;
1.29 veillard 210: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 211: goto found_meta;
212: }
213: cur = cur->next;
214: }
215: if (cur == NULL)
216: return(-1);
217: cur = cur->children;
218:
219: /*
220: * Search the head
221: */
222: while (cur != NULL) {
223: if (cur->name != NULL) {
1.29 veillard 224: if (xmlStrEqual(cur->name, BAD_CAST"head"))
1.26 veillard 225: break;
1.29 veillard 226: if (xmlStrEqual(cur->name, BAD_CAST"body")) {
1.26 veillard 227: if (encoding == NULL)
228: return(0);
229: meta = xmlNewDocNode(doc, NULL, BAD_CAST"head", NULL);
230: xmlAddPrevSibling(cur, meta);
231: cur = meta;
232: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
233: xmlAddChild(cur, meta);
234: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
235: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
236: return(0);
237: }
1.29 veillard 238: if (xmlStrEqual(cur->name, BAD_CAST"meta"))
1.26 veillard 239: goto found_meta;
240: }
241: cur = cur->next;
242: }
243: if (cur == NULL)
244: return(-1);
245: found_head:
246: if (cur->children == NULL) {
247: if (encoding == NULL)
248: return(0);
249: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
250: xmlAddChild(cur, meta);
251: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
252: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
253: return(0);
254: }
255: cur = cur->children;
256:
257: found_meta:
258: if (encoding != NULL) {
259: /*
260: * Create a new Meta element with the right aatributes
261: */
262:
263: meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
264: xmlAddPrevSibling(cur, meta);
265: xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
266: xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
267: }
268:
269: /*
270: * Search and destroy all the remaining the meta elements carrying
271: * encoding informations
272: */
273: while (cur != NULL) {
274: if (cur->name != NULL) {
1.29 veillard 275: if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
1.26 veillard 276: xmlAttrPtr attr = cur->properties;
277: int http;
278: const xmlChar *value;
279:
280: content = NULL;
281: http = 0;
282: while (attr != NULL) {
283: if ((attr->children != NULL) &&
284: (attr->children->type == XML_TEXT_NODE) &&
285: (attr->children->next == NULL)) {
286: #ifndef XML_USE_BUFFER_CONTENT
287: value = attr->children->content;
288: #else
289: value = xmlBufferContent(attr->children->content);
290: #endif
1.28 veillard 291: if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
292: && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
1.26 veillard 293: http = 1;
1.28 veillard 294: else if ((value != NULL)
295: && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
1.26 veillard 296: content = value;
297: if ((http != 0) && (content != NULL))
298: break;
299: }
300: attr = attr->next;
301: }
302: if ((http != 0) && (content != NULL)) {
303: meta = cur;
304: cur = cur->next;
305: xmlUnlinkNode(meta);
306: xmlFreeNode(meta);
307: continue;
308: }
309:
310: }
311: }
312: cur = cur->next;
313: }
314: return(0);
1.23 veillard 315: }
316:
317: /************************************************************************
318: * *
1.21 veillard 319: * Dumping HTML tree content to a simple buffer *
320: * *
321: ************************************************************************/
322:
1.14 daniel 323: static void
324: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur);
325:
1.1 daniel 326: /**
327: * htmlDtdDump:
328: * @buf: the HTML buffer output
329: * @doc: the document
330: *
331: * Dump the HTML document DTD, if any.
332: */
333: static void
334: htmlDtdDump(xmlBufferPtr buf, xmlDocPtr doc) {
335: xmlDtdPtr cur = doc->intSubset;
336:
337: if (cur == NULL) {
338: fprintf(stderr, "htmlDtdDump : no internal subset\n");
339: return;
340: }
341: xmlBufferWriteChar(buf, "<!DOCTYPE ");
342: xmlBufferWriteCHAR(buf, cur->name);
343: if (cur->ExternalID != NULL) {
344: xmlBufferWriteChar(buf, " PUBLIC ");
345: xmlBufferWriteQuotedString(buf, cur->ExternalID);
1.2 daniel 346: if (cur->SystemID != NULL) {
347: xmlBufferWriteChar(buf, " ");
348: xmlBufferWriteQuotedString(buf, cur->SystemID);
349: }
1.1 daniel 350: } else if (cur->SystemID != NULL) {
351: xmlBufferWriteChar(buf, " SYSTEM ");
352: xmlBufferWriteQuotedString(buf, cur->SystemID);
353: }
354: xmlBufferWriteChar(buf, ">\n");
355: }
356:
357: /**
358: * htmlAttrDump:
359: * @buf: the HTML buffer output
360: * @doc: the document
361: * @cur: the attribute pointer
362: *
363: * Dump an HTML attribute
364: */
365: static void
366: htmlAttrDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
1.6 daniel 367: xmlChar *value;
1.1 daniel 368:
369: if (cur == NULL) {
370: fprintf(stderr, "htmlAttrDump : property == NULL\n");
371: return;
372: }
373: xmlBufferWriteChar(buf, " ");
374: xmlBufferWriteCHAR(buf, cur->name);
1.19 daniel 375: if (cur->children != NULL) {
376: value = xmlNodeListGetString(doc, cur->children, 0);
377: if (value) {
378: xmlBufferWriteChar(buf, "=");
379: xmlBufferWriteQuotedString(buf, value);
380: xmlFree(value);
381: } else {
382: xmlBufferWriteChar(buf, "=\"\"");
383: }
1.1 daniel 384: }
385: }
386:
387: /**
388: * htmlAttrListDump:
389: * @buf: the HTML buffer output
390: * @doc: the document
391: * @cur: the first attribute pointer
392: *
393: * Dump a list of HTML attributes
394: */
395: static void
396: htmlAttrListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
397: if (cur == NULL) {
398: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
399: return;
400: }
401: while (cur != NULL) {
402: htmlAttrDump(buf, doc, cur);
403: cur = cur->next;
404: }
405: }
406:
407:
1.14 daniel 408: void
1.1 daniel 409: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur);
410: /**
411: * htmlNodeListDump:
412: * @buf: the HTML buffer output
413: * @doc: the document
414: * @cur: the first node
415: *
416: * Dump an HTML node list, recursive behaviour,children are printed too.
417: */
418: static void
419: htmlNodeListDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
420: if (cur == NULL) {
421: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
422: return;
423: }
424: while (cur != NULL) {
425: htmlNodeDump(buf, doc, cur);
426: cur = cur->next;
427: }
428: }
429:
430: /**
431: * htmlNodeDump:
432: * @buf: the HTML buffer output
433: * @doc: the document
434: * @cur: the current node
435: *
436: * Dump an HTML node, recursive behaviour,children are printed too.
437: */
1.14 daniel 438: void
1.1 daniel 439: htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440: htmlElemDescPtr info;
441:
442: if (cur == NULL) {
443: fprintf(stderr, "htmlNodeDump : node == NULL\n");
444: return;
445: }
446: /*
447: * Special cases.
448: */
1.20 daniel 449: if (cur->type == XML_DTD_NODE)
450: return;
1.14 daniel 451: if (cur->type == XML_HTML_DOCUMENT_NODE) {
452: htmlDocContentDump(buf, (xmlDocPtr) cur);
453: return;
454: }
1.1 daniel 455: if (cur->type == HTML_TEXT_NODE) {
456: if (cur->content != NULL) {
1.6 daniel 457: xmlChar *buffer;
1.1 daniel 458:
1.9 daniel 459: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 460: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
1.9 daniel 461: #else
462: buffer = xmlEncodeEntitiesReentrant(doc,
463: xmlBufferContent(cur->content));
464: #endif
1.1 daniel 465: if (buffer != NULL) {
466: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 467: xmlFree(buffer);
1.1 daniel 468: }
469: }
470: return;
471: }
472: if (cur->type == HTML_COMMENT_NODE) {
473: if (cur->content != NULL) {
474: xmlBufferWriteChar(buf, "<!--");
1.9 daniel 475: #ifndef XML_USE_BUFFER_CONTENT
1.1 daniel 476: xmlBufferWriteCHAR(buf, cur->content);
1.9 daniel 477: #else
478: xmlBufferWriteCHAR(buf, xmlBufferContent(cur->content));
479: #endif
1.1 daniel 480: xmlBufferWriteChar(buf, "-->");
481: }
482: return;
483: }
484: if (cur->type == HTML_ENTITY_REF_NODE) {
485: xmlBufferWriteChar(buf, "&");
486: xmlBufferWriteCHAR(buf, cur->name);
487: xmlBufferWriteChar(buf, ";");
488: return;
489: }
490:
491: /*
492: * Get specific HTmL info for taht node.
493: */
494: info = htmlTagLookup(cur->name);
495:
496: xmlBufferWriteChar(buf, "<");
497: xmlBufferWriteCHAR(buf, cur->name);
498: if (cur->properties != NULL)
499: htmlAttrListDump(buf, doc, cur->properties);
500:
1.7 daniel 501: if ((info != NULL) && (info->empty)) {
1.1 daniel 502: xmlBufferWriteChar(buf, ">");
503: if (cur->next != NULL) {
504: if ((cur->next->type != HTML_TEXT_NODE) &&
505: (cur->next->type != HTML_ENTITY_REF_NODE))
506: xmlBufferWriteChar(buf, "\n");
507: }
508: return;
509: }
1.17 daniel 510: if ((cur->content == NULL) && (cur->children == NULL)) {
1.7 daniel 511: if ((info != NULL) && (info->endTag != 0))
1.1 daniel 512: xmlBufferWriteChar(buf, ">");
513: else {
514: xmlBufferWriteChar(buf, "></");
515: xmlBufferWriteCHAR(buf, cur->name);
516: xmlBufferWriteChar(buf, ">");
517: }
518: if (cur->next != NULL) {
519: if ((cur->next->type != HTML_TEXT_NODE) &&
520: (cur->next->type != HTML_ENTITY_REF_NODE))
521: xmlBufferWriteChar(buf, "\n");
522: }
523: return;
524: }
525: xmlBufferWriteChar(buf, ">");
526: if (cur->content != NULL) {
1.6 daniel 527: xmlChar *buffer;
1.1 daniel 528:
1.9 daniel 529: #ifndef XML_USE_BUFFER_CONTENT
530: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
531: #else
532: buffer = xmlEncodeEntitiesReentrant(doc,
533: xmlBufferContent(cur->content));
534: #endif
1.1 daniel 535: if (buffer != NULL) {
536: xmlBufferWriteCHAR(buf, buffer);
1.4 daniel 537: xmlFree(buffer);
1.1 daniel 538: }
539: }
1.17 daniel 540: if (cur->children != NULL) {
541: if ((cur->children->type != HTML_TEXT_NODE) &&
542: (cur->children->type != HTML_ENTITY_REF_NODE) &&
543: (cur->children != cur->last))
1.1 daniel 544: xmlBufferWriteChar(buf, "\n");
1.17 daniel 545: htmlNodeListDump(buf, doc, cur->children);
1.1 daniel 546: if ((cur->last->type != HTML_TEXT_NODE) &&
1.10 daniel 547: (cur->last->type != HTML_ENTITY_REF_NODE) &&
1.17 daniel 548: (cur->children != cur->last))
1.1 daniel 549: xmlBufferWriteChar(buf, "\n");
550: }
1.11 daniel 551: if (!htmlIsAutoClosed(doc, cur)) {
552: xmlBufferWriteChar(buf, "</");
553: xmlBufferWriteCHAR(buf, cur->name);
554: xmlBufferWriteChar(buf, ">");
555: }
1.1 daniel 556: if (cur->next != NULL) {
557: if ((cur->next->type != HTML_TEXT_NODE) &&
558: (cur->next->type != HTML_ENTITY_REF_NODE))
559: xmlBufferWriteChar(buf, "\n");
560: }
561: }
562:
563: /**
1.16 daniel 564: * htmlNodeDumpFile:
565: * @out: the FILE pointer
566: * @doc: the document
567: * @cur: the current node
568: *
569: * Dump an HTML node, recursive behaviour,children are printed too.
570: */
571: void
572: htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
573: xmlBufferPtr buf;
574:
575: buf = xmlBufferCreate();
576: if (buf == NULL) return;
577: htmlNodeDump(buf, doc, cur);
578: xmlBufferDump(out, buf);
579: xmlBufferFree(buf);
580: }
581:
582: /**
1.1 daniel 583: * htmlDocContentDump:
584: * @buf: the HTML buffer output
585: * @cur: the document
586: *
587: * Dump an HTML document.
588: */
589: static void
590: htmlDocContentDump(xmlBufferPtr buf, xmlDocPtr cur) {
1.12 daniel 591: int type;
592:
593: /*
594: * force to output the stuff as HTML, especially for entities
595: */
596: type = cur->type;
597: cur->type = XML_HTML_DOCUMENT_NODE;
1.1 daniel 598: if (cur->intSubset != NULL)
599: htmlDtdDump(buf, cur);
1.11 daniel 600: else {
601: /* Default to HTML-4.0 transitionnal @@@@ */
602: xmlBufferWriteChar(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
603:
604: }
1.17 daniel 605: if (cur->children != NULL) {
606: htmlNodeListDump(buf, cur, cur->children);
1.1 daniel 607: }
608: xmlBufferWriteChar(buf, "\n");
1.22 veillard 609: cur->type = (xmlElementType) type;
1.1 daniel 610: }
611:
612: /**
613: * htmlDocDumpMemory:
614: * @cur: the document
615: * @mem: OUT: the memory pointer
616: * @size: OUT: the memory lenght
617: *
1.6 daniel 618: * Dump an HTML document in memory and return the xmlChar * and it's size.
1.1 daniel 619: * It's up to the caller to free the memory.
620: */
621: void
1.6 daniel 622: htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
1.1 daniel 623: xmlBufferPtr buf;
624:
625: if (cur == NULL) {
626: #ifdef DEBUG_TREE
1.15 daniel 627: fprintf(stderr, "htmlxmlDocDumpMemory : document == NULL\n");
1.1 daniel 628: #endif
629: *mem = NULL;
630: *size = 0;
631: return;
632: }
633: buf = xmlBufferCreate();
634: if (buf == NULL) {
635: *mem = NULL;
636: *size = 0;
637: return;
638: }
639: htmlDocContentDump(buf, cur);
640: *mem = buf->content;
641: *size = buf->use;
642: memset(buf, -1, sizeof(xmlBuffer));
1.4 daniel 643: xmlFree(buf);
1.1 daniel 644: }
645:
646:
1.21 veillard 647: /************************************************************************
648: * *
649: * Dumping HTML tree content to an I/O output buffer *
650: * *
651: ************************************************************************/
652:
653: static void
654: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding);
655:
656: /**
657: * htmlDtdDump:
658: * @buf: the HTML buffer output
659: * @doc: the document
660: *
661: * Dump the HTML document DTD, if any.
662: */
663: static void
664: htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, const char *encoding) {
665: xmlDtdPtr cur = doc->intSubset;
666:
667: if (cur == NULL) {
668: fprintf(stderr, "htmlDtdDump : no internal subset\n");
669: return;
670: }
671: xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
672: xmlOutputBufferWriteString(buf, (const char *)cur->name);
673: if (cur->ExternalID != NULL) {
674: xmlOutputBufferWriteString(buf, " PUBLIC ");
675: xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
676: if (cur->SystemID != NULL) {
677: xmlOutputBufferWriteString(buf, " ");
678: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
679: }
680: } else if (cur->SystemID != NULL) {
681: xmlOutputBufferWriteString(buf, " SYSTEM ");
682: xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
683: }
684: xmlOutputBufferWriteString(buf, ">\n");
685: }
686:
687: /**
688: * htmlAttrDump:
689: * @buf: the HTML buffer output
690: * @doc: the document
691: * @cur: the attribute pointer
692: *
693: * Dump an HTML attribute
694: */
695: static void
696: htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
697: xmlChar *value;
698:
699: if (cur == NULL) {
700: fprintf(stderr, "htmlAttrDump : property == NULL\n");
701: return;
702: }
703: xmlOutputBufferWriteString(buf, " ");
704: xmlOutputBufferWriteString(buf, (const char *)cur->name);
705: if (cur->children != NULL) {
706: value = xmlNodeListGetString(doc, cur->children, 0);
707: if (value) {
708: xmlOutputBufferWriteString(buf, "=");
709: xmlBufferWriteQuotedString(buf->buffer, value);
710: xmlFree(value);
711: } else {
712: xmlOutputBufferWriteString(buf, "=\"\"");
713: }
714: }
715: }
716:
717: /**
718: * htmlAttrListDump:
719: * @buf: the HTML buffer output
720: * @doc: the document
721: * @cur: the first attribute pointer
722: *
723: * Dump a list of HTML attributes
724: */
725: static void
726: htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
727: if (cur == NULL) {
728: fprintf(stderr, "htmlAttrListDump : property == NULL\n");
729: return;
730: }
731: while (cur != NULL) {
732: htmlAttrDumpOutput(buf, doc, cur, encoding);
733: cur = cur->next;
734: }
735: }
736:
737:
738: void htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739: xmlNodePtr cur, const char *encoding);
740:
741: /**
742: * htmlNodeListDump:
743: * @buf: the HTML buffer output
744: * @doc: the document
745: * @cur: the first node
746: *
747: * Dump an HTML node list, recursive behaviour,children are printed too.
748: */
749: static void
750: htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
751: if (cur == NULL) {
752: fprintf(stderr, "htmlNodeListDump : node == NULL\n");
753: return;
754: }
755: while (cur != NULL) {
756: htmlNodeDumpOutput(buf, doc, cur, encoding);
757: cur = cur->next;
758: }
759: }
760:
761: /**
762: * htmlNodeDump:
763: * @buf: the HTML buffer output
764: * @doc: the document
765: * @cur: the current node
766: *
767: * Dump an HTML node, recursive behaviour,children are printed too.
768: */
769: void
770: htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const char *encoding) {
771: htmlElemDescPtr info;
772:
773: if (cur == NULL) {
774: fprintf(stderr, "htmlNodeDump : node == NULL\n");
775: return;
776: }
777: /*
778: * Special cases.
779: */
780: if (cur->type == XML_DTD_NODE)
781: return;
782: if (cur->type == XML_HTML_DOCUMENT_NODE) {
783: htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
784: return;
785: }
786: if (cur->type == HTML_TEXT_NODE) {
787: if (cur->content != NULL) {
788: xmlChar *buffer;
789:
790: #ifndef XML_USE_BUFFER_CONTENT
791: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
792: #else
793: buffer = xmlEncodeEntitiesReentrant(doc,
794: xmlBufferContent(cur->content));
795: #endif
796: if (buffer != NULL) {
1.25 veillard 797: xmlOutputBufferWriteString(buf, (const char *)buffer);
1.21 veillard 798: xmlFree(buffer);
799: }
800: }
801: return;
802: }
803: if (cur->type == HTML_COMMENT_NODE) {
804: if (cur->content != NULL) {
805: xmlOutputBufferWriteString(buf, "<!--");
806: #ifndef XML_USE_BUFFER_CONTENT
807: xmlOutputBufferWriteString(buf, (const char *)cur->content);
808: #else
809: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
810: #endif
811: xmlOutputBufferWriteString(buf, "-->");
812: }
813: return;
814: }
815: if (cur->type == HTML_ENTITY_REF_NODE) {
816: xmlOutputBufferWriteString(buf, "&");
817: xmlOutputBufferWriteString(buf, (const char *)cur->name);
818: xmlOutputBufferWriteString(buf, ";");
819: return;
820: }
1.31 ! veillard 821: if (cur->type == HTML_PRESERVE_NODE) {
! 822: if (cur->content != NULL) {
! 823: #ifndef XML_USE_BUFFER_CONTENT
! 824: xmlOutputBufferWriteString(buf, (const char *)cur->content);
! 825: #else
! 826: xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
! 827: #endif
! 828: }
! 829: return;
! 830: }
1.21 veillard 831:
832: /*
833: * Get specific HTmL info for taht node.
834: */
835: info = htmlTagLookup(cur->name);
836:
837: xmlOutputBufferWriteString(buf, "<");
838: xmlOutputBufferWriteString(buf, (const char *)cur->name);
839: if (cur->properties != NULL)
840: htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
841:
842: if ((info != NULL) && (info->empty)) {
843: xmlOutputBufferWriteString(buf, ">");
844: if (cur->next != NULL) {
845: if ((cur->next->type != HTML_TEXT_NODE) &&
846: (cur->next->type != HTML_ENTITY_REF_NODE))
847: xmlOutputBufferWriteString(buf, "\n");
848: }
849: return;
850: }
851: if ((cur->content == NULL) && (cur->children == NULL)) {
852: if ((info != NULL) && (info->endTag != 0))
853: xmlOutputBufferWriteString(buf, ">");
854: else {
855: xmlOutputBufferWriteString(buf, "></");
856: xmlOutputBufferWriteString(buf, (const char *)cur->name);
857: xmlOutputBufferWriteString(buf, ">");
858: }
859: if (cur->next != NULL) {
860: if ((cur->next->type != HTML_TEXT_NODE) &&
861: (cur->next->type != HTML_ENTITY_REF_NODE))
862: xmlOutputBufferWriteString(buf, "\n");
863: }
864: return;
865: }
866: xmlOutputBufferWriteString(buf, ">");
867: if (cur->content != NULL) {
868: #if 0
869: xmlChar *buffer;
870:
871: #ifndef XML_USE_BUFFER_CONTENT
872: buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
873: #else
874: buffer = xmlEncodeEntitiesReentrant(doc,
875: xmlBufferContent(cur->content));
876: #endif
877: if (buffer != NULL) {
878: xmlOutputBufferWriteString(buf, buffer);
879: xmlFree(buffer);
880: }
881: #else
882: /*
883: * Uses the OutputBuffer property to automatically convert
884: * invalids to charrefs
885: */
886:
887: #ifndef XML_USE_BUFFER_CONTENT
888: xmlOutputBufferWriteString(buf, (const char *) cur->content);
889: #else
890: xmlOutputBufferWriteString(buf,
891: (const char *) xmlBufferContent(cur->content));
892: #endif
893: #endif
894: }
895: if (cur->children != NULL) {
896: if ((cur->children->type != HTML_TEXT_NODE) &&
897: (cur->children->type != HTML_ENTITY_REF_NODE) &&
898: (cur->children != cur->last))
899: xmlOutputBufferWriteString(buf, "\n");
900: htmlNodeListDumpOutput(buf, doc, cur->children, encoding);
901: if ((cur->last->type != HTML_TEXT_NODE) &&
902: (cur->last->type != HTML_ENTITY_REF_NODE) &&
903: (cur->children != cur->last))
904: xmlOutputBufferWriteString(buf, "\n");
905: }
906: if (!htmlIsAutoClosed(doc, cur)) {
907: xmlOutputBufferWriteString(buf, "</");
908: xmlOutputBufferWriteString(buf, (const char *)cur->name);
909: xmlOutputBufferWriteString(buf, ">");
910: }
911: if (cur->next != NULL) {
912: if ((cur->next->type != HTML_TEXT_NODE) &&
913: (cur->next->type != HTML_ENTITY_REF_NODE))
914: xmlOutputBufferWriteString(buf, "\n");
915: }
916: }
917:
918: /**
919: * htmlDocContentDump:
920: * @buf: the HTML buffer output
921: * @cur: the document
922: *
923: * Dump an HTML document.
924: */
925: static void
926: htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, const char *encoding) {
927: int type;
928:
929: /*
930: * force to output the stuff as HTML, especially for entities
931: */
932: type = cur->type;
933: cur->type = XML_HTML_DOCUMENT_NODE;
934: if (cur->intSubset != NULL)
935: htmlDtdDumpOutput(buf, cur, NULL);
936: else {
937: /* Default to HTML-4.0 transitionnal @@@@ */
938: xmlOutputBufferWriteString(buf, "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">");
939:
940: }
941: if (cur->children != NULL) {
942: htmlNodeListDumpOutput(buf, cur, cur->children, encoding);
943: }
944: xmlOutputBufferWriteString(buf, "\n");
1.22 veillard 945: cur->type = (xmlElementType) type;
1.21 veillard 946: }
947:
948:
949: /************************************************************************
950: * *
951: * Saving functions front-ends *
952: * *
953: ************************************************************************/
954:
1.1 daniel 955: /**
956: * htmlDocDump:
957: * @f: the FILE*
958: * @cur: the document
959: *
960: * Dump an HTML document to an open FILE.
1.21 veillard 961: *
962: * returns: the number of byte written or -1 in case of failure.
1.1 daniel 963: */
1.21 veillard 964: int
1.1 daniel 965: htmlDocDump(FILE *f, xmlDocPtr cur) {
1.21 veillard 966: xmlOutputBufferPtr buf;
1.24 veillard 967: xmlCharEncodingHandlerPtr handler = NULL;
968: const char *encoding;
1.21 veillard 969: int ret;
1.1 daniel 970:
971: if (cur == NULL) {
972: #ifdef DEBUG_TREE
1.15 daniel 973: fprintf(stderr, "htmlDocDump : document == NULL\n");
1.1 daniel 974: #endif
1.21 veillard 975: return(-1);
1.1 daniel 976: }
1.24 veillard 977:
978: encoding = (const char *) htmlGetMetaEncoding(cur);
979:
980: if (encoding != NULL) {
981: xmlCharEncoding enc;
982:
983: enc = xmlParseCharEncoding(encoding);
984: if (enc != cur->charset) {
985: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
986: /*
987: * Not supported yet
988: */
989: return(-1);
990: }
991:
992: handler = xmlFindCharEncodingHandler(encoding);
993: if (handler == NULL)
994: return(-1);
995: }
996: }
997:
998: /*
1.25 veillard 999: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1000: */
1001: if (handler == NULL)
1.25 veillard 1002: handler = xmlFindCharEncodingHandler("HTML");
1003: if (handler == NULL)
1.24 veillard 1004: handler = xmlFindCharEncodingHandler("ascii");
1005:
1006: buf = xmlOutputBufferCreateFile(f, handler);
1.21 veillard 1007: if (buf == NULL) return(-1);
1008: htmlDocContentDumpOutput(buf, cur, NULL);
1009:
1010: ret = xmlOutputBufferClose(buf);
1011: return(ret);
1012: }
1013:
1014: /**
1015: * htmlSaveFile:
1016: * @filename: the filename (or URL)
1017: * @cur: the document
1018: *
1019: * Dump an HTML document to a file. If @filename is "-" the stdout file is
1020: * used.
1021: * returns: the number of byte written or -1 in case of failure.
1022: */
1023: int
1024: htmlSaveFile(const char *filename, xmlDocPtr cur) {
1025: xmlOutputBufferPtr buf;
1.24 veillard 1026: xmlCharEncodingHandlerPtr handler = NULL;
1027: const char *encoding;
1.21 veillard 1028: int ret;
1029:
1.24 veillard 1030: encoding = (const char *) htmlGetMetaEncoding(cur);
1031:
1032: if (encoding != NULL) {
1033: xmlCharEncoding enc;
1034:
1035: enc = xmlParseCharEncoding(encoding);
1036: if (enc != cur->charset) {
1037: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1038: /*
1039: * Not supported yet
1040: */
1041: return(-1);
1042: }
1043:
1044: handler = xmlFindCharEncodingHandler(encoding);
1045: if (handler == NULL)
1046: return(-1);
1047: }
1048: }
1049:
1050: /*
1.25 veillard 1051: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1052: */
1053: if (handler == NULL)
1.25 veillard 1054: handler = xmlFindCharEncodingHandler("HTML");
1055: if (handler == NULL)
1.24 veillard 1056: handler = xmlFindCharEncodingHandler("ascii");
1057:
1.21 veillard 1058: /*
1059: * save the content to a temp buffer.
1060: */
1.24 veillard 1061: buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1.21 veillard 1062: if (buf == NULL) return(0);
1063:
1064: htmlDocContentDumpOutput(buf, cur, NULL);
1065:
1066: ret = xmlOutputBufferClose(buf);
1067: return(ret);
1.1 daniel 1068: }
1069:
1070: /**
1.26 veillard 1071: * htmlSaveFileEnc:
1.1 daniel 1072: * @filename: the filename
1073: * @cur: the document
1074: *
1.26 veillard 1075: * Dump an HTML document to a file using a given encoding.
1.1 daniel 1076: *
1077: * returns: the number of byte written or -1 in case of failure.
1078: */
1079: int
1.21 veillard 1080: htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1081: xmlOutputBufferPtr buf;
1082: xmlCharEncodingHandlerPtr handler = NULL;
1.1 daniel 1083: int ret;
1084:
1.21 veillard 1085: if (encoding != NULL) {
1086: xmlCharEncoding enc;
1087:
1088: enc = xmlParseCharEncoding(encoding);
1089: if (enc != cur->charset) {
1090: if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1091: /*
1092: * Not supported yet
1093: */
1094: return(-1);
1095: }
1096:
1097: handler = xmlFindCharEncodingHandler(encoding);
1098: if (handler == NULL)
1099: return(-1);
1.26 veillard 1100: htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1.21 veillard 1101: }
1102: }
1.24 veillard 1103:
1104: /*
1.25 veillard 1105: * Fallback to HTML or ASCII when the encoding is unspecified
1.24 veillard 1106: */
1.25 veillard 1107: if (handler == NULL)
1108: handler = xmlFindCharEncodingHandler("HTML");
1.24 veillard 1109: if (handler == NULL)
1110: handler = xmlFindCharEncodingHandler("ascii");
1.21 veillard 1111:
1.1 daniel 1112: /*
1113: * save the content to a temp buffer.
1114: */
1.21 veillard 1115: buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1.1 daniel 1116: if (buf == NULL) return(0);
1117:
1.21 veillard 1118: htmlDocContentDumpOutput(buf, cur, encoding);
1.1 daniel 1119:
1.21 veillard 1120: ret = xmlOutputBufferClose(buf);
1121: return(ret);
1.1 daniel 1122: }
1.18 daniel 1123: #endif /* LIBXML_HTML_ENABLED */
Webmaster