1 /*
2 * HTMLtree.c : implementation of access function for an HTML tree.
3 *
4 * See Copyright for the status of this software.
5 *
6 * [email protected]
7 */
8
9
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30
31 /************************************************************************
32 * *
33 * Getting/Setting encoding meta tags *
34 * *
35 ************************************************************************/
36
37 /**
38 * htmlGetMetaEncoding:
39 * @doc: the document
40 *
41 * Encoding definition lookup in the Meta tags
42 *
43 * Returns the current encoding as flagged in the HTML source
44 */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47 htmlNodePtr cur;
48 const xmlChar *content;
49 const xmlChar *encoding;
50
51 if (doc == NULL)
52 return(NULL);
53 cur = doc->children;
54
55 /*
56 * Search the html
57 */
58 while (cur != NULL) {
59 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 break;
62 if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 goto found_head;
64 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 goto found_meta;
66 }
67 cur = cur->next;
68 }
69 if (cur == NULL)
70 return(NULL);
71 cur = cur->children;
72
73 /*
74 * Search the head
75 */
76 while (cur != NULL) {
77 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 break;
80 if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 goto found_meta;
82 }
83 cur = cur->next;
84 }
85 if (cur == NULL)
86 return(NULL);
87 found_head:
88 cur = cur->children;
89
90 /*
91 * Search the meta elements
92 */
93 found_meta:
94 while (cur != NULL) {
95 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 xmlAttrPtr attr = cur->properties;
98 int http;
99 const xmlChar *value;
100
101 content = NULL;
102 http = 0;
103 while (attr != NULL) {
104 if ((attr->children != NULL) &&
105 (attr->children->type == XML_TEXT_NODE) &&
106 (attr->children->next == NULL)) {
107 value = attr->children->content;
108 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 http = 1;
111 else if ((value != NULL)
112 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 content = value;
114 if ((http != 0) && (content != NULL))
115 goto found_content;
116 }
117 attr = attr->next;
118 }
119 }
120 }
121 cur = cur->next;
122 }
123 return(NULL);
124
125 found_content:
126 encoding = xmlStrstr(content, BAD_CAST"charset=");
127 if (encoding == NULL)
128 encoding = xmlStrstr(content, BAD_CAST"Charset=");
129 if (encoding == NULL)
130 encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131 if (encoding != NULL) {
132 encoding += 8;
133 } else {
134 encoding = xmlStrstr(content, BAD_CAST"charset =");
135 if (encoding == NULL)
136 encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 if (encoding == NULL)
138 encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 if (encoding != NULL)
140 encoding += 9;
141 }
142 if (encoding != NULL) {
143 while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144 }
145 return(encoding);
146 }
147
148 /**
149 * htmlSetMetaEncoding:
150 * @doc: the document
151 * @encoding: the encoding string
152 *
153 * Sets the current encoding in the Meta tags
154 * NOTE: this will not change the document content encoding, just
155 * the META flag associated.
156 *
157 * Returns 0 in case of success and -1 in case of error
158 */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161 htmlNodePtr cur, meta = NULL, head = NULL;
162 const xmlChar *content = NULL;
163 char newcontent[100];
164
165 newcontent[0] = 0;
166
167 if (doc == NULL)
168 return(-1);
169
170 /* html isn't a real encoding it's just libxml2 way to get entities */
171 if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172 return(-1);
173
174 if (encoding != NULL) {
175 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176 (char *)encoding);
177 newcontent[sizeof(newcontent) - 1] = 0;
178 }
179
180 cur = doc->children;
181
182 /*
183 * Search the html
184 */
185 while (cur != NULL) {
186 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 break;
189 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 goto found_head;
191 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 goto found_meta;
193 }
194 cur = cur->next;
195 }
196 if (cur == NULL)
197 return(-1);
198 cur = cur->children;
199
200 /*
201 * Search the head
202 */
203 while (cur != NULL) {
204 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 break;
207 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208 head = cur->parent;
209 goto found_meta;
210 }
211 }
212 cur = cur->next;
213 }
214 if (cur == NULL)
215 return(-1);
216 found_head:
217 head = cur;
218 if (cur->children == NULL)
219 goto create;
220 cur = cur->children;
221
222 found_meta:
223 /*
224 * Search and update all the remaining the meta elements carrying
225 * encoding information
226 */
227 while (cur != NULL) {
228 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 xmlAttrPtr attr = cur->properties;
231 int http;
232 const xmlChar *value;
233
234 content = NULL;
235 http = 0;
236 while (attr != NULL) {
237 if ((attr->children != NULL) &&
238 (attr->children->type == XML_TEXT_NODE) &&
239 (attr->children->next == NULL)) {
240 value = attr->children->content;
241 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 http = 1;
244 else
245 {
246 if ((value != NULL) &&
247 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 content = value;
249 }
250 if ((http != 0) && (content != NULL))
251 break;
252 }
253 attr = attr->next;
254 }
255 if ((http != 0) && (content != NULL)) {
256 meta = cur;
257 break;
258 }
259
260 }
261 }
262 cur = cur->next;
263 }
264 create:
265 if (meta == NULL) {
266 if ((encoding != NULL) && (head != NULL)) {
267 /*
268 * Create a new Meta element with the right attributes
269 */
270
271 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272 if (head->children == NULL)
273 xmlAddChild(head, meta);
274 else
275 xmlAddPrevSibling(head->children, meta);
276 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278 }
279 } else {
280 /* remove the meta tag if NULL is passed */
281 if (encoding == NULL) {
282 xmlUnlinkNode(meta);
283 xmlFreeNode(meta);
284 }
285 /* change the document only if there is a real encoding change */
286 else if (xmlStrcasestr(content, encoding) == NULL) {
287 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288 }
289 }
290
291
292 return(0);
293 }
294
295 /**
296 * booleanHTMLAttrs:
297 *
298 * These are the HTML attributes which will be output
299 * in minimized form, i.e. <option selected="selected"> will be
300 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301 *
302 */
303 static const char* const htmlBooleanAttrs[] = {
304 "checked", "compact", "declare", "defer", "disabled", "ismap",
305 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306 "selected", NULL
307 };
308
309
310 /**
311 * htmlIsBooleanAttr:
312 * @name: the name of the attribute to check
313 *
314 * Determine if a given attribute is a boolean attribute.
315 *
316 * returns: false if the attribute is not boolean, true otherwise.
317 */
318 int
htmlIsBooleanAttr(const xmlChar * name)319 htmlIsBooleanAttr(const xmlChar *name)
320 {
321 int i = 0;
322
323 while (htmlBooleanAttrs[i] != NULL) {
324 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325 return 1;
326 i++;
327 }
328 return 0;
329 }
330
331 #ifdef LIBXML_OUTPUT_ENABLED
332 /************************************************************************
333 * *
334 * Output error handlers *
335 * *
336 ************************************************************************/
337
338 /**
339 * htmlSaveErr:
340 * @code: the error number
341 * @node: the location of the error.
342 * @extra: extra information
343 *
344 * Handle an out of memory condition
345 */
346 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)347 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
348 {
349 const char *msg = NULL;
350 int res;
351
352 switch(code) {
353 case XML_SAVE_NOT_UTF8:
354 msg = "string is not in UTF-8\n";
355 break;
356 case XML_SAVE_CHAR_INVALID:
357 msg = "invalid character value\n";
358 break;
359 case XML_SAVE_UNKNOWN_ENCODING:
360 msg = "unknown encoding %s\n";
361 break;
362 case XML_SAVE_NO_DOCTYPE:
363 msg = "HTML has no DOCTYPE\n";
364 break;
365 default:
366 msg = "unexpected error number\n";
367 }
368
369 res = __xmlRaiseError(NULL, NULL, NULL, NULL, node,
370 XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
371 extra, NULL, NULL, 0, 0,
372 msg, extra);
373 if (res < 0)
374 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
375 }
376
377 /************************************************************************
378 * *
379 * Dumping HTML tree content to a simple buffer *
380 * *
381 ************************************************************************/
382
383 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)384 htmlFindOutputEncoder(const char *encoding) {
385 xmlCharEncodingHandler *handler = NULL;
386
387 if (encoding != NULL) {
388 xmlCharEncoding enc;
389
390 enc = xmlParseCharEncoding(encoding);
391 if (enc != XML_CHAR_ENCODING_UTF8) {
392 xmlOpenCharEncodingHandler(encoding, /* output */ 1, &handler);
393 if (handler == NULL)
394 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
395 }
396 } else {
397 /*
398 * Fallback to HTML or ASCII when the encoding is unspecified
399 */
400 if (handler == NULL)
401 xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402 if (handler == NULL)
403 xmlOpenCharEncodingHandler("ascii", /* output */ 1, &handler);
404 }
405
406 return(handler);
407 }
408
409 /**
410 * htmlBufNodeDumpFormat:
411 * @buf: the xmlBufPtr output
412 * @doc: the document
413 * @cur: the current node
414 * @format: should formatting spaces been added
415 *
416 * Dump an HTML node, recursive behaviour,children are printed too.
417 *
418 * Returns the number of byte written or -1 in case of error
419 */
420 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)421 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
422 int format) {
423 size_t use;
424 int ret;
425 xmlOutputBufferPtr outbuf;
426
427 if (cur == NULL) {
428 return (-1);
429 }
430 if (buf == NULL) {
431 return (-1);
432 }
433 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
434 if (outbuf == NULL)
435 return (-1);
436 memset(outbuf, 0, sizeof(xmlOutputBuffer));
437 outbuf->buffer = buf;
438 outbuf->encoder = NULL;
439 outbuf->writecallback = NULL;
440 outbuf->closecallback = NULL;
441 outbuf->context = NULL;
442 outbuf->written = 0;
443
444 use = xmlBufUse(buf);
445 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
446 xmlFree(outbuf);
447 ret = xmlBufUse(buf) - use;
448 return (ret);
449 }
450
451 /**
452 * htmlNodeDump:
453 * @buf: the HTML buffer output
454 * @doc: the document
455 * @cur: the current node
456 *
457 * Dump an HTML node, recursive behaviour,children are printed too,
458 * and formatting returns are added.
459 *
460 * Returns the number of byte written or -1 in case of error
461 */
462 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)463 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
464 xmlBufPtr buffer;
465 size_t ret;
466
467 if ((buf == NULL) || (cur == NULL))
468 return(-1);
469
470 xmlInitParser();
471 buffer = xmlBufFromBuffer(buf);
472 if (buffer == NULL)
473 return(-1);
474
475 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
476
477 xmlBufBackToBuffer(buffer);
478
479 if (ret > INT_MAX)
480 return(-1);
481 return((int) ret);
482 }
483
484 /**
485 * htmlNodeDumpFileFormat:
486 * @out: the FILE pointer
487 * @doc: the document
488 * @cur: the current node
489 * @encoding: the document encoding
490 * @format: should formatting spaces been added
491 *
492 * Dump an HTML node, recursive behaviour,children are printed too.
493 *
494 * TODO: if encoding == NULL try to save in the doc encoding
495 *
496 * returns: the number of byte written or -1 in case of failure.
497 */
498 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)499 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
500 xmlNodePtr cur, const char *encoding, int format) {
501 xmlOutputBufferPtr buf;
502 xmlCharEncodingHandlerPtr handler;
503 int ret;
504
505 xmlInitParser();
506
507 /*
508 * save the content to a temp buffer.
509 */
510 handler = htmlFindOutputEncoder(encoding);
511 buf = xmlOutputBufferCreateFile(out, handler);
512 if (buf == NULL) return(0);
513
514 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
515
516 ret = xmlOutputBufferClose(buf);
517 return(ret);
518 }
519
520 /**
521 * htmlNodeDumpFile:
522 * @out: the FILE pointer
523 * @doc: the document
524 * @cur: the current node
525 *
526 * Dump an HTML node, recursive behaviour,children are printed too,
527 * and formatting returns are added.
528 */
529 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)530 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
531 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
532 }
533
534 /**
535 * htmlDocDumpMemoryFormat:
536 * @cur: the document
537 * @mem: OUT: the memory pointer
538 * @size: OUT: the memory length
539 * @format: should formatting spaces been added
540 *
541 * Dump an HTML document in memory and return the xmlChar * and it's size.
542 * It's up to the caller to free the memory.
543 */
544 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)545 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
546 xmlOutputBufferPtr buf;
547 xmlCharEncodingHandlerPtr handler = NULL;
548 const char *encoding;
549
550 xmlInitParser();
551
552 if ((mem == NULL) || (size == NULL))
553 return;
554 if (cur == NULL) {
555 *mem = NULL;
556 *size = 0;
557 return;
558 }
559
560 encoding = (const char *) htmlGetMetaEncoding(cur);
561 handler = htmlFindOutputEncoder(encoding);
562 buf = xmlAllocOutputBufferInternal(handler);
563 if (buf == NULL) {
564 *mem = NULL;
565 *size = 0;
566 return;
567 }
568
569 htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
570
571 xmlOutputBufferFlush(buf);
572 if (buf->conv != NULL) {
573 *size = xmlBufUse(buf->conv);
574 *mem = xmlStrndup(xmlBufContent(buf->conv), *size);
575 } else {
576 *size = xmlBufUse(buf->buffer);
577 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
578 }
579 (void)xmlOutputBufferClose(buf);
580 }
581
582 /**
583 * htmlDocDumpMemory:
584 * @cur: the document
585 * @mem: OUT: the memory pointer
586 * @size: OUT: the memory length
587 *
588 * Dump an HTML document in memory and return the xmlChar * and it's size.
589 * It's up to the caller to free the memory.
590 */
591 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)592 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
593 htmlDocDumpMemoryFormat(cur, mem, size, 1);
594 }
595
596
597 /************************************************************************
598 * *
599 * Dumping HTML tree content to an I/O output buffer *
600 * *
601 ************************************************************************/
602
603 /**
604 * htmlDtdDumpOutput:
605 * @buf: the HTML buffer output
606 * @doc: the document
607 * @encoding: the encoding string
608 *
609 * TODO: check whether encoding is needed
610 *
611 * Dump the HTML document DTD, if any.
612 */
613 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)614 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
615 const char *encoding ATTRIBUTE_UNUSED) {
616 xmlDtdPtr cur = doc->intSubset;
617
618 if (cur == NULL) {
619 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
620 return;
621 }
622 xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
623 xmlOutputBufferWriteString(buf, (const char *)cur->name);
624 if (cur->ExternalID != NULL) {
625 xmlOutputBufferWriteString(buf, " PUBLIC ");
626 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
627 if (cur->SystemID != NULL) {
628 xmlOutputBufferWriteString(buf, " ");
629 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
630 }
631 } else if (cur->SystemID != NULL &&
632 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
633 xmlOutputBufferWriteString(buf, " SYSTEM ");
634 xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
635 }
636 xmlOutputBufferWriteString(buf, ">\n");
637 }
638
639 /**
640 * htmlAttrDumpOutput:
641 * @buf: the HTML buffer output
642 * @doc: the document
643 * @cur: the attribute pointer
644 *
645 * Dump an HTML attribute
646 */
647 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)648 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
649 xmlChar *value;
650
651 /*
652 * The html output method should not escape a & character
653 * occurring in an attribute value immediately followed by
654 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
655 * This is implemented in xmlEncodeEntitiesReentrant
656 */
657
658 if (cur == NULL) {
659 return;
660 }
661 xmlOutputBufferWriteString(buf, " ");
662 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
663 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
664 xmlOutputBufferWriteString(buf, ":");
665 }
666 xmlOutputBufferWriteString(buf, (const char *)cur->name);
667 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
668 value = xmlNodeListGetString(doc, cur->children, 0);
669 if (value) {
670 xmlOutputBufferWriteString(buf, "=");
671 if ((cur->ns == NULL) && (cur->parent != NULL) &&
672 (cur->parent->ns == NULL) &&
673 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
674 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
675 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
676 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
677 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
678 xmlChar *escaped;
679 xmlChar *tmp = value;
680
681 while (IS_BLANK_CH(*tmp)) tmp++;
682
683 /*
684 * Angle brackets are technically illegal in URIs, but they're
685 * used in server side includes, for example. Curly brackets
686 * are illegal as well and often used in templates.
687 * Don't escape non-whitespace, printable ASCII chars for
688 * improved interoperability. Only escape space, control
689 * and non-ASCII chars.
690 */
691 escaped = xmlURIEscapeStr(tmp,
692 BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
693 if (escaped != NULL) {
694 xmlBufWriteQuotedString(buf->buffer, escaped);
695 xmlFree(escaped);
696 } else {
697 buf->error = XML_ERR_NO_MEMORY;
698 }
699 } else {
700 xmlBufWriteQuotedString(buf->buffer, value);
701 }
702 xmlFree(value);
703 } else {
704 buf->error = XML_ERR_NO_MEMORY;
705 }
706 }
707 }
708
709 /**
710 * htmlNodeDumpFormatOutput:
711 * @buf: the HTML buffer output
712 * @doc: the document
713 * @cur: the current node
714 * @encoding: the encoding string (unused)
715 * @format: should formatting spaces been added
716 *
717 * Dump an HTML node, recursive behaviour,children are printed too.
718 */
719 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)720 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
721 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
722 int format) {
723 xmlNodePtr root, parent;
724 xmlAttrPtr attr;
725 const htmlElemDesc * info;
726
727 xmlInitParser();
728
729 if ((cur == NULL) || (buf == NULL)) {
730 return;
731 }
732
733 root = cur;
734 parent = cur->parent;
735 while (1) {
736 switch (cur->type) {
737 case XML_HTML_DOCUMENT_NODE:
738 case XML_DOCUMENT_NODE:
739 if (((xmlDocPtr) cur)->intSubset != NULL) {
740 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
741 }
742 if (cur->children != NULL) {
743 /* Always validate cur->parent when descending. */
744 if (cur->parent == parent) {
745 parent = cur;
746 cur = cur->children;
747 continue;
748 }
749 } else {
750 xmlOutputBufferWriteString(buf, "\n");
751 }
752 break;
753
754 case XML_ELEMENT_NODE:
755 /*
756 * Some users like lxml are known to pass nodes with a corrupted
757 * tree structure. Fall back to a recursive call to handle this
758 * case.
759 */
760 if ((cur->parent != parent) && (cur->children != NULL)) {
761 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
762 break;
763 }
764
765 /*
766 * Get specific HTML info for that node.
767 */
768 if (cur->ns == NULL)
769 info = htmlTagLookup(cur->name);
770 else
771 info = NULL;
772
773 xmlOutputBufferWriteString(buf, "<");
774 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
775 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
776 xmlOutputBufferWriteString(buf, ":");
777 }
778 xmlOutputBufferWriteString(buf, (const char *)cur->name);
779 if (cur->nsDef)
780 xmlNsListDumpOutput(buf, cur->nsDef);
781 attr = cur->properties;
782 while (attr != NULL) {
783 htmlAttrDumpOutput(buf, doc, attr);
784 attr = attr->next;
785 }
786
787 if ((info != NULL) && (info->empty)) {
788 xmlOutputBufferWriteString(buf, ">");
789 } else if (cur->children == NULL) {
790 if ((info != NULL) && (info->saveEndTag != 0) &&
791 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
792 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
793 xmlOutputBufferWriteString(buf, ">");
794 } else {
795 xmlOutputBufferWriteString(buf, "></");
796 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
797 xmlOutputBufferWriteString(buf,
798 (const char *)cur->ns->prefix);
799 xmlOutputBufferWriteString(buf, ":");
800 }
801 xmlOutputBufferWriteString(buf, (const char *)cur->name);
802 xmlOutputBufferWriteString(buf, ">");
803 }
804 } else {
805 xmlOutputBufferWriteString(buf, ">");
806 if ((format) && (info != NULL) && (!info->isinline) &&
807 (cur->children->type != HTML_TEXT_NODE) &&
808 (cur->children->type != HTML_ENTITY_REF_NODE) &&
809 (cur->children != cur->last) &&
810 (cur->name != NULL) &&
811 (cur->name[0] != 'p')) /* p, pre, param */
812 xmlOutputBufferWriteString(buf, "\n");
813 parent = cur;
814 cur = cur->children;
815 continue;
816 }
817
818 if ((format) && (cur->next != NULL) &&
819 (info != NULL) && (!info->isinline)) {
820 if ((cur->next->type != HTML_TEXT_NODE) &&
821 (cur->next->type != HTML_ENTITY_REF_NODE) &&
822 (parent != NULL) &&
823 (parent->name != NULL) &&
824 (parent->name[0] != 'p')) /* p, pre, param */
825 xmlOutputBufferWriteString(buf, "\n");
826 }
827
828 break;
829
830 case XML_ATTRIBUTE_NODE:
831 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
832 break;
833
834 case HTML_TEXT_NODE:
835 if (cur->content == NULL)
836 break;
837 if (((cur->name == (const xmlChar *)xmlStringText) ||
838 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
839 ((parent == NULL) ||
840 ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
841 (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
842 xmlChar *buffer;
843
844 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
845 if (buffer == NULL) {
846 buf->error = XML_ERR_NO_MEMORY;
847 return;
848 }
849 xmlOutputBufferWriteString(buf, (const char *)buffer);
850 xmlFree(buffer);
851 } else {
852 xmlOutputBufferWriteString(buf, (const char *)cur->content);
853 }
854 break;
855
856 case HTML_COMMENT_NODE:
857 if (cur->content != NULL) {
858 xmlOutputBufferWriteString(buf, "<!--");
859 xmlOutputBufferWriteString(buf, (const char *)cur->content);
860 xmlOutputBufferWriteString(buf, "-->");
861 }
862 break;
863
864 case HTML_PI_NODE:
865 if (cur->name != NULL) {
866 xmlOutputBufferWriteString(buf, "<?");
867 xmlOutputBufferWriteString(buf, (const char *)cur->name);
868 if (cur->content != NULL) {
869 xmlOutputBufferWriteString(buf, " ");
870 xmlOutputBufferWriteString(buf,
871 (const char *)cur->content);
872 }
873 xmlOutputBufferWriteString(buf, ">");
874 }
875 break;
876
877 case HTML_ENTITY_REF_NODE:
878 xmlOutputBufferWriteString(buf, "&");
879 xmlOutputBufferWriteString(buf, (const char *)cur->name);
880 xmlOutputBufferWriteString(buf, ";");
881 break;
882
883 case HTML_PRESERVE_NODE:
884 if (cur->content != NULL) {
885 xmlOutputBufferWriteString(buf, (const char *)cur->content);
886 }
887 break;
888
889 default:
890 break;
891 }
892
893 while (1) {
894 if (cur == root)
895 return;
896 if (cur->next != NULL) {
897 cur = cur->next;
898 break;
899 }
900
901 cur = parent;
902 /* cur->parent was validated when descending. */
903 parent = cur->parent;
904
905 if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
906 (cur->type == XML_DOCUMENT_NODE)) {
907 xmlOutputBufferWriteString(buf, "\n");
908 } else {
909 if ((format) && (cur->ns == NULL))
910 info = htmlTagLookup(cur->name);
911 else
912 info = NULL;
913
914 if ((format) && (info != NULL) && (!info->isinline) &&
915 (cur->last->type != HTML_TEXT_NODE) &&
916 (cur->last->type != HTML_ENTITY_REF_NODE) &&
917 (cur->children != cur->last) &&
918 (cur->name != NULL) &&
919 (cur->name[0] != 'p')) /* p, pre, param */
920 xmlOutputBufferWriteString(buf, "\n");
921
922 xmlOutputBufferWriteString(buf, "</");
923 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
924 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
925 xmlOutputBufferWriteString(buf, ":");
926 }
927 xmlOutputBufferWriteString(buf, (const char *)cur->name);
928 xmlOutputBufferWriteString(buf, ">");
929
930 if ((format) && (info != NULL) && (!info->isinline) &&
931 (cur->next != NULL)) {
932 if ((cur->next->type != HTML_TEXT_NODE) &&
933 (cur->next->type != HTML_ENTITY_REF_NODE) &&
934 (parent != NULL) &&
935 (parent->name != NULL) &&
936 (parent->name[0] != 'p')) /* p, pre, param */
937 xmlOutputBufferWriteString(buf, "\n");
938 }
939 }
940 }
941 }
942 }
943
944 /**
945 * htmlNodeDumpOutput:
946 * @buf: the HTML buffer output
947 * @doc: the document
948 * @cur: the current node
949 * @encoding: the encoding string (unused)
950 *
951 * Dump an HTML node, recursive behaviour,children are printed too,
952 * and formatting returns/spaces are added.
953 */
954 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)955 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
956 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
957 htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
958 }
959
960 /**
961 * htmlDocContentDumpFormatOutput:
962 * @buf: the HTML buffer output
963 * @cur: the document
964 * @encoding: the encoding string (unused)
965 * @format: should formatting spaces been added
966 *
967 * Dump an HTML document.
968 */
969 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)970 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
971 const char *encoding ATTRIBUTE_UNUSED,
972 int format) {
973 int type = 0;
974 if (cur) {
975 type = cur->type;
976 cur->type = XML_HTML_DOCUMENT_NODE;
977 }
978 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
979 if (cur)
980 cur->type = (xmlElementType) type;
981 }
982
983 /**
984 * htmlDocContentDumpOutput:
985 * @buf: the HTML buffer output
986 * @cur: the document
987 * @encoding: the encoding string (unused)
988 *
989 * Dump an HTML document. Formatting return/spaces are added.
990 */
991 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)992 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
993 const char *encoding ATTRIBUTE_UNUSED) {
994 htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
995 }
996
997 /************************************************************************
998 * *
999 * Saving functions front-ends *
1000 * *
1001 ************************************************************************/
1002
1003 /**
1004 * htmlDocDump:
1005 * @f: the FILE*
1006 * @cur: the document
1007 *
1008 * Dump an HTML document to an open FILE.
1009 *
1010 * returns: the number of byte written or -1 in case of failure.
1011 */
1012 int
htmlDocDump(FILE * f,xmlDocPtr cur)1013 htmlDocDump(FILE *f, xmlDocPtr cur) {
1014 xmlOutputBufferPtr buf;
1015 xmlCharEncodingHandlerPtr handler = NULL;
1016 const char *encoding;
1017 int ret;
1018
1019 xmlInitParser();
1020
1021 if ((cur == NULL) || (f == NULL)) {
1022 return(-1);
1023 }
1024
1025 encoding = (const char *) htmlGetMetaEncoding(cur);
1026 handler = htmlFindOutputEncoder(encoding);
1027 buf = xmlOutputBufferCreateFile(f, handler);
1028 if (buf == NULL) return(-1);
1029 htmlDocContentDumpOutput(buf, cur, NULL);
1030
1031 ret = xmlOutputBufferClose(buf);
1032 return(ret);
1033 }
1034
1035 /**
1036 * htmlSaveFile:
1037 * @filename: the filename (or URL)
1038 * @cur: the document
1039 *
1040 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1041 * used.
1042 * returns: the number of byte written or -1 in case of failure.
1043 */
1044 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1045 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1046 xmlOutputBufferPtr buf;
1047 xmlCharEncodingHandlerPtr handler = NULL;
1048 const char *encoding;
1049 int ret;
1050
1051 if ((cur == NULL) || (filename == NULL))
1052 return(-1);
1053
1054 xmlInitParser();
1055
1056 encoding = (const char *) htmlGetMetaEncoding(cur);
1057 handler = htmlFindOutputEncoder(encoding);
1058 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1059 if (buf == NULL) return(0);
1060
1061 htmlDocContentDumpOutput(buf, cur, NULL);
1062
1063 ret = xmlOutputBufferClose(buf);
1064 return(ret);
1065 }
1066
1067 /**
1068 * htmlSaveFileFormat:
1069 * @filename: the filename
1070 * @cur: the document
1071 * @format: should formatting spaces been added
1072 * @encoding: the document encoding
1073 *
1074 * Dump an HTML document to a file using a given encoding.
1075 *
1076 * returns: the number of byte written or -1 in case of failure.
1077 */
1078 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1079 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1080 const char *encoding, int format) {
1081 xmlOutputBufferPtr buf;
1082 xmlCharEncodingHandlerPtr handler = NULL;
1083 int ret;
1084
1085 if ((cur == NULL) || (filename == NULL))
1086 return(-1);
1087
1088 xmlInitParser();
1089
1090 handler = htmlFindOutputEncoder(encoding);
1091 if (handler != NULL)
1092 htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1093 else
1094 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1095
1096 /*
1097 * save the content to a temp buffer.
1098 */
1099 buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1100 if (buf == NULL) return(0);
1101
1102 htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1103
1104 ret = xmlOutputBufferClose(buf);
1105 return(ret);
1106 }
1107
1108 /**
1109 * htmlSaveFileEnc:
1110 * @filename: the filename
1111 * @cur: the document
1112 * @encoding: the document encoding
1113 *
1114 * Dump an HTML document to a file using a given encoding
1115 * and formatting returns/spaces are added.
1116 *
1117 * returns: the number of byte written or -1 in case of failure.
1118 */
1119 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1120 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1121 return(htmlSaveFileFormat(filename, cur, encoding, 1));
1122 }
1123
1124 #endif /* LIBXML_OUTPUT_ENABLED */
1125
1126 #endif /* LIBXML_HTML_ENABLED */
1127