xref: /aosp_15_r20/external/cronet/third_party/libxml/src/HTMLtree.c (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /*
2  * HTMLtree.c : implementation of access function for an HTML tree.
3  *
4  * See Copyright for the status of this software.
5  *
6  * [email protected]
7  */
8 
9 
10 #define IN_LIBXML
11 #include "libxml.h"
12 #ifdef LIBXML_HTML_ENABLED
13 
14 #include <string.h> /* for memset() only ! */
15 #include <ctype.h>
16 #include <stdlib.h>
17 
18 #include <libxml/xmlmemory.h>
19 #include <libxml/HTMLparser.h>
20 #include <libxml/HTMLtree.h>
21 #include <libxml/entities.h>
22 #include <libxml/xmlerror.h>
23 #include <libxml/parserInternals.h>
24 #include <libxml/uri.h>
25 
26 #include "private/buf.h"
27 #include "private/error.h"
28 #include "private/io.h"
29 #include "private/save.h"
30 
31 /************************************************************************
32  *									*
33  *		Getting/Setting encoding meta tags			*
34  *									*
35  ************************************************************************/
36 
37 /**
38  * htmlGetMetaEncoding:
39  * @doc:  the document
40  *
41  * Encoding definition lookup in the Meta tags
42  *
43  * Returns the current encoding as flagged in the HTML source
44  */
45 const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc)46 htmlGetMetaEncoding(htmlDocPtr doc) {
47     htmlNodePtr cur;
48     const xmlChar *content;
49     const xmlChar *encoding;
50 
51     if (doc == NULL)
52 	return(NULL);
53     cur = doc->children;
54 
55     /*
56      * Search the html
57      */
58     while (cur != NULL) {
59 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60 	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
61 		break;
62 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
63 		goto found_head;
64 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65 		goto found_meta;
66 	}
67 	cur = cur->next;
68     }
69     if (cur == NULL)
70 	return(NULL);
71     cur = cur->children;
72 
73     /*
74      * Search the head
75      */
76     while (cur != NULL) {
77 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78 	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
79 		break;
80 	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81 		goto found_meta;
82 	}
83 	cur = cur->next;
84     }
85     if (cur == NULL)
86 	return(NULL);
87 found_head:
88     cur = cur->children;
89 
90     /*
91      * Search the meta elements
92      */
93 found_meta:
94     while (cur != NULL) {
95 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96 	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97 		xmlAttrPtr attr = cur->properties;
98 		int http;
99 		const xmlChar *value;
100 
101 		content = NULL;
102 		http = 0;
103 		while (attr != NULL) {
104 		    if ((attr->children != NULL) &&
105 		        (attr->children->type == XML_TEXT_NODE) &&
106 		        (attr->children->next == NULL)) {
107 			value = attr->children->content;
108 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110 			    http = 1;
111 			else if ((value != NULL)
112 			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113 			    content = value;
114 			if ((http != 0) && (content != NULL))
115 			    goto found_content;
116 		    }
117 		    attr = attr->next;
118 		}
119 	    }
120 	}
121 	cur = cur->next;
122     }
123     return(NULL);
124 
125 found_content:
126     encoding = xmlStrstr(content, BAD_CAST"charset=");
127     if (encoding == NULL)
128 	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129     if (encoding == NULL)
130 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131     if (encoding != NULL) {
132 	encoding += 8;
133     } else {
134 	encoding = xmlStrstr(content, BAD_CAST"charset =");
135 	if (encoding == NULL)
136 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
137 	if (encoding == NULL)
138 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139 	if (encoding != NULL)
140 	    encoding += 9;
141     }
142     if (encoding != NULL) {
143 	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
144     }
145     return(encoding);
146 }
147 
148 /**
149  * htmlSetMetaEncoding:
150  * @doc:  the document
151  * @encoding:  the encoding string
152  *
153  * Sets the current encoding in the Meta tags
154  * NOTE: this will not change the document content encoding, just
155  * the META flag associated.
156  *
157  * Returns 0 in case of success and -1 in case of error
158  */
159 int
htmlSetMetaEncoding(htmlDocPtr doc,const xmlChar * encoding)160 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161     htmlNodePtr cur, meta = NULL, head = NULL;
162     const xmlChar *content = NULL;
163     char newcontent[100];
164 
165     newcontent[0] = 0;
166 
167     if (doc == NULL)
168 	return(-1);
169 
170     /* html isn't a real encoding it's just libxml2 way to get entities */
171     if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172         return(-1);
173 
174     if (encoding != NULL) {
175 	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176                 (char *)encoding);
177 	newcontent[sizeof(newcontent) - 1] = 0;
178     }
179 
180     cur = doc->children;
181 
182     /*
183      * Search the html
184      */
185     while (cur != NULL) {
186 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187 	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188 		break;
189 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190 		goto found_head;
191 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192 		goto found_meta;
193 	}
194 	cur = cur->next;
195     }
196     if (cur == NULL)
197 	return(-1);
198     cur = cur->children;
199 
200     /*
201      * Search the head
202      */
203     while (cur != NULL) {
204 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205 	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206 		break;
207 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208                 head = cur->parent;
209 		goto found_meta;
210             }
211 	}
212 	cur = cur->next;
213     }
214     if (cur == NULL)
215 	return(-1);
216 found_head:
217     head = cur;
218     if (cur->children == NULL)
219         goto create;
220     cur = cur->children;
221 
222 found_meta:
223     /*
224      * Search and update all the remaining the meta elements carrying
225      * encoding information
226      */
227     while (cur != NULL) {
228 	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229 	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230 		xmlAttrPtr attr = cur->properties;
231 		int http;
232 		const xmlChar *value;
233 
234 		content = NULL;
235 		http = 0;
236 		while (attr != NULL) {
237 		    if ((attr->children != NULL) &&
238 		        (attr->children->type == XML_TEXT_NODE) &&
239 		        (attr->children->next == NULL)) {
240 			value = attr->children->content;
241 			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242 			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243 			    http = 1;
244 			else
245                         {
246                            if ((value != NULL) &&
247                                (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248 			       content = value;
249                         }
250 		        if ((http != 0) && (content != NULL))
251 			    break;
252 		    }
253 		    attr = attr->next;
254 		}
255 		if ((http != 0) && (content != NULL)) {
256 		    meta = cur;
257 		    break;
258 		}
259 
260 	    }
261 	}
262 	cur = cur->next;
263     }
264 create:
265     if (meta == NULL) {
266         if ((encoding != NULL) && (head != NULL)) {
267             /*
268              * Create a new Meta element with the right attributes
269              */
270 
271             meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272             if (head->children == NULL)
273                 xmlAddChild(head, meta);
274             else
275                 xmlAddPrevSibling(head->children, meta);
276             xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277             xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278         }
279     } else {
280         /* remove the meta tag if NULL is passed */
281         if (encoding == NULL) {
282             xmlUnlinkNode(meta);
283             xmlFreeNode(meta);
284         }
285         /* change the document only if there is a real encoding change */
286         else if (xmlStrcasestr(content, encoding) == NULL) {
287             xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288         }
289     }
290 
291 
292     return(0);
293 }
294 
295 /**
296  * booleanHTMLAttrs:
297  *
298  * These are the HTML attributes which will be output
299  * in minimized form, i.e. <option selected="selected"> will be
300  * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301  *
302  */
303 static const char* const htmlBooleanAttrs[] = {
304   "checked", "compact", "declare", "defer", "disabled", "ismap",
305   "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306   "selected", NULL
307 };
308 
309 
310 /**
311  * htmlIsBooleanAttr:
312  * @name:  the name of the attribute to check
313  *
314  * Determine if a given attribute is a boolean attribute.
315  *
316  * returns: false if the attribute is not boolean, true otherwise.
317  */
318 int
htmlIsBooleanAttr(const xmlChar * name)319 htmlIsBooleanAttr(const xmlChar *name)
320 {
321     int i = 0;
322 
323     while (htmlBooleanAttrs[i] != NULL) {
324         if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325             return 1;
326         i++;
327     }
328     return 0;
329 }
330 
331 #ifdef LIBXML_OUTPUT_ENABLED
332 /************************************************************************
333  *									*
334  *			Output error handlers				*
335  *									*
336  ************************************************************************/
337 
338 /**
339  * htmlSaveErr:
340  * @code:  the error number
341  * @node:  the location of the error.
342  * @extra:  extra information
343  *
344  * Handle an out of memory condition
345  */
346 static void
htmlSaveErr(int code,xmlNodePtr node,const char * extra)347 htmlSaveErr(int code, xmlNodePtr node, const char *extra)
348 {
349     const char *msg = NULL;
350     int res;
351 
352     switch(code) {
353         case XML_SAVE_NOT_UTF8:
354 	    msg = "string is not in UTF-8\n";
355 	    break;
356 	case XML_SAVE_CHAR_INVALID:
357 	    msg = "invalid character value\n";
358 	    break;
359 	case XML_SAVE_UNKNOWN_ENCODING:
360 	    msg = "unknown encoding %s\n";
361 	    break;
362 	case XML_SAVE_NO_DOCTYPE:
363 	    msg = "HTML has no DOCTYPE\n";
364 	    break;
365 	default:
366 	    msg = "unexpected error number\n";
367     }
368 
369     res = __xmlRaiseError(NULL, NULL, NULL, NULL, node,
370                           XML_FROM_OUTPUT, code, XML_ERR_ERROR, NULL, 0,
371                           extra, NULL, NULL, 0, 0,
372                           msg, extra);
373     if (res < 0)
374         xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_OUTPUT, NULL);
375 }
376 
377 /************************************************************************
378  *									*
379  *		Dumping HTML tree content to a simple buffer		*
380  *									*
381  ************************************************************************/
382 
383 static xmlCharEncodingHandler *
htmlFindOutputEncoder(const char * encoding)384 htmlFindOutputEncoder(const char *encoding) {
385     xmlCharEncodingHandler *handler = NULL;
386 
387     if (encoding != NULL) {
388 	xmlCharEncoding enc;
389 
390 	enc = xmlParseCharEncoding(encoding);
391 	if (enc != XML_CHAR_ENCODING_UTF8) {
392 	    xmlOpenCharEncodingHandler(encoding, /* output */ 1, &handler);
393 	    if (handler == NULL)
394 		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
395 	}
396     } else {
397         /*
398          * Fallback to HTML or ASCII when the encoding is unspecified
399          */
400         if (handler == NULL)
401             xmlOpenCharEncodingHandler("HTML", /* output */ 1, &handler);
402         if (handler == NULL)
403             xmlOpenCharEncodingHandler("ascii", /* output */ 1, &handler);
404     }
405 
406     return(handler);
407 }
408 
409 /**
410  * htmlBufNodeDumpFormat:
411  * @buf:  the xmlBufPtr output
412  * @doc:  the document
413  * @cur:  the current node
414  * @format:  should formatting spaces been added
415  *
416  * Dump an HTML node, recursive behaviour,children are printed too.
417  *
418  * Returns the number of byte written or -1 in case of error
419  */
420 static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf,xmlDocPtr doc,xmlNodePtr cur,int format)421 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
422 	           int format) {
423     size_t use;
424     int ret;
425     xmlOutputBufferPtr outbuf;
426 
427     if (cur == NULL) {
428 	return (-1);
429     }
430     if (buf == NULL) {
431 	return (-1);
432     }
433     outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
434     if (outbuf == NULL)
435 	return (-1);
436     memset(outbuf, 0, sizeof(xmlOutputBuffer));
437     outbuf->buffer = buf;
438     outbuf->encoder = NULL;
439     outbuf->writecallback = NULL;
440     outbuf->closecallback = NULL;
441     outbuf->context = NULL;
442     outbuf->written = 0;
443 
444     use = xmlBufUse(buf);
445     htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
446     xmlFree(outbuf);
447     ret = xmlBufUse(buf) - use;
448     return (ret);
449 }
450 
451 /**
452  * htmlNodeDump:
453  * @buf:  the HTML buffer output
454  * @doc:  the document
455  * @cur:  the current node
456  *
457  * Dump an HTML node, recursive behaviour,children are printed too,
458  * and formatting returns are added.
459  *
460  * Returns the number of byte written or -1 in case of error
461  */
462 int
htmlNodeDump(xmlBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur)463 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
464     xmlBufPtr buffer;
465     size_t ret;
466 
467     if ((buf == NULL) || (cur == NULL))
468         return(-1);
469 
470     xmlInitParser();
471     buffer = xmlBufFromBuffer(buf);
472     if (buffer == NULL)
473         return(-1);
474 
475     ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
476 
477     xmlBufBackToBuffer(buffer);
478 
479     if (ret > INT_MAX)
480         return(-1);
481     return((int) ret);
482 }
483 
484 /**
485  * htmlNodeDumpFileFormat:
486  * @out:  the FILE pointer
487  * @doc:  the document
488  * @cur:  the current node
489  * @encoding: the document encoding
490  * @format:  should formatting spaces been added
491  *
492  * Dump an HTML node, recursive behaviour,children are printed too.
493  *
494  * TODO: if encoding == NULL try to save in the doc encoding
495  *
496  * returns: the number of byte written or -1 in case of failure.
497  */
498 int
htmlNodeDumpFileFormat(FILE * out,xmlDocPtr doc,xmlNodePtr cur,const char * encoding,int format)499 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
500 	               xmlNodePtr cur, const char *encoding, int format) {
501     xmlOutputBufferPtr buf;
502     xmlCharEncodingHandlerPtr handler;
503     int ret;
504 
505     xmlInitParser();
506 
507     /*
508      * save the content to a temp buffer.
509      */
510     handler = htmlFindOutputEncoder(encoding);
511     buf = xmlOutputBufferCreateFile(out, handler);
512     if (buf == NULL) return(0);
513 
514     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
515 
516     ret = xmlOutputBufferClose(buf);
517     return(ret);
518 }
519 
520 /**
521  * htmlNodeDumpFile:
522  * @out:  the FILE pointer
523  * @doc:  the document
524  * @cur:  the current node
525  *
526  * Dump an HTML node, recursive behaviour,children are printed too,
527  * and formatting returns are added.
528  */
529 void
htmlNodeDumpFile(FILE * out,xmlDocPtr doc,xmlNodePtr cur)530 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
531     htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
532 }
533 
534 /**
535  * htmlDocDumpMemoryFormat:
536  * @cur:  the document
537  * @mem:  OUT: the memory pointer
538  * @size:  OUT: the memory length
539  * @format:  should formatting spaces been added
540  *
541  * Dump an HTML document in memory and return the xmlChar * and it's size.
542  * It's up to the caller to free the memory.
543  */
544 void
htmlDocDumpMemoryFormat(xmlDocPtr cur,xmlChar ** mem,int * size,int format)545 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
546     xmlOutputBufferPtr buf;
547     xmlCharEncodingHandlerPtr handler = NULL;
548     const char *encoding;
549 
550     xmlInitParser();
551 
552     if ((mem == NULL) || (size == NULL))
553         return;
554     if (cur == NULL) {
555 	*mem = NULL;
556 	*size = 0;
557 	return;
558     }
559 
560     encoding = (const char *) htmlGetMetaEncoding(cur);
561     handler = htmlFindOutputEncoder(encoding);
562     buf = xmlAllocOutputBufferInternal(handler);
563     if (buf == NULL) {
564 	*mem = NULL;
565 	*size = 0;
566 	return;
567     }
568 
569     htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
570 
571     xmlOutputBufferFlush(buf);
572     if (buf->conv != NULL) {
573 	*size = xmlBufUse(buf->conv);
574 	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
575     } else {
576 	*size = xmlBufUse(buf->buffer);
577 	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
578     }
579     (void)xmlOutputBufferClose(buf);
580 }
581 
582 /**
583  * htmlDocDumpMemory:
584  * @cur:  the document
585  * @mem:  OUT: the memory pointer
586  * @size:  OUT: the memory length
587  *
588  * Dump an HTML document in memory and return the xmlChar * and it's size.
589  * It's up to the caller to free the memory.
590  */
591 void
htmlDocDumpMemory(xmlDocPtr cur,xmlChar ** mem,int * size)592 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
593 	htmlDocDumpMemoryFormat(cur, mem, size, 1);
594 }
595 
596 
597 /************************************************************************
598  *									*
599  *		Dumping HTML tree content to an I/O output buffer	*
600  *									*
601  ************************************************************************/
602 
603 /**
604  * htmlDtdDumpOutput:
605  * @buf:  the HTML buffer output
606  * @doc:  the document
607  * @encoding:  the encoding string
608  *
609  * TODO: check whether encoding is needed
610  *
611  * Dump the HTML document DTD, if any.
612  */
613 static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,const char * encoding ATTRIBUTE_UNUSED)614 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
615 	          const char *encoding ATTRIBUTE_UNUSED) {
616     xmlDtdPtr cur = doc->intSubset;
617 
618     if (cur == NULL) {
619 	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
620 	return;
621     }
622     xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
623     xmlOutputBufferWriteString(buf, (const char *)cur->name);
624     if (cur->ExternalID != NULL) {
625 	xmlOutputBufferWriteString(buf, " PUBLIC ");
626 	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
627 	if (cur->SystemID != NULL) {
628 	    xmlOutputBufferWriteString(buf, " ");
629 	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
630 	}
631     } else if (cur->SystemID != NULL &&
632 	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
633 	xmlOutputBufferWriteString(buf, " SYSTEM ");
634 	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
635     }
636     xmlOutputBufferWriteString(buf, ">\n");
637 }
638 
639 /**
640  * htmlAttrDumpOutput:
641  * @buf:  the HTML buffer output
642  * @doc:  the document
643  * @cur:  the attribute pointer
644  *
645  * Dump an HTML attribute
646  */
647 static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlAttrPtr cur)648 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
649     xmlChar *value;
650 
651     /*
652      * The html output method should not escape a & character
653      * occurring in an attribute value immediately followed by
654      * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
655      * This is implemented in xmlEncodeEntitiesReentrant
656      */
657 
658     if (cur == NULL) {
659 	return;
660     }
661     xmlOutputBufferWriteString(buf, " ");
662     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
663         xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
664 	xmlOutputBufferWriteString(buf, ":");
665     }
666     xmlOutputBufferWriteString(buf, (const char *)cur->name);
667     if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
668 	value = xmlNodeListGetString(doc, cur->children, 0);
669 	if (value) {
670 	    xmlOutputBufferWriteString(buf, "=");
671 	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
672 		(cur->parent->ns == NULL) &&
673 		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
674 	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
675 		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
676 		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
677 		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
678 		xmlChar *escaped;
679 		xmlChar *tmp = value;
680 
681 		while (IS_BLANK_CH(*tmp)) tmp++;
682 
683 		/*
684                  * Angle brackets are technically illegal in URIs, but they're
685                  * used in server side includes, for example. Curly brackets
686                  * are illegal as well and often used in templates.
687                  * Don't escape non-whitespace, printable ASCII chars for
688                  * improved interoperability. Only escape space, control
689                  * and non-ASCII chars.
690 		 */
691 		escaped = xmlURIEscapeStr(tmp,
692                         BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
693 		if (escaped != NULL) {
694 		    xmlBufWriteQuotedString(buf->buffer, escaped);
695 		    xmlFree(escaped);
696 		} else {
697                     buf->error = XML_ERR_NO_MEMORY;
698 		}
699 	    } else {
700 		xmlBufWriteQuotedString(buf->buffer, value);
701 	    }
702 	    xmlFree(value);
703 	} else  {
704             buf->error = XML_ERR_NO_MEMORY;
705 	}
706     }
707 }
708 
709 /**
710  * htmlNodeDumpFormatOutput:
711  * @buf:  the HTML buffer output
712  * @doc:  the document
713  * @cur:  the current node
714  * @encoding:  the encoding string (unused)
715  * @format:  should formatting spaces been added
716  *
717  * Dump an HTML node, recursive behaviour,children are printed too.
718  */
719 void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)720 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
721 	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
722                          int format) {
723     xmlNodePtr root, parent;
724     xmlAttrPtr attr;
725     const htmlElemDesc * info;
726 
727     xmlInitParser();
728 
729     if ((cur == NULL) || (buf == NULL)) {
730 	return;
731     }
732 
733     root = cur;
734     parent = cur->parent;
735     while (1) {
736         switch (cur->type) {
737         case XML_HTML_DOCUMENT_NODE:
738         case XML_DOCUMENT_NODE:
739             if (((xmlDocPtr) cur)->intSubset != NULL) {
740                 htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
741             }
742             if (cur->children != NULL) {
743                 /* Always validate cur->parent when descending. */
744                 if (cur->parent == parent) {
745                     parent = cur;
746                     cur = cur->children;
747                     continue;
748                 }
749             } else {
750                 xmlOutputBufferWriteString(buf, "\n");
751             }
752             break;
753 
754         case XML_ELEMENT_NODE:
755             /*
756              * Some users like lxml are known to pass nodes with a corrupted
757              * tree structure. Fall back to a recursive call to handle this
758              * case.
759              */
760             if ((cur->parent != parent) && (cur->children != NULL)) {
761                 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
762                 break;
763             }
764 
765             /*
766              * Get specific HTML info for that node.
767              */
768             if (cur->ns == NULL)
769                 info = htmlTagLookup(cur->name);
770             else
771                 info = NULL;
772 
773             xmlOutputBufferWriteString(buf, "<");
774             if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
775                 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
776                 xmlOutputBufferWriteString(buf, ":");
777             }
778             xmlOutputBufferWriteString(buf, (const char *)cur->name);
779             if (cur->nsDef)
780                 xmlNsListDumpOutput(buf, cur->nsDef);
781             attr = cur->properties;
782             while (attr != NULL) {
783                 htmlAttrDumpOutput(buf, doc, attr);
784                 attr = attr->next;
785             }
786 
787             if ((info != NULL) && (info->empty)) {
788                 xmlOutputBufferWriteString(buf, ">");
789             } else if (cur->children == NULL) {
790                 if ((info != NULL) && (info->saveEndTag != 0) &&
791                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
792                     (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
793                     xmlOutputBufferWriteString(buf, ">");
794                 } else {
795                     xmlOutputBufferWriteString(buf, "></");
796                     if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
797                         xmlOutputBufferWriteString(buf,
798                                 (const char *)cur->ns->prefix);
799                         xmlOutputBufferWriteString(buf, ":");
800                     }
801                     xmlOutputBufferWriteString(buf, (const char *)cur->name);
802                     xmlOutputBufferWriteString(buf, ">");
803                 }
804             } else {
805                 xmlOutputBufferWriteString(buf, ">");
806                 if ((format) && (info != NULL) && (!info->isinline) &&
807                     (cur->children->type != HTML_TEXT_NODE) &&
808                     (cur->children->type != HTML_ENTITY_REF_NODE) &&
809                     (cur->children != cur->last) &&
810                     (cur->name != NULL) &&
811                     (cur->name[0] != 'p')) /* p, pre, param */
812                     xmlOutputBufferWriteString(buf, "\n");
813                 parent = cur;
814                 cur = cur->children;
815                 continue;
816             }
817 
818             if ((format) && (cur->next != NULL) &&
819                 (info != NULL) && (!info->isinline)) {
820                 if ((cur->next->type != HTML_TEXT_NODE) &&
821                     (cur->next->type != HTML_ENTITY_REF_NODE) &&
822                     (parent != NULL) &&
823                     (parent->name != NULL) &&
824                     (parent->name[0] != 'p')) /* p, pre, param */
825                     xmlOutputBufferWriteString(buf, "\n");
826             }
827 
828             break;
829 
830         case XML_ATTRIBUTE_NODE:
831             htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
832             break;
833 
834         case HTML_TEXT_NODE:
835             if (cur->content == NULL)
836                 break;
837             if (((cur->name == (const xmlChar *)xmlStringText) ||
838                  (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
839                 ((parent == NULL) ||
840                  ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
841                   (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
842                 xmlChar *buffer;
843 
844                 buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
845                 if (buffer == NULL) {
846                     buf->error = XML_ERR_NO_MEMORY;
847                     return;
848                 }
849                 xmlOutputBufferWriteString(buf, (const char *)buffer);
850                 xmlFree(buffer);
851             } else {
852                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
853             }
854             break;
855 
856         case HTML_COMMENT_NODE:
857             if (cur->content != NULL) {
858                 xmlOutputBufferWriteString(buf, "<!--");
859                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
860                 xmlOutputBufferWriteString(buf, "-->");
861             }
862             break;
863 
864         case HTML_PI_NODE:
865             if (cur->name != NULL) {
866                 xmlOutputBufferWriteString(buf, "<?");
867                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
868                 if (cur->content != NULL) {
869                     xmlOutputBufferWriteString(buf, " ");
870                     xmlOutputBufferWriteString(buf,
871                             (const char *)cur->content);
872                 }
873                 xmlOutputBufferWriteString(buf, ">");
874             }
875             break;
876 
877         case HTML_ENTITY_REF_NODE:
878             xmlOutputBufferWriteString(buf, "&");
879             xmlOutputBufferWriteString(buf, (const char *)cur->name);
880             xmlOutputBufferWriteString(buf, ";");
881             break;
882 
883         case HTML_PRESERVE_NODE:
884             if (cur->content != NULL) {
885                 xmlOutputBufferWriteString(buf, (const char *)cur->content);
886             }
887             break;
888 
889         default:
890             break;
891         }
892 
893         while (1) {
894             if (cur == root)
895                 return;
896             if (cur->next != NULL) {
897                 cur = cur->next;
898                 break;
899             }
900 
901             cur = parent;
902             /* cur->parent was validated when descending. */
903             parent = cur->parent;
904 
905             if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
906                 (cur->type == XML_DOCUMENT_NODE)) {
907                 xmlOutputBufferWriteString(buf, "\n");
908             } else {
909                 if ((format) && (cur->ns == NULL))
910                     info = htmlTagLookup(cur->name);
911                 else
912                     info = NULL;
913 
914                 if ((format) && (info != NULL) && (!info->isinline) &&
915                     (cur->last->type != HTML_TEXT_NODE) &&
916                     (cur->last->type != HTML_ENTITY_REF_NODE) &&
917                     (cur->children != cur->last) &&
918                     (cur->name != NULL) &&
919                     (cur->name[0] != 'p')) /* p, pre, param */
920                     xmlOutputBufferWriteString(buf, "\n");
921 
922                 xmlOutputBufferWriteString(buf, "</");
923                 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
924                     xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
925                     xmlOutputBufferWriteString(buf, ":");
926                 }
927                 xmlOutputBufferWriteString(buf, (const char *)cur->name);
928                 xmlOutputBufferWriteString(buf, ">");
929 
930                 if ((format) && (info != NULL) && (!info->isinline) &&
931                     (cur->next != NULL)) {
932                     if ((cur->next->type != HTML_TEXT_NODE) &&
933                         (cur->next->type != HTML_ENTITY_REF_NODE) &&
934                         (parent != NULL) &&
935                         (parent->name != NULL) &&
936                         (parent->name[0] != 'p')) /* p, pre, param */
937                         xmlOutputBufferWriteString(buf, "\n");
938                 }
939             }
940         }
941     }
942 }
943 
944 /**
945  * htmlNodeDumpOutput:
946  * @buf:  the HTML buffer output
947  * @doc:  the document
948  * @cur:  the current node
949  * @encoding:  the encoding string (unused)
950  *
951  * Dump an HTML node, recursive behaviour,children are printed too,
952  * and formatting returns/spaces are added.
953  */
954 void
htmlNodeDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr doc,xmlNodePtr cur,const char * encoding ATTRIBUTE_UNUSED)955 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
956 	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
957     htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
958 }
959 
960 /**
961  * htmlDocContentDumpFormatOutput:
962  * @buf:  the HTML buffer output
963  * @cur:  the document
964  * @encoding:  the encoding string (unused)
965  * @format:  should formatting spaces been added
966  *
967  * Dump an HTML document.
968  */
969 void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED,int format)970 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
971 	                       const char *encoding ATTRIBUTE_UNUSED,
972                                int format) {
973     int type = 0;
974     if (cur) {
975         type = cur->type;
976         cur->type = XML_HTML_DOCUMENT_NODE;
977     }
978     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
979     if (cur)
980         cur->type = (xmlElementType) type;
981 }
982 
983 /**
984  * htmlDocContentDumpOutput:
985  * @buf:  the HTML buffer output
986  * @cur:  the document
987  * @encoding:  the encoding string (unused)
988  *
989  * Dump an HTML document. Formatting return/spaces are added.
990  */
991 void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf,xmlDocPtr cur,const char * encoding ATTRIBUTE_UNUSED)992 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
993 	                 const char *encoding ATTRIBUTE_UNUSED) {
994     htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
995 }
996 
997 /************************************************************************
998  *									*
999  *		Saving functions front-ends				*
1000  *									*
1001  ************************************************************************/
1002 
1003 /**
1004  * htmlDocDump:
1005  * @f:  the FILE*
1006  * @cur:  the document
1007  *
1008  * Dump an HTML document to an open FILE.
1009  *
1010  * returns: the number of byte written or -1 in case of failure.
1011  */
1012 int
htmlDocDump(FILE * f,xmlDocPtr cur)1013 htmlDocDump(FILE *f, xmlDocPtr cur) {
1014     xmlOutputBufferPtr buf;
1015     xmlCharEncodingHandlerPtr handler = NULL;
1016     const char *encoding;
1017     int ret;
1018 
1019     xmlInitParser();
1020 
1021     if ((cur == NULL) || (f == NULL)) {
1022 	return(-1);
1023     }
1024 
1025     encoding = (const char *) htmlGetMetaEncoding(cur);
1026     handler = htmlFindOutputEncoder(encoding);
1027     buf = xmlOutputBufferCreateFile(f, handler);
1028     if (buf == NULL) return(-1);
1029     htmlDocContentDumpOutput(buf, cur, NULL);
1030 
1031     ret = xmlOutputBufferClose(buf);
1032     return(ret);
1033 }
1034 
1035 /**
1036  * htmlSaveFile:
1037  * @filename:  the filename (or URL)
1038  * @cur:  the document
1039  *
1040  * Dump an HTML document to a file. If @filename is "-" the stdout file is
1041  * used.
1042  * returns: the number of byte written or -1 in case of failure.
1043  */
1044 int
htmlSaveFile(const char * filename,xmlDocPtr cur)1045 htmlSaveFile(const char *filename, xmlDocPtr cur) {
1046     xmlOutputBufferPtr buf;
1047     xmlCharEncodingHandlerPtr handler = NULL;
1048     const char *encoding;
1049     int ret;
1050 
1051     if ((cur == NULL) || (filename == NULL))
1052         return(-1);
1053 
1054     xmlInitParser();
1055 
1056     encoding = (const char *) htmlGetMetaEncoding(cur);
1057     handler = htmlFindOutputEncoder(encoding);
1058     buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1059     if (buf == NULL) return(0);
1060 
1061     htmlDocContentDumpOutput(buf, cur, NULL);
1062 
1063     ret = xmlOutputBufferClose(buf);
1064     return(ret);
1065 }
1066 
1067 /**
1068  * htmlSaveFileFormat:
1069  * @filename:  the filename
1070  * @cur:  the document
1071  * @format:  should formatting spaces been added
1072  * @encoding: the document encoding
1073  *
1074  * Dump an HTML document to a file using a given encoding.
1075  *
1076  * returns: the number of byte written or -1 in case of failure.
1077  */
1078 int
htmlSaveFileFormat(const char * filename,xmlDocPtr cur,const char * encoding,int format)1079 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1080 	           const char *encoding, int format) {
1081     xmlOutputBufferPtr buf;
1082     xmlCharEncodingHandlerPtr handler = NULL;
1083     int ret;
1084 
1085     if ((cur == NULL) || (filename == NULL))
1086         return(-1);
1087 
1088     xmlInitParser();
1089 
1090     handler = htmlFindOutputEncoder(encoding);
1091     if (handler != NULL)
1092         htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
1093     else
1094 	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1095 
1096     /*
1097      * save the content to a temp buffer.
1098      */
1099     buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1100     if (buf == NULL) return(0);
1101 
1102     htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1103 
1104     ret = xmlOutputBufferClose(buf);
1105     return(ret);
1106 }
1107 
1108 /**
1109  * htmlSaveFileEnc:
1110  * @filename:  the filename
1111  * @cur:  the document
1112  * @encoding: the document encoding
1113  *
1114  * Dump an HTML document to a file using a given encoding
1115  * and formatting returns/spaces are added.
1116  *
1117  * returns: the number of byte written or -1 in case of failure.
1118  */
1119 int
htmlSaveFileEnc(const char * filename,xmlDocPtr cur,const char * encoding)1120 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1121     return(htmlSaveFileFormat(filename, cur, encoding, 1));
1122 }
1123 
1124 #endif /* LIBXML_OUTPUT_ENABLED */
1125 
1126 #endif /* LIBXML_HTML_ENABLED */
1127