Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2026-07-04 08:30:28

0001 /**
0002  * @file
0003  * 
0004  * @brief HTML parser, doesn't support HTML5
0005  * 
0006  * This module orginally implemented an HTML parser based on the
0007  * (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer
0008  * conforms to HTML5. Tree construction still follows a custom,
0009  * unspecified algorithm with many differences to HTML5.
0010  *
0011  * The parser defaults to ISO-8859-1, the default encoding of
0012  * HTTP/1.0.
0013  *
0014  * @copyright See Copyright for the status of this software.
0015  *
0016  * @author Daniel Veillard
0017  */
0018 
0019 #ifndef __HTML_PARSER_H__
0020 #define __HTML_PARSER_H__
0021 #include <libxml/xmlversion.h>
0022 #include <libxml/parser.h>
0023 
0024 #ifdef LIBXML_HTML_ENABLED
0025 
0026 #ifdef __cplusplus
0027 extern "C" {
0028 #endif
0029 
0030 /*
0031  * Backward compatibility
0032  */
0033 #define UTF8ToHtml htmlUTF8ToHtml
0034 #define htmlDefaultSubelement(elt) elt->defaultsubelt
0035 #define htmlElementAllowedHereDesc(parent,elt) \
0036     htmlElementAllowedHere((parent), (elt)->name)
0037 #define htmlRequiredAttrs(elt) (elt)->attrs_req
0038 
0039 /*
0040  * Most of the back-end structures from XML and HTML are shared.
0041  */
0042 /** Same as xmlParserCtxt */
0043 typedef xmlParserCtxt htmlParserCtxt;
0044 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
0045 typedef xmlParserNodeInfo htmlParserNodeInfo;
0046 /** Same as xmlSAXHandler */
0047 typedef xmlSAXHandler htmlSAXHandler;
0048 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
0049 /** Same as xmlParserInput */
0050 typedef xmlParserInput htmlParserInput;
0051 typedef xmlParserInputPtr htmlParserInputPtr;
0052 typedef xmlDocPtr htmlDocPtr;
0053 typedef xmlNodePtr htmlNodePtr;
0054 
0055 /** @cond ignore */
0056 
0057 /*
0058  * Internal description of an HTML element, representing HTML 4.01
0059  * and XHTML 1.0 (which share the same structure).
0060  */
0061 typedef struct _htmlElemDesc htmlElemDesc;
0062 typedef htmlElemDesc *htmlElemDescPtr;
0063 struct _htmlElemDesc {
0064     const char *name;   /* The tag name */
0065     char startTag;      /* unused */
0066     char endTag;        /* Whether the end tag can be implied */
0067     char saveEndTag;    /* unused */
0068     char empty;         /* Is this an empty element ? */
0069     char depr;          /* unused */
0070     char dtd;           /* unused */
0071     char isinline;      /* is this a block 0 or inline 1 element */
0072     const char *desc;   /* the description */
0073 
0074     const char** subelts XML_DEPRECATED_MEMBER;
0075     const char* defaultsubelt XML_DEPRECATED_MEMBER;
0076     const char** attrs_opt XML_DEPRECATED_MEMBER;
0077     const char** attrs_depr XML_DEPRECATED_MEMBER;
0078     const char** attrs_req XML_DEPRECATED_MEMBER;
0079 
0080     int dataMode;
0081 };
0082 
0083 /*
0084  * Internal description of an HTML entity.
0085  */
0086 typedef struct _htmlEntityDesc htmlEntityDesc;
0087 typedef htmlEntityDesc *htmlEntityDescPtr;
0088 struct _htmlEntityDesc {
0089     unsigned int value; /* the UNICODE value for the character */
0090     const char *name;   /* The entity name */
0091     const char *desc;   /* the description */
0092 };
0093 
0094 #ifdef LIBXML_SAX1_ENABLED
0095 /**
0096  * @deprecated Use #xmlSAX2InitHtmlDefaultSAXHandler
0097  */
0098 XML_DEPRECATED
0099 XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
0100 #endif /* LIBXML_SAX1_ENABLED */
0101 
0102 /** @endcond */
0103 
0104 /*
0105  * There is only few public functions.
0106  */
0107 XML_DEPRECATED
0108 XMLPUBFUN void
0109             htmlInitAutoClose   (void);
0110 XML_DEPRECATED
0111 XMLPUBFUN const htmlElemDesc *
0112             htmlTagLookup   (const xmlChar *tag);
0113 XML_DEPRECATED
0114 XMLPUBFUN const htmlEntityDesc *
0115             htmlEntityLookup(const xmlChar *name);
0116 XML_DEPRECATED
0117 XMLPUBFUN const htmlEntityDesc *
0118             htmlEntityValueLookup(unsigned int value);
0119 
0120 XML_DEPRECATED
0121 XMLPUBFUN int
0122             htmlIsAutoClosed(xmlDoc *doc,
0123                      xmlNode *elem);
0124 XML_DEPRECATED
0125 XMLPUBFUN int
0126             htmlAutoCloseTag(xmlDoc *doc,
0127                      const xmlChar *name,
0128                      xmlNode *elem);
0129 XML_DEPRECATED
0130 XMLPUBFUN const htmlEntityDesc *
0131             htmlParseEntityRef(htmlParserCtxt *ctxt,
0132                      const xmlChar **str);
0133 XML_DEPRECATED
0134 XMLPUBFUN int
0135             htmlParseCharRef(htmlParserCtxt *ctxt);
0136 XML_DEPRECATED
0137 XMLPUBFUN void
0138             htmlParseElement(htmlParserCtxt *ctxt);
0139 
0140 XMLPUBFUN htmlParserCtxt *
0141             htmlNewParserCtxt(void);
0142 XMLPUBFUN htmlParserCtxt *
0143             htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
0144                          void *userData);
0145 
0146 XMLPUBFUN htmlParserCtxt *
0147             htmlCreateMemoryParserCtxt(const char *buffer,
0148                            int size);
0149 
0150 XMLPUBFUN int
0151             htmlParseDocument(htmlParserCtxt *ctxt);
0152 XML_DEPRECATED
0153 XMLPUBFUN xmlDoc *
0154             htmlSAXParseDoc (const xmlChar *cur,
0155                      const char *encoding,
0156                      htmlSAXHandler *sax,
0157                      void *userData);
0158 XMLPUBFUN xmlDoc *
0159             htmlParseDoc    (const xmlChar *cur,
0160                      const char *encoding);
0161 XMLPUBFUN htmlParserCtxt *
0162             htmlCreateFileParserCtxt(const char *filename,
0163                                              const char *encoding);
0164 XML_DEPRECATED
0165 XMLPUBFUN xmlDoc *
0166             htmlSAXParseFile(const char *filename,
0167                      const char *encoding,
0168                      htmlSAXHandler *sax,
0169                      void *userData);
0170 XMLPUBFUN xmlDoc *
0171             htmlParseFile   (const char *filename,
0172                      const char *encoding);
0173 XML_DEPRECATED
0174 XMLPUBFUN int
0175             htmlUTF8ToHtml  (unsigned char *out,
0176                      int *outlen,
0177                      const unsigned char *in,
0178                      int *inlen);
0179 XML_DEPRECATED
0180 XMLPUBFUN int
0181             htmlEncodeEntities(unsigned char *out,
0182                      int *outlen,
0183                      const unsigned char *in,
0184                      int *inlen, int quoteChar);
0185 XML_DEPRECATED
0186 XMLPUBFUN int
0187             htmlIsScriptAttribute(const xmlChar *name);
0188 XML_DEPRECATED
0189 XMLPUBFUN int
0190             htmlHandleOmittedElem(int val);
0191 
0192 #ifdef LIBXML_PUSH_ENABLED
0193 /*
0194  * Interfaces for the Push mode.
0195  */
0196 XMLPUBFUN htmlParserCtxt *
0197             htmlCreatePushParserCtxt(htmlSAXHandler *sax,
0198                          void *user_data,
0199                          const char *chunk,
0200                          int size,
0201                          const char *filename,
0202                          xmlCharEncoding enc);
0203 XMLPUBFUN int
0204             htmlParseChunk      (htmlParserCtxt *ctxt,
0205                          const char *chunk,
0206                          int size,
0207                          int terminate);
0208 #endif /* LIBXML_PUSH_ENABLED */
0209 
0210 XMLPUBFUN void
0211             htmlFreeParserCtxt  (htmlParserCtxt *ctxt);
0212 
0213 /*
0214  * New set of simpler/more flexible APIs
0215  */
0216 
0217 /**
0218  * This is the set of HTML parser options that can be passed to
0219  * #htmlReadDoc, #htmlCtxtSetOptions and other functions.
0220  */
0221 typedef enum {
0222     /**
0223      * No effect as of 2.14.0.
0224      */
0225     HTML_PARSE_RECOVER = 1<<0,
0226     /**
0227      * Do not default to a doctype if none was found.
0228      */
0229     HTML_PARSE_NODEFDTD = 1<<2,
0230     /**
0231      * Disable error and warning reports to the error handlers.
0232      * Errors are still accessible with xmlCtxtGetLastError().
0233      */
0234     HTML_PARSE_NOERROR = 1<<5,
0235     /**
0236      * Disable warning reports.
0237      */
0238     HTML_PARSE_NOWARNING = 1<<6,
0239     /**
0240      * No effect.
0241      */
0242     HTML_PARSE_PEDANTIC = 1<<7,
0243     /**
0244      * Remove some text nodes containing only whitespace from the
0245      * result document. Which nodes are removed depends on a conservative
0246      * heuristic. The reindenting feature of the serialization code relies
0247      * on this option to be set when parsing. Use of this option is
0248      * DISCOURAGED.
0249      */
0250     HTML_PARSE_NOBLANKS = 1<<8,
0251     /**
0252      * No effect.
0253      */
0254     HTML_PARSE_NONET = 1<<11,
0255     /**
0256      * Do not add implied html, head or body elements.
0257      */
0258     HTML_PARSE_NOIMPLIED = 1<<13,
0259     /**
0260      * Store small strings directly in the node struct to save
0261      * memory.
0262     */
0263     HTML_PARSE_COMPACT = 1<<16,
0264     /**
0265      * Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption.
0266      *
0267      * @since 2.14.0
0268      *
0269      * Use XML_PARSE_HUGE with older versions.
0270      */
0271     HTML_PARSE_HUGE = 1<<19,
0272     /**
0273      * Ignore the encoding in the HTML declaration. This option is
0274      * mostly unneeded these days. The only effect is to enforce
0275      * ISO-8859-1 decoding of ASCII-like data.
0276      */
0277     HTML_PARSE_IGNORE_ENC =1<<21,
0278     /**
0279      * Enable reporting of line numbers larger than 65535.
0280      *
0281      * @since 2.14.0
0282      *
0283      * Use XML_PARSE_BIG_LINES with older versions.
0284      */
0285     HTML_PARSE_BIG_LINES = 1<<22,
0286     /**
0287      * Make the tokenizer emit a SAX callback for each token. This results
0288      * in unbalanced invocations of startElement and endElement.
0289      *
0290      * For now, this is only usable to tokenize HTML5 with custom SAX
0291      * callbacks. A tree builder isn't implemented yet.
0292      *
0293      * @since 2.14.0
0294     */
0295     HTML_PARSE_HTML5 = 1<<26
0296 } htmlParserOption;
0297 
0298 XMLPUBFUN void
0299         htmlCtxtReset       (htmlParserCtxt *ctxt);
0300 XMLPUBFUN int
0301         htmlCtxtSetOptions  (htmlParserCtxt *ctxt,
0302                      int options);
0303 XMLPUBFUN int
0304         htmlCtxtUseOptions  (htmlParserCtxt *ctxt,
0305                      int options);
0306 XMLPUBFUN xmlDoc *
0307         htmlReadDoc     (const xmlChar *cur,
0308                      const char *URL,
0309                      const char *encoding,
0310                      int options);
0311 XMLPUBFUN xmlDoc *
0312         htmlReadFile        (const char *URL,
0313                      const char *encoding,
0314                      int options);
0315 XMLPUBFUN xmlDoc *
0316         htmlReadMemory      (const char *buffer,
0317                      int size,
0318                      const char *URL,
0319                      const char *encoding,
0320                      int options);
0321 XMLPUBFUN xmlDoc *
0322         htmlReadFd      (int fd,
0323                      const char *URL,
0324                      const char *encoding,
0325                      int options);
0326 XMLPUBFUN xmlDoc *
0327         htmlReadIO      (xmlInputReadCallback ioread,
0328                      xmlInputCloseCallback ioclose,
0329                      void *ioctx,
0330                      const char *URL,
0331                      const char *encoding,
0332                      int options);
0333 XMLPUBFUN xmlDoc *
0334         htmlCtxtParseDocument   (htmlParserCtxt *ctxt,
0335                      xmlParserInput *input);
0336 XMLPUBFUN xmlDoc *
0337         htmlCtxtReadDoc     (xmlParserCtxt *ctxt,
0338                      const xmlChar *cur,
0339                      const char *URL,
0340                      const char *encoding,
0341                      int options);
0342 XMLPUBFUN xmlDoc *
0343         htmlCtxtReadFile        (xmlParserCtxt *ctxt,
0344                      const char *filename,
0345                      const char *encoding,
0346                      int options);
0347 XMLPUBFUN xmlDoc *
0348         htmlCtxtReadMemory      (xmlParserCtxt *ctxt,
0349                      const char *buffer,
0350                      int size,
0351                      const char *URL,
0352                      const char *encoding,
0353                      int options);
0354 XMLPUBFUN xmlDoc *
0355         htmlCtxtReadFd      (xmlParserCtxt *ctxt,
0356                      int fd,
0357                      const char *URL,
0358                      const char *encoding,
0359                      int options);
0360 XMLPUBFUN xmlDoc *
0361         htmlCtxtReadIO      (xmlParserCtxt *ctxt,
0362                      xmlInputReadCallback ioread,
0363                      xmlInputCloseCallback ioclose,
0364                      void *ioctx,
0365                      const char *URL,
0366                      const char *encoding,
0367                      int options);
0368 
0369 /**
0370  * deprecated content model
0371  */
0372 typedef enum {
0373   HTML_NA = 0 ,     /* something we don't check at all */
0374   HTML_INVALID = 0x1 ,
0375   HTML_DEPRECATED = 0x2 ,
0376   HTML_VALID = 0x4 ,
0377   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
0378 } htmlStatus ;
0379 
0380 /* Using htmlElemDesc rather than name here, to emphasise the fact
0381    that otherwise there's a lookup overhead
0382 */
0383 XML_DEPRECATED
0384 XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
0385 XML_DEPRECATED
0386 XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
0387 XML_DEPRECATED
0388 XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
0389 XML_DEPRECATED
0390 XMLPUBFUN htmlStatus htmlNodeStatus(xmlNode *, int) ;
0391 
0392 #ifdef __cplusplus
0393 }
0394 #endif
0395 
0396 #endif /* LIBXML_HTML_ENABLED */
0397 #endif /* __HTML_PARSER_H__ */