Back to home page

EIC code displayed by LXR

 
 

    


File indexing completed on 2025-02-22 10:41:49

0001 /*
0002  * Summary: interface for an HTML 4.0 non-verifying parser
0003  * Description: this module implements an HTML 4.0 non-verifying parser
0004  *              with API compatible with the XML parser ones. It should
0005  *              be able to parse "real world" HTML, even if severely
0006  *              broken from a specification point of view.
0007  *
0008  * Copy: See Copyright for the status of this software.
0009  *
0010  * Author: Daniel Veillard
0011  */
0012 
0013 #ifndef __HTML_PARSER_H__
0014 #define __HTML_PARSER_H__
0015 #include <libxml/xmlversion.h>
0016 #include <libxml/parser.h>
0017 
0018 #ifdef LIBXML_HTML_ENABLED
0019 
0020 #ifdef __cplusplus
0021 extern "C" {
0022 #endif
0023 
0024 /*
0025  * Most of the back-end structures from XML and HTML are shared.
0026  */
0027 typedef xmlParserCtxt htmlParserCtxt;
0028 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
0029 typedef xmlParserNodeInfo htmlParserNodeInfo;
0030 typedef xmlSAXHandler htmlSAXHandler;
0031 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
0032 typedef xmlParserInput htmlParserInput;
0033 typedef xmlParserInputPtr htmlParserInputPtr;
0034 typedef xmlDocPtr htmlDocPtr;
0035 typedef xmlNodePtr htmlNodePtr;
0036 
0037 /*
0038  * Internal description of an HTML element, representing HTML 4.01
0039  * and XHTML 1.0 (which share the same structure).
0040  */
0041 typedef struct _htmlElemDesc htmlElemDesc;
0042 typedef htmlElemDesc *htmlElemDescPtr;
0043 struct _htmlElemDesc {
0044     const char *name;   /* The tag name */
0045     char startTag;      /* Whether the start tag can be implied */
0046     char endTag;        /* Whether the end tag can be implied */
0047     char saveEndTag;    /* Whether the end tag should be saved */
0048     char empty;         /* Is this an empty element ? */
0049     char depr;          /* Is this a deprecated element ? */
0050     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
0051     char isinline;      /* is this a block 0 or inline 1 element */
0052     const char *desc;   /* the description */
0053 
0054 /* NRK Jan.2003
0055  * New fields encapsulating HTML structure
0056  *
0057  * Bugs:
0058  *  This is a very limited representation.  It fails to tell us when
0059  *  an element *requires* subelements (we only have whether they're
0060  *  allowed or not), and it doesn't tell us where CDATA and PCDATA
0061  *  are allowed.  Some element relationships are not fully represented:
0062  *  these are flagged with the word MODIFIER
0063  */
0064     const char** subelts;       /* allowed sub-elements of this element */
0065     const char* defaultsubelt;  /* subelement for suggested auto-repair
0066                        if necessary or NULL */
0067     const char** attrs_opt;     /* Optional Attributes */
0068     const char** attrs_depr;        /* Additional deprecated attributes */
0069     const char** attrs_req;     /* Required attributes */
0070 };
0071 
0072 /*
0073  * Internal description of an HTML entity.
0074  */
0075 typedef struct _htmlEntityDesc htmlEntityDesc;
0076 typedef htmlEntityDesc *htmlEntityDescPtr;
0077 struct _htmlEntityDesc {
0078     unsigned int value; /* the UNICODE value for the character */
0079     const char *name;   /* The entity name */
0080     const char *desc;   /* the description */
0081 };
0082 
0083 #ifdef LIBXML_SAX1_ENABLED
0084 
0085 XML_DEPRECATED
0086 XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler;
0087 
0088 #ifdef LIBXML_THREAD_ENABLED
0089 XML_DEPRECATED
0090 XMLPUBFUN const xmlSAXHandlerV1 *__htmlDefaultSAXHandler(void);
0091 #endif
0092 
0093 #endif /* LIBXML_SAX1_ENABLED */
0094 
0095 /*
0096  * There is only few public functions.
0097  */
0098 XML_DEPRECATED
0099 XMLPUBFUN void
0100             htmlInitAutoClose   (void);
0101 XMLPUBFUN const htmlElemDesc *
0102             htmlTagLookup   (const xmlChar *tag);
0103 XMLPUBFUN const htmlEntityDesc *
0104             htmlEntityLookup(const xmlChar *name);
0105 XMLPUBFUN const htmlEntityDesc *
0106             htmlEntityValueLookup(unsigned int value);
0107 
0108 XMLPUBFUN int
0109             htmlIsAutoClosed(htmlDocPtr doc,
0110                      htmlNodePtr elem);
0111 XMLPUBFUN int
0112             htmlAutoCloseTag(htmlDocPtr doc,
0113                      const xmlChar *name,
0114                      htmlNodePtr elem);
0115 XML_DEPRECATED
0116 XMLPUBFUN const htmlEntityDesc *
0117             htmlParseEntityRef(htmlParserCtxtPtr ctxt,
0118                      const xmlChar **str);
0119 XML_DEPRECATED
0120 XMLPUBFUN int
0121             htmlParseCharRef(htmlParserCtxtPtr ctxt);
0122 XML_DEPRECATED
0123 XMLPUBFUN void
0124             htmlParseElement(htmlParserCtxtPtr ctxt);
0125 
0126 XMLPUBFUN htmlParserCtxtPtr
0127             htmlNewParserCtxt(void);
0128 XMLPUBFUN htmlParserCtxtPtr
0129             htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
0130                          void *userData);
0131 
0132 XMLPUBFUN htmlParserCtxtPtr
0133             htmlCreateMemoryParserCtxt(const char *buffer,
0134                            int size);
0135 
0136 XMLPUBFUN int
0137             htmlParseDocument(htmlParserCtxtPtr ctxt);
0138 XML_DEPRECATED
0139 XMLPUBFUN htmlDocPtr
0140             htmlSAXParseDoc (const xmlChar *cur,
0141                      const char *encoding,
0142                      htmlSAXHandlerPtr sax,
0143                      void *userData);
0144 XMLPUBFUN htmlDocPtr
0145             htmlParseDoc    (const xmlChar *cur,
0146                      const char *encoding);
0147 XMLPUBFUN htmlParserCtxtPtr
0148             htmlCreateFileParserCtxt(const char *filename,
0149                                              const char *encoding);
0150 XML_DEPRECATED
0151 XMLPUBFUN htmlDocPtr
0152             htmlSAXParseFile(const char *filename,
0153                      const char *encoding,
0154                      htmlSAXHandlerPtr sax,
0155                      void *userData);
0156 XMLPUBFUN htmlDocPtr
0157             htmlParseFile   (const char *filename,
0158                      const char *encoding);
0159 XMLPUBFUN int
0160             UTF8ToHtml  (unsigned char *out,
0161                      int *outlen,
0162                      const unsigned char *in,
0163                      int *inlen);
0164 XMLPUBFUN int
0165             htmlEncodeEntities(unsigned char *out,
0166                      int *outlen,
0167                      const unsigned char *in,
0168                      int *inlen, int quoteChar);
0169 XMLPUBFUN int
0170             htmlIsScriptAttribute(const xmlChar *name);
0171 XML_DEPRECATED
0172 XMLPUBFUN int
0173             htmlHandleOmittedElem(int val);
0174 
0175 #ifdef LIBXML_PUSH_ENABLED
0176 /**
0177  * Interfaces for the Push mode.
0178  */
0179 XMLPUBFUN htmlParserCtxtPtr
0180             htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
0181                          void *user_data,
0182                          const char *chunk,
0183                          int size,
0184                          const char *filename,
0185                          xmlCharEncoding enc);
0186 XMLPUBFUN int
0187             htmlParseChunk      (htmlParserCtxtPtr ctxt,
0188                          const char *chunk,
0189                          int size,
0190                          int terminate);
0191 #endif /* LIBXML_PUSH_ENABLED */
0192 
0193 XMLPUBFUN void
0194             htmlFreeParserCtxt  (htmlParserCtxtPtr ctxt);
0195 
0196 /*
0197  * New set of simpler/more flexible APIs
0198  */
0199 /**
0200  * xmlParserOption:
0201  *
0202  * This is the set of XML parser options that can be passed down
0203  * to the xmlReadDoc() and similar calls.
0204  */
0205 typedef enum {
0206     HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
0207     HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
0208     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
0209     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
0210     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
0211     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
0212     HTML_PARSE_NONET    = 1<<11,/* Forbid network access */
0213     HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
0214     HTML_PARSE_COMPACT  = 1<<16,/* compact small text nodes */
0215     HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
0216 } htmlParserOption;
0217 
0218 XMLPUBFUN void
0219         htmlCtxtReset       (htmlParserCtxtPtr ctxt);
0220 XMLPUBFUN int
0221         htmlCtxtUseOptions  (htmlParserCtxtPtr ctxt,
0222                      int options);
0223 XMLPUBFUN htmlDocPtr
0224         htmlReadDoc     (const xmlChar *cur,
0225                      const char *URL,
0226                      const char *encoding,
0227                      int options);
0228 XMLPUBFUN htmlDocPtr
0229         htmlReadFile        (const char *URL,
0230                      const char *encoding,
0231                      int options);
0232 XMLPUBFUN htmlDocPtr
0233         htmlReadMemory      (const char *buffer,
0234                      int size,
0235                      const char *URL,
0236                      const char *encoding,
0237                      int options);
0238 XMLPUBFUN htmlDocPtr
0239         htmlReadFd      (int fd,
0240                      const char *URL,
0241                      const char *encoding,
0242                      int options);
0243 XMLPUBFUN htmlDocPtr
0244         htmlReadIO      (xmlInputReadCallback ioread,
0245                      xmlInputCloseCallback ioclose,
0246                      void *ioctx,
0247                      const char *URL,
0248                      const char *encoding,
0249                      int options);
0250 XMLPUBFUN htmlDocPtr
0251         htmlCtxtParseDocument   (htmlParserCtxtPtr ctxt,
0252                      xmlParserInputPtr input);
0253 XMLPUBFUN htmlDocPtr
0254         htmlCtxtReadDoc     (xmlParserCtxtPtr ctxt,
0255                      const xmlChar *cur,
0256                      const char *URL,
0257                      const char *encoding,
0258                      int options);
0259 XMLPUBFUN htmlDocPtr
0260         htmlCtxtReadFile        (xmlParserCtxtPtr ctxt,
0261                      const char *filename,
0262                      const char *encoding,
0263                      int options);
0264 XMLPUBFUN htmlDocPtr
0265         htmlCtxtReadMemory      (xmlParserCtxtPtr ctxt,
0266                      const char *buffer,
0267                      int size,
0268                      const char *URL,
0269                      const char *encoding,
0270                      int options);
0271 XMLPUBFUN htmlDocPtr
0272         htmlCtxtReadFd      (xmlParserCtxtPtr ctxt,
0273                      int fd,
0274                      const char *URL,
0275                      const char *encoding,
0276                      int options);
0277 XMLPUBFUN htmlDocPtr
0278         htmlCtxtReadIO      (xmlParserCtxtPtr ctxt,
0279                      xmlInputReadCallback ioread,
0280                      xmlInputCloseCallback ioclose,
0281                      void *ioctx,
0282                      const char *URL,
0283                      const char *encoding,
0284                      int options);
0285 
0286 /* NRK/Jan2003: further knowledge of HTML structure
0287  */
0288 typedef enum {
0289   HTML_NA = 0 ,     /* something we don't check at all */
0290   HTML_INVALID = 0x1 ,
0291   HTML_DEPRECATED = 0x2 ,
0292   HTML_VALID = 0x4 ,
0293   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
0294 } htmlStatus ;
0295 
0296 /* Using htmlElemDesc rather than name here, to emphasise the fact
0297    that otherwise there's a lookup overhead
0298 */
0299 XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
0300 XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
0301 XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
0302 XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ;
0303 /**
0304  * htmlDefaultSubelement:
0305  * @elt: HTML element
0306  *
0307  * Returns the default subelement for this element
0308  */
0309 #define htmlDefaultSubelement(elt) elt->defaultsubelt
0310 /**
0311  * htmlElementAllowedHereDesc:
0312  * @parent: HTML parent element
0313  * @elt: HTML element
0314  *
0315  * Checks whether an HTML element description may be a
0316  * direct child of the specified element.
0317  *
0318  * Returns 1 if allowed; 0 otherwise.
0319  */
0320 #define htmlElementAllowedHereDesc(parent,elt) \
0321     htmlElementAllowedHere((parent), (elt)->name)
0322 /**
0323  * htmlRequiredAttrs:
0324  * @elt: HTML element
0325  *
0326  * Returns the attributes required for the specified element.
0327  */
0328 #define htmlRequiredAttrs(elt) (elt)->attrs_req
0329 
0330 
0331 #ifdef __cplusplus
0332 }
0333 #endif
0334 
0335 #endif /* LIBXML_HTML_ENABLED */
0336 #endif /* __HTML_PARSER_H__ */