1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4from collections import defaultdict
5import itertools
6import re
7import warnings
8import sys
9from bs4.element import (
10    CharsetMetaAttributeValue,
11    ContentMetaAttributeValue,
12    RubyParenthesisString,
13    RubyTextString,
14    Stylesheet,
15    Script,
16    TemplateString,
17    nonwhitespace_re
18)
19
20__all__ = [
21    'HTMLTreeBuilder',
22    'SAXTreeBuilder',
23    'TreeBuilder',
24    'TreeBuilderRegistry',
25    ]
26
27# Some useful features for a TreeBuilder to have.
28FAST = 'fast'
29PERMISSIVE = 'permissive'
30STRICT = 'strict'
31XML = 'xml'
32HTML = 'html'
33HTML_5 = 'html5'
34
35class XMLParsedAsHTMLWarning(UserWarning):
36    """The warning issued when an HTML parser is used to parse
37    XML that is not XHTML.
38    """
39    MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
40
41
42class TreeBuilderRegistry(object):
43    """A way of looking up TreeBuilder subclasses by their name or by desired
44    features.
45    """
46
47    def __init__(self):
48        self.builders_for_feature = defaultdict(list)
49        self.builders = []
50
51    def register(self, treebuilder_class):
52        """Register a treebuilder based on its advertised features.
53
54        :param treebuilder_class: A subclass of Treebuilder. its .features
55           attribute should list its features.
56        """
57        for feature in treebuilder_class.features:
58            self.builders_for_feature[feature].insert(0, treebuilder_class)
59        self.builders.insert(0, treebuilder_class)
60
61    def lookup(self, *features):
62        """Look up a TreeBuilder subclass with the desired features.
63
64        :param features: A list of features to look for. If none are
65            provided, the most recently registered TreeBuilder subclass
66            will be used.
67        :return: A TreeBuilder subclass, or None if there's no
68            registered subclass with all the requested features.
69        """
70        if len(self.builders) == 0:
71            # There are no builders at all.
72            return None
73
74        if len(features) == 0:
75            # They didn't ask for any features. Give them the most
76            # recently registered builder.
77            return self.builders[0]
78
79        # Go down the list of features in order, and eliminate any builders
80        # that don't match every feature.
81        features = list(features)
82        features.reverse()
83        candidates = None
84        candidate_set = None
85        while len(features) > 0:
86            feature = features.pop()
87            we_have_the_feature = self.builders_for_feature.get(feature, [])
88            if len(we_have_the_feature) > 0:
89                if candidates is None:
90                    candidates = we_have_the_feature
91                    candidate_set = set(candidates)
92                else:
93                    # Eliminate any candidates that don't have this feature.
94                    candidate_set = candidate_set.intersection(
95                        set(we_have_the_feature))
96
97        # The only valid candidates are the ones in candidate_set.
98        # Go through the original list of candidates and pick the first one
99        # that's in candidate_set.
100        if candidate_set is None:
101            return None
102        for candidate in candidates:
103            if candidate in candidate_set:
104                return candidate
105        return None
106
107# The BeautifulSoup class will take feature lists from developers and use them
108# to look up builders in this registry.
109builder_registry = TreeBuilderRegistry()
110
111class TreeBuilder(object):
112    """Turn a textual document into a Beautiful Soup object tree."""
113
114    NAME = "[Unknown tree builder]"
115    ALTERNATE_NAMES = []
116    features = []
117
118    is_xml = False
119    picklable = False
120    empty_element_tags = None # A tag will be considered an empty-element
121                              # tag when and only when it has no contents.
122
123    # A value for these tag/attribute combinations is a space- or
124    # comma-separated list of CDATA, rather than a single CDATA.
125    DEFAULT_CDATA_LIST_ATTRIBUTES = defaultdict(list)
126
127    # Whitespace should be preserved inside these tags.
128    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
129
130    # The textual contents of tags with these names should be
131    # instantiated with some class other than NavigableString.
132    DEFAULT_STRING_CONTAINERS = {}
133
134    USE_DEFAULT = object()
135
136    # Most parsers don't keep track of line numbers.
137    TRACKS_LINE_NUMBERS = False
138
139    def __init__(self, multi_valued_attributes=USE_DEFAULT,
140                 preserve_whitespace_tags=USE_DEFAULT,
141                 store_line_numbers=USE_DEFAULT,
142                 string_containers=USE_DEFAULT,
143    ):
144        """Constructor.
145
146        :param multi_valued_attributes: If this is set to None, the
147         TreeBuilder will not turn any values for attributes like
148         'class' into lists. Setting this to a dictionary will
149         customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
150         for an example.
151
152         Internally, these are called "CDATA list attributes", but that
153         probably doesn't make sense to an end-user, so the argument name
154         is `multi_valued_attributes`.
155
156        :param preserve_whitespace_tags: A list of tags to treat
157         the way <pre> tags are treated in HTML. Tags in this list
158         are immune from pretty-printing; their contents will always be
159         output as-is.
160
161        :param string_containers: A dictionary mapping tag names to
162        the classes that should be instantiated to contain the textual
163        contents of those tags. The default is to use NavigableString
164        for every tag, no matter what the name. You can override the
165        default by changing DEFAULT_STRING_CONTAINERS.
166
167        :param store_line_numbers: If the parser keeps track of the
168         line numbers and positions of the original markup, that
169         information will, by default, be stored in each corresponding
170         `Tag` object. You can turn this off by passing
171         store_line_numbers=False. If the parser you're using doesn't
172         keep track of this information, then setting store_line_numbers=True
173         will do nothing.
174        """
175        self.soup = None
176        if multi_valued_attributes is self.USE_DEFAULT:
177            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
178        self.cdata_list_attributes = multi_valued_attributes
179        if preserve_whitespace_tags is self.USE_DEFAULT:
180            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
181        self.preserve_whitespace_tags = preserve_whitespace_tags
182        if store_line_numbers == self.USE_DEFAULT:
183            store_line_numbers = self.TRACKS_LINE_NUMBERS
184        self.store_line_numbers = store_line_numbers
185        if string_containers == self.USE_DEFAULT:
186            string_containers = self.DEFAULT_STRING_CONTAINERS
187        self.string_containers = string_containers
188
189    def initialize_soup(self, soup):
190        """The BeautifulSoup object has been initialized and is now
191        being associated with the TreeBuilder.
192
193        :param soup: A BeautifulSoup object.
194        """
195        self.soup = soup
196
197    def reset(self):
198        """Do any work necessary to reset the underlying parser
199        for a new document.
200
201        By default, this does nothing.
202        """
203        pass
204
205    def can_be_empty_element(self, tag_name):
206        """Might a tag with this name be an empty-element tag?
207
208        The final markup may or may not actually present this tag as
209        self-closing.
210
211        For instance: an HTMLBuilder does not consider a <p> tag to be
212        an empty-element tag (it's not in
213        HTMLBuilder.empty_element_tags). This means an empty <p> tag
214        will be presented as "<p></p>", not "<p/>" or "<p>".
215
216        The default implementation has no opinion about which tags are
217        empty-element tags, so a tag will be presented as an
218        empty-element tag if and only if it has no children.
219        "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
220        be left alone.
221
222        :param tag_name: The name of a markup tag.
223        """
224        if self.empty_element_tags is None:
225            return True
226        return tag_name in self.empty_element_tags
227
228    def feed(self, markup):
229        """Run some incoming markup through some parsing process,
230        populating the `BeautifulSoup` object in self.soup.
231
232        This method is not implemented in TreeBuilder; it must be
233        implemented in subclasses.
234
235        :return: None.
236        """
237        raise NotImplementedError()
238
239    def prepare_markup(self, markup, user_specified_encoding=None,
240                       document_declared_encoding=None, exclude_encodings=None):
241        """Run any preliminary steps necessary to make incoming markup
242        acceptable to the parser.
243
244        :param markup: Some markup -- probably a bytestring.
245        :param user_specified_encoding: The user asked to try this encoding.
246        :param document_declared_encoding: The markup itself claims to be
247            in this encoding. NOTE: This argument is not used by the
248            calling code and can probably be removed.
249        :param exclude_encodings: The user asked _not_ to try any of
250            these encodings.
251
252        :yield: A series of 4-tuples:
253         (markup, encoding, declared encoding,
254          has undergone character replacement)
255
256         Each 4-tuple represents a strategy for converting the
257         document to Unicode and parsing it. Each strategy will be tried
258         in turn.
259
260         By default, the only strategy is to parse the markup
261         as-is. See `LXMLTreeBuilderForXML` and
262         `HTMLParserTreeBuilder` for implementations that take into
263         account the quirks of particular parsers.
264        """
265        yield markup, None, None, False
266
267    def test_fragment_to_document(self, fragment):
268        """Wrap an HTML fragment to make it look like a document.
269
270        Different parsers do this differently. For instance, lxml
271        introduces an empty <head> tag, and html5lib
272        doesn't. Abstracting this away lets us write simple tests
273        which run HTML fragments through the parser and compare the
274        results against other HTML fragments.
275
276        This method should not be used outside of tests.
277
278        :param fragment: A string -- fragment of HTML.
279        :return: A string -- a full HTML document.
280        """
281        return fragment
282
283    def set_up_substitutions(self, tag):
284        """Set up any substitutions that will need to be performed on
285        a `Tag` when it's output as a string.
286
287        By default, this does nothing. See `HTMLTreeBuilder` for a
288        case where this is used.
289
290        :param tag: A `Tag`
291        :return: Whether or not a substitution was performed.
292        """
293        return False
294
295    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
296        """When an attribute value is associated with a tag that can
297        have multiple values for that attribute, convert the string
298        value to a list of strings.
299
300        Basically, replaces class="foo bar" with class=["foo", "bar"]
301
302        NOTE: This method modifies its input in place.
303
304        :param tag_name: The name of a tag.
305        :param attrs: A dictionary containing the tag's attributes.
306           Any appropriate attribute values will be modified in place.
307        """
308        if not attrs:
309            return attrs
310        if self.cdata_list_attributes:
311            universal = self.cdata_list_attributes.get('*', [])
312            tag_specific = self.cdata_list_attributes.get(
313                tag_name.lower(), None)
314            for attr in list(attrs.keys()):
315                if attr in universal or (tag_specific and attr in tag_specific):
316                    # We have a "class"-type attribute whose string
317                    # value is a whitespace-separated list of
318                    # values. Split it into a list.
319                    value = attrs[attr]
320                    if isinstance(value, str):
321                        values = nonwhitespace_re.findall(value)
322                    else:
323                        # html5lib sometimes calls setAttributes twice
324                        # for the same tag when rearranging the parse
325                        # tree. On the second call the attribute value
326                        # here is already a list.  If this happens,
327                        # leave the value alone rather than trying to
328                        # split it again.
329                        values = value
330                    attrs[attr] = values
331        return attrs
332
333class SAXTreeBuilder(TreeBuilder):
334    """A Beautiful Soup treebuilder that listens for SAX events.
335
336    This is not currently used for anything, but it demonstrates
337    how a simple TreeBuilder would work.
338    """
339
340    def feed(self, markup):
341        raise NotImplementedError()
342
343    def close(self):
344        pass
345
346    def startElement(self, name, attrs):
347        attrs = dict((key[1], value) for key, value in list(attrs.items()))
348        #print("Start %s, %r" % (name, attrs))
349        self.soup.handle_starttag(name, attrs)
350
351    def endElement(self, name):
352        #print("End %s" % name)
353        self.soup.handle_endtag(name)
354
355    def startElementNS(self, nsTuple, nodeName, attrs):
356        # Throw away (ns, nodeName) for now.
357        self.startElement(nodeName, attrs)
358
359    def endElementNS(self, nsTuple, nodeName):
360        # Throw away (ns, nodeName) for now.
361        self.endElement(nodeName)
362        #handler.endElementNS((ns, node.nodeName), node.nodeName)
363
364    def startPrefixMapping(self, prefix, nodeValue):
365        # Ignore the prefix for now.
366        pass
367
368    def endPrefixMapping(self, prefix):
369        # Ignore the prefix for now.
370        # handler.endPrefixMapping(prefix)
371        pass
372
373    def characters(self, content):
374        self.soup.handle_data(content)
375
376    def startDocument(self):
377        pass
378
379    def endDocument(self):
380        pass
381
382
383class HTMLTreeBuilder(TreeBuilder):
384    """This TreeBuilder knows facts about HTML.
385
386    Such as which tags are empty-element tags.
387    """
388
389    empty_element_tags = set([
390        # These are from HTML5.
391        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
392
393        # These are from earlier versions of HTML and are removed in HTML5.
394        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
395    ])
396
397    # The HTML standard defines these as block-level elements. Beautiful
398    # Soup does not treat these elements differently from other elements,
399    # but it may do so eventually, and this information is available if
400    # you need to use it.
401    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
402
403    # These HTML tags need special treatment so they can be
404    # represented by a string class other than NavigableString.
405    #
406    # For some of these tags, it's because the HTML standard defines
407    # an unusual content model for them. I made this list by going
408    # through the HTML spec
409    # (https://html.spec.whatwg.org/#metadata-content) and looking for
410    # "metadata content" elements that can contain strings.
411    #
412    # The Ruby tags (<rt> and <rp>) are here despite being normal
413    # "phrasing content" tags, because the content they contain is
414    # qualitatively different from other text in the document, and it
415    # can be useful to be able to distinguish it.
416    #
417    # TODO: Arguably <noscript> could go here but it seems
418    # qualitatively different from the other tags.
419    DEFAULT_STRING_CONTAINERS = {
420        'rt' : RubyTextString,
421        'rp' : RubyParenthesisString,
422        'style': Stylesheet,
423        'script': Script,
424        'template': TemplateString,
425    }
426
427    # The HTML standard defines these attributes as containing a
428    # space-separated list of values, not a single value. That is,
429    # class="foo bar" means that the 'class' attribute has two values,
430    # 'foo' and 'bar', not the single value 'foo bar'.  When we
431    # encounter one of these attributes, we will parse its value into
432    # a list of values if possible. Upon output, the list will be
433    # converted back into a string.
434    DEFAULT_CDATA_LIST_ATTRIBUTES = {
435        "*" : ['class', 'accesskey', 'dropzone'],
436        "a" : ['rel', 'rev'],
437        "link" :  ['rel', 'rev'],
438        "td" : ["headers"],
439        "th" : ["headers"],
440        "td" : ["headers"],
441        "form" : ["accept-charset"],
442        "object" : ["archive"],
443
444        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
445        "area" : ["rel"],
446        "icon" : ["sizes"],
447        "iframe" : ["sandbox"],
448        "output" : ["for"],
449        }
450
451    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
452
453    def set_up_substitutions(self, tag):
454        """Replace the declared encoding in a <meta> tag with a placeholder,
455        to be substituted when the tag is output to a string.
456
457        An HTML document may come in to Beautiful Soup as one
458        encoding, but exit in a different encoding, and the <meta> tag
459        needs to be changed to reflect this.
460
461        :param tag: A `Tag`
462        :return: Whether or not a substitution was performed.
463        """
464        # We are only interested in <meta> tags
465        if tag.name != 'meta':
466            return False
467
468        http_equiv = tag.get('http-equiv')
469        content = tag.get('content')
470        charset = tag.get('charset')
471
472        # We are interested in <meta> tags that say what encoding the
473        # document was originally in. This means HTML 5-style <meta>
474        # tags that provide the "charset" attribute. It also means
475        # HTML 4-style <meta> tags that provide the "content"
476        # attribute and have "http-equiv" set to "content-type".
477        #
478        # In both cases we will replace the value of the appropriate
479        # attribute with a standin object that can take on any
480        # encoding.
481        meta_encoding = None
482        if charset is not None:
483            # HTML 5 style:
484            # <meta charset="utf8">
485            meta_encoding = charset
486            tag['charset'] = CharsetMetaAttributeValue(charset)
487
488        elif (content is not None and http_equiv is not None
489              and http_equiv.lower() == 'content-type'):
490            # HTML 4 style:
491            # <meta http-equiv="content-type" content="text/html; charset=utf8">
492            tag['content'] = ContentMetaAttributeValue(content)
493
494        return (meta_encoding is not None)
495
496class DetectsXMLParsedAsHTML(object):
497    """A mixin class for any class (a TreeBuilder, or some class used by a
498    TreeBuilder) that's in a position to detect whether an XML
499    document is being incorrectly parsed as HTML, and issue an
500    appropriate warning.
501
502    This requires being able to observe an incoming processing
503    instruction that might be an XML declaration, and also able to
504    observe tags as they're opened. If you can't do that for a given
505    TreeBuilder, there's a less reliable implementation based on
506    examining the raw markup.
507    """
508
509    # Regular expression for seeing if markup has an <html> tag.
510    LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
511    LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
512
513    XML_PREFIX = '<?xml'
514    XML_PREFIX_B = b'<?xml'
515
516    @classmethod
517    def warn_if_markup_looks_like_xml(cls, markup, stacklevel=3):
518        """Perform a check on some markup to see if it looks like XML
519        that's not XHTML. If so, issue a warning.
520
521        This is much less reliable than doing the check while parsing,
522        but some of the tree builders can't do that.
523
524        :param stacklevel: The stacklevel of the code calling this
525        function.
526
527        :return: True if the markup looks like non-XHTML XML, False
528        otherwise.
529
530        """
531        if isinstance(markup, bytes):
532            prefix = cls.XML_PREFIX_B
533            looks_like_html = cls.LOOKS_LIKE_HTML_B
534        else:
535            prefix = cls.XML_PREFIX
536            looks_like_html = cls.LOOKS_LIKE_HTML
537
538        if (markup is not None
539            and markup.startswith(prefix)
540            and not looks_like_html.search(markup[:500])
541        ):
542            cls._warn(stacklevel=stacklevel+2)
543            return True
544        return False
545
546    @classmethod
547    def _warn(cls, stacklevel=5):
548        """Issue a warning about XML being parsed as HTML."""
549        warnings.warn(
550            XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning,
551            stacklevel=stacklevel
552        )
553
554    def _initialize_xml_detector(self):
555        """Call this method before parsing a document."""
556        self._first_processing_instruction = None
557        self._root_tag = None
558
559    def _document_might_be_xml(self, processing_instruction):
560        """Call this method when encountering an XML declaration, or a
561        "processing instruction" that might be an XML declaration.
562        """
563        if (self._first_processing_instruction is not None
564            or self._root_tag is not None):
565            # The document has already started. Don't bother checking
566            # anymore.
567            return
568
569        self._first_processing_instruction = processing_instruction
570
571        # We won't know until we encounter the first tag whether or
572        # not this is actually a problem.
573
574    def _root_tag_encountered(self, name):
575        """Call this when you encounter the document's root tag.
576
577        This is where we actually check whether an XML document is
578        being incorrectly parsed as HTML, and issue the warning.
579        """
580        if self._root_tag is not None:
581            # This method was incorrectly called multiple times. Do
582            # nothing.
583            return
584
585        self._root_tag = name
586        if (name != 'html' and self._first_processing_instruction is not None
587            and self._first_processing_instruction.lower().startswith('xml ')):
588            # We encountered an XML declaration and then a tag other
589            # than 'html'. This is a reliable indicator that a
590            # non-XHTML document is being parsed as XML.
591            self._warn()
592
593
594def register_treebuilders_from(module):
595    """Copy TreeBuilders from the given module into this module."""
596    this_module = sys.modules[__name__]
597    for name in module.__all__:
598        obj = getattr(module, name)
599
600        if issubclass(obj, TreeBuilder):
601            setattr(this_module, name, obj)
602            this_module.__all__.append(name)
603            # Register the builder while we're at it.
604            this_module.builder_registry.register(obj)
605
606class ParserRejectedMarkup(Exception):
607    """An Exception to be raised when the underlying parser simply
608    refuses to parse the given markup.
609    """
610    def __init__(self, message_or_exception):
611        """Explain why the parser rejected the given markup, either
612        with a textual explanation or another exception.
613        """
614        if isinstance(message_or_exception, Exception):
615            e = message_or_exception
616            message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
617        super(ParserRejectedMarkup, self).__init__(message_or_exception)
618
619# Builders are registered in reverse order of priority, so that custom
620# builder registrations will take precedence. In general, we want lxml
621# to take precedence over html5lib, because it's faster. And we only
622# want to use HTMLParser as a last resort.
623from . import _htmlparser
624register_treebuilders_from(_htmlparser)
625try:
626    from . import _html5lib
627    register_treebuilders_from(_html5lib)
628except ImportError:
629    # They don't have html5lib installed.
630    pass
631try:
632    from . import _lxml
633    register_treebuilders_from(_lxml)
634except ImportError:
635    # They don't have lxml installed.
636    pass
637