1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4__all__ = [
5    'HTML5TreeBuilder',
6    ]
7
8import warnings
9import re
10from bs4.builder import (
11    DetectsXMLParsedAsHTML,
12    PERMISSIVE,
13    HTML,
14    HTML_5,
15    HTMLTreeBuilder,
16    )
17from bs4.element import (
18    NamespacedAttribute,
19    nonwhitespace_re,
20)
21import html5lib
22from html5lib.constants import (
23    namespaces,
24    prefixes,
25    )
26from bs4.element import (
27    Comment,
28    Doctype,
29    NavigableString,
30    Tag,
31    )
32
33try:
34    # Pre-0.99999999
35    from html5lib.treebuilders import _base as treebuilder_base
36    new_html5lib = False
37except ImportError as e:
38    # 0.99999999 and up
39    from html5lib.treebuilders import base as treebuilder_base
40    new_html5lib = True
41
42class HTML5TreeBuilder(HTMLTreeBuilder):
43    """Use html5lib to build a tree.
44
45    Note that this TreeBuilder does not support some features common
46    to HTML TreeBuilders. Some of these features could theoretically
47    be implemented, but at the very least it's quite difficult,
48    because html5lib moves the parse tree around as it's being built.
49
50    * This TreeBuilder doesn't use different subclasses of NavigableString
51      based on the name of the tag in which the string was found.
52
53    * You can't use a SoupStrainer to parse only part of a document.
54    """
55
56    NAME = "html5lib"
57
58    features = [NAME, PERMISSIVE, HTML_5, HTML]
59
60    # html5lib can tell us which line number and position in the
61    # original file is the source of an element.
62    TRACKS_LINE_NUMBERS = True
63
64    def prepare_markup(self, markup, user_specified_encoding,
65                       document_declared_encoding=None, exclude_encodings=None):
66        # Store the user-specified encoding for use later on.
67        self.user_specified_encoding = user_specified_encoding
68
69        # document_declared_encoding and exclude_encodings aren't used
70        # ATM because the html5lib TreeBuilder doesn't use
71        # UnicodeDammit.
72        if exclude_encodings:
73            warnings.warn(
74                "You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.",
75                stacklevel=3
76            )
77
78        # html5lib only parses HTML, so if it's given XML that's worth
79        # noting.
80        DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
81            markup, stacklevel=3
82        )
83
84        yield (markup, None, None, False)
85
86    # These methods are defined by Beautiful Soup.
87    def feed(self, markup):
88        if self.soup.parse_only is not None:
89            warnings.warn(
90                "You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
91                stacklevel=4
92            )
93        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
94        self.underlying_builder.parser = parser
95        extra_kwargs = dict()
96        if not isinstance(markup, str):
97            if new_html5lib:
98                extra_kwargs['override_encoding'] = self.user_specified_encoding
99            else:
100                extra_kwargs['encoding'] = self.user_specified_encoding
101        doc = parser.parse(markup, **extra_kwargs)
102
103        # Set the character encoding detected by the tokenizer.
104        if isinstance(markup, str):
105            # We need to special-case this because html5lib sets
106            # charEncoding to UTF-8 if it gets Unicode input.
107            doc.original_encoding = None
108        else:
109            original_encoding = parser.tokenizer.stream.charEncoding[0]
110            if not isinstance(original_encoding, str):
111                # In 0.99999999 and up, the encoding is an html5lib
112                # Encoding object. We want to use a string for compatibility
113                # with other tree builders.
114                original_encoding = original_encoding.name
115            doc.original_encoding = original_encoding
116        self.underlying_builder.parser = None
117
118    def create_treebuilder(self, namespaceHTMLElements):
119        self.underlying_builder = TreeBuilderForHtml5lib(
120            namespaceHTMLElements, self.soup,
121            store_line_numbers=self.store_line_numbers
122        )
123        return self.underlying_builder
124
125    def test_fragment_to_document(self, fragment):
126        """See `TreeBuilder`."""
127        return '<html><head></head><body>%s</body></html>' % fragment
128
129
130class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
131
132    def __init__(self, namespaceHTMLElements, soup=None,
133                 store_line_numbers=True, **kwargs):
134        if soup:
135            self.soup = soup
136        else:
137            from bs4 import BeautifulSoup
138            # TODO: Why is the parser 'html.parser' here? To avoid an
139            # infinite loop?
140            self.soup = BeautifulSoup(
141                "", "html.parser", store_line_numbers=store_line_numbers,
142                **kwargs
143            )
144        # TODO: What are **kwargs exactly? Should they be passed in
145        # here in addition to/instead of being passed to the BeautifulSoup
146        # constructor?
147        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
148
149        # This will be set later to an html5lib.html5parser.HTMLParser
150        # object, which we can use to track the current line number.
151        self.parser = None
152        self.store_line_numbers = store_line_numbers
153
154    def documentClass(self):
155        self.soup.reset()
156        return Element(self.soup, self.soup, None)
157
158    def insertDoctype(self, token):
159        name = token["name"]
160        publicId = token["publicId"]
161        systemId = token["systemId"]
162
163        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
164        self.soup.object_was_parsed(doctype)
165
166    def elementClass(self, name, namespace):
167        kwargs = {}
168        if self.parser and self.store_line_numbers:
169            # This represents the point immediately after the end of the
170            # tag. We don't know when the tag started, but we do know
171            # where it ended -- the character just before this one.
172            sourceline, sourcepos = self.parser.tokenizer.stream.position()
173            kwargs['sourceline'] = sourceline
174            kwargs['sourcepos'] = sourcepos-1
175        tag = self.soup.new_tag(name, namespace, **kwargs)
176
177        return Element(tag, self.soup, namespace)
178
179    def commentClass(self, data):
180        return TextNode(Comment(data), self.soup)
181
182    def fragmentClass(self):
183        from bs4 import BeautifulSoup
184        # TODO: Why is the parser 'html.parser' here? To avoid an
185        # infinite loop?
186        self.soup = BeautifulSoup("", "html.parser")
187        self.soup.name = "[document_fragment]"
188        return Element(self.soup, self.soup, None)
189
190    def appendChild(self, node):
191        # XXX This code is not covered by the BS4 tests.
192        self.soup.append(node.element)
193
194    def getDocument(self):
195        return self.soup
196
197    def getFragment(self):
198        return treebuilder_base.TreeBuilder.getFragment(self).element
199
200    def testSerializer(self, element):
201        from bs4 import BeautifulSoup
202        rv = []
203        doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
204
205        def serializeElement(element, indent=0):
206            if isinstance(element, BeautifulSoup):
207                pass
208            if isinstance(element, Doctype):
209                m = doctype_re.match(element)
210                if m:
211                    name = m.group(1)
212                    if m.lastindex > 1:
213                        publicId = m.group(2) or ""
214                        systemId = m.group(3) or m.group(4) or ""
215                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
216                                  (' ' * indent, name, publicId, systemId))
217                    else:
218                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
219                else:
220                    rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
221            elif isinstance(element, Comment):
222                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
223            elif isinstance(element, NavigableString):
224                rv.append("|%s\"%s\"" % (' ' * indent, element))
225            else:
226                if element.namespace:
227                    name = "%s %s" % (prefixes[element.namespace],
228                                      element.name)
229                else:
230                    name = element.name
231                rv.append("|%s<%s>" % (' ' * indent, name))
232                if element.attrs:
233                    attributes = []
234                    for name, value in list(element.attrs.items()):
235                        if isinstance(name, NamespacedAttribute):
236                            name = "%s %s" % (prefixes[name.namespace], name.name)
237                        if isinstance(value, list):
238                            value = " ".join(value)
239                        attributes.append((name, value))
240
241                    for name, value in sorted(attributes):
242                        rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
243                indent += 2
244                for child in element.children:
245                    serializeElement(child, indent)
246        serializeElement(element, 0)
247
248        return "\n".join(rv)
249
250class AttrList(object):
251    def __init__(self, element):
252        self.element = element
253        self.attrs = dict(self.element.attrs)
254    def __iter__(self):
255        return list(self.attrs.items()).__iter__()
256    def __setitem__(self, name, value):
257        # If this attribute is a multi-valued attribute for this element,
258        # turn its value into a list.
259        list_attr = self.element.cdata_list_attributes or {}
260        if (name in list_attr.get('*', [])
261            or (self.element.name in list_attr
262                and name in list_attr.get(self.element.name, []))):
263            # A node that is being cloned may have already undergone
264            # this procedure.
265            if not isinstance(value, list):
266                value = nonwhitespace_re.findall(value)
267        self.element[name] = value
268    def items(self):
269        return list(self.attrs.items())
270    def keys(self):
271        return list(self.attrs.keys())
272    def __len__(self):
273        return len(self.attrs)
274    def __getitem__(self, name):
275        return self.attrs[name]
276    def __contains__(self, name):
277        return name in list(self.attrs.keys())
278
279
280class Element(treebuilder_base.Node):
281    def __init__(self, element, soup, namespace):
282        treebuilder_base.Node.__init__(self, element.name)
283        self.element = element
284        self.soup = soup
285        self.namespace = namespace
286
287    def appendChild(self, node):
288        string_child = child = None
289        if isinstance(node, str):
290            # Some other piece of code decided to pass in a string
291            # instead of creating a TextElement object to contain the
292            # string.
293            string_child = child = node
294        elif isinstance(node, Tag):
295            # Some other piece of code decided to pass in a Tag
296            # instead of creating an Element object to contain the
297            # Tag.
298            child = node
299        elif node.element.__class__ == NavigableString:
300            string_child = child = node.element
301            node.parent = self
302        else:
303            child = node.element
304            node.parent = self
305
306        if not isinstance(child, str) and child.parent is not None:
307            node.element.extract()
308
309        if (string_child is not None and self.element.contents
310            and self.element.contents[-1].__class__ == NavigableString):
311            # We are appending a string onto another string.
312            # TODO This has O(n^2) performance, for input like
313            # "a</a>a</a>a</a>..."
314            old_element = self.element.contents[-1]
315            new_element = self.soup.new_string(old_element + string_child)
316            old_element.replace_with(new_element)
317            self.soup._most_recent_element = new_element
318        else:
319            if isinstance(node, str):
320                # Create a brand new NavigableString from this string.
321                child = self.soup.new_string(node)
322
323            # Tell Beautiful Soup to act as if it parsed this element
324            # immediately after the parent's last descendant. (Or
325            # immediately after the parent, if it has no children.)
326            if self.element.contents:
327                most_recent_element = self.element._last_descendant(False)
328            elif self.element.next_element is not None:
329                # Something from further ahead in the parse tree is
330                # being inserted into this earlier element. This is
331                # very annoying because it means an expensive search
332                # for the last element in the tree.
333                most_recent_element = self.soup._last_descendant()
334            else:
335                most_recent_element = self.element
336
337            self.soup.object_was_parsed(
338                child, parent=self.element,
339                most_recent_element=most_recent_element)
340
341    def getAttributes(self):
342        if isinstance(self.element, Comment):
343            return {}
344        return AttrList(self.element)
345
346    def setAttributes(self, attributes):
347        if attributes is not None and len(attributes) > 0:
348            converted_attributes = []
349            for name, value in list(attributes.items()):
350                if isinstance(name, tuple):
351                    new_name = NamespacedAttribute(*name)
352                    del attributes[name]
353                    attributes[new_name] = value
354
355            self.soup.builder._replace_cdata_list_attribute_values(
356                self.name, attributes)
357            for name, value in list(attributes.items()):
358                self.element[name] = value
359
360            # The attributes may contain variables that need substitution.
361            # Call set_up_substitutions manually.
362            #
363            # The Tag constructor called this method when the Tag was created,
364            # but we just set/changed the attributes, so call it again.
365            self.soup.builder.set_up_substitutions(self.element)
366    attributes = property(getAttributes, setAttributes)
367
368    def insertText(self, data, insertBefore=None):
369        text = TextNode(self.soup.new_string(data), self.soup)
370        if insertBefore:
371            self.insertBefore(text, insertBefore)
372        else:
373            self.appendChild(text)
374
375    def insertBefore(self, node, refNode):
376        index = self.element.index(refNode.element)
377        if (node.element.__class__ == NavigableString and self.element.contents
378            and self.element.contents[index-1].__class__ == NavigableString):
379            # (See comments in appendChild)
380            old_node = self.element.contents[index-1]
381            new_str = self.soup.new_string(old_node + node.element)
382            old_node.replace_with(new_str)
383        else:
384            self.element.insert(index, node.element)
385            node.parent = self
386
387    def removeChild(self, node):
388        node.element.extract()
389
390    def reparentChildren(self, new_parent):
391        """Move all of this tag's children into another tag."""
392        # print("MOVE", self.element.contents)
393        # print("FROM", self.element)
394        # print("TO", new_parent.element)
395
396        element = self.element
397        new_parent_element = new_parent.element
398        # Determine what this tag's next_element will be once all the children
399        # are removed.
400        final_next_element = element.next_sibling
401
402        new_parents_last_descendant = new_parent_element._last_descendant(False, False)
403        if len(new_parent_element.contents) > 0:
404            # The new parent already contains children. We will be
405            # appending this tag's children to the end.
406            new_parents_last_child = new_parent_element.contents[-1]
407            new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
408        else:
409            # The new parent contains no children.
410            new_parents_last_child = None
411            new_parents_last_descendant_next_element = new_parent_element.next_element
412
413        to_append = element.contents
414        if len(to_append) > 0:
415            # Set the first child's previous_element and previous_sibling
416            # to elements within the new parent
417            first_child = to_append[0]
418            if new_parents_last_descendant is not None:
419                first_child.previous_element = new_parents_last_descendant
420            else:
421                first_child.previous_element = new_parent_element
422            first_child.previous_sibling = new_parents_last_child
423            if new_parents_last_descendant is not None:
424                new_parents_last_descendant.next_element = first_child
425            else:
426                new_parent_element.next_element = first_child
427            if new_parents_last_child is not None:
428                new_parents_last_child.next_sibling = first_child
429
430            # Find the very last element being moved. It is now the
431            # parent's last descendant. It has no .next_sibling and
432            # its .next_element is whatever the previous last
433            # descendant had.
434            last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
435
436            last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
437            if new_parents_last_descendant_next_element is not None:
438                # TODO: This code has no test coverage and I'm not sure
439                # how to get html5lib to go through this path, but it's
440                # just the other side of the previous line.
441                new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
442            last_childs_last_descendant.next_sibling = None
443
444        for child in to_append:
445            child.parent = new_parent_element
446            new_parent_element.contents.append(child)
447
448        # Now that this element has no children, change its .next_element.
449        element.contents = []
450        element.next_element = final_next_element
451
452        # print("DONE WITH MOVE")
453        # print("FROM", self.element)
454        # print("TO", new_parent_element)
455
456    def cloneNode(self):
457        tag = self.soup.new_tag(self.element.name, self.namespace)
458        node = Element(tag, self.soup, self.namespace)
459        for key,value in self.attributes:
460            node.attributes[key] = value
461        return node
462
463    def hasContent(self):
464        return self.element.contents
465
466    def getNameTuple(self):
467        if self.namespace == None:
468            return namespaces["html"], self.name
469        else:
470            return self.namespace, self.name
471
472    nameTuple = property(getNameTuple)
473
474class TextNode(Element):
475    def __init__(self, element, soup):
476        treebuilder_base.Node.__init__(self, None)
477        self.element = element
478        self.soup = soup
479
480    def cloneNode(self):
481        raise NotImplementedError
482