xref: /openbmc/openbmc/poky/bitbake/lib/bs4/builder/_lxml.py (revision c124f4f2e04dca16a428a76c89677328bc7bf908)
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4__all__ = [
5    'LXMLTreeBuilderForXML',
6    'LXMLTreeBuilder',
7    ]
8
9try:
10    from collections.abc import Callable # Python 3.6
11except ImportError as e:
12    from collections import Callable
13
14from io import BytesIO
15from io import StringIO
16from lxml import etree
17from bs4.element import (
18    Comment,
19    Doctype,
20    NamespacedAttribute,
21    ProcessingInstruction,
22    XMLProcessingInstruction,
23)
24from bs4.builder import (
25    DetectsXMLParsedAsHTML,
26    FAST,
27    HTML,
28    HTMLTreeBuilder,
29    PERMISSIVE,
30    ParserRejectedMarkup,
31    TreeBuilder,
32    XML)
33from bs4.dammit import EncodingDetector
34
35LXML = 'lxml'
36
37def _invert(d):
38    "Invert a dictionary."
39    return dict((v,k) for k, v in list(d.items()))
40
41class LXMLTreeBuilderForXML(TreeBuilder):
42    DEFAULT_PARSER_CLASS = etree.XMLParser
43
44    is_xml = True
45    processing_instruction_class = XMLProcessingInstruction
46
47    NAME = "lxml-xml"
48    ALTERNATE_NAMES = ["xml"]
49
50    # Well, it's permissive by XML parser standards.
51    features = [NAME, LXML, XML, FAST, PERMISSIVE]
52
53    CHUNK_SIZE = 512
54
55    # This namespace mapping is specified in the XML Namespace
56    # standard.
57    DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
58
59    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
60
61    # NOTE: If we parsed Element objects and looked at .sourceline,
62    # we'd be able to see the line numbers from the original document.
63    # But instead we build an XMLParser or HTMLParser object to serve
64    # as the target of parse messages, and those messages don't include
65    # line numbers.
66    # See: https://bugs.launchpad.net/lxml/+bug/1846906
67
68    def initialize_soup(self, soup):
69        """Let the BeautifulSoup object know about the standard namespace
70        mapping.
71
72        :param soup: A `BeautifulSoup`.
73        """
74        super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
75        self._register_namespaces(self.DEFAULT_NSMAPS)
76
77    def _register_namespaces(self, mapping):
78        """Let the BeautifulSoup object know about namespaces encountered
79        while parsing the document.
80
81        This might be useful later on when creating CSS selectors.
82
83        This will track (almost) all namespaces, even ones that were
84        only in scope for part of the document. If two namespaces have
85        the same prefix, only the first one encountered will be
86        tracked. Un-prefixed namespaces are not tracked.
87
88        :param mapping: A dictionary mapping namespace prefixes to URIs.
89        """
90        for key, value in list(mapping.items()):
91            # This is 'if key' and not 'if key is not None' because we
92            # don't track un-prefixed namespaces. Soupselect will
93            # treat an un-prefixed namespace as the default, which
94            # causes confusion in some cases.
95            if key and key not in self.soup._namespaces:
96                # Let the BeautifulSoup object know about a new namespace.
97                # If there are multiple namespaces defined with the same
98                # prefix, the first one in the document takes precedence.
99                self.soup._namespaces[key] = value
100
101    def default_parser(self, encoding):
102        """Find the default parser for the given encoding.
103
104        :param encoding: A string.
105        :return: Either a parser object or a class, which
106          will be instantiated with default arguments.
107        """
108        if self._default_parser is not None:
109            return self._default_parser
110        return etree.XMLParser(
111            target=self, strip_cdata=False, recover=True, encoding=encoding)
112
113    def parser_for(self, encoding):
114        """Instantiate an appropriate parser for the given encoding.
115
116        :param encoding: A string.
117        :return: A parser object such as an `etree.XMLParser`.
118        """
119        # Use the default parser.
120        parser = self.default_parser(encoding)
121
122        if isinstance(parser, Callable):
123            # Instantiate the parser with default arguments
124            parser = parser(
125                target=self, strip_cdata=False, recover=True, encoding=encoding
126            )
127        return parser
128
129    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
130        # TODO: Issue a warning if parser is present but not a
131        # callable, since that means there's no way to create new
132        # parsers for different encodings.
133        self._default_parser = parser
134        if empty_element_tags is not None:
135            self.empty_element_tags = set(empty_element_tags)
136        self.soup = None
137        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
138        self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
139        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
140
141    def _getNsTag(self, tag):
142        # Split the namespace URL out of a fully-qualified lxml tag
143        # name. Copied from lxml's src/lxml/sax.py.
144        if tag[0] == '{':
145            return tuple(tag[1:].split('}', 1))
146        else:
147            return (None, tag)
148
149    def prepare_markup(self, markup, user_specified_encoding=None,
150                       exclude_encodings=None,
151                       document_declared_encoding=None):
152        """Run any preliminary steps necessary to make incoming markup
153        acceptable to the parser.
154
155        lxml really wants to get a bytestring and convert it to
156        Unicode itself. So instead of using UnicodeDammit to convert
157        the bytestring to Unicode using different encodings, this
158        implementation uses EncodingDetector to iterate over the
159        encodings, and tell lxml to try to parse the document as each
160        one in turn.
161
162        :param markup: Some markup -- hopefully a bytestring.
163        :param user_specified_encoding: The user asked to try this encoding.
164        :param document_declared_encoding: The markup itself claims to be
165            in this encoding.
166        :param exclude_encodings: The user asked _not_ to try any of
167            these encodings.
168
169        :yield: A series of 4-tuples:
170         (markup, encoding, declared encoding,
171          has undergone character replacement)
172
173         Each 4-tuple represents a strategy for converting the
174         document to Unicode and parsing it. Each strategy will be tried
175         in turn.
176        """
177        is_html = not self.is_xml
178        if is_html:
179            self.processing_instruction_class = ProcessingInstruction
180            # We're in HTML mode, so if we're given XML, that's worth
181            # noting.
182            DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(
183                markup, stacklevel=3
184            )
185        else:
186            self.processing_instruction_class = XMLProcessingInstruction
187
188        if isinstance(markup, str):
189            # We were given Unicode. Maybe lxml can parse Unicode on
190            # this system?
191
192            # TODO: This is a workaround for
193            # https://bugs.launchpad.net/lxml/+bug/1948551.
194            # We can remove it once the upstream issue is fixed.
195            if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':
196                markup = markup[1:]
197            yield markup, None, document_declared_encoding, False
198
199        if isinstance(markup, str):
200            # No, apparently not. Convert the Unicode to UTF-8 and
201            # tell lxml to parse it as UTF-8.
202            yield (markup.encode("utf8"), "utf8",
203                   document_declared_encoding, False)
204
205        # This was provided by the end-user; treat it as a known
206        # definite encoding per the algorithm laid out in the HTML5
207        # spec.  (See the EncodingDetector class for details.)
208        known_definite_encodings = [user_specified_encoding]
209
210        # This was found in the document; treat it as a slightly lower-priority
211        # user encoding.
212        user_encodings = [document_declared_encoding]
213        detector = EncodingDetector(
214            markup, known_definite_encodings=known_definite_encodings,
215            user_encodings=user_encodings, is_html=is_html,
216            exclude_encodings=exclude_encodings
217        )
218        for encoding in detector.encodings:
219            yield (detector.markup, encoding, document_declared_encoding, False)
220
221    def feed(self, markup):
222        if isinstance(markup, bytes):
223            markup = BytesIO(markup)
224        elif isinstance(markup, str):
225            markup = StringIO(markup)
226
227        # Call feed() at least once, even if the markup is empty,
228        # or the parser won't be initialized.
229        data = markup.read(self.CHUNK_SIZE)
230        try:
231            self.parser = self.parser_for(self.soup.original_encoding)
232            self.parser.feed(data)
233            while len(data) != 0:
234                # Now call feed() on the rest of the data, chunk by chunk.
235                data = markup.read(self.CHUNK_SIZE)
236                if len(data) != 0:
237                    self.parser.feed(data)
238            self.parser.close()
239        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
240            raise ParserRejectedMarkup(e)
241
242    def close(self):
243        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
244
245    def start(self, name, attrs, nsmap={}):
246        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
247        attrs = dict(attrs)
248        nsprefix = None
249        # Invert each namespace map as it comes in.
250        if len(nsmap) == 0 and len(self.nsmaps) > 1:
251                # There are no new namespaces for this tag, but
252                # non-default namespaces are in play, so we need a
253                # separate tag stack to know when they end.
254                self.nsmaps.append(None)
255        elif len(nsmap) > 0:
256            # A new namespace mapping has come into play.
257
258            # First, Let the BeautifulSoup object know about it.
259            self._register_namespaces(nsmap)
260
261            # Then, add it to our running list of inverted namespace
262            # mappings.
263            self.nsmaps.append(_invert(nsmap))
264
265            # The currently active namespace prefixes have
266            # changed. Calculate the new mapping so it can be stored
267            # with all Tag objects created while these prefixes are in
268            # scope.
269            current_mapping = dict(self.active_namespace_prefixes[-1])
270            current_mapping.update(nsmap)
271
272            # We should not track un-prefixed namespaces as we can only hold one
273            # and it will be recognized as the default namespace by soupsieve,
274            # which may be confusing in some situations.
275            if '' in current_mapping:
276                del current_mapping['']
277            self.active_namespace_prefixes.append(current_mapping)
278
279            # Also treat the namespace mapping as a set of attributes on the
280            # tag, so we can recreate it later.
281            attrs = attrs.copy()
282            for prefix, namespace in list(nsmap.items()):
283                attribute = NamespacedAttribute(
284                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
285                attrs[attribute] = namespace
286
287        # Namespaces are in play. Find any attributes that came in
288        # from lxml with namespaces attached to their names, and
289        # turn then into NamespacedAttribute objects.
290        new_attrs = {}
291        for attr, value in list(attrs.items()):
292            namespace, attr = self._getNsTag(attr)
293            if namespace is None:
294                new_attrs[attr] = value
295            else:
296                nsprefix = self._prefix_for_namespace(namespace)
297                attr = NamespacedAttribute(nsprefix, attr, namespace)
298                new_attrs[attr] = value
299        attrs = new_attrs
300
301        namespace, name = self._getNsTag(name)
302        nsprefix = self._prefix_for_namespace(namespace)
303        self.soup.handle_starttag(
304            name, namespace, nsprefix, attrs,
305            namespaces=self.active_namespace_prefixes[-1]
306        )
307
308    def _prefix_for_namespace(self, namespace):
309        """Find the currently active prefix for the given namespace."""
310        if namespace is None:
311            return None
312        for inverted_nsmap in reversed(self.nsmaps):
313            if inverted_nsmap is not None and namespace in inverted_nsmap:
314                return inverted_nsmap[namespace]
315        return None
316
317    def end(self, name):
318        self.soup.endData()
319        completed_tag = self.soup.tagStack[-1]
320        namespace, name = self._getNsTag(name)
321        nsprefix = None
322        if namespace is not None:
323            for inverted_nsmap in reversed(self.nsmaps):
324                if inverted_nsmap is not None and namespace in inverted_nsmap:
325                    nsprefix = inverted_nsmap[namespace]
326                    break
327        self.soup.handle_endtag(name, nsprefix)
328        if len(self.nsmaps) > 1:
329            # This tag, or one of its parents, introduced a namespace
330            # mapping, so pop it off the stack.
331            out_of_scope_nsmap = self.nsmaps.pop()
332
333            if out_of_scope_nsmap is not None:
334                # This tag introduced a namespace mapping which is no
335                # longer in scope. Recalculate the currently active
336                # namespace prefixes.
337                self.active_namespace_prefixes.pop()
338
339    def pi(self, target, data):
340        self.soup.endData()
341        data = target + ' ' + data
342        self.soup.handle_data(data)
343        self.soup.endData(self.processing_instruction_class)
344
345    def data(self, content):
346        self.soup.handle_data(content)
347
348    def doctype(self, name, pubid, system):
349        self.soup.endData()
350        doctype = Doctype.for_name_and_ids(name, pubid, system)
351        self.soup.object_was_parsed(doctype)
352
353    def comment(self, content):
354        "Handle comments as Comment objects."
355        self.soup.endData()
356        self.soup.handle_data(content)
357        self.soup.endData(Comment)
358
359    def test_fragment_to_document(self, fragment):
360        """See `TreeBuilder`."""
361        return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
362
363
364class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
365
366    NAME = LXML
367    ALTERNATE_NAMES = ["lxml-html"]
368
369    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
370    is_xml = False
371    processing_instruction_class = ProcessingInstruction
372
373    def default_parser(self, encoding):
374        return etree.HTMLParser
375
376    def feed(self, markup):
377        encoding = self.soup.original_encoding
378        try:
379            self.parser = self.parser_for(encoding)
380            self.parser.feed(markup)
381            self.parser.close()
382        except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
383            raise ParserRejectedMarkup(e)
384
385
386    def test_fragment_to_document(self, fragment):
387        """See `TreeBuilder`."""
388        return '<html><body>%s</body></html>' % fragment
389