xref: /openbmc/openbmc/poky/bitbake/lib/bs4/builder/_htmlparser.py (revision c124f4f2e04dca16a428a76c89677328bc7bf908)
1 # encoding: utf-8
2 """Use the HTMLParser library to parse HTML files that aren't too bad."""
3 
4 # Use of this source code is governed by the MIT license.
5 __license__ = "MIT"
6 
7 __all__ = [
8     'HTMLParserTreeBuilder',
9     ]
10 
11 from html.parser import HTMLParser
12 
13 import sys
14 import warnings
15 
16 from bs4.element import (
17     CData,
18     Comment,
19     Declaration,
20     Doctype,
21     ProcessingInstruction,
22     )
23 from bs4.dammit import EntitySubstitution, UnicodeDammit
24 
25 from bs4.builder import (
26     DetectsXMLParsedAsHTML,
27     ParserRejectedMarkup,
28     HTML,
29     HTMLTreeBuilder,
30     STRICT,
31     )
32 
33 
34 HTMLPARSER = 'html.parser'
35 
36 class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
37     """A subclass of the Python standard library's HTMLParser class, which
38     listens for HTMLParser events and translates them into calls
39     to Beautiful Soup's tree construction API.
40     """
41 
42     # Strategies for handling duplicate attributes
43     IGNORE = 'ignore'
44     REPLACE = 'replace'
45 
46     def __init__(self, *args, **kwargs):
47         """Constructor.
48 
49         :param on_duplicate_attribute: A strategy for what to do if a
50             tag includes the same attribute more than once. Accepted
51             values are: REPLACE (replace earlier values with later
52             ones, the default), IGNORE (keep the earliest value
53             encountered), or a callable. A callable must take three
54             arguments: the dictionary of attributes already processed,
55             the name of the duplicate attribute, and the most recent value
56             encountered.
57         """
58         self.on_duplicate_attribute = kwargs.pop(
59             'on_duplicate_attribute', self.REPLACE
60         )
61         HTMLParser.__init__(self, *args, **kwargs)
62 
63         # Keep a list of empty-element tags that were encountered
64         # without an explicit closing tag. If we encounter a closing tag
65         # of this type, we'll associate it with one of those entries.
66         #
67         # This isn't a stack because we don't care about the
68         # order. It's a list of closing tags we've already handled and
69         # will ignore, assuming they ever show up.
70         self.already_closed_empty_element = []
71 
72         self._initialize_xml_detector()
73 
74     def error(self, message):
75         # NOTE: This method is required so long as Python 3.9 is
76         # supported. The corresponding code is removed from HTMLParser
77         # in 3.5, but not removed from ParserBase until 3.10.
78         # https://github.com/python/cpython/issues/76025
79         #
80         # The original implementation turned the error into a warning,
81         # but in every case I discovered, this made HTMLParser
82         # immediately crash with an error message that was less
83         # helpful than the warning. The new implementation makes it
84         # more clear that html.parser just can't parse this
85         # markup. The 3.10 implementation does the same, though it
86         # raises AssertionError rather than calling a method. (We
87         # catch this error and wrap it in a ParserRejectedMarkup.)
88         raise ParserRejectedMarkup(message)
89 
90     def handle_startendtag(self, name, attrs):
91         """Handle an incoming empty-element tag.
92 
93         This is only called when the markup looks like <tag/>.
94 
95         :param name: Name of the tag.
96         :param attrs: Dictionary of the tag's attributes.
97         """
98         # is_startend() tells handle_starttag not to close the tag
99         # just because its name matches a known empty-element tag. We
100         # know that this is an empty-element tag and we want to call
101         # handle_endtag ourselves.
102         tag = self.handle_starttag(name, attrs, handle_empty_element=False)
103         self.handle_endtag(name)
104 
105     def handle_starttag(self, name, attrs, handle_empty_element=True):
106         """Handle an opening tag, e.g. '<tag>'
107 
108         :param name: Name of the tag.
109         :param attrs: Dictionary of the tag's attributes.
110         :param handle_empty_element: True if this tag is known to be
111             an empty-element tag (i.e. there is not expected to be any
112             closing tag).
113         """
114         # XXX namespace
115         attr_dict = {}
116         for key, value in attrs:
117             # Change None attribute values to the empty string
118             # for consistency with the other tree builders.
119             if value is None:
120                 value = ''
121             if key in attr_dict:
122                 # A single attribute shows up multiple times in this
123                 # tag. How to handle it depends on the
124                 # on_duplicate_attribute setting.
125                 on_dupe = self.on_duplicate_attribute
126                 if on_dupe == self.IGNORE:
127                     pass
128                 elif on_dupe in (None, self.REPLACE):
129                     attr_dict[key] = value
130                 else:
131                     on_dupe(attr_dict, key, value)
132             else:
133                 attr_dict[key] = value
134             attrvalue = '""'
135         #print("START", name)
136         sourceline, sourcepos = self.getpos()
137         tag = self.soup.handle_starttag(
138             name, None, None, attr_dict, sourceline=sourceline,
139             sourcepos=sourcepos
140         )
141         if tag and tag.is_empty_element and handle_empty_element:
142             # Unlike other parsers, html.parser doesn't send separate end tag
143             # events for empty-element tags. (It's handled in
144             # handle_startendtag, but only if the original markup looked like
145             # <tag/>.)
146             #
147             # So we need to call handle_endtag() ourselves. Since we
148             # know the start event is identical to the end event, we
149             # don't want handle_endtag() to cross off any previous end
150             # events for tags of this name.
151             self.handle_endtag(name, check_already_closed=False)
152 
153             # But we might encounter an explicit closing tag for this tag
154             # later on. If so, we want to ignore it.
155             self.already_closed_empty_element.append(name)
156 
157         if self._root_tag is None:
158             self._root_tag_encountered(name)
159 
160     def handle_endtag(self, name, check_already_closed=True):
161         """Handle a closing tag, e.g. '</tag>'
162 
163         :param name: A tag name.
164         :param check_already_closed: True if this tag is expected to
165            be the closing portion of an empty-element tag,
166            e.g. '<tag></tag>'.
167         """
168         #print("END", name)
169         if check_already_closed and name in self.already_closed_empty_element:
170             # This is a redundant end tag for an empty-element tag.
171             # We've already called handle_endtag() for it, so just
172             # check it off the list.
173             #print("ALREADY CLOSED", name)
174             self.already_closed_empty_element.remove(name)
175         else:
176             self.soup.handle_endtag(name)
177 
178     def handle_data(self, data):
179         """Handle some textual data that shows up between tags."""
180         self.soup.handle_data(data)
181 
182     def handle_charref(self, name):
183         """Handle a numeric character reference by converting it to the
184         corresponding Unicode character and treating it as textual
185         data.
186 
187         :param name: Character number, possibly in hexadecimal.
188         """
189         # TODO: This was originally a workaround for a bug in
190         # HTMLParser. (http://bugs.python.org/issue13633) The bug has
191         # been fixed, but removing this code still makes some
192         # Beautiful Soup tests fail. This needs investigation.
193         if name.startswith('x'):
194             real_name = int(name.lstrip('x'), 16)
195         elif name.startswith('X'):
196             real_name = int(name.lstrip('X'), 16)
197         else:
198             real_name = int(name)
199 
200         data = None
201         if real_name < 256:
202             # HTML numeric entities are supposed to reference Unicode
203             # code points, but sometimes they reference code points in
204             # some other encoding (ahem, Windows-1252). E.g. &#147;
205             # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
206             # code tries to detect this situation and compensate.
207             for encoding in (self.soup.original_encoding, 'windows-1252'):
208                 if not encoding:
209                     continue
210                 try:
211                     data = bytearray([real_name]).decode(encoding)
212                 except UnicodeDecodeError as e:
213                     pass
214         if not data:
215             try:
216                 data = chr(real_name)
217             except (ValueError, OverflowError) as e:
218                 pass
219         data = data or "\N{REPLACEMENT CHARACTER}"
220         self.handle_data(data)
221 
222     def handle_entityref(self, name):
223         """Handle a named entity reference by converting it to the
224         corresponding Unicode character(s) and treating it as textual
225         data.
226 
227         :param name: Name of the entity reference.
228         """
229         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
230         if character is not None:
231             data = character
232         else:
233             # If this were XML, it would be ambiguous whether "&foo"
234             # was an character entity reference with a missing
235             # semicolon or the literal string "&foo". Since this is
236             # HTML, we have a complete list of all character entity references,
237             # and this one wasn't found, so assume it's the literal string "&foo".
238             data = "&%s" % name
239         self.handle_data(data)
240 
241     def handle_comment(self, data):
242         """Handle an HTML comment.
243 
244         :param data: The text of the comment.
245         """
246         self.soup.endData()
247         self.soup.handle_data(data)
248         self.soup.endData(Comment)
249 
250     def handle_decl(self, data):
251         """Handle a DOCTYPE declaration.
252 
253         :param data: The text of the declaration.
254         """
255         self.soup.endData()
256         data = data[len("DOCTYPE "):]
257         self.soup.handle_data(data)
258         self.soup.endData(Doctype)
259 
260     def unknown_decl(self, data):
261         """Handle a declaration of unknown type -- probably a CDATA block.
262 
263         :param data: The text of the declaration.
264         """
265         if data.upper().startswith('CDATA['):
266             cls = CData
267             data = data[len('CDATA['):]
268         else:
269             cls = Declaration
270         self.soup.endData()
271         self.soup.handle_data(data)
272         self.soup.endData(cls)
273 
274     def handle_pi(self, data):
275         """Handle a processing instruction.
276 
277         :param data: The text of the instruction.
278         """
279         self.soup.endData()
280         self.soup.handle_data(data)
281         self._document_might_be_xml(data)
282         self.soup.endData(ProcessingInstruction)
283 
284 
285 class HTMLParserTreeBuilder(HTMLTreeBuilder):
286     """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
287     found in the Python standard library.
288     """
289     is_xml = False
290     picklable = True
291     NAME = HTMLPARSER
292     features = [NAME, HTML, STRICT]
293 
294     # The html.parser knows which line number and position in the
295     # original file is the source of an element.
296     TRACKS_LINE_NUMBERS = True
297 
298     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
299         """Constructor.
300 
301         :param parser_args: Positional arguments to pass into
302             the BeautifulSoupHTMLParser constructor, once it's
303             invoked.
304         :param parser_kwargs: Keyword arguments to pass into
305             the BeautifulSoupHTMLParser constructor, once it's
306             invoked.
307         :param kwargs: Keyword arguments for the superclass constructor.
308         """
309         # Some keyword arguments will be pulled out of kwargs and placed
310         # into parser_kwargs.
311         extra_parser_kwargs = dict()
312         for arg in ('on_duplicate_attribute',):
313             if arg in kwargs:
314                 value = kwargs.pop(arg)
315                 extra_parser_kwargs[arg] = value
316         super(HTMLParserTreeBuilder, self).__init__(**kwargs)
317         parser_args = parser_args or []
318         parser_kwargs = parser_kwargs or {}
319         parser_kwargs.update(extra_parser_kwargs)
320         parser_kwargs['convert_charrefs'] = False
321         self.parser_args = (parser_args, parser_kwargs)
322 
323     def prepare_markup(self, markup, user_specified_encoding=None,
324                        document_declared_encoding=None, exclude_encodings=None):
325 
326         """Run any preliminary steps necessary to make incoming markup
327         acceptable to the parser.
328 
329         :param markup: Some markup -- probably a bytestring.
330         :param user_specified_encoding: The user asked to try this encoding.
331         :param document_declared_encoding: The markup itself claims to be
332             in this encoding.
333         :param exclude_encodings: The user asked _not_ to try any of
334             these encodings.
335 
336         :yield: A series of 4-tuples:
337          (markup, encoding, declared encoding,
338           has undergone character replacement)
339 
340          Each 4-tuple represents a strategy for converting the
341          document to Unicode and parsing it. Each strategy will be tried
342          in turn.
343         """
344         if isinstance(markup, str):
345             # Parse Unicode as-is.
346             yield (markup, None, None, False)
347             return
348 
349         # Ask UnicodeDammit to sniff the most likely encoding.
350 
351         # This was provided by the end-user; treat it as a known
352         # definite encoding per the algorithm laid out in the HTML5
353         # spec.  (See the EncodingDetector class for details.)
354         known_definite_encodings = [user_specified_encoding]
355 
356         # This was found in the document; treat it as a slightly lower-priority
357         # user encoding.
358         user_encodings = [document_declared_encoding]
359 
360         try_encodings = [user_specified_encoding, document_declared_encoding]
361         dammit = UnicodeDammit(
362             markup,
363             known_definite_encodings=known_definite_encodings,
364             user_encodings=user_encodings,
365             is_html=True,
366             exclude_encodings=exclude_encodings
367         )
368         yield (dammit.markup, dammit.original_encoding,
369                dammit.declared_html_encoding,
370                dammit.contains_replacement_characters)
371 
372     def feed(self, markup):
373         """Run some incoming markup through some parsing process,
374         populating the `BeautifulSoup` object in self.soup.
375         """
376         args, kwargs = self.parser_args
377         parser = BeautifulSoupHTMLParser(*args, **kwargs)
378         parser.soup = self.soup
379         try:
380             parser.feed(markup)
381             parser.close()
382         except AssertionError as e:
383             # html.parser raises AssertionError in rare cases to
384             # indicate a fatal problem with the markup, especially
385             # when there's an error in the doctype declaration.
386             raise ParserRejectedMarkup(e)
387         parser.already_closed_empty_element = []
388