xref: /openbmc/openbmc/poky/bitbake/lib/bs4/__init__.py (revision 82c905dc)
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to
9navigate, search, and modify the parse tree.
10
11Beautiful Soup works with Python 2.6 and up. It works better if lxml
12and/or html5lib is installed.
13
14For more than you ever wanted to know about Beautiful Soup, see the
15documentation:
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17"""
18
19__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.4.1"
21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
22__license__ = "MIT"
23
24__all__ = ['BeautifulSoup']
25
26import os
27import re
28import warnings
29
30from .builder import builder_registry, ParserRejectedMarkup
31from .dammit import UnicodeDammit
32from .element import (
33    CData,
34    Comment,
35    DEFAULT_OUTPUT_ENCODING,
36    Declaration,
37    Doctype,
38    NavigableString,
39    PageElement,
40    ProcessingInstruction,
41    ResultSet,
42    SoupStrainer,
43    Tag,
44    )
45
46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it.
48'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49
50class BeautifulSoup(Tag):
51    """
52    This class defines the basic interface called by the tree builders.
53
54    These methods will be called by the parser:
55      reset()
56      feed(markup)
57
58    The tree builder may call these methods from its feed() implementation:
59      handle_starttag(name, attrs) # See note about return value
60      handle_endtag(name)
61      handle_data(data) # Appends to the current data node
62      endData(containerClass=NavigableString) # Ends the current data node
63
64    No matter how complicated the underlying parser is, you should be
65    able to build a tree using 'start tag' events, 'end tag' events,
66    'data' events, and "done with data" events.
67
68    If you encounter an empty-element tag (aka a self-closing tag,
69    like HTML's <br> tag), call handle_starttag and then
70    handle_endtag.
71    """
72    ROOT_TAG_NAME = '[document]'
73
74    # If the end-user gives no indication which tree builder they
75    # want, look for one with these features.
76    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77
78    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79
80    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
81
82    def __init__(self, markup="", features=None, builder=None,
83                 parse_only=None, from_encoding=None, exclude_encodings=None,
84                 **kwargs):
85        """The Soup object is initialized as the 'root tag', and the
86        provided markup (which can be a string or a file-like object)
87        is fed into the underlying parser."""
88
89        if 'convertEntities' in kwargs:
90            warnings.warn(
91                "BS4 does not respect the convertEntities argument to the "
92                "BeautifulSoup constructor. Entities are always converted "
93                "to Unicode characters.")
94
95        if 'markupMassage' in kwargs:
96            del kwargs['markupMassage']
97            warnings.warn(
98                "BS4 does not respect the markupMassage argument to the "
99                "BeautifulSoup constructor. The tree builder is responsible "
100                "for any necessary markup massage.")
101
102        if 'smartQuotesTo' in kwargs:
103            del kwargs['smartQuotesTo']
104            warnings.warn(
105                "BS4 does not respect the smartQuotesTo argument to the "
106                "BeautifulSoup constructor. Smart quotes are always converted "
107                "to Unicode characters.")
108
109        if 'selfClosingTags' in kwargs:
110            del kwargs['selfClosingTags']
111            warnings.warn(
112                "BS4 does not respect the selfClosingTags argument to the "
113                "BeautifulSoup constructor. The tree builder is responsible "
114                "for understanding self-closing tags.")
115
116        if 'isHTML' in kwargs:
117            del kwargs['isHTML']
118            warnings.warn(
119                "BS4 does not respect the isHTML argument to the "
120                "BeautifulSoup constructor. Suggest you use "
121                "features='lxml' for HTML and features='lxml-xml' for "
122                "XML.")
123
124        def deprecated_argument(old_name, new_name):
125            if old_name in kwargs:
126                warnings.warn(
127                    'The "%s" argument to the BeautifulSoup constructor '
128                    'has been renamed to "%s."' % (old_name, new_name))
129                value = kwargs[old_name]
130                del kwargs[old_name]
131                return value
132            return None
133
134        parse_only = parse_only or deprecated_argument(
135            "parseOnlyThese", "parse_only")
136
137        from_encoding = from_encoding or deprecated_argument(
138            "fromEncoding", "from_encoding")
139
140        if len(kwargs) > 0:
141            arg = list(kwargs.keys()).pop()
142            raise TypeError(
143                "__init__() got an unexpected keyword argument '%s'" % arg)
144
145        if builder is None:
146            original_features = features
147            if isinstance(features, str):
148                features = [features]
149            if features is None or len(features) == 0:
150                features = self.DEFAULT_BUILDER_FEATURES
151            builder_class = builder_registry.lookup(*features)
152            if builder_class is None:
153                raise FeatureNotFound(
154                    "Couldn't find a tree builder with the features you "
155                    "requested: %s. Do you need to install a parser library?"
156                    % ",".join(features))
157            builder = builder_class()
158            if not (original_features == builder.NAME or
159                    original_features in builder.ALTERNATE_NAMES):
160                if builder.is_xml:
161                    markup_type = "XML"
162                else:
163                    markup_type = "HTML"
164                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165                    parser=builder.NAME,
166                    markup_type=markup_type))
167
168        self.builder = builder
169        self.is_xml = builder.is_xml
170        self.builder.soup = self
171
172        self.parse_only = parse_only
173
174        if hasattr(markup, 'read'):        # It's a file-type object.
175            markup = markup.read()
176        elif len(markup) <= 256:
177            # Print out warnings for a couple beginner problems
178            # involving passing non-markup to Beautiful Soup.
179            # Beautiful Soup will still parse the input as markup,
180            # just in case that's what the user really wants.
181            if (isinstance(markup, str)
182                and not os.path.supports_unicode_filenames):
183                possible_filename = markup.encode("utf8")
184            else:
185                possible_filename = markup
186            is_file = False
187            try:
188                is_file = os.path.exists(possible_filename)
189            except Exception as e:
190                # This is almost certainly a problem involving
191                # characters not valid in filenames on this
192                # system. Just let it go.
193                pass
194            if is_file:
195                if isinstance(markup, str):
196                    markup = markup.encode("utf8")
197                warnings.warn(
198                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199            if markup[:5] == "http:" or markup[:6] == "https:":
200                # TODO: This is ugly but I couldn't get it to work in
201                # Python 3 otherwise.
202                if ((isinstance(markup, bytes) and not b' ' in markup)
203                    or (isinstance(markup, str) and not ' ' in markup)):
204                    if isinstance(markup, str):
205                        markup = markup.encode("utf8")
206                    warnings.warn(
207                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208
209        for (self.markup, self.original_encoding, self.declared_html_encoding,
210         self.contains_replacement_characters) in (
211             self.builder.prepare_markup(
212                 markup, from_encoding, exclude_encodings=exclude_encodings)):
213            self.reset()
214            try:
215                self._feed()
216                break
217            except ParserRejectedMarkup:
218                pass
219
220        # Clear out the markup and remove the builder's circular
221        # reference to this object.
222        self.markup = None
223        self.builder.soup = None
224
225    def __copy__(self):
226        return type(self)(self.encode(), builder=self.builder)
227
228    def __getstate__(self):
229        # Frequently a tree builder can't be pickled.
230        d = dict(self.__dict__)
231        if 'builder' in d and not self.builder.picklable:
232            del d['builder']
233        return d
234
235    def _feed(self):
236        # Convert the document to Unicode.
237        self.builder.reset()
238
239        self.builder.feed(self.markup)
240        # Close out any unfinished strings and close all the open tags.
241        self.endData()
242        while self.currentTag.name != self.ROOT_TAG_NAME:
243            self.popTag()
244
245    def reset(self):
246        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247        self.hidden = 1
248        self.builder.reset()
249        self.current_data = []
250        self.currentTag = None
251        self.tagStack = []
252        self.preserve_whitespace_tag_stack = []
253        self.pushTag(self)
254
255    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
256        """Create a new tag associated with this soup."""
257        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
258
259    def new_string(self, s, subclass=NavigableString):
260        """Create a new NavigableString associated with this soup."""
261        return subclass(s)
262
263    def insert_before(self, successor):
264        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265
266    def insert_after(self, successor):
267        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268
269    def popTag(self):
270        tag = self.tagStack.pop()
271        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272            self.preserve_whitespace_tag_stack.pop()
273        #print "Pop", tag.name
274        if self.tagStack:
275            self.currentTag = self.tagStack[-1]
276        return self.currentTag
277
278    def pushTag(self, tag):
279        #print "Push", tag.name
280        if self.currentTag:
281            self.currentTag.contents.append(tag)
282        self.tagStack.append(tag)
283        self.currentTag = self.tagStack[-1]
284        if tag.name in self.builder.preserve_whitespace_tags:
285            self.preserve_whitespace_tag_stack.append(tag)
286
287    def endData(self, containerClass=NavigableString):
288        if self.current_data:
289            current_data = ''.join(self.current_data)
290            # If whitespace is not preserved, and this string contains
291            # nothing but ASCII spaces, replace it with a single space
292            # or newline.
293            if not self.preserve_whitespace_tag_stack:
294                strippable = True
295                for i in current_data:
296                    if i not in self.ASCII_SPACES:
297                        strippable = False
298                        break
299                if strippable:
300                    if '\n' in current_data:
301                        current_data = '\n'
302                    else:
303                        current_data = ' '
304
305            # Reset the data collector.
306            self.current_data = []
307
308            # Should we add this string to the tree at all?
309            if self.parse_only and len(self.tagStack) <= 1 and \
310                   (not self.parse_only.text or \
311                    not self.parse_only.search(current_data)):
312                return
313
314            o = containerClass(current_data)
315            self.object_was_parsed(o)
316
317    def object_was_parsed(self, o, parent=None, most_recent_element=None):
318        """Add an object to the parse tree."""
319        parent = parent or self.currentTag
320        previous_element = most_recent_element or self._most_recent_element
321
322        next_element = previous_sibling = next_sibling = None
323        if isinstance(o, Tag):
324            next_element = o.next_element
325            next_sibling = o.next_sibling
326            previous_sibling = o.previous_sibling
327            if not previous_element:
328                previous_element = o.previous_element
329
330        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331
332        self._most_recent_element = o
333        parent.contents.append(o)
334
335        if parent.next_sibling:
336            # This node is being inserted into an element that has
337            # already been parsed. Deal with any dangling references.
338            index = parent.contents.index(o)
339            if index == 0:
340                previous_element = parent
341                previous_sibling = None
342            else:
343                previous_element = previous_sibling = parent.contents[index-1]
344            if index == len(parent.contents)-1:
345                next_element = parent.next_sibling
346                next_sibling = None
347            else:
348                next_element = next_sibling = parent.contents[index+1]
349
350            o.previous_element = previous_element
351            if previous_element:
352                previous_element.next_element = o
353            o.next_element = next_element
354            if next_element:
355                next_element.previous_element = o
356            o.next_sibling = next_sibling
357            if next_sibling:
358                next_sibling.previous_sibling = o
359            o.previous_sibling = previous_sibling
360            if previous_sibling:
361                previous_sibling.next_sibling = o
362
363    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364        """Pops the tag stack up to and including the most recent
365        instance of the given tag. If inclusivePop is false, pops the tag
366        stack up to but *not* including the most recent instqance of
367        the given tag."""
368        #print "Popping to %s" % name
369        if name == self.ROOT_TAG_NAME:
370            # The BeautifulSoup object itself can never be popped.
371            return
372
373        most_recently_popped = None
374
375        stack_size = len(self.tagStack)
376        for i in range(stack_size - 1, 0, -1):
377            t = self.tagStack[i]
378            if (name == t.name and nsprefix == t.prefix):
379                if inclusivePop:
380                    most_recently_popped = self.popTag()
381                break
382            most_recently_popped = self.popTag()
383
384        return most_recently_popped
385
386    def handle_starttag(self, name, namespace, nsprefix, attrs):
387        """Push a start tag on to the stack.
388
389        If this method returns None, the tag was rejected by the
390        SoupStrainer. You should proceed as if the tag had not occured
391        in the document. For instance, if this was a self-closing tag,
392        don't call handle_endtag.
393        """
394
395        # print "Start tag %s: %s" % (name, attrs)
396        self.endData()
397
398        if (self.parse_only and len(self.tagStack) <= 1
399            and (self.parse_only.text
400                 or not self.parse_only.search_tag(name, attrs))):
401            return None
402
403        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
404                  self.currentTag, self._most_recent_element)
405        if tag is None:
406            return tag
407        if self._most_recent_element:
408            self._most_recent_element.next_element = tag
409        self._most_recent_element = tag
410        self.pushTag(tag)
411        return tag
412
413    def handle_endtag(self, name, nsprefix=None):
414        #print "End tag: " + name
415        self.endData()
416        self._popToTag(name, nsprefix)
417
418    def handle_data(self, data):
419        self.current_data.append(data)
420
421    def decode(self, pretty_print=False,
422               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423               formatter="minimal"):
424        """Returns a string or Unicode representation of this document.
425        To get Unicode, pass None for encoding."""
426
427        if self.is_xml:
428            # Print the XML declaration
429            encoding_part = ''
430            if eventual_encoding is not None:
431                encoding_part = ' encoding="%s"' % eventual_encoding
432            prefix = '<?xml version="1.0"%s?>\n' % encoding_part
433        else:
434            prefix = ''
435        if not pretty_print:
436            indent_level = None
437        else:
438            indent_level = 0
439        return prefix + super(BeautifulSoup, self).decode(
440            indent_level, eventual_encoding, formatter)
441
442# Alias to make it easier to type import: 'from bs4 import _soup'
443_s = BeautifulSoup
444_soup = BeautifulSoup
445
446class BeautifulStoneSoup(BeautifulSoup):
447    """Deprecated interface to an XML parser."""
448
449    def __init__(self, *args, **kwargs):
450        kwargs['features'] = 'xml'
451        warnings.warn(
452            'The BeautifulStoneSoup class is deprecated. Instead of using '
453            'it, pass features="xml" into the BeautifulSoup constructor.')
454        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455
456
457class StopParsing(Exception):
458    pass
459
460class FeatureNotFound(ValueError):
461    pass
462
463
464#By default, act as an HTML pretty-printer.
465if __name__ == '__main__':
466    import sys
467    soup = BeautifulSoup(sys.stdin)
468    print(soup.prettify())
469