xref: /openbmc/openbmc/poky/bitbake/lib/bs4/element.py (revision edff4923)
1# Use of this source code is governed by the MIT license.
2__license__ = "MIT"
3
4try:
5    from collections.abc import Callable # Python 3.6
6except ImportError as e:
7    from collections import Callable
8import re
9import sys
10import warnings
11
12from bs4.css import CSS
13from bs4.formatter import (
14    Formatter,
15    HTMLFormatter,
16    XMLFormatter,
17)
18
19DEFAULT_OUTPUT_ENCODING = "utf-8"
20
21nonwhitespace_re = re.compile(r"\S+")
22
23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
24# the off chance someone imported it for their own use.
25whitespace_re = re.compile(r"\s+")
26
27def _alias(attr):
28    """Alias one attribute name to another for backward compatibility"""
29    @property
30    def alias(self):
31        return getattr(self, attr)
32
33    @alias.setter
34    def alias(self):
35        return setattr(self, attr)
36    return alias
37
38
39# These encodings are recognized by Python (so PageElement.encode
40# could theoretically support them) but XML and HTML don't recognize
41# them (so they should not show up in an XML or HTML document as that
42# document's encoding).
43#
44# If an XML document is encoded in one of these encodings, no encoding
45# will be mentioned in the XML declaration. If an HTML document is
46# encoded in one of these encodings, and the HTML document has a
47# <meta> tag that mentions an encoding, the encoding will be given as
48# the empty string.
49#
50# Source:
51# https://docs.python.org/3/library/codecs.html#python-specific-encodings
52PYTHON_SPECIFIC_ENCODINGS = set([
53    "idna",
54    "mbcs",
55    "oem",
56    "palmos",
57    "punycode",
58    "raw_unicode_escape",
59    "undefined",
60    "unicode_escape",
61    "raw-unicode-escape",
62    "unicode-escape",
63    "string-escape",
64    "string_escape",
65])
66
67
68class NamespacedAttribute(str):
69    """A namespaced string (e.g. 'xml:lang') that remembers the namespace
70    ('xml') and the name ('lang') that were used to create it.
71    """
72
73    def __new__(cls, prefix, name=None, namespace=None):
74        if not name:
75            # This is the default namespace. Its name "has no value"
76            # per https://www.w3.org/TR/xml-names/#defaulting
77            name = None
78
79        if not name:
80            obj = str.__new__(cls, prefix)
81        elif not prefix:
82            # Not really namespaced.
83            obj = str.__new__(cls, name)
84        else:
85            obj = str.__new__(cls, prefix + ":" + name)
86        obj.prefix = prefix
87        obj.name = name
88        obj.namespace = namespace
89        return obj
90
91class AttributeValueWithCharsetSubstitution(str):
92    """A stand-in object for a character encoding specified in HTML."""
93
94class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
95    """A generic stand-in for the value of a meta tag's 'charset' attribute.
96
97    When Beautiful Soup parses the markup '<meta charset="utf8">', the
98    value of the 'charset' attribute will be one of these objects.
99    """
100
101    def __new__(cls, original_value):
102        obj = str.__new__(cls, original_value)
103        obj.original_value = original_value
104        return obj
105
106    def encode(self, encoding):
107        """When an HTML document is being encoded to a given encoding, the
108        value of a meta tag's 'charset' is the name of the encoding.
109        """
110        if encoding in PYTHON_SPECIFIC_ENCODINGS:
111            return ''
112        return encoding
113
114
115class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
116    """A generic stand-in for the value of a meta tag's 'content' attribute.
117
118    When Beautiful Soup parses the markup:
119     <meta http-equiv="content-type" content="text/html; charset=utf8">
120
121    The value of the 'content' attribute will be one of these objects.
122    """
123
124    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
125
126    def __new__(cls, original_value):
127        match = cls.CHARSET_RE.search(original_value)
128        if match is None:
129            # No substitution necessary.
130            return str.__new__(str, original_value)
131
132        obj = str.__new__(cls, original_value)
133        obj.original_value = original_value
134        return obj
135
136    def encode(self, encoding):
137        if encoding in PYTHON_SPECIFIC_ENCODINGS:
138            return ''
139        def rewrite(match):
140            return match.group(1) + encoding
141        return self.CHARSET_RE.sub(rewrite, self.original_value)
142
143
144class PageElement(object):
145    """Contains the navigational information for some part of the page:
146    that is, its current location in the parse tree.
147
148    NavigableString, Tag, etc. are all subclasses of PageElement.
149    """
150
151    # In general, we can't tell just by looking at an element whether
152    # it's contained in an XML document or an HTML document. But for
153    # Tags (q.v.) we can store this information at parse time.
154    known_xml = None
155
156    def setup(self, parent=None, previous_element=None, next_element=None,
157              previous_sibling=None, next_sibling=None):
158        """Sets up the initial relations between this element and
159        other elements.
160
161        :param parent: The parent of this element.
162
163        :param previous_element: The element parsed immediately before
164            this one.
165
166        :param next_element: The element parsed immediately before
167            this one.
168
169        :param previous_sibling: The most recently encountered element
170            on the same level of the parse tree as this one.
171
172        :param previous_sibling: The next element to be encountered
173            on the same level of the parse tree as this one.
174        """
175        self.parent = parent
176
177        self.previous_element = previous_element
178        if previous_element is not None:
179            self.previous_element.next_element = self
180
181        self.next_element = next_element
182        if self.next_element is not None:
183            self.next_element.previous_element = self
184
185        self.next_sibling = next_sibling
186        if self.next_sibling is not None:
187            self.next_sibling.previous_sibling = self
188
189        if (previous_sibling is None
190            and self.parent is not None and self.parent.contents):
191            previous_sibling = self.parent.contents[-1]
192
193        self.previous_sibling = previous_sibling
194        if previous_sibling is not None:
195            self.previous_sibling.next_sibling = self
196
197    def format_string(self, s, formatter):
198        """Format the given string using the given formatter.
199
200        :param s: A string.
201        :param formatter: A Formatter object, or a string naming one of the standard formatters.
202        """
203        if formatter is None:
204            return s
205        if not isinstance(formatter, Formatter):
206            formatter = self.formatter_for_name(formatter)
207        output = formatter.substitute(s)
208        return output
209
210    def formatter_for_name(self, formatter):
211        """Look up or create a Formatter for the given identifier,
212        if necessary.
213
214        :param formatter: Can be a Formatter object (used as-is), a
215            function (used as the entity substitution hook for an
216            XMLFormatter or HTMLFormatter), or a string (used to look
217            up an XMLFormatter or HTMLFormatter in the appropriate
218            registry.
219        """
220        if isinstance(formatter, Formatter):
221            return formatter
222        if self._is_xml:
223            c = XMLFormatter
224        else:
225            c = HTMLFormatter
226        if isinstance(formatter, Callable):
227            return c(entity_substitution=formatter)
228        return c.REGISTRY[formatter]
229
230    @property
231    def _is_xml(self):
232        """Is this element part of an XML tree or an HTML tree?
233
234        This is used in formatter_for_name, when deciding whether an
235        XMLFormatter or HTMLFormatter is more appropriate. It can be
236        inefficient, but it should be called very rarely.
237        """
238        if self.known_xml is not None:
239            # Most of the time we will have determined this when the
240            # document is parsed.
241            return self.known_xml
242
243        # Otherwise, it's likely that this element was created by
244        # direct invocation of the constructor from within the user's
245        # Python code.
246        if self.parent is None:
247            # This is the top-level object. It should have .known_xml set
248            # from tree creation. If not, take a guess--BS is usually
249            # used on HTML markup.
250            return getattr(self, 'is_xml', False)
251        return self.parent._is_xml
252
253    nextSibling = _alias("next_sibling")  # BS3
254    previousSibling = _alias("previous_sibling")  # BS3
255
256    default = object()
257    def _all_strings(self, strip=False, types=default):
258        """Yield all strings of certain classes, possibly stripping them.
259
260        This is implemented differently in Tag and NavigableString.
261        """
262        raise NotImplementedError()
263
264    @property
265    def stripped_strings(self):
266        """Yield all strings in this PageElement, stripping them first.
267
268        :yield: A sequence of stripped strings.
269        """
270        for string in self._all_strings(True):
271            yield string
272
273    def get_text(self, separator="", strip=False,
274                 types=default):
275        """Get all child strings of this PageElement, concatenated using the
276        given separator.
277
278        :param separator: Strings will be concatenated using this separator.
279
280        :param strip: If True, strings will be stripped before being
281            concatenated.
282
283        :param types: A tuple of NavigableString subclasses. Any
284            strings of a subclass not found in this list will be
285            ignored. Although there are exceptions, the default
286            behavior in most cases is to consider only NavigableString
287            and CData objects. That means no comments, processing
288            instructions, etc.
289
290        :return: A string.
291        """
292        return separator.join([s for s in self._all_strings(
293                    strip, types=types)])
294    getText = get_text
295    text = property(get_text)
296
297    def replace_with(self, *args):
298        """Replace this PageElement with one or more PageElements, keeping the
299        rest of the tree the same.
300
301        :param args: One or more PageElements.
302        :return: `self`, no longer part of the tree.
303        """
304        if self.parent is None:
305            raise ValueError(
306                "Cannot replace one element with another when the "
307                "element to be replaced is not part of a tree.")
308        if len(args) == 1 and args[0] is self:
309            return
310        if any(x is self.parent for x in args):
311            raise ValueError("Cannot replace a Tag with its parent.")
312        old_parent = self.parent
313        my_index = self.parent.index(self)
314        self.extract(_self_index=my_index)
315        for idx, replace_with in enumerate(args, start=my_index):
316            old_parent.insert(idx, replace_with)
317        return self
318    replaceWith = replace_with  # BS3
319
320    def unwrap(self):
321        """Replace this PageElement with its contents.
322
323        :return: `self`, no longer part of the tree.
324        """
325        my_parent = self.parent
326        if self.parent is None:
327            raise ValueError(
328                "Cannot replace an element with its contents when that"
329                "element is not part of a tree.")
330        my_index = self.parent.index(self)
331        self.extract(_self_index=my_index)
332        for child in reversed(self.contents[:]):
333            my_parent.insert(my_index, child)
334        return self
335    replace_with_children = unwrap
336    replaceWithChildren = unwrap  # BS3
337
338    def wrap(self, wrap_inside):
339        """Wrap this PageElement inside another one.
340
341        :param wrap_inside: A PageElement.
342        :return: `wrap_inside`, occupying the position in the tree that used
343           to be occupied by `self`, and with `self` inside it.
344        """
345        me = self.replace_with(wrap_inside)
346        wrap_inside.append(me)
347        return wrap_inside
348
349    def extract(self, _self_index=None):
350        """Destructively rips this element out of the tree.
351
352        :param _self_index: The location of this element in its parent's
353           .contents, if known. Passing this in allows for a performance
354           optimization.
355
356        :return: `self`, no longer part of the tree.
357        """
358        if self.parent is not None:
359            if _self_index is None:
360                _self_index = self.parent.index(self)
361            del self.parent.contents[_self_index]
362
363        #Find the two elements that would be next to each other if
364        #this element (and any children) hadn't been parsed. Connect
365        #the two.
366        last_child = self._last_descendant()
367        next_element = last_child.next_element
368
369        if (self.previous_element is not None and
370            self.previous_element is not next_element):
371            self.previous_element.next_element = next_element
372        if next_element is not None and next_element is not self.previous_element:
373            next_element.previous_element = self.previous_element
374        self.previous_element = None
375        last_child.next_element = None
376
377        self.parent = None
378        if (self.previous_sibling is not None
379            and self.previous_sibling is not self.next_sibling):
380            self.previous_sibling.next_sibling = self.next_sibling
381        if (self.next_sibling is not None
382            and self.next_sibling is not self.previous_sibling):
383            self.next_sibling.previous_sibling = self.previous_sibling
384        self.previous_sibling = self.next_sibling = None
385        return self
386
387    def _last_descendant(self, is_initialized=True, accept_self=True):
388        """Finds the last element beneath this object to be parsed.
389
390        :param is_initialized: Has `setup` been called on this PageElement
391            yet?
392        :param accept_self: Is `self` an acceptable answer to the question?
393        """
394        if is_initialized and self.next_sibling is not None:
395            last_child = self.next_sibling.previous_element
396        else:
397            last_child = self
398            while isinstance(last_child, Tag) and last_child.contents:
399                last_child = last_child.contents[-1]
400        if not accept_self and last_child is self:
401            last_child = None
402        return last_child
403    # BS3: Not part of the API!
404    _lastRecursiveChild = _last_descendant
405
406    def insert(self, position, new_child):
407        """Insert a new PageElement in the list of this PageElement's children.
408
409        This works the same way as `list.insert`.
410
411        :param position: The numeric position that should be occupied
412           in `self.children` by the new PageElement.
413        :param new_child: A PageElement.
414        """
415        if new_child is None:
416            raise ValueError("Cannot insert None into a tag.")
417        if new_child is self:
418            raise ValueError("Cannot insert a tag into itself.")
419        if (isinstance(new_child, str)
420            and not isinstance(new_child, NavigableString)):
421            new_child = NavigableString(new_child)
422
423        from bs4 import BeautifulSoup
424        if isinstance(new_child, BeautifulSoup):
425            # We don't want to end up with a situation where one BeautifulSoup
426            # object contains another. Insert the children one at a time.
427            for subchild in list(new_child.contents):
428                self.insert(position, subchild)
429                position += 1
430            return
431        position = min(position, len(self.contents))
432        if hasattr(new_child, 'parent') and new_child.parent is not None:
433            # We're 'inserting' an element that's already one
434            # of this object's children.
435            if new_child.parent is self:
436                current_index = self.index(new_child)
437                if current_index < position:
438                    # We're moving this element further down the list
439                    # of this object's children. That means that when
440                    # we extract this element, our target index will
441                    # jump down one.
442                    position -= 1
443            new_child.extract()
444
445        new_child.parent = self
446        previous_child = None
447        if position == 0:
448            new_child.previous_sibling = None
449            new_child.previous_element = self
450        else:
451            previous_child = self.contents[position - 1]
452            new_child.previous_sibling = previous_child
453            new_child.previous_sibling.next_sibling = new_child
454            new_child.previous_element = previous_child._last_descendant(False)
455        if new_child.previous_element is not None:
456            new_child.previous_element.next_element = new_child
457
458        new_childs_last_element = new_child._last_descendant(False)
459
460        if position >= len(self.contents):
461            new_child.next_sibling = None
462
463            parent = self
464            parents_next_sibling = None
465            while parents_next_sibling is None and parent is not None:
466                parents_next_sibling = parent.next_sibling
467                parent = parent.parent
468                if parents_next_sibling is not None:
469                    # We found the element that comes next in the document.
470                    break
471            if parents_next_sibling is not None:
472                new_childs_last_element.next_element = parents_next_sibling
473            else:
474                # The last element of this tag is the last element in
475                # the document.
476                new_childs_last_element.next_element = None
477        else:
478            next_child = self.contents[position]
479            new_child.next_sibling = next_child
480            if new_child.next_sibling is not None:
481                new_child.next_sibling.previous_sibling = new_child
482            new_childs_last_element.next_element = next_child
483
484        if new_childs_last_element.next_element is not None:
485            new_childs_last_element.next_element.previous_element = new_childs_last_element
486        self.contents.insert(position, new_child)
487
488    def append(self, tag):
489        """Appends the given PageElement to the contents of this one.
490
491        :param tag: A PageElement.
492        """
493        self.insert(len(self.contents), tag)
494
495    def extend(self, tags):
496        """Appends the given PageElements to this one's contents.
497
498        :param tags: A list of PageElements. If a single Tag is
499            provided instead, this PageElement's contents will be extended
500            with that Tag's contents.
501        """
502        if isinstance(tags, Tag):
503            tags = tags.contents
504        if isinstance(tags, list):
505            # Moving items around the tree may change their position in
506            # the original list. Make a list that won't change.
507            tags = list(tags)
508        for tag in tags:
509            self.append(tag)
510
511    def insert_before(self, *args):
512        """Makes the given element(s) the immediate predecessor of this one.
513
514        All the elements will have the same parent, and the given elements
515        will be immediately before this one.
516
517        :param args: One or more PageElements.
518        """
519        parent = self.parent
520        if parent is None:
521            raise ValueError(
522                "Element has no parent, so 'before' has no meaning.")
523        if any(x is self for x in args):
524                raise ValueError("Can't insert an element before itself.")
525        for predecessor in args:
526            # Extract first so that the index won't be screwed up if they
527            # are siblings.
528            if isinstance(predecessor, PageElement):
529                predecessor.extract()
530            index = parent.index(self)
531            parent.insert(index, predecessor)
532
533    def insert_after(self, *args):
534        """Makes the given element(s) the immediate successor of this one.
535
536        The elements will have the same parent, and the given elements
537        will be immediately after this one.
538
539        :param args: One or more PageElements.
540        """
541        # Do all error checking before modifying the tree.
542        parent = self.parent
543        if parent is None:
544            raise ValueError(
545                "Element has no parent, so 'after' has no meaning.")
546        if any(x is self for x in args):
547            raise ValueError("Can't insert an element after itself.")
548
549        offset = 0
550        for successor in args:
551            # Extract first so that the index won't be screwed up if they
552            # are siblings.
553            if isinstance(successor, PageElement):
554                successor.extract()
555            index = parent.index(self)
556            parent.insert(index+1+offset, successor)
557            offset += 1
558
559    def find_next(self, name=None, attrs={}, string=None, **kwargs):
560        """Find the first PageElement that matches the given criteria and
561        appears later in the document than this PageElement.
562
563        All find_* methods take a common set of arguments. See the online
564        documentation for detailed explanations.
565
566        :param name: A filter on tag name.
567        :param attrs: A dictionary of filters on attribute values.
568        :param string: A filter for a NavigableString with specific text.
569        :kwargs: A dictionary of filters on attribute values.
570        :return: A PageElement.
571        :rtype: bs4.element.Tag | bs4.element.NavigableString
572        """
573        return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
574    findNext = find_next  # BS3
575
576    def find_all_next(self, name=None, attrs={}, string=None, limit=None,
577                    **kwargs):
578        """Find all PageElements that match the given criteria and appear
579        later in the document than this PageElement.
580
581        All find_* methods take a common set of arguments. See the online
582        documentation for detailed explanations.
583
584        :param name: A filter on tag name.
585        :param attrs: A dictionary of filters on attribute values.
586        :param string: A filter for a NavigableString with specific text.
587        :param limit: Stop looking after finding this many results.
588        :kwargs: A dictionary of filters on attribute values.
589        :return: A ResultSet containing PageElements.
590        """
591        _stacklevel = kwargs.pop('_stacklevel', 2)
592        return self._find_all(name, attrs, string, limit, self.next_elements,
593                              _stacklevel=_stacklevel+1, **kwargs)
594    findAllNext = find_all_next  # BS3
595
596    def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
597        """Find the closest sibling to this PageElement that matches the
598        given criteria and appears later in the document.
599
600        All find_* methods take a common set of arguments. See the
601        online documentation for detailed explanations.
602
603        :param name: A filter on tag name.
604        :param attrs: A dictionary of filters on attribute values.
605        :param string: A filter for a NavigableString with specific text.
606        :kwargs: A dictionary of filters on attribute values.
607        :return: A PageElement.
608        :rtype: bs4.element.Tag | bs4.element.NavigableString
609        """
610        return self._find_one(self.find_next_siblings, name, attrs, string,
611                             **kwargs)
612    findNextSibling = find_next_sibling  # BS3
613
614    def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
615                           **kwargs):
616        """Find all siblings of this PageElement that match the given criteria
617        and appear later in the document.
618
619        All find_* methods take a common set of arguments. See the online
620        documentation for detailed explanations.
621
622        :param name: A filter on tag name.
623        :param attrs: A dictionary of filters on attribute values.
624        :param string: A filter for a NavigableString with specific text.
625        :param limit: Stop looking after finding this many results.
626        :kwargs: A dictionary of filters on attribute values.
627        :return: A ResultSet of PageElements.
628        :rtype: bs4.element.ResultSet
629        """
630        _stacklevel = kwargs.pop('_stacklevel', 2)
631        return self._find_all(
632            name, attrs, string, limit,
633            self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
634        )
635    findNextSiblings = find_next_siblings   # BS3
636    fetchNextSiblings = find_next_siblings  # BS2
637
638    def find_previous(self, name=None, attrs={}, string=None, **kwargs):
639        """Look backwards in the document from this PageElement and find the
640        first PageElement that matches the given criteria.
641
642        All find_* methods take a common set of arguments. See the online
643        documentation for detailed explanations.
644
645        :param name: A filter on tag name.
646        :param attrs: A dictionary of filters on attribute values.
647        :param string: A filter for a NavigableString with specific text.
648        :kwargs: A dictionary of filters on attribute values.
649        :return: A PageElement.
650        :rtype: bs4.element.Tag | bs4.element.NavigableString
651        """
652        return self._find_one(
653            self.find_all_previous, name, attrs, string, **kwargs)
654    findPrevious = find_previous  # BS3
655
656    def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
657                        **kwargs):
658        """Look backwards in the document from this PageElement and find all
659        PageElements that match the given criteria.
660
661        All find_* methods take a common set of arguments. See the online
662        documentation for detailed explanations.
663
664        :param name: A filter on tag name.
665        :param attrs: A dictionary of filters on attribute values.
666        :param string: A filter for a NavigableString with specific text.
667        :param limit: Stop looking after finding this many results.
668        :kwargs: A dictionary of filters on attribute values.
669        :return: A ResultSet of PageElements.
670        :rtype: bs4.element.ResultSet
671        """
672        _stacklevel = kwargs.pop('_stacklevel', 2)
673        return self._find_all(
674            name, attrs, string, limit, self.previous_elements,
675            _stacklevel=_stacklevel+1, **kwargs
676        )
677    findAllPrevious = find_all_previous  # BS3
678    fetchPrevious = find_all_previous    # BS2
679
680    def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
681        """Returns the closest sibling to this PageElement that matches the
682        given criteria and appears earlier in the document.
683
684        All find_* methods take a common set of arguments. See the online
685        documentation for detailed explanations.
686
687        :param name: A filter on tag name.
688        :param attrs: A dictionary of filters on attribute values.
689        :param string: A filter for a NavigableString with specific text.
690        :kwargs: A dictionary of filters on attribute values.
691        :return: A PageElement.
692        :rtype: bs4.element.Tag | bs4.element.NavigableString
693        """
694        return self._find_one(self.find_previous_siblings, name, attrs, string,
695                             **kwargs)
696    findPreviousSibling = find_previous_sibling  # BS3
697
698    def find_previous_siblings(self, name=None, attrs={}, string=None,
699                               limit=None, **kwargs):
700        """Returns all siblings to this PageElement that match the
701        given criteria and appear earlier in the document.
702
703        All find_* methods take a common set of arguments. See the online
704        documentation for detailed explanations.
705
706        :param name: A filter on tag name.
707        :param attrs: A dictionary of filters on attribute values.
708        :param string: A filter for a NavigableString with specific text.
709        :param limit: Stop looking after finding this many results.
710        :kwargs: A dictionary of filters on attribute values.
711        :return: A ResultSet of PageElements.
712        :rtype: bs4.element.ResultSet
713        """
714        _stacklevel = kwargs.pop('_stacklevel', 2)
715        return self._find_all(
716            name, attrs, string, limit,
717            self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
718        )
719    findPreviousSiblings = find_previous_siblings   # BS3
720    fetchPreviousSiblings = find_previous_siblings  # BS2
721
722    def find_parent(self, name=None, attrs={}, **kwargs):
723        """Find the closest parent of this PageElement that matches the given
724        criteria.
725
726        All find_* methods take a common set of arguments. See the online
727        documentation for detailed explanations.
728
729        :param name: A filter on tag name.
730        :param attrs: A dictionary of filters on attribute values.
731        :kwargs: A dictionary of filters on attribute values.
732
733        :return: A PageElement.
734        :rtype: bs4.element.Tag | bs4.element.NavigableString
735        """
736        # NOTE: We can't use _find_one because findParents takes a different
737        # set of arguments.
738        r = None
739        l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
740        if l:
741            r = l[0]
742        return r
743    findParent = find_parent  # BS3
744
745    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
746        """Find all parents of this PageElement that match the given criteria.
747
748        All find_* methods take a common set of arguments. See the online
749        documentation for detailed explanations.
750
751        :param name: A filter on tag name.
752        :param attrs: A dictionary of filters on attribute values.
753        :param limit: Stop looking after finding this many results.
754        :kwargs: A dictionary of filters on attribute values.
755
756        :return: A PageElement.
757        :rtype: bs4.element.Tag | bs4.element.NavigableString
758        """
759        _stacklevel = kwargs.pop('_stacklevel', 2)
760        return self._find_all(name, attrs, None, limit, self.parents,
761                              _stacklevel=_stacklevel+1, **kwargs)
762    findParents = find_parents   # BS3
763    fetchParents = find_parents  # BS2
764
765    @property
766    def next(self):
767        """The PageElement, if any, that was parsed just after this one.
768
769        :return: A PageElement.
770        :rtype: bs4.element.Tag | bs4.element.NavigableString
771        """
772        return self.next_element
773
774    @property
775    def previous(self):
776        """The PageElement, if any, that was parsed just before this one.
777
778        :return: A PageElement.
779        :rtype: bs4.element.Tag | bs4.element.NavigableString
780        """
781        return self.previous_element
782
783    #These methods do the real heavy lifting.
784
785    def _find_one(self, method, name, attrs, string, **kwargs):
786        r = None
787        l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
788        if l:
789            r = l[0]
790        return r
791
792    def _find_all(self, name, attrs, string, limit, generator, **kwargs):
793        "Iterates over a generator looking for things that match."
794        _stacklevel = kwargs.pop('_stacklevel', 3)
795
796        if string is None and 'text' in kwargs:
797            string = kwargs.pop('text')
798            warnings.warn(
799                "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
800                DeprecationWarning, stacklevel=_stacklevel
801            )
802
803        if isinstance(name, SoupStrainer):
804            strainer = name
805        else:
806            strainer = SoupStrainer(name, attrs, string, **kwargs)
807
808        if string is None and not limit and not attrs and not kwargs:
809            if name is True or name is None:
810                # Optimization to find all tags.
811                result = (element for element in generator
812                          if isinstance(element, Tag))
813                return ResultSet(strainer, result)
814            elif isinstance(name, str):
815                # Optimization to find all tags with a given name.
816                if name.count(':') == 1:
817                    # This is a name with a prefix. If this is a namespace-aware document,
818                    # we need to match the local name against tag.name. If not,
819                    # we need to match the fully-qualified name against tag.name.
820                    prefix, local_name = name.split(':', 1)
821                else:
822                    prefix = None
823                    local_name = name
824                result = (element for element in generator
825                          if isinstance(element, Tag)
826                          and (
827                              element.name == name
828                          ) or (
829                              element.name == local_name
830                              and (prefix is None or element.prefix == prefix)
831                          )
832                )
833                return ResultSet(strainer, result)
834        results = ResultSet(strainer)
835        while True:
836            try:
837                i = next(generator)
838            except StopIteration:
839                break
840            if i:
841                found = strainer.search(i)
842                if found:
843                    results.append(found)
844                    if limit and len(results) >= limit:
845                        break
846        return results
847
848    #These generators can be used to navigate starting from both
849    #NavigableStrings and Tags.
850    @property
851    def next_elements(self):
852        """All PageElements that were parsed after this one.
853
854        :yield: A sequence of PageElements.
855        """
856        i = self.next_element
857        while i is not None:
858            yield i
859            i = i.next_element
860
861    @property
862    def next_siblings(self):
863        """All PageElements that are siblings of this one but were parsed
864        later.
865
866        :yield: A sequence of PageElements.
867        """
868        i = self.next_sibling
869        while i is not None:
870            yield i
871            i = i.next_sibling
872
873    @property
874    def previous_elements(self):
875        """All PageElements that were parsed before this one.
876
877        :yield: A sequence of PageElements.
878        """
879        i = self.previous_element
880        while i is not None:
881            yield i
882            i = i.previous_element
883
884    @property
885    def previous_siblings(self):
886        """All PageElements that are siblings of this one but were parsed
887        earlier.
888
889        :yield: A sequence of PageElements.
890        """
891        i = self.previous_sibling
892        while i is not None:
893            yield i
894            i = i.previous_sibling
895
896    @property
897    def parents(self):
898        """All PageElements that are parents of this PageElement.
899
900        :yield: A sequence of PageElements.
901        """
902        i = self.parent
903        while i is not None:
904            yield i
905            i = i.parent
906
907    @property
908    def decomposed(self):
909        """Check whether a PageElement has been decomposed.
910
911        :rtype: bool
912        """
913        return getattr(self, '_decomposed', False) or False
914
915    # Old non-property versions of the generators, for backwards
916    # compatibility with BS3.
917    def nextGenerator(self):
918        return self.next_elements
919
920    def nextSiblingGenerator(self):
921        return self.next_siblings
922
923    def previousGenerator(self):
924        return self.previous_elements
925
926    def previousSiblingGenerator(self):
927        return self.previous_siblings
928
929    def parentGenerator(self):
930        return self.parents
931
932
933class NavigableString(str, PageElement):
934    """A Python Unicode string that is part of a parse tree.
935
936    When Beautiful Soup parses the markup <b>penguin</b>, it will
937    create a NavigableString for the string "penguin".
938    """
939
940    PREFIX = ''
941    SUFFIX = ''
942
943    def __new__(cls, value):
944        """Create a new NavigableString.
945
946        When unpickling a NavigableString, this method is called with
947        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
948        passed in to the superclass's __new__ or the superclass won't know
949        how to handle non-ASCII characters.
950        """
951        if isinstance(value, str):
952            u = str.__new__(cls, value)
953        else:
954            u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
955        u.setup()
956        return u
957
958    def __deepcopy__(self, memo, recursive=False):
959        """A copy of a NavigableString has the same contents and class
960        as the original, but it is not connected to the parse tree.
961
962        :param recursive: This parameter is ignored; it's only defined
963           so that NavigableString.__deepcopy__ implements the same
964           signature as Tag.__deepcopy__.
965        """
966        return type(self)(self)
967
968    def __copy__(self):
969        """A copy of a NavigableString can only be a deep copy, because
970        only one PageElement can occupy a given place in a parse tree.
971        """
972        return self.__deepcopy__({})
973
974    def __getnewargs__(self):
975        return (str(self),)
976
977    def __getattr__(self, attr):
978        """text.string gives you text. This is for backwards
979        compatibility for Navigable*String, but for CData* it lets you
980        get the string without the CData wrapper."""
981        if attr == 'string':
982            return self
983        else:
984            raise AttributeError(
985                "'%s' object has no attribute '%s'" % (
986                    self.__class__.__name__, attr))
987
988    def output_ready(self, formatter="minimal"):
989        """Run the string through the provided formatter.
990
991        :param formatter: A Formatter object, or a string naming one of the standard formatters.
992        """
993        output = self.format_string(self, formatter)
994        return self.PREFIX + output + self.SUFFIX
995
996    @property
997    def name(self):
998        """Since a NavigableString is not a Tag, it has no .name.
999
1000        This property is implemented so that code like this doesn't crash
1001        when run on a mixture of Tag and NavigableString objects:
1002            [x.name for x in tag.children]
1003        """
1004        return None
1005
1006    @name.setter
1007    def name(self, name):
1008        """Prevent NavigableString.name from ever being set."""
1009        raise AttributeError("A NavigableString cannot be given a name.")
1010
1011    def _all_strings(self, strip=False, types=PageElement.default):
1012        """Yield all strings of certain classes, possibly stripping them.
1013
1014        This makes it easy for NavigableString to implement methods
1015        like get_text() as conveniences, creating a consistent
1016        text-extraction API across all PageElements.
1017
1018        :param strip: If True, all strings will be stripped before being
1019            yielded.
1020
1021        :param types: A tuple of NavigableString subclasses. If this
1022            NavigableString isn't one of those subclasses, the
1023            sequence will be empty. By default, the subclasses
1024            considered are NavigableString and CData objects. That
1025            means no comments, processing instructions, etc.
1026
1027        :yield: A sequence that either contains this string, or is empty.
1028
1029        """
1030        if types is self.default:
1031            # This is kept in Tag because it's full of subclasses of
1032            # this class, which aren't defined until later in the file.
1033            types = Tag.DEFAULT_INTERESTING_STRING_TYPES
1034
1035        # Do nothing if the caller is looking for specific types of
1036        # string, and we're of a different type.
1037        #
1038        # We check specific types instead of using isinstance(self,
1039        # types) because all of these classes subclass
1040        # NavigableString. Anyone who's using this feature probably
1041        # wants generic NavigableStrings but not other stuff.
1042        my_type = type(self)
1043        if types is not None:
1044            if isinstance(types, type):
1045                # Looking for a single type.
1046                if my_type is not types:
1047                    return
1048            elif my_type not in types:
1049                # Looking for one of a list of types.
1050                return
1051
1052        value = self
1053        if strip:
1054            value = value.strip()
1055        if len(value) > 0:
1056            yield value
1057    strings = property(_all_strings)
1058
1059class PreformattedString(NavigableString):
1060    """A NavigableString not subject to the normal formatting rules.
1061
1062    This is an abstract class used for special kinds of strings such
1063    as comments (the Comment class) and CDATA blocks (the CData
1064    class).
1065    """
1066
1067    PREFIX = ''
1068    SUFFIX = ''
1069
1070    def output_ready(self, formatter=None):
1071        """Make this string ready for output by adding any subclass-specific
1072            prefix or suffix.
1073
1074        :param formatter: A Formatter object, or a string naming one
1075            of the standard formatters. The string will be passed into the
1076            Formatter, but only to trigger any side effects: the return
1077            value is ignored.
1078
1079        :return: The string, with any subclass-specific prefix and
1080           suffix added on.
1081        """
1082        if formatter is not None:
1083            ignore = self.format_string(self, formatter)
1084        return self.PREFIX + self + self.SUFFIX
1085
1086class CData(PreformattedString):
1087    """A CDATA block."""
1088    PREFIX = '<![CDATA['
1089    SUFFIX = ']]>'
1090
1091class ProcessingInstruction(PreformattedString):
1092    """A SGML processing instruction."""
1093
1094    PREFIX = '<?'
1095    SUFFIX = '>'
1096
1097class XMLProcessingInstruction(ProcessingInstruction):
1098    """An XML processing instruction."""
1099    PREFIX = '<?'
1100    SUFFIX = '?>'
1101
1102class Comment(PreformattedString):
1103    """An HTML or XML comment."""
1104    PREFIX = '<!--'
1105    SUFFIX = '-->'
1106
1107
1108class Declaration(PreformattedString):
1109    """An XML declaration."""
1110    PREFIX = '<?'
1111    SUFFIX = '?>'
1112
1113
1114class Doctype(PreformattedString):
1115    """A document type declaration."""
1116    @classmethod
1117    def for_name_and_ids(cls, name, pub_id, system_id):
1118        """Generate an appropriate document type declaration for a given
1119        public ID and system ID.
1120
1121        :param name: The name of the document's root element, e.g. 'html'.
1122        :param pub_id: The Formal Public Identifier for this document type,
1123            e.g. '-//W3C//DTD XHTML 1.1//EN'
1124        :param system_id: The system identifier for this document type,
1125            e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
1126
1127        :return: A Doctype.
1128        """
1129        value = name or ''
1130        if pub_id is not None:
1131            value += ' PUBLIC "%s"' % pub_id
1132            if system_id is not None:
1133                value += ' "%s"' % system_id
1134        elif system_id is not None:
1135            value += ' SYSTEM "%s"' % system_id
1136
1137        return Doctype(value)
1138
1139    PREFIX = '<!DOCTYPE '
1140    SUFFIX = '>\n'
1141
1142
1143class Stylesheet(NavigableString):
1144    """A NavigableString representing an stylesheet (probably
1145    CSS).
1146
1147    Used to distinguish embedded stylesheets from textual content.
1148    """
1149    pass
1150
1151
1152class Script(NavigableString):
1153    """A NavigableString representing an executable script (probably
1154    Javascript).
1155
1156    Used to distinguish executable code from textual content.
1157    """
1158    pass
1159
1160
1161class TemplateString(NavigableString):
1162    """A NavigableString representing a string found inside an HTML
1163    template embedded in a larger document.
1164
1165    Used to distinguish such strings from the main body of the document.
1166    """
1167    pass
1168
1169
1170class RubyTextString(NavigableString):
1171    """A NavigableString representing the contents of the <rt> HTML
1172    element.
1173
1174    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
1175
1176    Can be used to distinguish such strings from the strings they're
1177    annotating.
1178    """
1179    pass
1180
1181
1182class RubyParenthesisString(NavigableString):
1183    """A NavigableString representing the contents of the <rp> HTML
1184    element.
1185
1186    https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
1187    """
1188    pass
1189
1190
1191class Tag(PageElement):
1192    """Represents an HTML or XML tag that is part of a parse tree, along
1193    with its attributes and contents.
1194
1195    When Beautiful Soup parses the markup <b>penguin</b>, it will
1196    create a Tag object representing the <b> tag.
1197    """
1198
1199    def __init__(self, parser=None, builder=None, name=None, namespace=None,
1200                 prefix=None, attrs=None, parent=None, previous=None,
1201                 is_xml=None, sourceline=None, sourcepos=None,
1202                 can_be_empty_element=None, cdata_list_attributes=None,
1203                 preserve_whitespace_tags=None,
1204                 interesting_string_types=None,
1205                 namespaces=None
1206    ):
1207        """Basic constructor.
1208
1209        :param parser: A BeautifulSoup object.
1210        :param builder: A TreeBuilder.
1211        :param name: The name of the tag.
1212        :param namespace: The URI of this Tag's XML namespace, if any.
1213        :param prefix: The prefix for this Tag's XML namespace, if any.
1214        :param attrs: A dictionary of this Tag's attribute values.
1215        :param parent: The PageElement to use as this Tag's parent.
1216        :param previous: The PageElement that was parsed immediately before
1217            this tag.
1218        :param is_xml: If True, this is an XML tag. Otherwise, this is an
1219            HTML tag.
1220        :param sourceline: The line number where this tag was found in its
1221            source document.
1222        :param sourcepos: The character position within `sourceline` where this
1223            tag was found.
1224        :param can_be_empty_element: If True, this tag should be
1225            represented as <tag/>. If False, this tag should be represented
1226            as <tag></tag>.
1227        :param cdata_list_attributes: A list of attributes whose values should
1228            be treated as CDATA if they ever show up on this tag.
1229        :param preserve_whitespace_tags: A list of tag names whose contents
1230            should have their whitespace preserved.
1231        :param interesting_string_types: This is a NavigableString
1232            subclass or a tuple of them. When iterating over this
1233            Tag's strings in methods like Tag.strings or Tag.get_text,
1234            these are the types of strings that are interesting enough
1235            to be considered. The default is to consider
1236            NavigableString and CData the only interesting string
1237            subtypes.
1238        :param namespaces: A dictionary mapping currently active
1239            namespace prefixes to URIs. This can be used later to
1240            construct CSS selectors.
1241        """
1242        if parser is None:
1243            self.parser_class = None
1244        else:
1245            # We don't actually store the parser object: that lets extracted
1246            # chunks be garbage-collected.
1247            self.parser_class = parser.__class__
1248        if name is None:
1249            raise ValueError("No value provided for new tag's name.")
1250        self.name = name
1251        self.namespace = namespace
1252        self._namespaces = namespaces or {}
1253        self.prefix = prefix
1254        if ((not builder or builder.store_line_numbers)
1255            and (sourceline is not None or sourcepos is not None)):
1256            self.sourceline = sourceline
1257            self.sourcepos = sourcepos
1258        if attrs is None:
1259            attrs = {}
1260        elif attrs:
1261            if builder is not None and builder.cdata_list_attributes:
1262                attrs = builder._replace_cdata_list_attribute_values(
1263                    self.name, attrs)
1264            else:
1265                attrs = dict(attrs)
1266        else:
1267            attrs = dict(attrs)
1268
1269        # If possible, determine ahead of time whether this tag is an
1270        # XML tag.
1271        if builder:
1272            self.known_xml = builder.is_xml
1273        else:
1274            self.known_xml = is_xml
1275        self.attrs = attrs
1276        self.contents = []
1277        self.setup(parent, previous)
1278        self.hidden = False
1279
1280        if builder is None:
1281            # In the absence of a TreeBuilder, use whatever values were
1282            # passed in here. They're probably None, unless this is a copy of some
1283            # other tag.
1284            self.can_be_empty_element = can_be_empty_element
1285            self.cdata_list_attributes = cdata_list_attributes
1286            self.preserve_whitespace_tags = preserve_whitespace_tags
1287            self.interesting_string_types = interesting_string_types
1288        else:
1289            # Set up any substitutions for this tag, such as the charset in a META tag.
1290            builder.set_up_substitutions(self)
1291
1292            # Ask the TreeBuilder whether this tag might be an empty-element tag.
1293            self.can_be_empty_element = builder.can_be_empty_element(name)
1294
1295            # Keep track of the list of attributes of this tag that
1296            # might need to be treated as a list.
1297            #
1298            # For performance reasons, we store the whole data structure
1299            # rather than asking the question of every tag. Asking would
1300            # require building a new data structure every time, and
1301            # (unlike can_be_empty_element), we almost never need
1302            # to check this.
1303            self.cdata_list_attributes = builder.cdata_list_attributes
1304
1305            # Keep track of the names that might cause this tag to be treated as a
1306            # whitespace-preserved tag.
1307            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
1308
1309            if self.name in builder.string_containers:
1310                # This sort of tag uses a special string container
1311                # subclass for most of its strings. When we ask the
1312                self.interesting_string_types = builder.string_containers[self.name]
1313            else:
1314                self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
1315
1316    parserClass = _alias("parser_class")  # BS3
1317
1318    def __deepcopy__(self, memo, recursive=True):
1319        """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
1320        Its contents are a copy of the old Tag's contents.
1321        """
1322        clone = self._clone()
1323
1324        if recursive:
1325            # Clone this tag's descendants recursively, but without
1326            # making any recursive function calls.
1327            tag_stack = [clone]
1328            for event, element in self._event_stream(self.descendants):
1329                if event is Tag.END_ELEMENT_EVENT:
1330                    # Stop appending incoming Tags to the Tag that was
1331                    # just closed.
1332                    tag_stack.pop()
1333                else:
1334                    descendant_clone = element.__deepcopy__(
1335                        memo, recursive=False
1336                    )
1337                    # Add to its parent's .contents
1338                    tag_stack[-1].append(descendant_clone)
1339
1340                    if event is Tag.START_ELEMENT_EVENT:
1341                        # Add the Tag itself to the stack so that its
1342                        # children will be .appended to it.
1343                        tag_stack.append(descendant_clone)
1344        return clone
1345
1346    def __copy__(self):
1347        """A copy of a Tag must always be a deep copy, because a Tag's
1348        children can only have one parent at a time.
1349        """
1350        return self.__deepcopy__({})
1351
1352    def _clone(self):
1353        """Create a new Tag just like this one, but with no
1354        contents and unattached to any parse tree.
1355
1356        This is the first step in the deepcopy process.
1357        """
1358        clone = type(self)(
1359            None, None, self.name, self.namespace,
1360            self.prefix, self.attrs, is_xml=self._is_xml,
1361            sourceline=self.sourceline, sourcepos=self.sourcepos,
1362            can_be_empty_element=self.can_be_empty_element,
1363            cdata_list_attributes=self.cdata_list_attributes,
1364            preserve_whitespace_tags=self.preserve_whitespace_tags,
1365            interesting_string_types=self.interesting_string_types
1366        )
1367        for attr in ('can_be_empty_element', 'hidden'):
1368            setattr(clone, attr, getattr(self, attr))
1369        return clone
1370
1371    @property
1372    def is_empty_element(self):
1373        """Is this tag an empty-element tag? (aka a self-closing tag)
1374
1375        A tag that has contents is never an empty-element tag.
1376
1377        A tag that has no contents may or may not be an empty-element
1378        tag. It depends on the builder used to create the tag. If the
1379        builder has a designated list of empty-element tags, then only
1380        a tag whose name shows up in that list is considered an
1381        empty-element tag.
1382
1383        If the builder has no designated list of empty-element tags,
1384        then any tag with no contents is an empty-element tag.
1385        """
1386        return len(self.contents) == 0 and self.can_be_empty_element
1387    isSelfClosing = is_empty_element  # BS3
1388
1389    @property
1390    def string(self):
1391        """Convenience property to get the single string within this
1392        PageElement.
1393
1394        TODO It might make sense to have NavigableString.string return
1395        itself.
1396
1397        :return: If this element has a single string child, return
1398         value is that string. If this element has one child tag,
1399         return value is the 'string' attribute of the child tag,
1400         recursively. If this element is itself a string, has no
1401         children, or has more than one child, return value is None.
1402        """
1403        if len(self.contents) != 1:
1404            return None
1405        child = self.contents[0]
1406        if isinstance(child, NavigableString):
1407            return child
1408        return child.string
1409
1410    @string.setter
1411    def string(self, string):
1412        """Replace this PageElement's contents with `string`."""
1413        self.clear()
1414        self.append(string.__class__(string))
1415
1416    DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
1417    def _all_strings(self, strip=False, types=PageElement.default):
1418        """Yield all strings of certain classes, possibly stripping them.
1419
1420        :param strip: If True, all strings will be stripped before being
1421            yielded.
1422
1423        :param types: A tuple of NavigableString subclasses. Any strings of
1424            a subclass not found in this list will be ignored. By
1425            default, the subclasses considered are the ones found in
1426            self.interesting_string_types. If that's not specified,
1427            only NavigableString and CData objects will be
1428            considered. That means no comments, processing
1429            instructions, etc.
1430
1431        :yield: A sequence of strings.
1432
1433        """
1434        if types is self.default:
1435            types = self.interesting_string_types
1436
1437        for descendant in self.descendants:
1438            if (types is None and not isinstance(descendant, NavigableString)):
1439                continue
1440            descendant_type = type(descendant)
1441            if isinstance(types, type):
1442                if descendant_type is not types:
1443                    # We're not interested in strings of this type.
1444                    continue
1445            elif types is not None and descendant_type not in types:
1446                # We're not interested in strings of this type.
1447                continue
1448            if strip:
1449                descendant = descendant.strip()
1450                if len(descendant) == 0:
1451                    continue
1452            yield descendant
1453    strings = property(_all_strings)
1454
1455    def decompose(self):
1456        """Recursively destroys this PageElement and its children.
1457
1458        This element will be removed from the tree and wiped out; so
1459        will everything beneath it.
1460
1461        The behavior of a decomposed PageElement is undefined and you
1462        should never use one for anything, but if you need to _check_
1463        whether an element has been decomposed, you can use the
1464        `decomposed` property.
1465        """
1466        self.extract()
1467        i = self
1468        while i is not None:
1469            n = i.next_element
1470            i.__dict__.clear()
1471            i.contents = []
1472            i._decomposed = True
1473            i = n
1474
1475    def clear(self, decompose=False):
1476        """Wipe out all children of this PageElement by calling extract()
1477           on them.
1478
1479        :param decompose: If this is True, decompose() (a more
1480            destructive method) will be called instead of extract().
1481        """
1482        if decompose:
1483            for element in self.contents[:]:
1484                if isinstance(element, Tag):
1485                    element.decompose()
1486                else:
1487                    element.extract()
1488        else:
1489            for element in self.contents[:]:
1490                element.extract()
1491
1492    def smooth(self):
1493        """Smooth out this element's children by consolidating consecutive
1494        strings.
1495
1496        This makes pretty-printed output look more natural following a
1497        lot of operations that modified the tree.
1498        """
1499        # Mark the first position of every pair of children that need
1500        # to be consolidated.  Do this rather than making a copy of
1501        # self.contents, since in most cases very few strings will be
1502        # affected.
1503        marked = []
1504        for i, a in enumerate(self.contents):
1505            if isinstance(a, Tag):
1506                # Recursively smooth children.
1507                a.smooth()
1508            if i == len(self.contents)-1:
1509                # This is the last item in .contents, and it's not a
1510                # tag. There's no chance it needs any work.
1511                continue
1512            b = self.contents[i+1]
1513            if (isinstance(a, NavigableString)
1514                and isinstance(b, NavigableString)
1515                and not isinstance(a, PreformattedString)
1516                and not isinstance(b, PreformattedString)
1517            ):
1518                marked.append(i)
1519
1520        # Go over the marked positions in reverse order, so that
1521        # removing items from .contents won't affect the remaining
1522        # positions.
1523        for i in reversed(marked):
1524            a = self.contents[i]
1525            b = self.contents[i+1]
1526            b.extract()
1527            n = NavigableString(a+b)
1528            a.replace_with(n)
1529
1530    def index(self, element):
1531        """Find the index of a child by identity, not value.
1532
1533        Avoids issues with tag.contents.index(element) getting the
1534        index of equal elements.
1535
1536        :param element: Look for this PageElement in `self.contents`.
1537        """
1538        for i, child in enumerate(self.contents):
1539            if child is element:
1540                return i
1541        raise ValueError("Tag.index: element not in tag")
1542
1543    def get(self, key, default=None):
1544        """Returns the value of the 'key' attribute for the tag, or
1545        the value given for 'default' if it doesn't have that
1546        attribute."""
1547        return self.attrs.get(key, default)
1548
1549    def get_attribute_list(self, key, default=None):
1550        """The same as get(), but always returns a list.
1551
1552        :param key: The attribute to look for.
1553        :param default: Use this value if the attribute is not present
1554            on this PageElement.
1555        :return: A list of values, probably containing only a single
1556            value.
1557        """
1558        value = self.get(key, default)
1559        if not isinstance(value, list):
1560            value = [value]
1561        return value
1562
1563    def has_attr(self, key):
1564        """Does this PageElement have an attribute with the given name?"""
1565        return key in self.attrs
1566
1567    def __hash__(self):
1568        return str(self).__hash__()
1569
1570    def __getitem__(self, key):
1571        """tag[key] returns the value of the 'key' attribute for the Tag,
1572        and throws an exception if it's not there."""
1573        return self.attrs[key]
1574
1575    def __iter__(self):
1576        "Iterating over a Tag iterates over its contents."
1577        return iter(self.contents)
1578
1579    def __len__(self):
1580        "The length of a Tag is the length of its list of contents."
1581        return len(self.contents)
1582
1583    def __contains__(self, x):
1584        return x in self.contents
1585
1586    def __bool__(self):
1587        "A tag is non-None even if it has no contents."
1588        return True
1589
1590    def __setitem__(self, key, value):
1591        """Setting tag[key] sets the value of the 'key' attribute for the
1592        tag."""
1593        self.attrs[key] = value
1594
1595    def __delitem__(self, key):
1596        "Deleting tag[key] deletes all 'key' attributes for the tag."
1597        self.attrs.pop(key, None)
1598
1599    def __call__(self, *args, **kwargs):
1600        """Calling a Tag like a function is the same as calling its
1601        find_all() method. Eg. tag('a') returns a list of all the A tags
1602        found within this tag."""
1603        return self.find_all(*args, **kwargs)
1604
1605    def __getattr__(self, tag):
1606        """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
1607        #print("Getattr %s.%s" % (self.__class__, tag))
1608        if len(tag) > 3 and tag.endswith('Tag'):
1609            # BS3: soup.aTag -> "soup.find("a")
1610            tag_name = tag[:-3]
1611            warnings.warn(
1612                '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
1613                    name=tag_name
1614                ),
1615                DeprecationWarning, stacklevel=2
1616            )
1617            return self.find(tag_name)
1618        # We special case contents to avoid recursion.
1619        elif not tag.startswith("__") and not tag == "contents":
1620            return self.find(tag)
1621        raise AttributeError(
1622            "'%s' object has no attribute '%s'" % (self.__class__, tag))
1623
1624    def __eq__(self, other):
1625        """Returns true iff this Tag has the same name, the same attributes,
1626        and the same contents (recursively) as `other`."""
1627        if self is other:
1628            return True
1629        if (not hasattr(other, 'name') or
1630            not hasattr(other, 'attrs') or
1631            not hasattr(other, 'contents') or
1632            self.name != other.name or
1633            self.attrs != other.attrs or
1634            len(self) != len(other)):
1635            return False
1636        for i, my_child in enumerate(self.contents):
1637            if my_child != other.contents[i]:
1638                return False
1639        return True
1640
1641    def __ne__(self, other):
1642        """Returns true iff this Tag is not identical to `other`,
1643        as defined in __eq__."""
1644        return not self == other
1645
1646    def __repr__(self, encoding="unicode-escape"):
1647        """Renders this PageElement as a string.
1648
1649        :param encoding: The encoding to use (Python 2 only).
1650            TODO: This is now ignored and a warning should be issued
1651            if a value is provided.
1652        :return: A (Unicode) string.
1653        """
1654        # "The return value must be a string object", i.e. Unicode
1655        return self.decode()
1656
1657    def __unicode__(self):
1658        """Renders this PageElement as a Unicode string."""
1659        return self.decode()
1660
1661    __str__ = __repr__ = __unicode__
1662
1663    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1664               indent_level=None, formatter="minimal",
1665               errors="xmlcharrefreplace"):
1666        """Render a bytestring representation of this PageElement and its
1667        contents.
1668
1669        :param encoding: The destination encoding.
1670        :param indent_level: Each line of the rendering will be
1671           indented this many levels. (The formatter decides what a
1672           'level' means in terms of spaces or other characters
1673           output.) Used internally in recursive calls while
1674           pretty-printing.
1675        :param formatter: A Formatter object, or a string naming one of
1676            the standard formatters.
1677        :param errors: An error handling strategy such as
1678            'xmlcharrefreplace'. This value is passed along into
1679            encode() and its value should be one of the constants
1680            defined by Python.
1681        :return: A bytestring.
1682
1683        """
1684        # Turn the data structure into Unicode, then encode the
1685        # Unicode.
1686        u = self.decode(indent_level, encoding, formatter)
1687        return u.encode(encoding, errors)
1688
1689    def decode(self, indent_level=None,
1690               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1691               formatter="minimal",
1692               iterator=None):
1693        pieces = []
1694        # First off, turn a non-Formatter `formatter` into a Formatter
1695        # object. This will stop the lookup from happening over and
1696        # over again.
1697        if not isinstance(formatter, Formatter):
1698            formatter = self.formatter_for_name(formatter)
1699
1700        if indent_level is True:
1701            indent_level = 0
1702
1703        # The currently active tag that put us into string literal
1704        # mode. Until this element is closed, children will be treated
1705        # as string literals and not pretty-printed. String literal
1706        # mode is turned on immediately after this tag begins, and
1707        # turned off immediately before it's closed. This means there
1708        # will be whitespace before and after the tag itself.
1709        string_literal_tag = None
1710
1711        for event, element in self._event_stream(iterator):
1712            if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
1713                piece = element._format_tag(
1714                    eventual_encoding, formatter, opening=True
1715                )
1716            elif event is Tag.END_ELEMENT_EVENT:
1717                piece = element._format_tag(
1718                    eventual_encoding, formatter, opening=False
1719                )
1720                if indent_level is not None:
1721                    indent_level -= 1
1722            else:
1723                piece = element.output_ready(formatter)
1724
1725            # Now we need to apply the 'prettiness' -- extra
1726            # whitespace before and/or after this tag. This can get
1727            # complicated because certain tags, like <pre> and
1728            # <script>, can't be prettified, since adding whitespace would
1729            # change the meaning of the content.
1730
1731            # The default behavior is to add whitespace before and
1732            # after an element when string literal mode is off, and to
1733            # leave things as they are when string literal mode is on.
1734            if string_literal_tag:
1735                indent_before = indent_after = False
1736            else:
1737                indent_before = indent_after = True
1738
1739            # The only time the behavior is more complex than that is
1740            # when we encounter an opening or closing tag that might
1741            # put us into or out of string literal mode.
1742            if (event is Tag.START_ELEMENT_EVENT
1743                and not string_literal_tag
1744                and not element._should_pretty_print()):
1745                    # We are about to enter string literal mode. Add
1746                    # whitespace before this tag, but not after. We
1747                    # will stay in string literal mode until this tag
1748                    # is closed.
1749                    indent_before = True
1750                    indent_after = False
1751                    string_literal_tag = element
1752            elif (event is Tag.END_ELEMENT_EVENT
1753                  and element is string_literal_tag):
1754                # We are about to exit string literal mode by closing
1755                # the tag that sent us into that mode. Add whitespace
1756                # after this tag, but not before.
1757                indent_before = False
1758                indent_after = True
1759                string_literal_tag = None
1760
1761            # Now we know whether to add whitespace before and/or
1762            # after this element.
1763            if indent_level is not None:
1764                if (indent_before or indent_after):
1765                    if isinstance(element, NavigableString):
1766                        piece = piece.strip()
1767                    if piece:
1768                        piece = self._indent_string(
1769                            piece, indent_level, formatter,
1770                            indent_before, indent_after
1771                        )
1772                if event == Tag.START_ELEMENT_EVENT:
1773                    indent_level += 1
1774            pieces.append(piece)
1775        return "".join(pieces)
1776
1777    # Names for the different events yielded by _event_stream
1778    START_ELEMENT_EVENT = object()
1779    END_ELEMENT_EVENT = object()
1780    EMPTY_ELEMENT_EVENT = object()
1781    STRING_ELEMENT_EVENT = object()
1782
1783    def _event_stream(self, iterator=None):
1784        """Yield a sequence of events that can be used to reconstruct the DOM
1785        for this element.
1786
1787        This lets us recreate the nested structure of this element
1788        (e.g. when formatting it as a string) without using recursive
1789        method calls.
1790
1791        This is similar in concept to the SAX API, but it's a simpler
1792        interface designed for internal use. The events are different
1793        from SAX and the arguments associated with the events are Tags
1794        and other Beautiful Soup objects.
1795
1796        :param iterator: An alternate iterator to use when traversing
1797         the tree.
1798        """
1799        tag_stack = []
1800
1801        iterator = iterator or self.self_and_descendants
1802
1803        for c in iterator:
1804            # If the parent of the element we're about to yield is not
1805            # the tag currently on the stack, it means that the tag on
1806            # the stack closed before this element appeared.
1807            while tag_stack and c.parent != tag_stack[-1]:
1808                now_closed_tag = tag_stack.pop()
1809                yield Tag.END_ELEMENT_EVENT, now_closed_tag
1810
1811            if isinstance(c, Tag):
1812                if c.is_empty_element:
1813                    yield Tag.EMPTY_ELEMENT_EVENT, c
1814                else:
1815                    yield Tag.START_ELEMENT_EVENT, c
1816                    tag_stack.append(c)
1817                    continue
1818            else:
1819                yield Tag.STRING_ELEMENT_EVENT, c
1820
1821        while tag_stack:
1822            now_closed_tag = tag_stack.pop()
1823            yield Tag.END_ELEMENT_EVENT, now_closed_tag
1824
1825    def _indent_string(self, s, indent_level, formatter,
1826                       indent_before, indent_after):
1827        """Add indentation whitespace before and/or after a string.
1828
1829        :param s: The string to amend with whitespace.
1830        :param indent_level: The indentation level; affects how much
1831           whitespace goes before the string.
1832        :param indent_before: Whether or not to add whitespace
1833           before the string.
1834        :param indent_after: Whether or not to add whitespace
1835           (a newline) after the string.
1836        """
1837        space_before = ''
1838        if indent_before and indent_level:
1839            space_before = (formatter.indent * indent_level)
1840
1841        space_after = ''
1842        if indent_after:
1843            space_after = "\n"
1844
1845        return space_before + s + space_after
1846
1847    def _format_tag(self, eventual_encoding, formatter, opening):
1848        if self.hidden:
1849            # A hidden tag is invisible, although its contents
1850            # are visible.
1851            return ''
1852
1853        # A tag starts with the < character (see below).
1854
1855        # Then the / character, if this is a closing tag.
1856        closing_slash = ''
1857        if not opening:
1858            closing_slash = '/'
1859
1860        # Then an optional namespace prefix.
1861        prefix = ''
1862        if self.prefix:
1863            prefix = self.prefix + ":"
1864
1865        # Then a list of attribute values, if this is an opening tag.
1866        attribute_string = ''
1867        if opening:
1868            attributes = formatter.attributes(self)
1869            attrs = []
1870            for key, val in attributes:
1871                if val is None:
1872                    decoded = key
1873                else:
1874                    if isinstance(val, list) or isinstance(val, tuple):
1875                        val = ' '.join(val)
1876                    elif not isinstance(val, str):
1877                        val = str(val)
1878                    elif (
1879                            isinstance(val, AttributeValueWithCharsetSubstitution)
1880                            and eventual_encoding is not None
1881                    ):
1882                        val = val.encode(eventual_encoding)
1883
1884                    text = formatter.attribute_value(val)
1885                    decoded = (
1886                        str(key) + '='
1887                        + formatter.quoted_attribute_value(text))
1888                attrs.append(decoded)
1889            if attrs:
1890                attribute_string = ' ' + ' '.join(attrs)
1891
1892        # Then an optional closing slash (for a void element in an
1893        # XML document).
1894        void_element_closing_slash = ''
1895        if self.is_empty_element:
1896            void_element_closing_slash = formatter.void_element_close_prefix or ''
1897
1898        # Put it all together.
1899        return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
1900
1901    def _should_pretty_print(self, indent_level=1):
1902        """Should this tag be pretty-printed?
1903
1904        Most of them should, but some (such as <pre> in HTML
1905        documents) should not.
1906        """
1907        return (
1908            indent_level is not None
1909            and (
1910                not self.preserve_whitespace_tags
1911                or self.name not in self.preserve_whitespace_tags
1912            )
1913        )
1914
1915    def prettify(self, encoding=None, formatter="minimal"):
1916        """Pretty-print this PageElement as a string.
1917
1918        :param encoding: The eventual encoding of the string. If this is None,
1919            a Unicode string will be returned.
1920        :param formatter: A Formatter object, or a string naming one of
1921            the standard formatters.
1922        :return: A Unicode string (if encoding==None) or a bytestring
1923            (otherwise).
1924        """
1925        if encoding is None:
1926            return self.decode(True, formatter=formatter)
1927        else:
1928            return self.encode(encoding, True, formatter=formatter)
1929
1930    def decode_contents(self, indent_level=None,
1931                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1932                       formatter="minimal"):
1933        """Renders the contents of this tag as a Unicode string.
1934
1935        :param indent_level: Each line of the rendering will be
1936           indented this many levels. (The formatter decides what a
1937           'level' means in terms of spaces or other characters
1938           output.) Used internally in recursive calls while
1939           pretty-printing.
1940
1941        :param eventual_encoding: The tag is destined to be
1942           encoded into this encoding. decode_contents() is _not_
1943           responsible for performing that encoding. This information
1944           is passed in so that it can be substituted in if the
1945           document contains a <META> tag that mentions the document's
1946           encoding.
1947
1948        :param formatter: A Formatter object, or a string naming one of
1949            the standard Formatters.
1950
1951        """
1952        return self.decode(indent_level, eventual_encoding, formatter,
1953                           iterator=self.descendants)
1954
1955    def encode_contents(
1956        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1957        formatter="minimal"):
1958        """Renders the contents of this PageElement as a bytestring.
1959
1960        :param indent_level: Each line of the rendering will be
1961           indented this many levels. (The formatter decides what a
1962           'level' means in terms of spaces or other characters
1963           output.) Used internally in recursive calls while
1964           pretty-printing.
1965
1966        :param eventual_encoding: The bytestring will be in this encoding.
1967
1968        :param formatter: A Formatter object, or a string naming one of
1969            the standard Formatters.
1970
1971        :return: A bytestring.
1972        """
1973        contents = self.decode_contents(indent_level, encoding, formatter)
1974        return contents.encode(encoding)
1975
1976    # Old method for BS3 compatibility
1977    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1978                       prettyPrint=False, indentLevel=0):
1979        """Deprecated method for BS3 compatibility."""
1980        if not prettyPrint:
1981            indentLevel = None
1982        return self.encode_contents(
1983            indent_level=indentLevel, encoding=encoding)
1984
1985    #Soup methods
1986
1987    def find(self, name=None, attrs={}, recursive=True, string=None,
1988             **kwargs):
1989        """Look in the children of this PageElement and find the first
1990        PageElement that matches the given criteria.
1991
1992        All find_* methods take a common set of arguments. See the online
1993        documentation for detailed explanations.
1994
1995        :param name: A filter on tag name.
1996        :param attrs: A dictionary of filters on attribute values.
1997        :param recursive: If this is True, find() will perform a
1998            recursive search of this PageElement's children. Otherwise,
1999            only the direct children will be considered.
2000        :param limit: Stop looking after finding this many results.
2001        :kwargs: A dictionary of filters on attribute values.
2002        :return: A PageElement.
2003        :rtype: bs4.element.Tag | bs4.element.NavigableString
2004        """
2005        r = None
2006        l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
2007                          **kwargs)
2008        if l:
2009            r = l[0]
2010        return r
2011    findChild = find #BS2
2012
2013    def find_all(self, name=None, attrs={}, recursive=True, string=None,
2014                 limit=None, **kwargs):
2015        """Look in the children of this PageElement and find all
2016        PageElements that match the given criteria.
2017
2018        All find_* methods take a common set of arguments. See the online
2019        documentation for detailed explanations.
2020
2021        :param name: A filter on tag name.
2022        :param attrs: A dictionary of filters on attribute values.
2023        :param recursive: If this is True, find_all() will perform a
2024            recursive search of this PageElement's children. Otherwise,
2025            only the direct children will be considered.
2026        :param limit: Stop looking after finding this many results.
2027        :kwargs: A dictionary of filters on attribute values.
2028        :return: A ResultSet of PageElements.
2029        :rtype: bs4.element.ResultSet
2030        """
2031        generator = self.descendants
2032        if not recursive:
2033            generator = self.children
2034        _stacklevel = kwargs.pop('_stacklevel', 2)
2035        return self._find_all(name, attrs, string, limit, generator,
2036                              _stacklevel=_stacklevel+1, **kwargs)
2037    findAll = find_all       # BS3
2038    findChildren = find_all  # BS2
2039
2040    #Generator methods
2041    @property
2042    def children(self):
2043        """Iterate over all direct children of this PageElement.
2044
2045        :yield: A sequence of PageElements.
2046        """
2047        # return iter() to make the purpose of the method clear
2048        return iter(self.contents)  # XXX This seems to be untested.
2049
2050    @property
2051    def self_and_descendants(self):
2052        """Iterate over this PageElement and its children in a
2053        breadth-first sequence.
2054
2055        :yield: A sequence of PageElements.
2056        """
2057        if not self.hidden:
2058            yield self
2059        for i in self.descendants:
2060            yield i
2061
2062    @property
2063    def descendants(self):
2064        """Iterate over all children of this PageElement in a
2065        breadth-first sequence.
2066
2067        :yield: A sequence of PageElements.
2068        """
2069        if not len(self.contents):
2070            return
2071        stopNode = self._last_descendant().next_element
2072        current = self.contents[0]
2073        while current is not stopNode:
2074            yield current
2075            current = current.next_element
2076
2077    # CSS selector code
2078    def select_one(self, selector, namespaces=None, **kwargs):
2079        """Perform a CSS selection operation on the current element.
2080
2081        :param selector: A CSS selector.
2082
2083        :param namespaces: A dictionary mapping namespace prefixes
2084           used in the CSS selector to namespace URIs. By default,
2085           Beautiful Soup will use the prefixes it encountered while
2086           parsing the document.
2087
2088        :param kwargs: Keyword arguments to be passed into Soup Sieve's
2089           soupsieve.select() method.
2090
2091        :return: A Tag.
2092        :rtype: bs4.element.Tag
2093        """
2094        return self.css.select_one(selector, namespaces, **kwargs)
2095
2096    def select(self, selector, namespaces=None, limit=None, **kwargs):
2097        """Perform a CSS selection operation on the current element.
2098
2099        This uses the SoupSieve library.
2100
2101        :param selector: A string containing a CSS selector.
2102
2103        :param namespaces: A dictionary mapping namespace prefixes
2104           used in the CSS selector to namespace URIs. By default,
2105           Beautiful Soup will use the prefixes it encountered while
2106           parsing the document.
2107
2108        :param limit: After finding this number of results, stop looking.
2109
2110        :param kwargs: Keyword arguments to be passed into SoupSieve's
2111           soupsieve.select() method.
2112
2113        :return: A ResultSet of Tags.
2114        :rtype: bs4.element.ResultSet
2115        """
2116        return self.css.select(selector, namespaces, limit, **kwargs)
2117
2118    @property
2119    def css(self):
2120        """Return an interface to the CSS selector API."""
2121        return CSS(self)
2122
2123    # Old names for backwards compatibility
2124    def childGenerator(self):
2125        """Deprecated generator."""
2126        return self.children
2127
2128    def recursiveChildGenerator(self):
2129        """Deprecated generator."""
2130        return self.descendants
2131
2132    def has_key(self, key):
2133        """Deprecated method. This was kind of misleading because has_key()
2134        (attributes) was different from __in__ (contents).
2135
2136        has_key() is gone in Python 3, anyway.
2137        """
2138        warnings.warn(
2139            'has_key is deprecated. Use has_attr(key) instead.',
2140            DeprecationWarning, stacklevel=2
2141        )
2142        return self.has_attr(key)
2143
2144# Next, a couple classes to represent queries and their results.
2145class SoupStrainer(object):
2146    """Encapsulates a number of ways of matching a markup element (tag or
2147    string).
2148
2149    This is primarily used to underpin the find_* methods, but you can
2150    create one yourself and pass it in as `parse_only` to the
2151    `BeautifulSoup` constructor, to parse a subset of a large
2152    document.
2153    """
2154
2155    def __init__(self, name=None, attrs={}, string=None, **kwargs):
2156        """Constructor.
2157
2158        The SoupStrainer constructor takes the same arguments passed
2159        into the find_* methods. See the online documentation for
2160        detailed explanations.
2161
2162        :param name: A filter on tag name.
2163        :param attrs: A dictionary of filters on attribute values.
2164        :param string: A filter for a NavigableString with specific text.
2165        :kwargs: A dictionary of filters on attribute values.
2166        """
2167        if string is None and 'text' in kwargs:
2168            string = kwargs.pop('text')
2169            warnings.warn(
2170                "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
2171                DeprecationWarning, stacklevel=2
2172            )
2173
2174        self.name = self._normalize_search_value(name)
2175        if not isinstance(attrs, dict):
2176            # Treat a non-dict value for attrs as a search for the 'class'
2177            # attribute.
2178            kwargs['class'] = attrs
2179            attrs = None
2180
2181        if 'class_' in kwargs:
2182            # Treat class_="foo" as a search for the 'class'
2183            # attribute, overriding any non-dict value for attrs.
2184            kwargs['class'] = kwargs['class_']
2185            del kwargs['class_']
2186
2187        if kwargs:
2188            if attrs:
2189                attrs = attrs.copy()
2190                attrs.update(kwargs)
2191            else:
2192                attrs = kwargs
2193        normalized_attrs = {}
2194        for key, value in list(attrs.items()):
2195            normalized_attrs[key] = self._normalize_search_value(value)
2196
2197        self.attrs = normalized_attrs
2198        self.string = self._normalize_search_value(string)
2199
2200        # DEPRECATED but just in case someone is checking this.
2201        self.text = self.string
2202
2203    def _normalize_search_value(self, value):
2204        # Leave it alone if it's a Unicode string, a callable, a
2205        # regular expression, a boolean, or None.
2206        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
2207            or isinstance(value, bool) or value is None):
2208            return value
2209
2210        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
2211        if isinstance(value, bytes):
2212            return value.decode("utf8")
2213
2214        # If it's listlike, convert it into a list of strings.
2215        if hasattr(value, '__iter__'):
2216            new_value = []
2217            for v in value:
2218                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
2219                    and not isinstance(v, str)):
2220                    # This is almost certainly the user's mistake. In the
2221                    # interests of avoiding infinite loops, we'll let
2222                    # it through as-is rather than doing a recursive call.
2223                    new_value.append(v)
2224                else:
2225                    new_value.append(self._normalize_search_value(v))
2226            return new_value
2227
2228        # Otherwise, convert it into a Unicode string.
2229        # The unicode(str()) thing is so this will do the same thing on Python 2
2230        # and Python 3.
2231        return str(str(value))
2232
2233    def __str__(self):
2234        """A human-readable representation of this SoupStrainer."""
2235        if self.string:
2236            return self.string
2237        else:
2238            return "%s|%s" % (self.name, self.attrs)
2239
2240    def search_tag(self, markup_name=None, markup_attrs={}):
2241        """Check whether a Tag with the given name and attributes would
2242        match this SoupStrainer.
2243
2244        Used prospectively to decide whether to even bother creating a Tag
2245        object.
2246
2247        :param markup_name: A tag name as found in some markup.
2248        :param markup_attrs: A dictionary of attributes as found in some markup.
2249
2250        :return: True if the prospective tag would match this SoupStrainer;
2251            False otherwise.
2252        """
2253        found = None
2254        markup = None
2255        if isinstance(markup_name, Tag):
2256            markup = markup_name
2257            markup_attrs = markup
2258
2259        if isinstance(self.name, str):
2260            # Optimization for a very common case where the user is
2261            # searching for a tag with one specific name, and we're
2262            # looking at a tag with a different name.
2263            if markup and not markup.prefix and self.name != markup.name:
2264                 return False
2265
2266        call_function_with_tag_data = (
2267            isinstance(self.name, Callable)
2268            and not isinstance(markup_name, Tag))
2269
2270        if ((not self.name)
2271            or call_function_with_tag_data
2272            or (markup and self._matches(markup, self.name))
2273            or (not markup and self._matches(markup_name, self.name))):
2274            if call_function_with_tag_data:
2275                match = self.name(markup_name, markup_attrs)
2276            else:
2277                match = True
2278                markup_attr_map = None
2279                for attr, match_against in list(self.attrs.items()):
2280                    if not markup_attr_map:
2281                        if hasattr(markup_attrs, 'get'):
2282                            markup_attr_map = markup_attrs
2283                        else:
2284                            markup_attr_map = {}
2285                            for k, v in markup_attrs:
2286                                markup_attr_map[k] = v
2287                    attr_value = markup_attr_map.get(attr)
2288                    if not self._matches(attr_value, match_against):
2289                        match = False
2290                        break
2291            if match:
2292                if markup:
2293                    found = markup
2294                else:
2295                    found = markup_name
2296        if found and self.string and not self._matches(found.string, self.string):
2297            found = None
2298        return found
2299
2300    # For BS3 compatibility.
2301    searchTag = search_tag
2302
2303    def search(self, markup):
2304        """Find all items in `markup` that match this SoupStrainer.
2305
2306        Used by the core _find_all() method, which is ultimately
2307        called by all find_* methods.
2308
2309        :param markup: A PageElement or a list of them.
2310        """
2311        # print('looking for %s in %s' % (self, markup))
2312        found = None
2313        # If given a list of items, scan it for a text element that
2314        # matches.
2315        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
2316            for element in markup:
2317                if isinstance(element, NavigableString) \
2318                       and self.search(element):
2319                    found = element
2320                    break
2321        # If it's a Tag, make sure its name or attributes match.
2322        # Don't bother with Tags if we're searching for text.
2323        elif isinstance(markup, Tag):
2324            if not self.string or self.name or self.attrs:
2325                found = self.search_tag(markup)
2326        # If it's text, make sure the text matches.
2327        elif isinstance(markup, NavigableString) or \
2328                 isinstance(markup, str):
2329            if not self.name and not self.attrs and self._matches(markup, self.string):
2330                found = markup
2331        else:
2332            raise Exception(
2333                "I don't know how to match against a %s" % markup.__class__)
2334        return found
2335
2336    def _matches(self, markup, match_against, already_tried=None):
2337        # print(u"Matching %s against %s" % (markup, match_against))
2338        result = False
2339        if isinstance(markup, list) or isinstance(markup, tuple):
2340            # This should only happen when searching a multi-valued attribute
2341            # like 'class'.
2342            for item in markup:
2343                if self._matches(item, match_against):
2344                    return True
2345            # We didn't match any particular value of the multivalue
2346            # attribute, but maybe we match the attribute value when
2347            # considered as a string.
2348            if self._matches(' '.join(markup), match_against):
2349                return True
2350            return False
2351
2352        if match_against is True:
2353            # True matches any non-None value.
2354            return markup is not None
2355
2356        if isinstance(match_against, Callable):
2357            return match_against(markup)
2358
2359        # Custom callables take the tag as an argument, but all
2360        # other ways of matching match the tag name as a string.
2361        original_markup = markup
2362        if isinstance(markup, Tag):
2363            markup = markup.name
2364
2365        # Ensure that `markup` is either a Unicode string, or None.
2366        markup = self._normalize_search_value(markup)
2367
2368        if markup is None:
2369            # None matches None, False, an empty string, an empty list, and so on.
2370            return not match_against
2371
2372        if (hasattr(match_against, '__iter__')
2373            and not isinstance(match_against, str)):
2374            # We're asked to match against an iterable of items.
2375            # The markup must be match at least one item in the
2376            # iterable. We'll try each one in turn.
2377            #
2378            # To avoid infinite recursion we need to keep track of
2379            # items we've already seen.
2380            if not already_tried:
2381                already_tried = set()
2382            for item in match_against:
2383                if item.__hash__:
2384                    key = item
2385                else:
2386                    key = id(item)
2387                if key in already_tried:
2388                    continue
2389                else:
2390                    already_tried.add(key)
2391                    if self._matches(original_markup, item, already_tried):
2392                        return True
2393            else:
2394                return False
2395
2396        # Beyond this point we might need to run the test twice: once against
2397        # the tag's name and once against its prefixed name.
2398        match = False
2399
2400        if not match and isinstance(match_against, str):
2401            # Exact string match
2402            match = markup == match_against
2403
2404        if not match and hasattr(match_against, 'search'):
2405            # Regexp match
2406            return match_against.search(markup)
2407
2408        if (not match
2409            and isinstance(original_markup, Tag)
2410            and original_markup.prefix):
2411            # Try the whole thing again with the prefixed tag name.
2412            return self._matches(
2413                original_markup.prefix + ':' + original_markup.name, match_against
2414            )
2415
2416        return match
2417
2418
2419class ResultSet(list):
2420    """A ResultSet is just a list that keeps track of the SoupStrainer
2421    that created it."""
2422    def __init__(self, source, result=()):
2423        """Constructor.
2424
2425        :param source: A SoupStrainer.
2426        :param result: A list of PageElements.
2427        """
2428        super(ResultSet, self).__init__(result)
2429        self.source = source
2430
2431    def __getattr__(self, key):
2432        """Raise a helpful exception to explain a common code fix."""
2433        raise AttributeError(
2434            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
2435        )
2436