xref: /openbmc/openbmc/poky/bitbake/lib/bs4/element.py (revision 82c905dc)
1__license__ = "MIT"
2
3import collections.abc
4import re
5import sys
6import warnings
7from bs4.dammit import EntitySubstitution
8
9DEFAULT_OUTPUT_ENCODING = "utf-8"
10PY3K = (sys.version_info[0] > 2)
11
12whitespace_re = re.compile(r"\s+")
13
14def _alias(attr):
15    """Alias one attribute name to another for backward compatibility"""
16    @property
17    def alias(self):
18        return getattr(self, attr)
19
20    @alias.setter
21    def alias(self):
22        return setattr(self, attr)
23    return alias
24
25
26class NamespacedAttribute(str):
27
28    def __new__(cls, prefix, name, namespace=None):
29        if name is None:
30            obj = str.__new__(cls, prefix)
31        elif prefix is None:
32            # Not really namespaced.
33            obj = str.__new__(cls, name)
34        else:
35            obj = str.__new__(cls, prefix + ":" + name)
36        obj.prefix = prefix
37        obj.name = name
38        obj.namespace = namespace
39        return obj
40
41class AttributeValueWithCharsetSubstitution(str):
42    """A stand-in object for a character encoding specified in HTML."""
43
44class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
45    """A generic stand-in for the value of a meta tag's 'charset' attribute.
46
47    When Beautiful Soup parses the markup '<meta charset="utf8">', the
48    value of the 'charset' attribute will be one of these objects.
49    """
50
51    def __new__(cls, original_value):
52        obj = str.__new__(cls, original_value)
53        obj.original_value = original_value
54        return obj
55
56    def encode(self, encoding):
57        return encoding
58
59
60class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
61    """A generic stand-in for the value of a meta tag's 'content' attribute.
62
63    When Beautiful Soup parses the markup:
64     <meta http-equiv="content-type" content="text/html; charset=utf8">
65
66    The value of the 'content' attribute will be one of these objects.
67    """
68
69    CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
70
71    def __new__(cls, original_value):
72        match = cls.CHARSET_RE.search(original_value)
73        if match is None:
74            # No substitution necessary.
75            return str.__new__(str, original_value)
76
77        obj = str.__new__(cls, original_value)
78        obj.original_value = original_value
79        return obj
80
81    def encode(self, encoding):
82        def rewrite(match):
83            return match.group(1) + encoding
84        return self.CHARSET_RE.sub(rewrite, self.original_value)
85
86class HTMLAwareEntitySubstitution(EntitySubstitution):
87
88    """Entity substitution rules that are aware of some HTML quirks.
89
90    Specifically, the contents of <script> and <style> tags should not
91    undergo entity substitution.
92
93    Incoming NavigableString objects are checked to see if they're the
94    direct children of a <script> or <style> tag.
95    """
96
97    cdata_containing_tags = set(["script", "style"])
98
99    preformatted_tags = set(["pre"])
100
101    @classmethod
102    def _substitute_if_appropriate(cls, ns, f):
103        if (isinstance(ns, NavigableString)
104            and ns.parent is not None
105            and ns.parent.name in cls.cdata_containing_tags):
106            # Do nothing.
107            return ns
108        # Substitute.
109        return f(ns)
110
111    @classmethod
112    def substitute_html(cls, ns):
113        return cls._substitute_if_appropriate(
114            ns, EntitySubstitution.substitute_html)
115
116    @classmethod
117    def substitute_xml(cls, ns):
118        return cls._substitute_if_appropriate(
119            ns, EntitySubstitution.substitute_xml)
120
121class PageElement(object):
122    """Contains the navigational information for some part of the page
123    (either a tag or a piece of text)"""
124
125    # There are five possible values for the "formatter" argument passed in
126    # to methods like encode() and prettify():
127    #
128    # "html" - All Unicode characters with corresponding HTML entities
129    #   are converted to those entities on output.
130    # "minimal" - Bare ampersands and angle brackets are converted to
131    #   XML entities: &amp; &lt; &gt;
132    # None - The null formatter. Unicode characters are never
133    #   converted to entities.  This is not recommended, but it's
134    #   faster than "minimal".
135    # A function - This function will be called on every string that
136    #  needs to undergo entity substitution.
137    #
138
139    # In an HTML document, the default "html" and "minimal" functions
140    # will leave the contents of <script> and <style> tags alone. For
141    # an XML document, all tags will be given the same treatment.
142
143    HTML_FORMATTERS = {
144        "html" : HTMLAwareEntitySubstitution.substitute_html,
145        "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
146        None : None
147        }
148
149    XML_FORMATTERS = {
150        "html" : EntitySubstitution.substitute_html,
151        "minimal" : EntitySubstitution.substitute_xml,
152        None : None
153        }
154
155    def format_string(self, s, formatter='minimal'):
156        """Format the given string using the given formatter."""
157        if not isinstance(formatter, collections.abc.Callable):
158            formatter = self._formatter_for_name(formatter)
159        if formatter is None:
160            output = s
161        else:
162            output = formatter(s)
163        return output
164
165    @property
166    def _is_xml(self):
167        """Is this element part of an XML tree or an HTML tree?
168
169        This is used when mapping a formatter name ("minimal") to an
170        appropriate function (one that performs entity-substitution on
171        the contents of <script> and <style> tags, or not). It's
172        inefficient, but it should be called very rarely.
173        """
174        if self.parent is None:
175            # This is the top-level object. It should have .is_xml set
176            # from tree creation. If not, take a guess--BS is usually
177            # used on HTML markup.
178            return getattr(self, 'is_xml', False)
179        return self.parent._is_xml
180
181    def _formatter_for_name(self, name):
182        "Look up a formatter function based on its name and the tree."
183        if self._is_xml:
184            return self.XML_FORMATTERS.get(
185                name, EntitySubstitution.substitute_xml)
186        else:
187            return self.HTML_FORMATTERS.get(
188                name, HTMLAwareEntitySubstitution.substitute_xml)
189
190    def setup(self, parent=None, previous_element=None, next_element=None,
191              previous_sibling=None, next_sibling=None):
192        """Sets up the initial relations between this element and
193        other elements."""
194        self.parent = parent
195
196        self.previous_element = previous_element
197        if previous_element is not None:
198            self.previous_element.next_element = self
199
200        self.next_element = next_element
201        if self.next_element:
202            self.next_element.previous_element = self
203
204        self.next_sibling = next_sibling
205        if self.next_sibling:
206            self.next_sibling.previous_sibling = self
207
208        if (not previous_sibling
209            and self.parent is not None and self.parent.contents):
210            previous_sibling = self.parent.contents[-1]
211
212        self.previous_sibling = previous_sibling
213        if previous_sibling:
214            self.previous_sibling.next_sibling = self
215
216    nextSibling = _alias("next_sibling")  # BS3
217    previousSibling = _alias("previous_sibling")  # BS3
218
219    def replace_with(self, replace_with):
220        if not self.parent:
221            raise ValueError(
222                "Cannot replace one element with another when the"
223                "element to be replaced is not part of a tree.")
224        if replace_with is self:
225            return
226        if replace_with is self.parent:
227            raise ValueError("Cannot replace a Tag with its parent.")
228        old_parent = self.parent
229        my_index = self.parent.index(self)
230        self.extract()
231        old_parent.insert(my_index, replace_with)
232        return self
233    replaceWith = replace_with  # BS3
234
235    def unwrap(self):
236        my_parent = self.parent
237        if not self.parent:
238            raise ValueError(
239                "Cannot replace an element with its contents when that"
240                "element is not part of a tree.")
241        my_index = self.parent.index(self)
242        self.extract()
243        for child in reversed(self.contents[:]):
244            my_parent.insert(my_index, child)
245        return self
246    replace_with_children = unwrap
247    replaceWithChildren = unwrap  # BS3
248
249    def wrap(self, wrap_inside):
250        me = self.replace_with(wrap_inside)
251        wrap_inside.append(me)
252        return wrap_inside
253
254    def extract(self):
255        """Destructively rips this element out of the tree."""
256        if self.parent is not None:
257            del self.parent.contents[self.parent.index(self)]
258
259        #Find the two elements that would be next to each other if
260        #this element (and any children) hadn't been parsed. Connect
261        #the two.
262        last_child = self._last_descendant()
263        next_element = last_child.next_element
264
265        if (self.previous_element is not None and
266            self.previous_element is not next_element):
267            self.previous_element.next_element = next_element
268        if next_element is not None and next_element is not self.previous_element:
269            next_element.previous_element = self.previous_element
270        self.previous_element = None
271        last_child.next_element = None
272
273        self.parent = None
274        if (self.previous_sibling is not None
275            and self.previous_sibling is not self.next_sibling):
276            self.previous_sibling.next_sibling = self.next_sibling
277        if (self.next_sibling is not None
278            and self.next_sibling is not self.previous_sibling):
279            self.next_sibling.previous_sibling = self.previous_sibling
280        self.previous_sibling = self.next_sibling = None
281        return self
282
283    def _last_descendant(self, is_initialized=True, accept_self=True):
284        "Finds the last element beneath this object to be parsed."
285        if is_initialized and self.next_sibling:
286            last_child = self.next_sibling.previous_element
287        else:
288            last_child = self
289            while isinstance(last_child, Tag) and last_child.contents:
290                last_child = last_child.contents[-1]
291        if not accept_self and last_child is self:
292            last_child = None
293        return last_child
294    # BS3: Not part of the API!
295    _lastRecursiveChild = _last_descendant
296
297    def insert(self, position, new_child):
298        if new_child is None:
299            raise ValueError("Cannot insert None into a tag.")
300        if new_child is self:
301            raise ValueError("Cannot insert a tag into itself.")
302        if (isinstance(new_child, str)
303            and not isinstance(new_child, NavigableString)):
304            new_child = NavigableString(new_child)
305
306        position = min(position, len(self.contents))
307        if hasattr(new_child, 'parent') and new_child.parent is not None:
308            # We're 'inserting' an element that's already one
309            # of this object's children.
310            if new_child.parent is self:
311                current_index = self.index(new_child)
312                if current_index < position:
313                    # We're moving this element further down the list
314                    # of this object's children. That means that when
315                    # we extract this element, our target index will
316                    # jump down one.
317                    position -= 1
318            new_child.extract()
319
320        new_child.parent = self
321        previous_child = None
322        if position == 0:
323            new_child.previous_sibling = None
324            new_child.previous_element = self
325        else:
326            previous_child = self.contents[position - 1]
327            new_child.previous_sibling = previous_child
328            new_child.previous_sibling.next_sibling = new_child
329            new_child.previous_element = previous_child._last_descendant(False)
330        if new_child.previous_element is not None:
331            new_child.previous_element.next_element = new_child
332
333        new_childs_last_element = new_child._last_descendant(False)
334
335        if position >= len(self.contents):
336            new_child.next_sibling = None
337
338            parent = self
339            parents_next_sibling = None
340            while parents_next_sibling is None and parent is not None:
341                parents_next_sibling = parent.next_sibling
342                parent = parent.parent
343                if parents_next_sibling is not None:
344                    # We found the element that comes next in the document.
345                    break
346            if parents_next_sibling is not None:
347                new_childs_last_element.next_element = parents_next_sibling
348            else:
349                # The last element of this tag is the last element in
350                # the document.
351                new_childs_last_element.next_element = None
352        else:
353            next_child = self.contents[position]
354            new_child.next_sibling = next_child
355            if new_child.next_sibling is not None:
356                new_child.next_sibling.previous_sibling = new_child
357            new_childs_last_element.next_element = next_child
358
359        if new_childs_last_element.next_element is not None:
360            new_childs_last_element.next_element.previous_element = new_childs_last_element
361        self.contents.insert(position, new_child)
362
363    def append(self, tag):
364        """Appends the given tag to the contents of this tag."""
365        self.insert(len(self.contents), tag)
366
367    def insert_before(self, predecessor):
368        """Makes the given element the immediate predecessor of this one.
369
370        The two elements will have the same parent, and the given element
371        will be immediately before this one.
372        """
373        if self is predecessor:
374            raise ValueError("Can't insert an element before itself.")
375        parent = self.parent
376        if parent is None:
377            raise ValueError(
378                "Element has no parent, so 'before' has no meaning.")
379        # Extract first so that the index won't be screwed up if they
380        # are siblings.
381        if isinstance(predecessor, PageElement):
382            predecessor.extract()
383        index = parent.index(self)
384        parent.insert(index, predecessor)
385
386    def insert_after(self, successor):
387        """Makes the given element the immediate successor of this one.
388
389        The two elements will have the same parent, and the given element
390        will be immediately after this one.
391        """
392        if self is successor:
393            raise ValueError("Can't insert an element after itself.")
394        parent = self.parent
395        if parent is None:
396            raise ValueError(
397                "Element has no parent, so 'after' has no meaning.")
398        # Extract first so that the index won't be screwed up if they
399        # are siblings.
400        if isinstance(successor, PageElement):
401            successor.extract()
402        index = parent.index(self)
403        parent.insert(index+1, successor)
404
405    def find_next(self, name=None, attrs={}, text=None, **kwargs):
406        """Returns the first item that matches the given criteria and
407        appears after this Tag in the document."""
408        return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
409    findNext = find_next  # BS3
410
411    def find_all_next(self, name=None, attrs={}, text=None, limit=None,
412                    **kwargs):
413        """Returns all items that match the given criteria and appear
414        after this Tag in the document."""
415        return self._find_all(name, attrs, text, limit, self.next_elements,
416                             **kwargs)
417    findAllNext = find_all_next  # BS3
418
419    def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
420        """Returns the closest sibling to this Tag that matches the
421        given criteria and appears after this Tag in the document."""
422        return self._find_one(self.find_next_siblings, name, attrs, text,
423                             **kwargs)
424    findNextSibling = find_next_sibling  # BS3
425
426    def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
427                           **kwargs):
428        """Returns the siblings of this Tag that match the given
429        criteria and appear after this Tag in the document."""
430        return self._find_all(name, attrs, text, limit,
431                              self.next_siblings, **kwargs)
432    findNextSiblings = find_next_siblings   # BS3
433    fetchNextSiblings = find_next_siblings  # BS2
434
435    def find_previous(self, name=None, attrs={}, text=None, **kwargs):
436        """Returns the first item that matches the given criteria and
437        appears before this Tag in the document."""
438        return self._find_one(
439            self.find_all_previous, name, attrs, text, **kwargs)
440    findPrevious = find_previous  # BS3
441
442    def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
443                        **kwargs):
444        """Returns all items that match the given criteria and appear
445        before this Tag in the document."""
446        return self._find_all(name, attrs, text, limit, self.previous_elements,
447                           **kwargs)
448    findAllPrevious = find_all_previous  # BS3
449    fetchPrevious = find_all_previous    # BS2
450
451    def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
452        """Returns the closest sibling to this Tag that matches the
453        given criteria and appears before this Tag in the document."""
454        return self._find_one(self.find_previous_siblings, name, attrs, text,
455                             **kwargs)
456    findPreviousSibling = find_previous_sibling  # BS3
457
458    def find_previous_siblings(self, name=None, attrs={}, text=None,
459                               limit=None, **kwargs):
460        """Returns the siblings of this Tag that match the given
461        criteria and appear before this Tag in the document."""
462        return self._find_all(name, attrs, text, limit,
463                              self.previous_siblings, **kwargs)
464    findPreviousSiblings = find_previous_siblings   # BS3
465    fetchPreviousSiblings = find_previous_siblings  # BS2
466
467    def find_parent(self, name=None, attrs={}, **kwargs):
468        """Returns the closest parent of this Tag that matches the given
469        criteria."""
470        # NOTE: We can't use _find_one because findParents takes a different
471        # set of arguments.
472        r = None
473        l = self.find_parents(name, attrs, 1, **kwargs)
474        if l:
475            r = l[0]
476        return r
477    findParent = find_parent  # BS3
478
479    def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
480        """Returns the parents of this Tag that match the given
481        criteria."""
482
483        return self._find_all(name, attrs, None, limit, self.parents,
484                             **kwargs)
485    findParents = find_parents   # BS3
486    fetchParents = find_parents  # BS2
487
488    @property
489    def next(self):
490        return self.next_element
491
492    @property
493    def previous(self):
494        return self.previous_element
495
496    #These methods do the real heavy lifting.
497
498    def _find_one(self, method, name, attrs, text, **kwargs):
499        r = None
500        l = method(name, attrs, text, 1, **kwargs)
501        if l:
502            r = l[0]
503        return r
504
505    def _find_all(self, name, attrs, text, limit, generator, **kwargs):
506        "Iterates over a generator looking for things that match."
507
508        if text is None and 'string' in kwargs:
509            text = kwargs['string']
510            del kwargs['string']
511
512        if isinstance(name, SoupStrainer):
513            strainer = name
514        else:
515            strainer = SoupStrainer(name, attrs, text, **kwargs)
516
517        if text is None and not limit and not attrs and not kwargs:
518            if name is True or name is None:
519                # Optimization to find all tags.
520                result = (element for element in generator
521                          if isinstance(element, Tag))
522                return ResultSet(strainer, result)
523            elif isinstance(name, str):
524                # Optimization to find all tags with a given name.
525                result = (element for element in generator
526                          if isinstance(element, Tag)
527                            and element.name == name)
528                return ResultSet(strainer, result)
529        results = ResultSet(strainer)
530        while True:
531            try:
532                i = next(generator)
533            except StopIteration:
534                break
535            if i:
536                found = strainer.search(i)
537                if found:
538                    results.append(found)
539                    if limit and len(results) >= limit:
540                        break
541        return results
542
543    #These generators can be used to navigate starting from both
544    #NavigableStrings and Tags.
545    @property
546    def next_elements(self):
547        i = self.next_element
548        while i is not None:
549            yield i
550            i = i.next_element
551
552    @property
553    def next_siblings(self):
554        i = self.next_sibling
555        while i is not None:
556            yield i
557            i = i.next_sibling
558
559    @property
560    def previous_elements(self):
561        i = self.previous_element
562        while i is not None:
563            yield i
564            i = i.previous_element
565
566    @property
567    def previous_siblings(self):
568        i = self.previous_sibling
569        while i is not None:
570            yield i
571            i = i.previous_sibling
572
573    @property
574    def parents(self):
575        i = self.parent
576        while i is not None:
577            yield i
578            i = i.parent
579
580    # Methods for supporting CSS selectors.
581
582    tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
583
584    # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
585    #   \---------------------------/  \---/\-------------/    \-------/
586    #     |                              |         |               |
587    #     |                              |         |           The value
588    #     |                              |    ~,|,^,$,* or =
589    #     |                           Attribute
590    #    Tag
591    attribselect_re = re.compile(
592        r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
593        r'=?"?(?P<value>[^\]"]*)"?\]$'
594        )
595
596    def _attr_value_as_string(self, value, default=None):
597        """Force an attribute value into a string representation.
598
599        A multi-valued attribute will be converted into a
600        space-separated stirng.
601        """
602        value = self.get(value, default)
603        if isinstance(value, list) or isinstance(value, tuple):
604            value =" ".join(value)
605        return value
606
607    def _tag_name_matches_and(self, function, tag_name):
608        if not tag_name:
609            return function
610        else:
611            def _match(tag):
612                return tag.name == tag_name and function(tag)
613            return _match
614
615    def _attribute_checker(self, operator, attribute, value=''):
616        """Create a function that performs a CSS selector operation.
617
618        Takes an operator, attribute and optional value. Returns a
619        function that will return True for elements that match that
620        combination.
621        """
622        if operator == '=':
623            # string representation of `attribute` is equal to `value`
624            return lambda el: el._attr_value_as_string(attribute) == value
625        elif operator == '~':
626            # space-separated list representation of `attribute`
627            # contains `value`
628            def _includes_value(element):
629                attribute_value = element.get(attribute, [])
630                if not isinstance(attribute_value, list):
631                    attribute_value = attribute_value.split()
632                return value in attribute_value
633            return _includes_value
634        elif operator == '^':
635            # string representation of `attribute` starts with `value`
636            return lambda el: el._attr_value_as_string(
637                attribute, '').startswith(value)
638        elif operator == '$':
639            # string represenation of `attribute` ends with `value`
640            return lambda el: el._attr_value_as_string(
641                attribute, '').endswith(value)
642        elif operator == '*':
643            # string representation of `attribute` contains `value`
644            return lambda el: value in el._attr_value_as_string(attribute, '')
645        elif operator == '|':
646            # string representation of `attribute` is either exactly
647            # `value` or starts with `value` and then a dash.
648            def _is_or_starts_with_dash(element):
649                attribute_value = element._attr_value_as_string(attribute, '')
650                return (attribute_value == value or attribute_value.startswith(
651                        value + '-'))
652            return _is_or_starts_with_dash
653        else:
654            return lambda el: el.has_attr(attribute)
655
656    # Old non-property versions of the generators, for backwards
657    # compatibility with BS3.
658    def nextGenerator(self):
659        return self.next_elements
660
661    def nextSiblingGenerator(self):
662        return self.next_siblings
663
664    def previousGenerator(self):
665        return self.previous_elements
666
667    def previousSiblingGenerator(self):
668        return self.previous_siblings
669
670    def parentGenerator(self):
671        return self.parents
672
673
674class NavigableString(str, PageElement):
675
676    PREFIX = ''
677    SUFFIX = ''
678
679    def __new__(cls, value):
680        """Create a new NavigableString.
681
682        When unpickling a NavigableString, this method is called with
683        the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
684        passed in to the superclass's __new__ or the superclass won't know
685        how to handle non-ASCII characters.
686        """
687        if isinstance(value, str):
688            u = str.__new__(cls, value)
689        else:
690            u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
691        u.setup()
692        return u
693
694    def __copy__(self):
695        """A copy of a NavigableString has the same contents and class
696        as the original, but it is not connected to the parse tree.
697        """
698        return type(self)(self)
699
700    def __getnewargs__(self):
701        return (str(self),)
702
703    def __getattr__(self, attr):
704        """text.string gives you text. This is for backwards
705        compatibility for Navigable*String, but for CData* it lets you
706        get the string without the CData wrapper."""
707        if attr == 'string':
708            return self
709        else:
710            raise AttributeError(
711                "'%s' object has no attribute '%s'" % (
712                    self.__class__.__name__, attr))
713
714    def output_ready(self, formatter="minimal"):
715        output = self.format_string(self, formatter)
716        return self.PREFIX + output + self.SUFFIX
717
718    @property
719    def name(self):
720        return None
721
722    @name.setter
723    def name(self, name):
724        raise AttributeError("A NavigableString cannot be given a name.")
725
726class PreformattedString(NavigableString):
727    """A NavigableString not subject to the normal formatting rules.
728
729    The string will be passed into the formatter (to trigger side effects),
730    but the return value will be ignored.
731    """
732
733    def output_ready(self, formatter="minimal"):
734        """CData strings are passed into the formatter.
735        But the return value is ignored."""
736        self.format_string(self, formatter)
737        return self.PREFIX + self + self.SUFFIX
738
739class CData(PreformattedString):
740
741    PREFIX = '<![CDATA['
742    SUFFIX = ']]>'
743
744class ProcessingInstruction(PreformattedString):
745
746    PREFIX = '<?'
747    SUFFIX = '>'
748
749class Comment(PreformattedString):
750
751    PREFIX = '<!--'
752    SUFFIX = '-->'
753
754
755class Declaration(PreformattedString):
756    PREFIX = '<?'
757    SUFFIX = '?>'
758
759
760class Doctype(PreformattedString):
761
762    @classmethod
763    def for_name_and_ids(cls, name, pub_id, system_id):
764        value = name or ''
765        if pub_id is not None:
766            value += ' PUBLIC "%s"' % pub_id
767            if system_id is not None:
768                value += ' "%s"' % system_id
769        elif system_id is not None:
770            value += ' SYSTEM "%s"' % system_id
771
772        return Doctype(value)
773
774    PREFIX = '<!DOCTYPE '
775    SUFFIX = '>\n'
776
777
778class Tag(PageElement):
779
780    """Represents a found HTML tag with its attributes and contents."""
781
782    def __init__(self, parser=None, builder=None, name=None, namespace=None,
783                 prefix=None, attrs=None, parent=None, previous=None):
784        "Basic constructor."
785
786        if parser is None:
787            self.parser_class = None
788        else:
789            # We don't actually store the parser object: that lets extracted
790            # chunks be garbage-collected.
791            self.parser_class = parser.__class__
792        if name is None:
793            raise ValueError("No value provided for new tag's name.")
794        self.name = name
795        self.namespace = namespace
796        self.prefix = prefix
797        if attrs is None:
798            attrs = {}
799        elif attrs:
800            if builder is not None and builder.cdata_list_attributes:
801                attrs = builder._replace_cdata_list_attribute_values(
802                    self.name, attrs)
803            else:
804                attrs = dict(attrs)
805        else:
806            attrs = dict(attrs)
807        self.attrs = attrs
808        self.contents = []
809        self.setup(parent, previous)
810        self.hidden = False
811
812        # Set up any substitutions, such as the charset in a META tag.
813        if builder is not None:
814            builder.set_up_substitutions(self)
815            self.can_be_empty_element = builder.can_be_empty_element(name)
816        else:
817            self.can_be_empty_element = False
818
819    parserClass = _alias("parser_class")  # BS3
820
821    def __copy__(self):
822        """A copy of a Tag is a new Tag, unconnected to the parse tree.
823        Its contents are a copy of the old Tag's contents.
824        """
825        clone = type(self)(None, self.builder, self.name, self.namespace,
826                           self.nsprefix, self.attrs)
827        for attr in ('can_be_empty_element', 'hidden'):
828            setattr(clone, attr, getattr(self, attr))
829        for child in self.contents:
830            clone.append(child.__copy__())
831        return clone
832
833    @property
834    def is_empty_element(self):
835        """Is this tag an empty-element tag? (aka a self-closing tag)
836
837        A tag that has contents is never an empty-element tag.
838
839        A tag that has no contents may or may not be an empty-element
840        tag. It depends on the builder used to create the tag. If the
841        builder has a designated list of empty-element tags, then only
842        a tag whose name shows up in that list is considered an
843        empty-element tag.
844
845        If the builder has no designated list of empty-element tags,
846        then any tag with no contents is an empty-element tag.
847        """
848        return len(self.contents) == 0 and self.can_be_empty_element
849    isSelfClosing = is_empty_element  # BS3
850
851    @property
852    def string(self):
853        """Convenience property to get the single string within this tag.
854
855        :Return: If this tag has a single string child, return value
856         is that string. If this tag has no children, or more than one
857         child, return value is None. If this tag has one child tag,
858         return value is the 'string' attribute of the child tag,
859         recursively.
860        """
861        if len(self.contents) != 1:
862            return None
863        child = self.contents[0]
864        if isinstance(child, NavigableString):
865            return child
866        return child.string
867
868    @string.setter
869    def string(self, string):
870        self.clear()
871        self.append(string.__class__(string))
872
873    def _all_strings(self, strip=False, types=(NavigableString, CData)):
874        """Yield all strings of certain classes, possibly stripping them.
875
876        By default, yields only NavigableString and CData objects. So
877        no comments, processing instructions, etc.
878        """
879        for descendant in self.descendants:
880            if (
881                (types is None and not isinstance(descendant, NavigableString))
882                or
883                (types is not None and type(descendant) not in types)):
884                continue
885            if strip:
886                descendant = descendant.strip()
887                if len(descendant) == 0:
888                    continue
889            yield descendant
890
891    strings = property(_all_strings)
892
893    @property
894    def stripped_strings(self):
895        for string in self._all_strings(True):
896            yield string
897
898    def get_text(self, separator="", strip=False,
899                 types=(NavigableString, CData)):
900        """
901        Get all child strings, concatenated using the given separator.
902        """
903        return separator.join([s for s in self._all_strings(
904                    strip, types=types)])
905    getText = get_text
906    text = property(get_text)
907
908    def decompose(self):
909        """Recursively destroys the contents of this tree."""
910        self.extract()
911        i = self
912        while i is not None:
913            next = i.next_element
914            i.__dict__.clear()
915            i.contents = []
916            i = next
917
918    def clear(self, decompose=False):
919        """
920        Extract all children. If decompose is True, decompose instead.
921        """
922        if decompose:
923            for element in self.contents[:]:
924                if isinstance(element, Tag):
925                    element.decompose()
926                else:
927                    element.extract()
928        else:
929            for element in self.contents[:]:
930                element.extract()
931
932    def index(self, element):
933        """
934        Find the index of a child by identity, not value. Avoids issues with
935        tag.contents.index(element) getting the index of equal elements.
936        """
937        for i, child in enumerate(self.contents):
938            if child is element:
939                return i
940        raise ValueError("Tag.index: element not in tag")
941
942    def get(self, key, default=None):
943        """Returns the value of the 'key' attribute for the tag, or
944        the value given for 'default' if it doesn't have that
945        attribute."""
946        return self.attrs.get(key, default)
947
948    def has_attr(self, key):
949        return key in self.attrs
950
951    def __hash__(self):
952        return str(self).__hash__()
953
954    def __getitem__(self, key):
955        """tag[key] returns the value of the 'key' attribute for the tag,
956        and throws an exception if it's not there."""
957        return self.attrs[key]
958
959    def __iter__(self):
960        "Iterating over a tag iterates over its contents."
961        return iter(self.contents)
962
963    def __len__(self):
964        "The length of a tag is the length of its list of contents."
965        return len(self.contents)
966
967    def __contains__(self, x):
968        return x in self.contents
969
970    def __bool__(self):
971        "A tag is non-None even if it has no contents."
972        return True
973
974    def __setitem__(self, key, value):
975        """Setting tag[key] sets the value of the 'key' attribute for the
976        tag."""
977        self.attrs[key] = value
978
979    def __delitem__(self, key):
980        "Deleting tag[key] deletes all 'key' attributes for the tag."
981        self.attrs.pop(key, None)
982
983    def __call__(self, *args, **kwargs):
984        """Calling a tag like a function is the same as calling its
985        find_all() method. Eg. tag('a') returns a list of all the A tags
986        found within this tag."""
987        return self.find_all(*args, **kwargs)
988
989    def __getattr__(self, tag):
990        #print "Getattr %s.%s" % (self.__class__, tag)
991        if len(tag) > 3 and tag.endswith('Tag'):
992            # BS3: soup.aTag -> "soup.find("a")
993            tag_name = tag[:-3]
994            warnings.warn(
995                '.%sTag is deprecated, use .find("%s") instead.' % (
996                    tag_name, tag_name))
997            return self.find(tag_name)
998        # We special case contents to avoid recursion.
999        elif not tag.startswith("__") and not tag=="contents":
1000            return self.find(tag)
1001        raise AttributeError(
1002            "'%s' object has no attribute '%s'" % (self.__class__, tag))
1003
1004    def __eq__(self, other):
1005        """Returns true iff this tag has the same name, the same attributes,
1006        and the same contents (recursively) as the given tag."""
1007        if self is other:
1008            return True
1009        if (not hasattr(other, 'name') or
1010            not hasattr(other, 'attrs') or
1011            not hasattr(other, 'contents') or
1012            self.name != other.name or
1013            self.attrs != other.attrs or
1014            len(self) != len(other)):
1015            return False
1016        for i, my_child in enumerate(self.contents):
1017            if my_child != other.contents[i]:
1018                return False
1019        return True
1020
1021    def __ne__(self, other):
1022        """Returns true iff this tag is not identical to the other tag,
1023        as defined in __eq__."""
1024        return not self == other
1025
1026    def __repr__(self, encoding="unicode-escape"):
1027        """Renders this tag as a string."""
1028        if PY3K:
1029            # "The return value must be a string object", i.e. Unicode
1030            return self.decode()
1031        else:
1032            # "The return value must be a string object", i.e. a bytestring.
1033            # By convention, the return value of __repr__ should also be
1034            # an ASCII string.
1035            return self.encode(encoding)
1036
1037    def __unicode__(self):
1038        return self.decode()
1039
1040    def __str__(self):
1041        if PY3K:
1042            return self.decode()
1043        else:
1044            return self.encode()
1045
1046    if PY3K:
1047        __str__ = __repr__ = __unicode__
1048
1049    def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1050               indent_level=None, formatter="minimal",
1051               errors="xmlcharrefreplace"):
1052        # Turn the data structure into Unicode, then encode the
1053        # Unicode.
1054        u = self.decode(indent_level, encoding, formatter)
1055        return u.encode(encoding, errors)
1056
1057    def _should_pretty_print(self, indent_level):
1058        """Should this tag be pretty-printed?"""
1059        return (
1060            indent_level is not None and
1061            (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1062             or self._is_xml))
1063
1064    def decode(self, indent_level=None,
1065               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1066               formatter="minimal"):
1067        """Returns a Unicode representation of this tag and its contents.
1068
1069        :param eventual_encoding: The tag is destined to be
1070           encoded into this encoding. This method is _not_
1071           responsible for performing that encoding. This information
1072           is passed in so that it can be substituted in if the
1073           document contains a <META> tag that mentions the document's
1074           encoding.
1075        """
1076
1077        # First off, turn a string formatter into a function. This
1078        # will stop the lookup from happening over and over again.
1079        if not isinstance(formatter, collections.abc.Callable):
1080            formatter = self._formatter_for_name(formatter)
1081
1082        attrs = []
1083        if self.attrs:
1084            for key, val in sorted(self.attrs.items()):
1085                if val is None:
1086                    decoded = key
1087                else:
1088                    if isinstance(val, list) or isinstance(val, tuple):
1089                        val = ' '.join(val)
1090                    elif not isinstance(val, str):
1091                        val = str(val)
1092                    elif (
1093                        isinstance(val, AttributeValueWithCharsetSubstitution)
1094                        and eventual_encoding is not None):
1095                        val = val.encode(eventual_encoding)
1096
1097                    text = self.format_string(val, formatter)
1098                    decoded = (
1099                        str(key) + '='
1100                        + EntitySubstitution.quoted_attribute_value(text))
1101                attrs.append(decoded)
1102        close = ''
1103        closeTag = ''
1104
1105        prefix = ''
1106        if self.prefix:
1107            prefix = self.prefix + ":"
1108
1109        if self.is_empty_element:
1110            close = '/'
1111        else:
1112            closeTag = '</%s%s>' % (prefix, self.name)
1113
1114        pretty_print = self._should_pretty_print(indent_level)
1115        space = ''
1116        indent_space = ''
1117        if indent_level is not None:
1118            indent_space = (' ' * (indent_level - 1))
1119        if pretty_print:
1120            space = indent_space
1121            indent_contents = indent_level + 1
1122        else:
1123            indent_contents = None
1124        contents = self.decode_contents(
1125            indent_contents, eventual_encoding, formatter)
1126
1127        if self.hidden:
1128            # This is the 'document root' object.
1129            s = contents
1130        else:
1131            s = []
1132            attribute_string = ''
1133            if attrs:
1134                attribute_string = ' ' + ' '.join(attrs)
1135            if indent_level is not None:
1136                # Even if this particular tag is not pretty-printed,
1137                # we should indent up to the start of the tag.
1138                s.append(indent_space)
1139            s.append('<%s%s%s%s>' % (
1140                    prefix, self.name, attribute_string, close))
1141            if pretty_print:
1142                s.append("\n")
1143            s.append(contents)
1144            if pretty_print and contents and contents[-1] != "\n":
1145                s.append("\n")
1146            if pretty_print and closeTag:
1147                s.append(space)
1148            s.append(closeTag)
1149            if indent_level is not None and closeTag and self.next_sibling:
1150                # Even if this particular tag is not pretty-printed,
1151                # we're now done with the tag, and we should add a
1152                # newline if appropriate.
1153                s.append("\n")
1154            s = ''.join(s)
1155        return s
1156
1157    def prettify(self, encoding=None, formatter="minimal"):
1158        if encoding is None:
1159            return self.decode(True, formatter=formatter)
1160        else:
1161            return self.encode(encoding, True, formatter=formatter)
1162
1163    def decode_contents(self, indent_level=None,
1164                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1165                       formatter="minimal"):
1166        """Renders the contents of this tag as a Unicode string.
1167
1168        :param indent_level: Each line of the rendering will be
1169           indented this many spaces.
1170
1171        :param eventual_encoding: The tag is destined to be
1172           encoded into this encoding. This method is _not_
1173           responsible for performing that encoding. This information
1174           is passed in so that it can be substituted in if the
1175           document contains a <META> tag that mentions the document's
1176           encoding.
1177
1178        :param formatter: The output formatter responsible for converting
1179           entities to Unicode characters.
1180        """
1181        # First off, turn a string formatter into a function. This
1182        # will stop the lookup from happening over and over again.
1183        if not isinstance(formatter, collections.abc.Callable):
1184            formatter = self._formatter_for_name(formatter)
1185
1186        pretty_print = (indent_level is not None)
1187        s = []
1188        for c in self:
1189            text = None
1190            if isinstance(c, NavigableString):
1191                text = c.output_ready(formatter)
1192            elif isinstance(c, Tag):
1193                s.append(c.decode(indent_level, eventual_encoding,
1194                                  formatter))
1195            if text and indent_level and not self.name == 'pre':
1196                text = text.strip()
1197            if text:
1198                if pretty_print and not self.name == 'pre':
1199                    s.append(" " * (indent_level - 1))
1200                s.append(text)
1201                if pretty_print and not self.name == 'pre':
1202                    s.append("\n")
1203        return ''.join(s)
1204
1205    def encode_contents(
1206        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1207        formatter="minimal"):
1208        """Renders the contents of this tag as a bytestring.
1209
1210        :param indent_level: Each line of the rendering will be
1211           indented this many spaces.
1212
1213        :param eventual_encoding: The bytestring will be in this encoding.
1214
1215        :param formatter: The output formatter responsible for converting
1216           entities to Unicode characters.
1217        """
1218
1219        contents = self.decode_contents(indent_level, encoding, formatter)
1220        return contents.encode(encoding)
1221
1222    # Old method for BS3 compatibility
1223    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1224                       prettyPrint=False, indentLevel=0):
1225        if not prettyPrint:
1226            indentLevel = None
1227        return self.encode_contents(
1228            indent_level=indentLevel, encoding=encoding)
1229
1230    #Soup methods
1231
1232    def find(self, name=None, attrs={}, recursive=True, text=None,
1233             **kwargs):
1234        """Return only the first child of this Tag matching the given
1235        criteria."""
1236        r = None
1237        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1238        if l:
1239            r = l[0]
1240        return r
1241    findChild = find
1242
1243    def find_all(self, name=None, attrs={}, recursive=True, text=None,
1244                 limit=None, **kwargs):
1245        """Extracts a list of Tag objects that match the given
1246        criteria.  You can specify the name of the Tag and any
1247        attributes you want the Tag to have.
1248
1249        The value of a key-value pair in the 'attrs' map can be a
1250        string, a list of strings, a regular expression object, or a
1251        callable that takes a string and returns whether or not the
1252        string matches for some custom definition of 'matches'. The
1253        same is true of the tag name."""
1254
1255        generator = self.descendants
1256        if not recursive:
1257            generator = self.children
1258        return self._find_all(name, attrs, text, limit, generator, **kwargs)
1259    findAll = find_all       # BS3
1260    findChildren = find_all  # BS2
1261
1262    #Generator methods
1263    @property
1264    def children(self):
1265        # return iter() to make the purpose of the method clear
1266        return iter(self.contents)  # XXX This seems to be untested.
1267
1268    @property
1269    def descendants(self):
1270        if not len(self.contents):
1271            return
1272        stopNode = self._last_descendant().next_element
1273        current = self.contents[0]
1274        while current is not stopNode:
1275            yield current
1276            current = current.next_element
1277
1278    # CSS selector code
1279
1280    _selector_combinators = ['>', '+', '~']
1281    _select_debug = False
1282    def select_one(self, selector):
1283        """Perform a CSS selection operation on the current element."""
1284        value = self.select(selector, limit=1)
1285        if value:
1286            return value[0]
1287        return None
1288
1289    def select(self, selector, _candidate_generator=None, limit=None):
1290        """Perform a CSS selection operation on the current element."""
1291
1292        # Handle grouping selectors if ',' exists, ie: p,a
1293        if ',' in selector:
1294            context = []
1295            for partial_selector in selector.split(','):
1296                partial_selector = partial_selector.strip()
1297                if partial_selector == '':
1298                    raise ValueError('Invalid group selection syntax: %s' % selector)
1299                candidates = self.select(partial_selector, limit=limit)
1300                for candidate in candidates:
1301                    if candidate not in context:
1302                        context.append(candidate)
1303
1304                if limit and len(context) >= limit:
1305                    break
1306            return context
1307
1308        tokens = selector.split()
1309        current_context = [self]
1310
1311        if tokens[-1] in self._selector_combinators:
1312            raise ValueError(
1313                'Final combinator "%s" is missing an argument.' % tokens[-1])
1314
1315        if self._select_debug:
1316            print('Running CSS selector "%s"' % selector)
1317
1318        for index, token in enumerate(tokens):
1319            new_context = []
1320            new_context_ids = set([])
1321
1322            if tokens[index-1] in self._selector_combinators:
1323                # This token was consumed by the previous combinator. Skip it.
1324                if self._select_debug:
1325                    print('  Token was consumed by the previous combinator.')
1326                continue
1327
1328            if self._select_debug:
1329                print(' Considering token "%s"' % token)
1330            recursive_candidate_generator = None
1331            tag_name = None
1332
1333            # Each operation corresponds to a checker function, a rule
1334            # for determining whether a candidate matches the
1335            # selector. Candidates are generated by the active
1336            # iterator.
1337            checker = None
1338
1339            m = self.attribselect_re.match(token)
1340            if m is not None:
1341                # Attribute selector
1342                tag_name, attribute, operator, value = m.groups()
1343                checker = self._attribute_checker(operator, attribute, value)
1344
1345            elif '#' in token:
1346                # ID selector
1347                tag_name, tag_id = token.split('#', 1)
1348                def id_matches(tag):
1349                    return tag.get('id', None) == tag_id
1350                checker = id_matches
1351
1352            elif '.' in token:
1353                # Class selector
1354                tag_name, klass = token.split('.', 1)
1355                classes = set(klass.split('.'))
1356                def classes_match(candidate):
1357                    return classes.issubset(candidate.get('class', []))
1358                checker = classes_match
1359
1360            elif ':' in token:
1361                # Pseudo-class
1362                tag_name, pseudo = token.split(':', 1)
1363                if tag_name == '':
1364                    raise ValueError(
1365                        "A pseudo-class must be prefixed with a tag name.")
1366                pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1367                found = []
1368                if pseudo_attributes is None:
1369                    pseudo_type = pseudo
1370                    pseudo_value = None
1371                else:
1372                    pseudo_type, pseudo_value = pseudo_attributes.groups()
1373                if pseudo_type == 'nth-of-type':
1374                    try:
1375                        pseudo_value = int(pseudo_value)
1376                    except:
1377                        raise NotImplementedError(
1378                            'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1379                    if pseudo_value < 1:
1380                        raise ValueError(
1381                            'nth-of-type pseudo-class value must be at least 1.')
1382                    class Counter(object):
1383                        def __init__(self, destination):
1384                            self.count = 0
1385                            self.destination = destination
1386
1387                        def nth_child_of_type(self, tag):
1388                            self.count += 1
1389                            if self.count == self.destination:
1390                                return True
1391                            if self.count > self.destination:
1392                                # Stop the generator that's sending us
1393                                # these things.
1394                                raise StopIteration()
1395                            return False
1396                    checker = Counter(pseudo_value).nth_child_of_type
1397                else:
1398                    raise NotImplementedError(
1399                        'Only the following pseudo-classes are implemented: nth-of-type.')
1400
1401            elif token == '*':
1402                # Star selector -- matches everything
1403                pass
1404            elif token == '>':
1405                # Run the next token as a CSS selector against the
1406                # direct children of each tag in the current context.
1407                recursive_candidate_generator = lambda tag: tag.children
1408            elif token == '~':
1409                # Run the next token as a CSS selector against the
1410                # siblings of each tag in the current context.
1411                recursive_candidate_generator = lambda tag: tag.next_siblings
1412            elif token == '+':
1413                # For each tag in the current context, run the next
1414                # token as a CSS selector against the tag's next
1415                # sibling that's a tag.
1416                def next_tag_sibling(tag):
1417                    yield tag.find_next_sibling(True)
1418                recursive_candidate_generator = next_tag_sibling
1419
1420            elif self.tag_name_re.match(token):
1421                # Just a tag name.
1422                tag_name = token
1423            else:
1424                raise ValueError(
1425                    'Unsupported or invalid CSS selector: "%s"' % token)
1426            if recursive_candidate_generator:
1427                # This happens when the selector looks like  "> foo".
1428                #
1429                # The generator calls select() recursively on every
1430                # member of the current context, passing in a different
1431                # candidate generator and a different selector.
1432                #
1433                # In the case of "> foo", the candidate generator is
1434                # one that yields a tag's direct children (">"), and
1435                # the selector is "foo".
1436                next_token = tokens[index+1]
1437                def recursive_select(tag):
1438                    if self._select_debug:
1439                        print('    Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1440                        print('-' * 40)
1441                    for i in tag.select(next_token, recursive_candidate_generator):
1442                        if self._select_debug:
1443                            print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
1444                        yield i
1445                    if self._select_debug:
1446                        print('-' * 40)
1447                _use_candidate_generator = recursive_select
1448            elif _candidate_generator is None:
1449                # By default, a tag's candidates are all of its
1450                # children. If tag_name is defined, only yield tags
1451                # with that name.
1452                if self._select_debug:
1453                    if tag_name:
1454                        check = "[any]"
1455                    else:
1456                        check = tag_name
1457                    print('   Default candidate generator, tag name="%s"' % check)
1458                if self._select_debug:
1459                    # This is redundant with later code, but it stops
1460                    # a bunch of bogus tags from cluttering up the
1461                    # debug log.
1462                    def default_candidate_generator(tag):
1463                        for child in tag.descendants:
1464                            if not isinstance(child, Tag):
1465                                continue
1466                            if tag_name and not child.name == tag_name:
1467                                continue
1468                            yield child
1469                    _use_candidate_generator = default_candidate_generator
1470                else:
1471                    _use_candidate_generator = lambda tag: tag.descendants
1472            else:
1473                _use_candidate_generator = _candidate_generator
1474
1475            count = 0
1476            for tag in current_context:
1477                if self._select_debug:
1478                    print("    Running candidate generator on %s %s" % (
1479                        tag.name, repr(tag.attrs)))
1480                for candidate in _use_candidate_generator(tag):
1481                    if not isinstance(candidate, Tag):
1482                        continue
1483                    if tag_name and candidate.name != tag_name:
1484                        continue
1485                    if checker is not None:
1486                        try:
1487                            result = checker(candidate)
1488                        except StopIteration:
1489                            # The checker has decided we should no longer
1490                            # run the generator.
1491                            break
1492                    if checker is None or result:
1493                        if self._select_debug:
1494                            print("     SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
1495                        if id(candidate) not in new_context_ids:
1496                            # If a tag matches a selector more than once,
1497                            # don't include it in the context more than once.
1498                            new_context.append(candidate)
1499                            new_context_ids.add(id(candidate))
1500                            if limit and len(new_context) >= limit:
1501                                break
1502                    elif self._select_debug:
1503                        print("     FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1504
1505
1506            current_context = new_context
1507
1508        if self._select_debug:
1509            print("Final verdict:")
1510            for i in current_context:
1511                print(" %s %s" % (i.name, i.attrs))
1512        return current_context
1513
1514    # Old names for backwards compatibility
1515    def childGenerator(self):
1516        return self.children
1517
1518    def recursiveChildGenerator(self):
1519        return self.descendants
1520
1521    def has_key(self, key):
1522        """This was kind of misleading because has_key() (attributes)
1523        was different from __in__ (contents). has_key() is gone in
1524        Python 3, anyway."""
1525        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1526                key))
1527        return self.has_attr(key)
1528
1529# Next, a couple classes to represent queries and their results.
1530class SoupStrainer(object):
1531    """Encapsulates a number of ways of matching a markup element (tag or
1532    text)."""
1533
1534    def __init__(self, name=None, attrs={}, text=None, **kwargs):
1535        self.name = self._normalize_search_value(name)
1536        if not isinstance(attrs, dict):
1537            # Treat a non-dict value for attrs as a search for the 'class'
1538            # attribute.
1539            kwargs['class'] = attrs
1540            attrs = None
1541
1542        if 'class_' in kwargs:
1543            # Treat class_="foo" as a search for the 'class'
1544            # attribute, overriding any non-dict value for attrs.
1545            kwargs['class'] = kwargs['class_']
1546            del kwargs['class_']
1547
1548        if kwargs:
1549            if attrs:
1550                attrs = attrs.copy()
1551                attrs.update(kwargs)
1552            else:
1553                attrs = kwargs
1554        normalized_attrs = {}
1555        for key, value in list(attrs.items()):
1556            normalized_attrs[key] = self._normalize_search_value(value)
1557
1558        self.attrs = normalized_attrs
1559        self.text = self._normalize_search_value(text)
1560
1561    def _normalize_search_value(self, value):
1562        # Leave it alone if it's a Unicode string, a callable, a
1563        # regular expression, a boolean, or None.
1564        if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match')
1565            or isinstance(value, bool) or value is None):
1566            return value
1567
1568        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1569        if isinstance(value, bytes):
1570            return value.decode("utf8")
1571
1572        # If it's listlike, convert it into a list of strings.
1573        if hasattr(value, '__iter__'):
1574            new_value = []
1575            for v in value:
1576                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1577                    and not isinstance(v, str)):
1578                    # This is almost certainly the user's mistake. In the
1579                    # interests of avoiding infinite loops, we'll let
1580                    # it through as-is rather than doing a recursive call.
1581                    new_value.append(v)
1582                else:
1583                    new_value.append(self._normalize_search_value(v))
1584            return new_value
1585
1586        # Otherwise, convert it into a Unicode string.
1587        # The unicode(str()) thing is so this will do the same thing on Python 2
1588        # and Python 3.
1589        return str(str(value))
1590
1591    def __str__(self):
1592        if self.text:
1593            return self.text
1594        else:
1595            return "%s|%s" % (self.name, self.attrs)
1596
1597    def search_tag(self, markup_name=None, markup_attrs={}):
1598        found = None
1599        markup = None
1600        if isinstance(markup_name, Tag):
1601            markup = markup_name
1602            markup_attrs = markup
1603        call_function_with_tag_data = (
1604            isinstance(self.name, collections.abc.Callable)
1605            and not isinstance(markup_name, Tag))
1606
1607        if ((not self.name)
1608            or call_function_with_tag_data
1609            or (markup and self._matches(markup, self.name))
1610            or (not markup and self._matches(markup_name, self.name))):
1611            if call_function_with_tag_data:
1612                match = self.name(markup_name, markup_attrs)
1613            else:
1614                match = True
1615                markup_attr_map = None
1616                for attr, match_against in list(self.attrs.items()):
1617                    if not markup_attr_map:
1618                        if hasattr(markup_attrs, 'get'):
1619                            markup_attr_map = markup_attrs
1620                        else:
1621                            markup_attr_map = {}
1622                            for k, v in markup_attrs:
1623                                markup_attr_map[k] = v
1624                    attr_value = markup_attr_map.get(attr)
1625                    if not self._matches(attr_value, match_against):
1626                        match = False
1627                        break
1628            if match:
1629                if markup:
1630                    found = markup
1631                else:
1632                    found = markup_name
1633        if found and self.text and not self._matches(found.string, self.text):
1634            found = None
1635        return found
1636    searchTag = search_tag
1637
1638    def search(self, markup):
1639        # print 'looking for %s in %s' % (self, markup)
1640        found = None
1641        # If given a list of items, scan it for a text element that
1642        # matches.
1643        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
1644            for element in markup:
1645                if isinstance(element, NavigableString) \
1646                       and self.search(element):
1647                    found = element
1648                    break
1649        # If it's a Tag, make sure its name or attributes match.
1650        # Don't bother with Tags if we're searching for text.
1651        elif isinstance(markup, Tag):
1652            if not self.text or self.name or self.attrs:
1653                found = self.search_tag(markup)
1654        # If it's text, make sure the text matches.
1655        elif isinstance(markup, NavigableString) or \
1656                 isinstance(markup, str):
1657            if not self.name and not self.attrs and self._matches(markup, self.text):
1658                found = markup
1659        else:
1660            raise Exception(
1661                "I don't know how to match against a %s" % markup.__class__)
1662        return found
1663
1664    def _matches(self, markup, match_against):
1665        # print u"Matching %s against %s" % (markup, match_against)
1666        result = False
1667        if isinstance(markup, list) or isinstance(markup, tuple):
1668            # This should only happen when searching a multi-valued attribute
1669            # like 'class'.
1670            if (isinstance(match_against, str)
1671                and ' ' in match_against):
1672                # A bit of a special case. If they try to match "foo
1673                # bar" on a multivalue attribute's value, only accept
1674                # the literal value "foo bar"
1675                #
1676                # XXX This is going to be pretty slow because we keep
1677                # splitting match_against. But it shouldn't come up
1678                # too often.
1679                return (whitespace_re.split(match_against) == markup)
1680            else:
1681                for item in markup:
1682                    if self._matches(item, match_against):
1683                        return True
1684                return False
1685
1686        if match_against is True:
1687            # True matches any non-None value.
1688            return markup is not None
1689
1690        if isinstance(match_against, collections.abc.Callable):
1691            return match_against(markup)
1692
1693        # Custom callables take the tag as an argument, but all
1694        # other ways of matching match the tag name as a string.
1695        if isinstance(markup, Tag):
1696            markup = markup.name
1697
1698        # Ensure that `markup` is either a Unicode string, or None.
1699        markup = self._normalize_search_value(markup)
1700
1701        if markup is None:
1702            # None matches None, False, an empty string, an empty list, and so on.
1703            return not match_against
1704
1705        if isinstance(match_against, str):
1706            # Exact string match
1707            return markup == match_against
1708
1709        if hasattr(match_against, 'match'):
1710            # Regexp match
1711            return match_against.search(markup)
1712
1713        if hasattr(match_against, '__iter__'):
1714            # The markup must be an exact match against something
1715            # in the iterable.
1716            return markup in match_against
1717
1718
1719class ResultSet(list):
1720    """A ResultSet is just a list that keeps track of the SoupStrainer
1721    that created it."""
1722    def __init__(self, source, result=()):
1723        super(ResultSet, self).__init__(result)
1724        self.source = source
1725