1# Use of this source code is governed by the MIT license. 2__license__ = "MIT" 3 4try: 5 from collections.abc import Callable # Python 3.6 6except ImportError as e: 7 from collections import Callable 8import re 9import sys 10import warnings 11 12from bs4.css import CSS 13from bs4.formatter import ( 14 Formatter, 15 HTMLFormatter, 16 XMLFormatter, 17) 18 19DEFAULT_OUTPUT_ENCODING = "utf-8" 20 21nonwhitespace_re = re.compile(r"\S+") 22 23# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on 24# the off chance someone imported it for their own use. 25whitespace_re = re.compile(r"\s+") 26 27def _alias(attr): 28 """Alias one attribute name to another for backward compatibility""" 29 @property 30 def alias(self): 31 return getattr(self, attr) 32 33 @alias.setter 34 def alias(self): 35 return setattr(self, attr) 36 return alias 37 38 39# These encodings are recognized by Python (so PageElement.encode 40# could theoretically support them) but XML and HTML don't recognize 41# them (so they should not show up in an XML or HTML document as that 42# document's encoding). 43# 44# If an XML document is encoded in one of these encodings, no encoding 45# will be mentioned in the XML declaration. If an HTML document is 46# encoded in one of these encodings, and the HTML document has a 47# <meta> tag that mentions an encoding, the encoding will be given as 48# the empty string. 49# 50# Source: 51# https://docs.python.org/3/library/codecs.html#python-specific-encodings 52PYTHON_SPECIFIC_ENCODINGS = set([ 53 "idna", 54 "mbcs", 55 "oem", 56 "palmos", 57 "punycode", 58 "raw_unicode_escape", 59 "undefined", 60 "unicode_escape", 61 "raw-unicode-escape", 62 "unicode-escape", 63 "string-escape", 64 "string_escape", 65]) 66 67 68class NamespacedAttribute(str): 69 """A namespaced string (e.g. 'xml:lang') that remembers the namespace 70 ('xml') and the name ('lang') that were used to create it. 71 """ 72 73 def __new__(cls, prefix, name=None, namespace=None): 74 if not name: 75 # This is the default namespace. Its name "has no value" 76 # per https://www.w3.org/TR/xml-names/#defaulting 77 name = None 78 79 if not name: 80 obj = str.__new__(cls, prefix) 81 elif not prefix: 82 # Not really namespaced. 83 obj = str.__new__(cls, name) 84 else: 85 obj = str.__new__(cls, prefix + ":" + name) 86 obj.prefix = prefix 87 obj.name = name 88 obj.namespace = namespace 89 return obj 90 91class AttributeValueWithCharsetSubstitution(str): 92 """A stand-in object for a character encoding specified in HTML.""" 93 94class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 95 """A generic stand-in for the value of a meta tag's 'charset' attribute. 96 97 When Beautiful Soup parses the markup '<meta charset="utf8">', the 98 value of the 'charset' attribute will be one of these objects. 99 """ 100 101 def __new__(cls, original_value): 102 obj = str.__new__(cls, original_value) 103 obj.original_value = original_value 104 return obj 105 106 def encode(self, encoding): 107 """When an HTML document is being encoded to a given encoding, the 108 value of a meta tag's 'charset' is the name of the encoding. 109 """ 110 if encoding in PYTHON_SPECIFIC_ENCODINGS: 111 return '' 112 return encoding 113 114 115class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 116 """A generic stand-in for the value of a meta tag's 'content' attribute. 117 118 When Beautiful Soup parses the markup: 119 <meta http-equiv="content-type" content="text/html; charset=utf8"> 120 121 The value of the 'content' attribute will be one of these objects. 122 """ 123 124 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 125 126 def __new__(cls, original_value): 127 match = cls.CHARSET_RE.search(original_value) 128 if match is None: 129 # No substitution necessary. 130 return str.__new__(str, original_value) 131 132 obj = str.__new__(cls, original_value) 133 obj.original_value = original_value 134 return obj 135 136 def encode(self, encoding): 137 if encoding in PYTHON_SPECIFIC_ENCODINGS: 138 return '' 139 def rewrite(match): 140 return match.group(1) + encoding 141 return self.CHARSET_RE.sub(rewrite, self.original_value) 142 143 144class PageElement(object): 145 """Contains the navigational information for some part of the page: 146 that is, its current location in the parse tree. 147 148 NavigableString, Tag, etc. are all subclasses of PageElement. 149 """ 150 151 # In general, we can't tell just by looking at an element whether 152 # it's contained in an XML document or an HTML document. But for 153 # Tags (q.v.) we can store this information at parse time. 154 known_xml = None 155 156 def setup(self, parent=None, previous_element=None, next_element=None, 157 previous_sibling=None, next_sibling=None): 158 """Sets up the initial relations between this element and 159 other elements. 160 161 :param parent: The parent of this element. 162 163 :param previous_element: The element parsed immediately before 164 this one. 165 166 :param next_element: The element parsed immediately before 167 this one. 168 169 :param previous_sibling: The most recently encountered element 170 on the same level of the parse tree as this one. 171 172 :param previous_sibling: The next element to be encountered 173 on the same level of the parse tree as this one. 174 """ 175 self.parent = parent 176 177 self.previous_element = previous_element 178 if previous_element is not None: 179 self.previous_element.next_element = self 180 181 self.next_element = next_element 182 if self.next_element is not None: 183 self.next_element.previous_element = self 184 185 self.next_sibling = next_sibling 186 if self.next_sibling is not None: 187 self.next_sibling.previous_sibling = self 188 189 if (previous_sibling is None 190 and self.parent is not None and self.parent.contents): 191 previous_sibling = self.parent.contents[-1] 192 193 self.previous_sibling = previous_sibling 194 if previous_sibling is not None: 195 self.previous_sibling.next_sibling = self 196 197 def format_string(self, s, formatter): 198 """Format the given string using the given formatter. 199 200 :param s: A string. 201 :param formatter: A Formatter object, or a string naming one of the standard formatters. 202 """ 203 if formatter is None: 204 return s 205 if not isinstance(formatter, Formatter): 206 formatter = self.formatter_for_name(formatter) 207 output = formatter.substitute(s) 208 return output 209 210 def formatter_for_name(self, formatter): 211 """Look up or create a Formatter for the given identifier, 212 if necessary. 213 214 :param formatter: Can be a Formatter object (used as-is), a 215 function (used as the entity substitution hook for an 216 XMLFormatter or HTMLFormatter), or a string (used to look 217 up an XMLFormatter or HTMLFormatter in the appropriate 218 registry. 219 """ 220 if isinstance(formatter, Formatter): 221 return formatter 222 if self._is_xml: 223 c = XMLFormatter 224 else: 225 c = HTMLFormatter 226 if isinstance(formatter, Callable): 227 return c(entity_substitution=formatter) 228 return c.REGISTRY[formatter] 229 230 @property 231 def _is_xml(self): 232 """Is this element part of an XML tree or an HTML tree? 233 234 This is used in formatter_for_name, when deciding whether an 235 XMLFormatter or HTMLFormatter is more appropriate. It can be 236 inefficient, but it should be called very rarely. 237 """ 238 if self.known_xml is not None: 239 # Most of the time we will have determined this when the 240 # document is parsed. 241 return self.known_xml 242 243 # Otherwise, it's likely that this element was created by 244 # direct invocation of the constructor from within the user's 245 # Python code. 246 if self.parent is None: 247 # This is the top-level object. It should have .known_xml set 248 # from tree creation. If not, take a guess--BS is usually 249 # used on HTML markup. 250 return getattr(self, 'is_xml', False) 251 return self.parent._is_xml 252 253 nextSibling = _alias("next_sibling") # BS3 254 previousSibling = _alias("previous_sibling") # BS3 255 256 default = object() 257 def _all_strings(self, strip=False, types=default): 258 """Yield all strings of certain classes, possibly stripping them. 259 260 This is implemented differently in Tag and NavigableString. 261 """ 262 raise NotImplementedError() 263 264 @property 265 def stripped_strings(self): 266 """Yield all strings in this PageElement, stripping them first. 267 268 :yield: A sequence of stripped strings. 269 """ 270 for string in self._all_strings(True): 271 yield string 272 273 def get_text(self, separator="", strip=False, 274 types=default): 275 """Get all child strings of this PageElement, concatenated using the 276 given separator. 277 278 :param separator: Strings will be concatenated using this separator. 279 280 :param strip: If True, strings will be stripped before being 281 concatenated. 282 283 :param types: A tuple of NavigableString subclasses. Any 284 strings of a subclass not found in this list will be 285 ignored. Although there are exceptions, the default 286 behavior in most cases is to consider only NavigableString 287 and CData objects. That means no comments, processing 288 instructions, etc. 289 290 :return: A string. 291 """ 292 return separator.join([s for s in self._all_strings( 293 strip, types=types)]) 294 getText = get_text 295 text = property(get_text) 296 297 def replace_with(self, *args): 298 """Replace this PageElement with one or more PageElements, keeping the 299 rest of the tree the same. 300 301 :param args: One or more PageElements. 302 :return: `self`, no longer part of the tree. 303 """ 304 if self.parent is None: 305 raise ValueError( 306 "Cannot replace one element with another when the " 307 "element to be replaced is not part of a tree.") 308 if len(args) == 1 and args[0] is self: 309 return 310 if any(x is self.parent for x in args): 311 raise ValueError("Cannot replace a Tag with its parent.") 312 old_parent = self.parent 313 my_index = self.parent.index(self) 314 self.extract(_self_index=my_index) 315 for idx, replace_with in enumerate(args, start=my_index): 316 old_parent.insert(idx, replace_with) 317 return self 318 replaceWith = replace_with # BS3 319 320 def unwrap(self): 321 """Replace this PageElement with its contents. 322 323 :return: `self`, no longer part of the tree. 324 """ 325 my_parent = self.parent 326 if self.parent is None: 327 raise ValueError( 328 "Cannot replace an element with its contents when that" 329 "element is not part of a tree.") 330 my_index = self.parent.index(self) 331 self.extract(_self_index=my_index) 332 for child in reversed(self.contents[:]): 333 my_parent.insert(my_index, child) 334 return self 335 replace_with_children = unwrap 336 replaceWithChildren = unwrap # BS3 337 338 def wrap(self, wrap_inside): 339 """Wrap this PageElement inside another one. 340 341 :param wrap_inside: A PageElement. 342 :return: `wrap_inside`, occupying the position in the tree that used 343 to be occupied by `self`, and with `self` inside it. 344 """ 345 me = self.replace_with(wrap_inside) 346 wrap_inside.append(me) 347 return wrap_inside 348 349 def extract(self, _self_index=None): 350 """Destructively rips this element out of the tree. 351 352 :param _self_index: The location of this element in its parent's 353 .contents, if known. Passing this in allows for a performance 354 optimization. 355 356 :return: `self`, no longer part of the tree. 357 """ 358 if self.parent is not None: 359 if _self_index is None: 360 _self_index = self.parent.index(self) 361 del self.parent.contents[_self_index] 362 363 #Find the two elements that would be next to each other if 364 #this element (and any children) hadn't been parsed. Connect 365 #the two. 366 last_child = self._last_descendant() 367 next_element = last_child.next_element 368 369 if (self.previous_element is not None and 370 self.previous_element is not next_element): 371 self.previous_element.next_element = next_element 372 if next_element is not None and next_element is not self.previous_element: 373 next_element.previous_element = self.previous_element 374 self.previous_element = None 375 last_child.next_element = None 376 377 self.parent = None 378 if (self.previous_sibling is not None 379 and self.previous_sibling is not self.next_sibling): 380 self.previous_sibling.next_sibling = self.next_sibling 381 if (self.next_sibling is not None 382 and self.next_sibling is not self.previous_sibling): 383 self.next_sibling.previous_sibling = self.previous_sibling 384 self.previous_sibling = self.next_sibling = None 385 return self 386 387 def _last_descendant(self, is_initialized=True, accept_self=True): 388 """Finds the last element beneath this object to be parsed. 389 390 :param is_initialized: Has `setup` been called on this PageElement 391 yet? 392 :param accept_self: Is `self` an acceptable answer to the question? 393 """ 394 if is_initialized and self.next_sibling is not None: 395 last_child = self.next_sibling.previous_element 396 else: 397 last_child = self 398 while isinstance(last_child, Tag) and last_child.contents: 399 last_child = last_child.contents[-1] 400 if not accept_self and last_child is self: 401 last_child = None 402 return last_child 403 # BS3: Not part of the API! 404 _lastRecursiveChild = _last_descendant 405 406 def insert(self, position, new_child): 407 """Insert a new PageElement in the list of this PageElement's children. 408 409 This works the same way as `list.insert`. 410 411 :param position: The numeric position that should be occupied 412 in `self.children` by the new PageElement. 413 :param new_child: A PageElement. 414 """ 415 if new_child is None: 416 raise ValueError("Cannot insert None into a tag.") 417 if new_child is self: 418 raise ValueError("Cannot insert a tag into itself.") 419 if (isinstance(new_child, str) 420 and not isinstance(new_child, NavigableString)): 421 new_child = NavigableString(new_child) 422 423 from bs4 import BeautifulSoup 424 if isinstance(new_child, BeautifulSoup): 425 # We don't want to end up with a situation where one BeautifulSoup 426 # object contains another. Insert the children one at a time. 427 for subchild in list(new_child.contents): 428 self.insert(position, subchild) 429 position += 1 430 return 431 position = min(position, len(self.contents)) 432 if hasattr(new_child, 'parent') and new_child.parent is not None: 433 # We're 'inserting' an element that's already one 434 # of this object's children. 435 if new_child.parent is self: 436 current_index = self.index(new_child) 437 if current_index < position: 438 # We're moving this element further down the list 439 # of this object's children. That means that when 440 # we extract this element, our target index will 441 # jump down one. 442 position -= 1 443 new_child.extract() 444 445 new_child.parent = self 446 previous_child = None 447 if position == 0: 448 new_child.previous_sibling = None 449 new_child.previous_element = self 450 else: 451 previous_child = self.contents[position - 1] 452 new_child.previous_sibling = previous_child 453 new_child.previous_sibling.next_sibling = new_child 454 new_child.previous_element = previous_child._last_descendant(False) 455 if new_child.previous_element is not None: 456 new_child.previous_element.next_element = new_child 457 458 new_childs_last_element = new_child._last_descendant(False) 459 460 if position >= len(self.contents): 461 new_child.next_sibling = None 462 463 parent = self 464 parents_next_sibling = None 465 while parents_next_sibling is None and parent is not None: 466 parents_next_sibling = parent.next_sibling 467 parent = parent.parent 468 if parents_next_sibling is not None: 469 # We found the element that comes next in the document. 470 break 471 if parents_next_sibling is not None: 472 new_childs_last_element.next_element = parents_next_sibling 473 else: 474 # The last element of this tag is the last element in 475 # the document. 476 new_childs_last_element.next_element = None 477 else: 478 next_child = self.contents[position] 479 new_child.next_sibling = next_child 480 if new_child.next_sibling is not None: 481 new_child.next_sibling.previous_sibling = new_child 482 new_childs_last_element.next_element = next_child 483 484 if new_childs_last_element.next_element is not None: 485 new_childs_last_element.next_element.previous_element = new_childs_last_element 486 self.contents.insert(position, new_child) 487 488 def append(self, tag): 489 """Appends the given PageElement to the contents of this one. 490 491 :param tag: A PageElement. 492 """ 493 self.insert(len(self.contents), tag) 494 495 def extend(self, tags): 496 """Appends the given PageElements to this one's contents. 497 498 :param tags: A list of PageElements. If a single Tag is 499 provided instead, this PageElement's contents will be extended 500 with that Tag's contents. 501 """ 502 if isinstance(tags, Tag): 503 tags = tags.contents 504 if isinstance(tags, list): 505 # Moving items around the tree may change their position in 506 # the original list. Make a list that won't change. 507 tags = list(tags) 508 for tag in tags: 509 self.append(tag) 510 511 def insert_before(self, *args): 512 """Makes the given element(s) the immediate predecessor of this one. 513 514 All the elements will have the same parent, and the given elements 515 will be immediately before this one. 516 517 :param args: One or more PageElements. 518 """ 519 parent = self.parent 520 if parent is None: 521 raise ValueError( 522 "Element has no parent, so 'before' has no meaning.") 523 if any(x is self for x in args): 524 raise ValueError("Can't insert an element before itself.") 525 for predecessor in args: 526 # Extract first so that the index won't be screwed up if they 527 # are siblings. 528 if isinstance(predecessor, PageElement): 529 predecessor.extract() 530 index = parent.index(self) 531 parent.insert(index, predecessor) 532 533 def insert_after(self, *args): 534 """Makes the given element(s) the immediate successor of this one. 535 536 The elements will have the same parent, and the given elements 537 will be immediately after this one. 538 539 :param args: One or more PageElements. 540 """ 541 # Do all error checking before modifying the tree. 542 parent = self.parent 543 if parent is None: 544 raise ValueError( 545 "Element has no parent, so 'after' has no meaning.") 546 if any(x is self for x in args): 547 raise ValueError("Can't insert an element after itself.") 548 549 offset = 0 550 for successor in args: 551 # Extract first so that the index won't be screwed up if they 552 # are siblings. 553 if isinstance(successor, PageElement): 554 successor.extract() 555 index = parent.index(self) 556 parent.insert(index+1+offset, successor) 557 offset += 1 558 559 def find_next(self, name=None, attrs={}, string=None, **kwargs): 560 """Find the first PageElement that matches the given criteria and 561 appears later in the document than this PageElement. 562 563 All find_* methods take a common set of arguments. See the online 564 documentation for detailed explanations. 565 566 :param name: A filter on tag name. 567 :param attrs: A dictionary of filters on attribute values. 568 :param string: A filter for a NavigableString with specific text. 569 :kwargs: A dictionary of filters on attribute values. 570 :return: A PageElement. 571 :rtype: bs4.element.Tag | bs4.element.NavigableString 572 """ 573 return self._find_one(self.find_all_next, name, attrs, string, **kwargs) 574 findNext = find_next # BS3 575 576 def find_all_next(self, name=None, attrs={}, string=None, limit=None, 577 **kwargs): 578 """Find all PageElements that match the given criteria and appear 579 later in the document than this PageElement. 580 581 All find_* methods take a common set of arguments. See the online 582 documentation for detailed explanations. 583 584 :param name: A filter on tag name. 585 :param attrs: A dictionary of filters on attribute values. 586 :param string: A filter for a NavigableString with specific text. 587 :param limit: Stop looking after finding this many results. 588 :kwargs: A dictionary of filters on attribute values. 589 :return: A ResultSet containing PageElements. 590 """ 591 _stacklevel = kwargs.pop('_stacklevel', 2) 592 return self._find_all(name, attrs, string, limit, self.next_elements, 593 _stacklevel=_stacklevel+1, **kwargs) 594 findAllNext = find_all_next # BS3 595 596 def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs): 597 """Find the closest sibling to this PageElement that matches the 598 given criteria and appears later in the document. 599 600 All find_* methods take a common set of arguments. See the 601 online documentation for detailed explanations. 602 603 :param name: A filter on tag name. 604 :param attrs: A dictionary of filters on attribute values. 605 :param string: A filter for a NavigableString with specific text. 606 :kwargs: A dictionary of filters on attribute values. 607 :return: A PageElement. 608 :rtype: bs4.element.Tag | bs4.element.NavigableString 609 """ 610 return self._find_one(self.find_next_siblings, name, attrs, string, 611 **kwargs) 612 findNextSibling = find_next_sibling # BS3 613 614 def find_next_siblings(self, name=None, attrs={}, string=None, limit=None, 615 **kwargs): 616 """Find all siblings of this PageElement that match the given criteria 617 and appear later in the document. 618 619 All find_* methods take a common set of arguments. See the online 620 documentation for detailed explanations. 621 622 :param name: A filter on tag name. 623 :param attrs: A dictionary of filters on attribute values. 624 :param string: A filter for a NavigableString with specific text. 625 :param limit: Stop looking after finding this many results. 626 :kwargs: A dictionary of filters on attribute values. 627 :return: A ResultSet of PageElements. 628 :rtype: bs4.element.ResultSet 629 """ 630 _stacklevel = kwargs.pop('_stacklevel', 2) 631 return self._find_all( 632 name, attrs, string, limit, 633 self.next_siblings, _stacklevel=_stacklevel+1, **kwargs 634 ) 635 findNextSiblings = find_next_siblings # BS3 636 fetchNextSiblings = find_next_siblings # BS2 637 638 def find_previous(self, name=None, attrs={}, string=None, **kwargs): 639 """Look backwards in the document from this PageElement and find the 640 first PageElement that matches the given criteria. 641 642 All find_* methods take a common set of arguments. See the online 643 documentation for detailed explanations. 644 645 :param name: A filter on tag name. 646 :param attrs: A dictionary of filters on attribute values. 647 :param string: A filter for a NavigableString with specific text. 648 :kwargs: A dictionary of filters on attribute values. 649 :return: A PageElement. 650 :rtype: bs4.element.Tag | bs4.element.NavigableString 651 """ 652 return self._find_one( 653 self.find_all_previous, name, attrs, string, **kwargs) 654 findPrevious = find_previous # BS3 655 656 def find_all_previous(self, name=None, attrs={}, string=None, limit=None, 657 **kwargs): 658 """Look backwards in the document from this PageElement and find all 659 PageElements that match the given criteria. 660 661 All find_* methods take a common set of arguments. See the online 662 documentation for detailed explanations. 663 664 :param name: A filter on tag name. 665 :param attrs: A dictionary of filters on attribute values. 666 :param string: A filter for a NavigableString with specific text. 667 :param limit: Stop looking after finding this many results. 668 :kwargs: A dictionary of filters on attribute values. 669 :return: A ResultSet of PageElements. 670 :rtype: bs4.element.ResultSet 671 """ 672 _stacklevel = kwargs.pop('_stacklevel', 2) 673 return self._find_all( 674 name, attrs, string, limit, self.previous_elements, 675 _stacklevel=_stacklevel+1, **kwargs 676 ) 677 findAllPrevious = find_all_previous # BS3 678 fetchPrevious = find_all_previous # BS2 679 680 def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs): 681 """Returns the closest sibling to this PageElement that matches the 682 given criteria and appears earlier in the document. 683 684 All find_* methods take a common set of arguments. See the online 685 documentation for detailed explanations. 686 687 :param name: A filter on tag name. 688 :param attrs: A dictionary of filters on attribute values. 689 :param string: A filter for a NavigableString with specific text. 690 :kwargs: A dictionary of filters on attribute values. 691 :return: A PageElement. 692 :rtype: bs4.element.Tag | bs4.element.NavigableString 693 """ 694 return self._find_one(self.find_previous_siblings, name, attrs, string, 695 **kwargs) 696 findPreviousSibling = find_previous_sibling # BS3 697 698 def find_previous_siblings(self, name=None, attrs={}, string=None, 699 limit=None, **kwargs): 700 """Returns all siblings to this PageElement that match the 701 given criteria and appear earlier in the document. 702 703 All find_* methods take a common set of arguments. See the online 704 documentation for detailed explanations. 705 706 :param name: A filter on tag name. 707 :param attrs: A dictionary of filters on attribute values. 708 :param string: A filter for a NavigableString with specific text. 709 :param limit: Stop looking after finding this many results. 710 :kwargs: A dictionary of filters on attribute values. 711 :return: A ResultSet of PageElements. 712 :rtype: bs4.element.ResultSet 713 """ 714 _stacklevel = kwargs.pop('_stacklevel', 2) 715 return self._find_all( 716 name, attrs, string, limit, 717 self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs 718 ) 719 findPreviousSiblings = find_previous_siblings # BS3 720 fetchPreviousSiblings = find_previous_siblings # BS2 721 722 def find_parent(self, name=None, attrs={}, **kwargs): 723 """Find the closest parent of this PageElement that matches the given 724 criteria. 725 726 All find_* methods take a common set of arguments. See the online 727 documentation for detailed explanations. 728 729 :param name: A filter on tag name. 730 :param attrs: A dictionary of filters on attribute values. 731 :kwargs: A dictionary of filters on attribute values. 732 733 :return: A PageElement. 734 :rtype: bs4.element.Tag | bs4.element.NavigableString 735 """ 736 # NOTE: We can't use _find_one because findParents takes a different 737 # set of arguments. 738 r = None 739 l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs) 740 if l: 741 r = l[0] 742 return r 743 findParent = find_parent # BS3 744 745 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 746 """Find all parents of this PageElement that match the given criteria. 747 748 All find_* methods take a common set of arguments. See the online 749 documentation for detailed explanations. 750 751 :param name: A filter on tag name. 752 :param attrs: A dictionary of filters on attribute values. 753 :param limit: Stop looking after finding this many results. 754 :kwargs: A dictionary of filters on attribute values. 755 756 :return: A PageElement. 757 :rtype: bs4.element.Tag | bs4.element.NavigableString 758 """ 759 _stacklevel = kwargs.pop('_stacklevel', 2) 760 return self._find_all(name, attrs, None, limit, self.parents, 761 _stacklevel=_stacklevel+1, **kwargs) 762 findParents = find_parents # BS3 763 fetchParents = find_parents # BS2 764 765 @property 766 def next(self): 767 """The PageElement, if any, that was parsed just after this one. 768 769 :return: A PageElement. 770 :rtype: bs4.element.Tag | bs4.element.NavigableString 771 """ 772 return self.next_element 773 774 @property 775 def previous(self): 776 """The PageElement, if any, that was parsed just before this one. 777 778 :return: A PageElement. 779 :rtype: bs4.element.Tag | bs4.element.NavigableString 780 """ 781 return self.previous_element 782 783 #These methods do the real heavy lifting. 784 785 def _find_one(self, method, name, attrs, string, **kwargs): 786 r = None 787 l = method(name, attrs, string, 1, _stacklevel=4, **kwargs) 788 if l: 789 r = l[0] 790 return r 791 792 def _find_all(self, name, attrs, string, limit, generator, **kwargs): 793 "Iterates over a generator looking for things that match." 794 _stacklevel = kwargs.pop('_stacklevel', 3) 795 796 if string is None and 'text' in kwargs: 797 string = kwargs.pop('text') 798 warnings.warn( 799 "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.", 800 DeprecationWarning, stacklevel=_stacklevel 801 ) 802 803 if isinstance(name, SoupStrainer): 804 strainer = name 805 else: 806 strainer = SoupStrainer(name, attrs, string, **kwargs) 807 808 if string is None and not limit and not attrs and not kwargs: 809 if name is True or name is None: 810 # Optimization to find all tags. 811 result = (element for element in generator 812 if isinstance(element, Tag)) 813 return ResultSet(strainer, result) 814 elif isinstance(name, str): 815 # Optimization to find all tags with a given name. 816 if name.count(':') == 1: 817 # This is a name with a prefix. If this is a namespace-aware document, 818 # we need to match the local name against tag.name. If not, 819 # we need to match the fully-qualified name against tag.name. 820 prefix, local_name = name.split(':', 1) 821 else: 822 prefix = None 823 local_name = name 824 result = (element for element in generator 825 if isinstance(element, Tag) 826 and ( 827 element.name == name 828 ) or ( 829 element.name == local_name 830 and (prefix is None or element.prefix == prefix) 831 ) 832 ) 833 return ResultSet(strainer, result) 834 results = ResultSet(strainer) 835 while True: 836 try: 837 i = next(generator) 838 except StopIteration: 839 break 840 if i: 841 found = strainer.search(i) 842 if found: 843 results.append(found) 844 if limit and len(results) >= limit: 845 break 846 return results 847 848 #These generators can be used to navigate starting from both 849 #NavigableStrings and Tags. 850 @property 851 def next_elements(self): 852 """All PageElements that were parsed after this one. 853 854 :yield: A sequence of PageElements. 855 """ 856 i = self.next_element 857 while i is not None: 858 yield i 859 i = i.next_element 860 861 @property 862 def next_siblings(self): 863 """All PageElements that are siblings of this one but were parsed 864 later. 865 866 :yield: A sequence of PageElements. 867 """ 868 i = self.next_sibling 869 while i is not None: 870 yield i 871 i = i.next_sibling 872 873 @property 874 def previous_elements(self): 875 """All PageElements that were parsed before this one. 876 877 :yield: A sequence of PageElements. 878 """ 879 i = self.previous_element 880 while i is not None: 881 yield i 882 i = i.previous_element 883 884 @property 885 def previous_siblings(self): 886 """All PageElements that are siblings of this one but were parsed 887 earlier. 888 889 :yield: A sequence of PageElements. 890 """ 891 i = self.previous_sibling 892 while i is not None: 893 yield i 894 i = i.previous_sibling 895 896 @property 897 def parents(self): 898 """All PageElements that are parents of this PageElement. 899 900 :yield: A sequence of PageElements. 901 """ 902 i = self.parent 903 while i is not None: 904 yield i 905 i = i.parent 906 907 @property 908 def decomposed(self): 909 """Check whether a PageElement has been decomposed. 910 911 :rtype: bool 912 """ 913 return getattr(self, '_decomposed', False) or False 914 915 # Old non-property versions of the generators, for backwards 916 # compatibility with BS3. 917 def nextGenerator(self): 918 return self.next_elements 919 920 def nextSiblingGenerator(self): 921 return self.next_siblings 922 923 def previousGenerator(self): 924 return self.previous_elements 925 926 def previousSiblingGenerator(self): 927 return self.previous_siblings 928 929 def parentGenerator(self): 930 return self.parents 931 932 933class NavigableString(str, PageElement): 934 """A Python Unicode string that is part of a parse tree. 935 936 When Beautiful Soup parses the markup <b>penguin</b>, it will 937 create a NavigableString for the string "penguin". 938 """ 939 940 PREFIX = '' 941 SUFFIX = '' 942 943 def __new__(cls, value): 944 """Create a new NavigableString. 945 946 When unpickling a NavigableString, this method is called with 947 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 948 passed in to the superclass's __new__ or the superclass won't know 949 how to handle non-ASCII characters. 950 """ 951 if isinstance(value, str): 952 u = str.__new__(cls, value) 953 else: 954 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 955 u.setup() 956 return u 957 958 def __deepcopy__(self, memo, recursive=False): 959 """A copy of a NavigableString has the same contents and class 960 as the original, but it is not connected to the parse tree. 961 962 :param recursive: This parameter is ignored; it's only defined 963 so that NavigableString.__deepcopy__ implements the same 964 signature as Tag.__deepcopy__. 965 """ 966 return type(self)(self) 967 968 def __copy__(self): 969 """A copy of a NavigableString can only be a deep copy, because 970 only one PageElement can occupy a given place in a parse tree. 971 """ 972 return self.__deepcopy__({}) 973 974 def __getnewargs__(self): 975 return (str(self),) 976 977 def __getattr__(self, attr): 978 """text.string gives you text. This is for backwards 979 compatibility for Navigable*String, but for CData* it lets you 980 get the string without the CData wrapper.""" 981 if attr == 'string': 982 return self 983 else: 984 raise AttributeError( 985 "'%s' object has no attribute '%s'" % ( 986 self.__class__.__name__, attr)) 987 988 def output_ready(self, formatter="minimal"): 989 """Run the string through the provided formatter. 990 991 :param formatter: A Formatter object, or a string naming one of the standard formatters. 992 """ 993 output = self.format_string(self, formatter) 994 return self.PREFIX + output + self.SUFFIX 995 996 @property 997 def name(self): 998 """Since a NavigableString is not a Tag, it has no .name. 999 1000 This property is implemented so that code like this doesn't crash 1001 when run on a mixture of Tag and NavigableString objects: 1002 [x.name for x in tag.children] 1003 """ 1004 return None 1005 1006 @name.setter 1007 def name(self, name): 1008 """Prevent NavigableString.name from ever being set.""" 1009 raise AttributeError("A NavigableString cannot be given a name.") 1010 1011 def _all_strings(self, strip=False, types=PageElement.default): 1012 """Yield all strings of certain classes, possibly stripping them. 1013 1014 This makes it easy for NavigableString to implement methods 1015 like get_text() as conveniences, creating a consistent 1016 text-extraction API across all PageElements. 1017 1018 :param strip: If True, all strings will be stripped before being 1019 yielded. 1020 1021 :param types: A tuple of NavigableString subclasses. If this 1022 NavigableString isn't one of those subclasses, the 1023 sequence will be empty. By default, the subclasses 1024 considered are NavigableString and CData objects. That 1025 means no comments, processing instructions, etc. 1026 1027 :yield: A sequence that either contains this string, or is empty. 1028 1029 """ 1030 if types is self.default: 1031 # This is kept in Tag because it's full of subclasses of 1032 # this class, which aren't defined until later in the file. 1033 types = Tag.DEFAULT_INTERESTING_STRING_TYPES 1034 1035 # Do nothing if the caller is looking for specific types of 1036 # string, and we're of a different type. 1037 # 1038 # We check specific types instead of using isinstance(self, 1039 # types) because all of these classes subclass 1040 # NavigableString. Anyone who's using this feature probably 1041 # wants generic NavigableStrings but not other stuff. 1042 my_type = type(self) 1043 if types is not None: 1044 if isinstance(types, type): 1045 # Looking for a single type. 1046 if my_type is not types: 1047 return 1048 elif my_type not in types: 1049 # Looking for one of a list of types. 1050 return 1051 1052 value = self 1053 if strip: 1054 value = value.strip() 1055 if len(value) > 0: 1056 yield value 1057 strings = property(_all_strings) 1058 1059class PreformattedString(NavigableString): 1060 """A NavigableString not subject to the normal formatting rules. 1061 1062 This is an abstract class used for special kinds of strings such 1063 as comments (the Comment class) and CDATA blocks (the CData 1064 class). 1065 """ 1066 1067 PREFIX = '' 1068 SUFFIX = '' 1069 1070 def output_ready(self, formatter=None): 1071 """Make this string ready for output by adding any subclass-specific 1072 prefix or suffix. 1073 1074 :param formatter: A Formatter object, or a string naming one 1075 of the standard formatters. The string will be passed into the 1076 Formatter, but only to trigger any side effects: the return 1077 value is ignored. 1078 1079 :return: The string, with any subclass-specific prefix and 1080 suffix added on. 1081 """ 1082 if formatter is not None: 1083 ignore = self.format_string(self, formatter) 1084 return self.PREFIX + self + self.SUFFIX 1085 1086class CData(PreformattedString): 1087 """A CDATA block.""" 1088 PREFIX = '<![CDATA[' 1089 SUFFIX = ']]>' 1090 1091class ProcessingInstruction(PreformattedString): 1092 """A SGML processing instruction.""" 1093 1094 PREFIX = '<?' 1095 SUFFIX = '>' 1096 1097class XMLProcessingInstruction(ProcessingInstruction): 1098 """An XML processing instruction.""" 1099 PREFIX = '<?' 1100 SUFFIX = '?>' 1101 1102class Comment(PreformattedString): 1103 """An HTML or XML comment.""" 1104 PREFIX = '<!--' 1105 SUFFIX = '-->' 1106 1107 1108class Declaration(PreformattedString): 1109 """An XML declaration.""" 1110 PREFIX = '<?' 1111 SUFFIX = '?>' 1112 1113 1114class Doctype(PreformattedString): 1115 """A document type declaration.""" 1116 @classmethod 1117 def for_name_and_ids(cls, name, pub_id, system_id): 1118 """Generate an appropriate document type declaration for a given 1119 public ID and system ID. 1120 1121 :param name: The name of the document's root element, e.g. 'html'. 1122 :param pub_id: The Formal Public Identifier for this document type, 1123 e.g. '-//W3C//DTD XHTML 1.1//EN' 1124 :param system_id: The system identifier for this document type, 1125 e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' 1126 1127 :return: A Doctype. 1128 """ 1129 value = name or '' 1130 if pub_id is not None: 1131 value += ' PUBLIC "%s"' % pub_id 1132 if system_id is not None: 1133 value += ' "%s"' % system_id 1134 elif system_id is not None: 1135 value += ' SYSTEM "%s"' % system_id 1136 1137 return Doctype(value) 1138 1139 PREFIX = '<!DOCTYPE ' 1140 SUFFIX = '>\n' 1141 1142 1143class Stylesheet(NavigableString): 1144 """A NavigableString representing an stylesheet (probably 1145 CSS). 1146 1147 Used to distinguish embedded stylesheets from textual content. 1148 """ 1149 pass 1150 1151 1152class Script(NavigableString): 1153 """A NavigableString representing an executable script (probably 1154 Javascript). 1155 1156 Used to distinguish executable code from textual content. 1157 """ 1158 pass 1159 1160 1161class TemplateString(NavigableString): 1162 """A NavigableString representing a string found inside an HTML 1163 template embedded in a larger document. 1164 1165 Used to distinguish such strings from the main body of the document. 1166 """ 1167 pass 1168 1169 1170class RubyTextString(NavigableString): 1171 """A NavigableString representing the contents of the <rt> HTML 1172 element. 1173 1174 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element 1175 1176 Can be used to distinguish such strings from the strings they're 1177 annotating. 1178 """ 1179 pass 1180 1181 1182class RubyParenthesisString(NavigableString): 1183 """A NavigableString representing the contents of the <rp> HTML 1184 element. 1185 1186 https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element 1187 """ 1188 pass 1189 1190 1191class Tag(PageElement): 1192 """Represents an HTML or XML tag that is part of a parse tree, along 1193 with its attributes and contents. 1194 1195 When Beautiful Soup parses the markup <b>penguin</b>, it will 1196 create a Tag object representing the <b> tag. 1197 """ 1198 1199 def __init__(self, parser=None, builder=None, name=None, namespace=None, 1200 prefix=None, attrs=None, parent=None, previous=None, 1201 is_xml=None, sourceline=None, sourcepos=None, 1202 can_be_empty_element=None, cdata_list_attributes=None, 1203 preserve_whitespace_tags=None, 1204 interesting_string_types=None, 1205 namespaces=None 1206 ): 1207 """Basic constructor. 1208 1209 :param parser: A BeautifulSoup object. 1210 :param builder: A TreeBuilder. 1211 :param name: The name of the tag. 1212 :param namespace: The URI of this Tag's XML namespace, if any. 1213 :param prefix: The prefix for this Tag's XML namespace, if any. 1214 :param attrs: A dictionary of this Tag's attribute values. 1215 :param parent: The PageElement to use as this Tag's parent. 1216 :param previous: The PageElement that was parsed immediately before 1217 this tag. 1218 :param is_xml: If True, this is an XML tag. Otherwise, this is an 1219 HTML tag. 1220 :param sourceline: The line number where this tag was found in its 1221 source document. 1222 :param sourcepos: The character position within `sourceline` where this 1223 tag was found. 1224 :param can_be_empty_element: If True, this tag should be 1225 represented as <tag/>. If False, this tag should be represented 1226 as <tag></tag>. 1227 :param cdata_list_attributes: A list of attributes whose values should 1228 be treated as CDATA if they ever show up on this tag. 1229 :param preserve_whitespace_tags: A list of tag names whose contents 1230 should have their whitespace preserved. 1231 :param interesting_string_types: This is a NavigableString 1232 subclass or a tuple of them. When iterating over this 1233 Tag's strings in methods like Tag.strings or Tag.get_text, 1234 these are the types of strings that are interesting enough 1235 to be considered. The default is to consider 1236 NavigableString and CData the only interesting string 1237 subtypes. 1238 :param namespaces: A dictionary mapping currently active 1239 namespace prefixes to URIs. This can be used later to 1240 construct CSS selectors. 1241 """ 1242 if parser is None: 1243 self.parser_class = None 1244 else: 1245 # We don't actually store the parser object: that lets extracted 1246 # chunks be garbage-collected. 1247 self.parser_class = parser.__class__ 1248 if name is None: 1249 raise ValueError("No value provided for new tag's name.") 1250 self.name = name 1251 self.namespace = namespace 1252 self._namespaces = namespaces or {} 1253 self.prefix = prefix 1254 if ((not builder or builder.store_line_numbers) 1255 and (sourceline is not None or sourcepos is not None)): 1256 self.sourceline = sourceline 1257 self.sourcepos = sourcepos 1258 if attrs is None: 1259 attrs = {} 1260 elif attrs: 1261 if builder is not None and builder.cdata_list_attributes: 1262 attrs = builder._replace_cdata_list_attribute_values( 1263 self.name, attrs) 1264 else: 1265 attrs = dict(attrs) 1266 else: 1267 attrs = dict(attrs) 1268 1269 # If possible, determine ahead of time whether this tag is an 1270 # XML tag. 1271 if builder: 1272 self.known_xml = builder.is_xml 1273 else: 1274 self.known_xml = is_xml 1275 self.attrs = attrs 1276 self.contents = [] 1277 self.setup(parent, previous) 1278 self.hidden = False 1279 1280 if builder is None: 1281 # In the absence of a TreeBuilder, use whatever values were 1282 # passed in here. They're probably None, unless this is a copy of some 1283 # other tag. 1284 self.can_be_empty_element = can_be_empty_element 1285 self.cdata_list_attributes = cdata_list_attributes 1286 self.preserve_whitespace_tags = preserve_whitespace_tags 1287 self.interesting_string_types = interesting_string_types 1288 else: 1289 # Set up any substitutions for this tag, such as the charset in a META tag. 1290 builder.set_up_substitutions(self) 1291 1292 # Ask the TreeBuilder whether this tag might be an empty-element tag. 1293 self.can_be_empty_element = builder.can_be_empty_element(name) 1294 1295 # Keep track of the list of attributes of this tag that 1296 # might need to be treated as a list. 1297 # 1298 # For performance reasons, we store the whole data structure 1299 # rather than asking the question of every tag. Asking would 1300 # require building a new data structure every time, and 1301 # (unlike can_be_empty_element), we almost never need 1302 # to check this. 1303 self.cdata_list_attributes = builder.cdata_list_attributes 1304 1305 # Keep track of the names that might cause this tag to be treated as a 1306 # whitespace-preserved tag. 1307 self.preserve_whitespace_tags = builder.preserve_whitespace_tags 1308 1309 if self.name in builder.string_containers: 1310 # This sort of tag uses a special string container 1311 # subclass for most of its strings. When we ask the 1312 self.interesting_string_types = builder.string_containers[self.name] 1313 else: 1314 self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES 1315 1316 parserClass = _alias("parser_class") # BS3 1317 1318 def __deepcopy__(self, memo, recursive=True): 1319 """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. 1320 Its contents are a copy of the old Tag's contents. 1321 """ 1322 clone = self._clone() 1323 1324 if recursive: 1325 # Clone this tag's descendants recursively, but without 1326 # making any recursive function calls. 1327 tag_stack = [clone] 1328 for event, element in self._event_stream(self.descendants): 1329 if event is Tag.END_ELEMENT_EVENT: 1330 # Stop appending incoming Tags to the Tag that was 1331 # just closed. 1332 tag_stack.pop() 1333 else: 1334 descendant_clone = element.__deepcopy__( 1335 memo, recursive=False 1336 ) 1337 # Add to its parent's .contents 1338 tag_stack[-1].append(descendant_clone) 1339 1340 if event is Tag.START_ELEMENT_EVENT: 1341 # Add the Tag itself to the stack so that its 1342 # children will be .appended to it. 1343 tag_stack.append(descendant_clone) 1344 return clone 1345 1346 def __copy__(self): 1347 """A copy of a Tag must always be a deep copy, because a Tag's 1348 children can only have one parent at a time. 1349 """ 1350 return self.__deepcopy__({}) 1351 1352 def _clone(self): 1353 """Create a new Tag just like this one, but with no 1354 contents and unattached to any parse tree. 1355 1356 This is the first step in the deepcopy process. 1357 """ 1358 clone = type(self)( 1359 None, None, self.name, self.namespace, 1360 self.prefix, self.attrs, is_xml=self._is_xml, 1361 sourceline=self.sourceline, sourcepos=self.sourcepos, 1362 can_be_empty_element=self.can_be_empty_element, 1363 cdata_list_attributes=self.cdata_list_attributes, 1364 preserve_whitespace_tags=self.preserve_whitespace_tags, 1365 interesting_string_types=self.interesting_string_types 1366 ) 1367 for attr in ('can_be_empty_element', 'hidden'): 1368 setattr(clone, attr, getattr(self, attr)) 1369 return clone 1370 1371 @property 1372 def is_empty_element(self): 1373 """Is this tag an empty-element tag? (aka a self-closing tag) 1374 1375 A tag that has contents is never an empty-element tag. 1376 1377 A tag that has no contents may or may not be an empty-element 1378 tag. It depends on the builder used to create the tag. If the 1379 builder has a designated list of empty-element tags, then only 1380 a tag whose name shows up in that list is considered an 1381 empty-element tag. 1382 1383 If the builder has no designated list of empty-element tags, 1384 then any tag with no contents is an empty-element tag. 1385 """ 1386 return len(self.contents) == 0 and self.can_be_empty_element 1387 isSelfClosing = is_empty_element # BS3 1388 1389 @property 1390 def string(self): 1391 """Convenience property to get the single string within this 1392 PageElement. 1393 1394 TODO It might make sense to have NavigableString.string return 1395 itself. 1396 1397 :return: If this element has a single string child, return 1398 value is that string. If this element has one child tag, 1399 return value is the 'string' attribute of the child tag, 1400 recursively. If this element is itself a string, has no 1401 children, or has more than one child, return value is None. 1402 """ 1403 if len(self.contents) != 1: 1404 return None 1405 child = self.contents[0] 1406 if isinstance(child, NavigableString): 1407 return child 1408 return child.string 1409 1410 @string.setter 1411 def string(self, string): 1412 """Replace this PageElement's contents with `string`.""" 1413 self.clear() 1414 self.append(string.__class__(string)) 1415 1416 DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData) 1417 def _all_strings(self, strip=False, types=PageElement.default): 1418 """Yield all strings of certain classes, possibly stripping them. 1419 1420 :param strip: If True, all strings will be stripped before being 1421 yielded. 1422 1423 :param types: A tuple of NavigableString subclasses. Any strings of 1424 a subclass not found in this list will be ignored. By 1425 default, the subclasses considered are the ones found in 1426 self.interesting_string_types. If that's not specified, 1427 only NavigableString and CData objects will be 1428 considered. That means no comments, processing 1429 instructions, etc. 1430 1431 :yield: A sequence of strings. 1432 1433 """ 1434 if types is self.default: 1435 types = self.interesting_string_types 1436 1437 for descendant in self.descendants: 1438 if (types is None and not isinstance(descendant, NavigableString)): 1439 continue 1440 descendant_type = type(descendant) 1441 if isinstance(types, type): 1442 if descendant_type is not types: 1443 # We're not interested in strings of this type. 1444 continue 1445 elif types is not None and descendant_type not in types: 1446 # We're not interested in strings of this type. 1447 continue 1448 if strip: 1449 descendant = descendant.strip() 1450 if len(descendant) == 0: 1451 continue 1452 yield descendant 1453 strings = property(_all_strings) 1454 1455 def decompose(self): 1456 """Recursively destroys this PageElement and its children. 1457 1458 This element will be removed from the tree and wiped out; so 1459 will everything beneath it. 1460 1461 The behavior of a decomposed PageElement is undefined and you 1462 should never use one for anything, but if you need to _check_ 1463 whether an element has been decomposed, you can use the 1464 `decomposed` property. 1465 """ 1466 self.extract() 1467 i = self 1468 while i is not None: 1469 n = i.next_element 1470 i.__dict__.clear() 1471 i.contents = [] 1472 i._decomposed = True 1473 i = n 1474 1475 def clear(self, decompose=False): 1476 """Wipe out all children of this PageElement by calling extract() 1477 on them. 1478 1479 :param decompose: If this is True, decompose() (a more 1480 destructive method) will be called instead of extract(). 1481 """ 1482 if decompose: 1483 for element in self.contents[:]: 1484 if isinstance(element, Tag): 1485 element.decompose() 1486 else: 1487 element.extract() 1488 else: 1489 for element in self.contents[:]: 1490 element.extract() 1491 1492 def smooth(self): 1493 """Smooth out this element's children by consolidating consecutive 1494 strings. 1495 1496 This makes pretty-printed output look more natural following a 1497 lot of operations that modified the tree. 1498 """ 1499 # Mark the first position of every pair of children that need 1500 # to be consolidated. Do this rather than making a copy of 1501 # self.contents, since in most cases very few strings will be 1502 # affected. 1503 marked = [] 1504 for i, a in enumerate(self.contents): 1505 if isinstance(a, Tag): 1506 # Recursively smooth children. 1507 a.smooth() 1508 if i == len(self.contents)-1: 1509 # This is the last item in .contents, and it's not a 1510 # tag. There's no chance it needs any work. 1511 continue 1512 b = self.contents[i+1] 1513 if (isinstance(a, NavigableString) 1514 and isinstance(b, NavigableString) 1515 and not isinstance(a, PreformattedString) 1516 and not isinstance(b, PreformattedString) 1517 ): 1518 marked.append(i) 1519 1520 # Go over the marked positions in reverse order, so that 1521 # removing items from .contents won't affect the remaining 1522 # positions. 1523 for i in reversed(marked): 1524 a = self.contents[i] 1525 b = self.contents[i+1] 1526 b.extract() 1527 n = NavigableString(a+b) 1528 a.replace_with(n) 1529 1530 def index(self, element): 1531 """Find the index of a child by identity, not value. 1532 1533 Avoids issues with tag.contents.index(element) getting the 1534 index of equal elements. 1535 1536 :param element: Look for this PageElement in `self.contents`. 1537 """ 1538 for i, child in enumerate(self.contents): 1539 if child is element: 1540 return i 1541 raise ValueError("Tag.index: element not in tag") 1542 1543 def get(self, key, default=None): 1544 """Returns the value of the 'key' attribute for the tag, or 1545 the value given for 'default' if it doesn't have that 1546 attribute.""" 1547 return self.attrs.get(key, default) 1548 1549 def get_attribute_list(self, key, default=None): 1550 """The same as get(), but always returns a list. 1551 1552 :param key: The attribute to look for. 1553 :param default: Use this value if the attribute is not present 1554 on this PageElement. 1555 :return: A list of values, probably containing only a single 1556 value. 1557 """ 1558 value = self.get(key, default) 1559 if not isinstance(value, list): 1560 value = [value] 1561 return value 1562 1563 def has_attr(self, key): 1564 """Does this PageElement have an attribute with the given name?""" 1565 return key in self.attrs 1566 1567 def __hash__(self): 1568 return str(self).__hash__() 1569 1570 def __getitem__(self, key): 1571 """tag[key] returns the value of the 'key' attribute for the Tag, 1572 and throws an exception if it's not there.""" 1573 return self.attrs[key] 1574 1575 def __iter__(self): 1576 "Iterating over a Tag iterates over its contents." 1577 return iter(self.contents) 1578 1579 def __len__(self): 1580 "The length of a Tag is the length of its list of contents." 1581 return len(self.contents) 1582 1583 def __contains__(self, x): 1584 return x in self.contents 1585 1586 def __bool__(self): 1587 "A tag is non-None even if it has no contents." 1588 return True 1589 1590 def __setitem__(self, key, value): 1591 """Setting tag[key] sets the value of the 'key' attribute for the 1592 tag.""" 1593 self.attrs[key] = value 1594 1595 def __delitem__(self, key): 1596 "Deleting tag[key] deletes all 'key' attributes for the tag." 1597 self.attrs.pop(key, None) 1598 1599 def __call__(self, *args, **kwargs): 1600 """Calling a Tag like a function is the same as calling its 1601 find_all() method. Eg. tag('a') returns a list of all the A tags 1602 found within this tag.""" 1603 return self.find_all(*args, **kwargs) 1604 1605 def __getattr__(self, tag): 1606 """Calling tag.subtag is the same as calling tag.find(name="subtag")""" 1607 #print("Getattr %s.%s" % (self.__class__, tag)) 1608 if len(tag) > 3 and tag.endswith('Tag'): 1609 # BS3: soup.aTag -> "soup.find("a") 1610 tag_name = tag[:-3] 1611 warnings.warn( 1612 '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( 1613 name=tag_name 1614 ), 1615 DeprecationWarning, stacklevel=2 1616 ) 1617 return self.find(tag_name) 1618 # We special case contents to avoid recursion. 1619 elif not tag.startswith("__") and not tag == "contents": 1620 return self.find(tag) 1621 raise AttributeError( 1622 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1623 1624 def __eq__(self, other): 1625 """Returns true iff this Tag has the same name, the same attributes, 1626 and the same contents (recursively) as `other`.""" 1627 if self is other: 1628 return True 1629 if (not hasattr(other, 'name') or 1630 not hasattr(other, 'attrs') or 1631 not hasattr(other, 'contents') or 1632 self.name != other.name or 1633 self.attrs != other.attrs or 1634 len(self) != len(other)): 1635 return False 1636 for i, my_child in enumerate(self.contents): 1637 if my_child != other.contents[i]: 1638 return False 1639 return True 1640 1641 def __ne__(self, other): 1642 """Returns true iff this Tag is not identical to `other`, 1643 as defined in __eq__.""" 1644 return not self == other 1645 1646 def __repr__(self, encoding="unicode-escape"): 1647 """Renders this PageElement as a string. 1648 1649 :param encoding: The encoding to use (Python 2 only). 1650 TODO: This is now ignored and a warning should be issued 1651 if a value is provided. 1652 :return: A (Unicode) string. 1653 """ 1654 # "The return value must be a string object", i.e. Unicode 1655 return self.decode() 1656 1657 def __unicode__(self): 1658 """Renders this PageElement as a Unicode string.""" 1659 return self.decode() 1660 1661 __str__ = __repr__ = __unicode__ 1662 1663 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1664 indent_level=None, formatter="minimal", 1665 errors="xmlcharrefreplace"): 1666 """Render a bytestring representation of this PageElement and its 1667 contents. 1668 1669 :param encoding: The destination encoding. 1670 :param indent_level: Each line of the rendering will be 1671 indented this many levels. (The formatter decides what a 1672 'level' means in terms of spaces or other characters 1673 output.) Used internally in recursive calls while 1674 pretty-printing. 1675 :param formatter: A Formatter object, or a string naming one of 1676 the standard formatters. 1677 :param errors: An error handling strategy such as 1678 'xmlcharrefreplace'. This value is passed along into 1679 encode() and its value should be one of the constants 1680 defined by Python. 1681 :return: A bytestring. 1682 1683 """ 1684 # Turn the data structure into Unicode, then encode the 1685 # Unicode. 1686 u = self.decode(indent_level, encoding, formatter) 1687 return u.encode(encoding, errors) 1688 1689 def decode(self, indent_level=None, 1690 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1691 formatter="minimal", 1692 iterator=None): 1693 pieces = [] 1694 # First off, turn a non-Formatter `formatter` into a Formatter 1695 # object. This will stop the lookup from happening over and 1696 # over again. 1697 if not isinstance(formatter, Formatter): 1698 formatter = self.formatter_for_name(formatter) 1699 1700 if indent_level is True: 1701 indent_level = 0 1702 1703 # The currently active tag that put us into string literal 1704 # mode. Until this element is closed, children will be treated 1705 # as string literals and not pretty-printed. String literal 1706 # mode is turned on immediately after this tag begins, and 1707 # turned off immediately before it's closed. This means there 1708 # will be whitespace before and after the tag itself. 1709 string_literal_tag = None 1710 1711 for event, element in self._event_stream(iterator): 1712 if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT): 1713 piece = element._format_tag( 1714 eventual_encoding, formatter, opening=True 1715 ) 1716 elif event is Tag.END_ELEMENT_EVENT: 1717 piece = element._format_tag( 1718 eventual_encoding, formatter, opening=False 1719 ) 1720 if indent_level is not None: 1721 indent_level -= 1 1722 else: 1723 piece = element.output_ready(formatter) 1724 1725 # Now we need to apply the 'prettiness' -- extra 1726 # whitespace before and/or after this tag. This can get 1727 # complicated because certain tags, like <pre> and 1728 # <script>, can't be prettified, since adding whitespace would 1729 # change the meaning of the content. 1730 1731 # The default behavior is to add whitespace before and 1732 # after an element when string literal mode is off, and to 1733 # leave things as they are when string literal mode is on. 1734 if string_literal_tag: 1735 indent_before = indent_after = False 1736 else: 1737 indent_before = indent_after = True 1738 1739 # The only time the behavior is more complex than that is 1740 # when we encounter an opening or closing tag that might 1741 # put us into or out of string literal mode. 1742 if (event is Tag.START_ELEMENT_EVENT 1743 and not string_literal_tag 1744 and not element._should_pretty_print()): 1745 # We are about to enter string literal mode. Add 1746 # whitespace before this tag, but not after. We 1747 # will stay in string literal mode until this tag 1748 # is closed. 1749 indent_before = True 1750 indent_after = False 1751 string_literal_tag = element 1752 elif (event is Tag.END_ELEMENT_EVENT 1753 and element is string_literal_tag): 1754 # We are about to exit string literal mode by closing 1755 # the tag that sent us into that mode. Add whitespace 1756 # after this tag, but not before. 1757 indent_before = False 1758 indent_after = True 1759 string_literal_tag = None 1760 1761 # Now we know whether to add whitespace before and/or 1762 # after this element. 1763 if indent_level is not None: 1764 if (indent_before or indent_after): 1765 if isinstance(element, NavigableString): 1766 piece = piece.strip() 1767 if piece: 1768 piece = self._indent_string( 1769 piece, indent_level, formatter, 1770 indent_before, indent_after 1771 ) 1772 if event == Tag.START_ELEMENT_EVENT: 1773 indent_level += 1 1774 pieces.append(piece) 1775 return "".join(pieces) 1776 1777 # Names for the different events yielded by _event_stream 1778 START_ELEMENT_EVENT = object() 1779 END_ELEMENT_EVENT = object() 1780 EMPTY_ELEMENT_EVENT = object() 1781 STRING_ELEMENT_EVENT = object() 1782 1783 def _event_stream(self, iterator=None): 1784 """Yield a sequence of events that can be used to reconstruct the DOM 1785 for this element. 1786 1787 This lets us recreate the nested structure of this element 1788 (e.g. when formatting it as a string) without using recursive 1789 method calls. 1790 1791 This is similar in concept to the SAX API, but it's a simpler 1792 interface designed for internal use. The events are different 1793 from SAX and the arguments associated with the events are Tags 1794 and other Beautiful Soup objects. 1795 1796 :param iterator: An alternate iterator to use when traversing 1797 the tree. 1798 """ 1799 tag_stack = [] 1800 1801 iterator = iterator or self.self_and_descendants 1802 1803 for c in iterator: 1804 # If the parent of the element we're about to yield is not 1805 # the tag currently on the stack, it means that the tag on 1806 # the stack closed before this element appeared. 1807 while tag_stack and c.parent != tag_stack[-1]: 1808 now_closed_tag = tag_stack.pop() 1809 yield Tag.END_ELEMENT_EVENT, now_closed_tag 1810 1811 if isinstance(c, Tag): 1812 if c.is_empty_element: 1813 yield Tag.EMPTY_ELEMENT_EVENT, c 1814 else: 1815 yield Tag.START_ELEMENT_EVENT, c 1816 tag_stack.append(c) 1817 continue 1818 else: 1819 yield Tag.STRING_ELEMENT_EVENT, c 1820 1821 while tag_stack: 1822 now_closed_tag = tag_stack.pop() 1823 yield Tag.END_ELEMENT_EVENT, now_closed_tag 1824 1825 def _indent_string(self, s, indent_level, formatter, 1826 indent_before, indent_after): 1827 """Add indentation whitespace before and/or after a string. 1828 1829 :param s: The string to amend with whitespace. 1830 :param indent_level: The indentation level; affects how much 1831 whitespace goes before the string. 1832 :param indent_before: Whether or not to add whitespace 1833 before the string. 1834 :param indent_after: Whether or not to add whitespace 1835 (a newline) after the string. 1836 """ 1837 space_before = '' 1838 if indent_before and indent_level: 1839 space_before = (formatter.indent * indent_level) 1840 1841 space_after = '' 1842 if indent_after: 1843 space_after = "\n" 1844 1845 return space_before + s + space_after 1846 1847 def _format_tag(self, eventual_encoding, formatter, opening): 1848 if self.hidden: 1849 # A hidden tag is invisible, although its contents 1850 # are visible. 1851 return '' 1852 1853 # A tag starts with the < character (see below). 1854 1855 # Then the / character, if this is a closing tag. 1856 closing_slash = '' 1857 if not opening: 1858 closing_slash = '/' 1859 1860 # Then an optional namespace prefix. 1861 prefix = '' 1862 if self.prefix: 1863 prefix = self.prefix + ":" 1864 1865 # Then a list of attribute values, if this is an opening tag. 1866 attribute_string = '' 1867 if opening: 1868 attributes = formatter.attributes(self) 1869 attrs = [] 1870 for key, val in attributes: 1871 if val is None: 1872 decoded = key 1873 else: 1874 if isinstance(val, list) or isinstance(val, tuple): 1875 val = ' '.join(val) 1876 elif not isinstance(val, str): 1877 val = str(val) 1878 elif ( 1879 isinstance(val, AttributeValueWithCharsetSubstitution) 1880 and eventual_encoding is not None 1881 ): 1882 val = val.encode(eventual_encoding) 1883 1884 text = formatter.attribute_value(val) 1885 decoded = ( 1886 str(key) + '=' 1887 + formatter.quoted_attribute_value(text)) 1888 attrs.append(decoded) 1889 if attrs: 1890 attribute_string = ' ' + ' '.join(attrs) 1891 1892 # Then an optional closing slash (for a void element in an 1893 # XML document). 1894 void_element_closing_slash = '' 1895 if self.is_empty_element: 1896 void_element_closing_slash = formatter.void_element_close_prefix or '' 1897 1898 # Put it all together. 1899 return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>' 1900 1901 def _should_pretty_print(self, indent_level=1): 1902 """Should this tag be pretty-printed? 1903 1904 Most of them should, but some (such as <pre> in HTML 1905 documents) should not. 1906 """ 1907 return ( 1908 indent_level is not None 1909 and ( 1910 not self.preserve_whitespace_tags 1911 or self.name not in self.preserve_whitespace_tags 1912 ) 1913 ) 1914 1915 def prettify(self, encoding=None, formatter="minimal"): 1916 """Pretty-print this PageElement as a string. 1917 1918 :param encoding: The eventual encoding of the string. If this is None, 1919 a Unicode string will be returned. 1920 :param formatter: A Formatter object, or a string naming one of 1921 the standard formatters. 1922 :return: A Unicode string (if encoding==None) or a bytestring 1923 (otherwise). 1924 """ 1925 if encoding is None: 1926 return self.decode(True, formatter=formatter) 1927 else: 1928 return self.encode(encoding, True, formatter=formatter) 1929 1930 def decode_contents(self, indent_level=None, 1931 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1932 formatter="minimal"): 1933 """Renders the contents of this tag as a Unicode string. 1934 1935 :param indent_level: Each line of the rendering will be 1936 indented this many levels. (The formatter decides what a 1937 'level' means in terms of spaces or other characters 1938 output.) Used internally in recursive calls while 1939 pretty-printing. 1940 1941 :param eventual_encoding: The tag is destined to be 1942 encoded into this encoding. decode_contents() is _not_ 1943 responsible for performing that encoding. This information 1944 is passed in so that it can be substituted in if the 1945 document contains a <META> tag that mentions the document's 1946 encoding. 1947 1948 :param formatter: A Formatter object, or a string naming one of 1949 the standard Formatters. 1950 1951 """ 1952 return self.decode(indent_level, eventual_encoding, formatter, 1953 iterator=self.descendants) 1954 1955 def encode_contents( 1956 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1957 formatter="minimal"): 1958 """Renders the contents of this PageElement as a bytestring. 1959 1960 :param indent_level: Each line of the rendering will be 1961 indented this many levels. (The formatter decides what a 1962 'level' means in terms of spaces or other characters 1963 output.) Used internally in recursive calls while 1964 pretty-printing. 1965 1966 :param eventual_encoding: The bytestring will be in this encoding. 1967 1968 :param formatter: A Formatter object, or a string naming one of 1969 the standard Formatters. 1970 1971 :return: A bytestring. 1972 """ 1973 contents = self.decode_contents(indent_level, encoding, formatter) 1974 return contents.encode(encoding) 1975 1976 # Old method for BS3 compatibility 1977 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1978 prettyPrint=False, indentLevel=0): 1979 """Deprecated method for BS3 compatibility.""" 1980 if not prettyPrint: 1981 indentLevel = None 1982 return self.encode_contents( 1983 indent_level=indentLevel, encoding=encoding) 1984 1985 #Soup methods 1986 1987 def find(self, name=None, attrs={}, recursive=True, string=None, 1988 **kwargs): 1989 """Look in the children of this PageElement and find the first 1990 PageElement that matches the given criteria. 1991 1992 All find_* methods take a common set of arguments. See the online 1993 documentation for detailed explanations. 1994 1995 :param name: A filter on tag name. 1996 :param attrs: A dictionary of filters on attribute values. 1997 :param recursive: If this is True, find() will perform a 1998 recursive search of this PageElement's children. Otherwise, 1999 only the direct children will be considered. 2000 :param limit: Stop looking after finding this many results. 2001 :kwargs: A dictionary of filters on attribute values. 2002 :return: A PageElement. 2003 :rtype: bs4.element.Tag | bs4.element.NavigableString 2004 """ 2005 r = None 2006 l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3, 2007 **kwargs) 2008 if l: 2009 r = l[0] 2010 return r 2011 findChild = find #BS2 2012 2013 def find_all(self, name=None, attrs={}, recursive=True, string=None, 2014 limit=None, **kwargs): 2015 """Look in the children of this PageElement and find all 2016 PageElements that match the given criteria. 2017 2018 All find_* methods take a common set of arguments. See the online 2019 documentation for detailed explanations. 2020 2021 :param name: A filter on tag name. 2022 :param attrs: A dictionary of filters on attribute values. 2023 :param recursive: If this is True, find_all() will perform a 2024 recursive search of this PageElement's children. Otherwise, 2025 only the direct children will be considered. 2026 :param limit: Stop looking after finding this many results. 2027 :kwargs: A dictionary of filters on attribute values. 2028 :return: A ResultSet of PageElements. 2029 :rtype: bs4.element.ResultSet 2030 """ 2031 generator = self.descendants 2032 if not recursive: 2033 generator = self.children 2034 _stacklevel = kwargs.pop('_stacklevel', 2) 2035 return self._find_all(name, attrs, string, limit, generator, 2036 _stacklevel=_stacklevel+1, **kwargs) 2037 findAll = find_all # BS3 2038 findChildren = find_all # BS2 2039 2040 #Generator methods 2041 @property 2042 def children(self): 2043 """Iterate over all direct children of this PageElement. 2044 2045 :yield: A sequence of PageElements. 2046 """ 2047 # return iter() to make the purpose of the method clear 2048 return iter(self.contents) # XXX This seems to be untested. 2049 2050 @property 2051 def self_and_descendants(self): 2052 """Iterate over this PageElement and its children in a 2053 breadth-first sequence. 2054 2055 :yield: A sequence of PageElements. 2056 """ 2057 if not self.hidden: 2058 yield self 2059 for i in self.descendants: 2060 yield i 2061 2062 @property 2063 def descendants(self): 2064 """Iterate over all children of this PageElement in a 2065 breadth-first sequence. 2066 2067 :yield: A sequence of PageElements. 2068 """ 2069 if not len(self.contents): 2070 return 2071 stopNode = self._last_descendant().next_element 2072 current = self.contents[0] 2073 while current is not stopNode: 2074 yield current 2075 current = current.next_element 2076 2077 # CSS selector code 2078 def select_one(self, selector, namespaces=None, **kwargs): 2079 """Perform a CSS selection operation on the current element. 2080 2081 :param selector: A CSS selector. 2082 2083 :param namespaces: A dictionary mapping namespace prefixes 2084 used in the CSS selector to namespace URIs. By default, 2085 Beautiful Soup will use the prefixes it encountered while 2086 parsing the document. 2087 2088 :param kwargs: Keyword arguments to be passed into Soup Sieve's 2089 soupsieve.select() method. 2090 2091 :return: A Tag. 2092 :rtype: bs4.element.Tag 2093 """ 2094 return self.css.select_one(selector, namespaces, **kwargs) 2095 2096 def select(self, selector, namespaces=None, limit=None, **kwargs): 2097 """Perform a CSS selection operation on the current element. 2098 2099 This uses the SoupSieve library. 2100 2101 :param selector: A string containing a CSS selector. 2102 2103 :param namespaces: A dictionary mapping namespace prefixes 2104 used in the CSS selector to namespace URIs. By default, 2105 Beautiful Soup will use the prefixes it encountered while 2106 parsing the document. 2107 2108 :param limit: After finding this number of results, stop looking. 2109 2110 :param kwargs: Keyword arguments to be passed into SoupSieve's 2111 soupsieve.select() method. 2112 2113 :return: A ResultSet of Tags. 2114 :rtype: bs4.element.ResultSet 2115 """ 2116 return self.css.select(selector, namespaces, limit, **kwargs) 2117 2118 @property 2119 def css(self): 2120 """Return an interface to the CSS selector API.""" 2121 return CSS(self) 2122 2123 # Old names for backwards compatibility 2124 def childGenerator(self): 2125 """Deprecated generator.""" 2126 return self.children 2127 2128 def recursiveChildGenerator(self): 2129 """Deprecated generator.""" 2130 return self.descendants 2131 2132 def has_key(self, key): 2133 """Deprecated method. This was kind of misleading because has_key() 2134 (attributes) was different from __in__ (contents). 2135 2136 has_key() is gone in Python 3, anyway. 2137 """ 2138 warnings.warn( 2139 'has_key is deprecated. Use has_attr(key) instead.', 2140 DeprecationWarning, stacklevel=2 2141 ) 2142 return self.has_attr(key) 2143 2144# Next, a couple classes to represent queries and their results. 2145class SoupStrainer(object): 2146 """Encapsulates a number of ways of matching a markup element (tag or 2147 string). 2148 2149 This is primarily used to underpin the find_* methods, but you can 2150 create one yourself and pass it in as `parse_only` to the 2151 `BeautifulSoup` constructor, to parse a subset of a large 2152 document. 2153 """ 2154 2155 def __init__(self, name=None, attrs={}, string=None, **kwargs): 2156 """Constructor. 2157 2158 The SoupStrainer constructor takes the same arguments passed 2159 into the find_* methods. See the online documentation for 2160 detailed explanations. 2161 2162 :param name: A filter on tag name. 2163 :param attrs: A dictionary of filters on attribute values. 2164 :param string: A filter for a NavigableString with specific text. 2165 :kwargs: A dictionary of filters on attribute values. 2166 """ 2167 if string is None and 'text' in kwargs: 2168 string = kwargs.pop('text') 2169 warnings.warn( 2170 "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.", 2171 DeprecationWarning, stacklevel=2 2172 ) 2173 2174 self.name = self._normalize_search_value(name) 2175 if not isinstance(attrs, dict): 2176 # Treat a non-dict value for attrs as a search for the 'class' 2177 # attribute. 2178 kwargs['class'] = attrs 2179 attrs = None 2180 2181 if 'class_' in kwargs: 2182 # Treat class_="foo" as a search for the 'class' 2183 # attribute, overriding any non-dict value for attrs. 2184 kwargs['class'] = kwargs['class_'] 2185 del kwargs['class_'] 2186 2187 if kwargs: 2188 if attrs: 2189 attrs = attrs.copy() 2190 attrs.update(kwargs) 2191 else: 2192 attrs = kwargs 2193 normalized_attrs = {} 2194 for key, value in list(attrs.items()): 2195 normalized_attrs[key] = self._normalize_search_value(value) 2196 2197 self.attrs = normalized_attrs 2198 self.string = self._normalize_search_value(string) 2199 2200 # DEPRECATED but just in case someone is checking this. 2201 self.text = self.string 2202 2203 def _normalize_search_value(self, value): 2204 # Leave it alone if it's a Unicode string, a callable, a 2205 # regular expression, a boolean, or None. 2206 if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match') 2207 or isinstance(value, bool) or value is None): 2208 return value 2209 2210 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 2211 if isinstance(value, bytes): 2212 return value.decode("utf8") 2213 2214 # If it's listlike, convert it into a list of strings. 2215 if hasattr(value, '__iter__'): 2216 new_value = [] 2217 for v in value: 2218 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 2219 and not isinstance(v, str)): 2220 # This is almost certainly the user's mistake. In the 2221 # interests of avoiding infinite loops, we'll let 2222 # it through as-is rather than doing a recursive call. 2223 new_value.append(v) 2224 else: 2225 new_value.append(self._normalize_search_value(v)) 2226 return new_value 2227 2228 # Otherwise, convert it into a Unicode string. 2229 # The unicode(str()) thing is so this will do the same thing on Python 2 2230 # and Python 3. 2231 return str(str(value)) 2232 2233 def __str__(self): 2234 """A human-readable representation of this SoupStrainer.""" 2235 if self.string: 2236 return self.string 2237 else: 2238 return "%s|%s" % (self.name, self.attrs) 2239 2240 def search_tag(self, markup_name=None, markup_attrs={}): 2241 """Check whether a Tag with the given name and attributes would 2242 match this SoupStrainer. 2243 2244 Used prospectively to decide whether to even bother creating a Tag 2245 object. 2246 2247 :param markup_name: A tag name as found in some markup. 2248 :param markup_attrs: A dictionary of attributes as found in some markup. 2249 2250 :return: True if the prospective tag would match this SoupStrainer; 2251 False otherwise. 2252 """ 2253 found = None 2254 markup = None 2255 if isinstance(markup_name, Tag): 2256 markup = markup_name 2257 markup_attrs = markup 2258 2259 if isinstance(self.name, str): 2260 # Optimization for a very common case where the user is 2261 # searching for a tag with one specific name, and we're 2262 # looking at a tag with a different name. 2263 if markup and not markup.prefix and self.name != markup.name: 2264 return False 2265 2266 call_function_with_tag_data = ( 2267 isinstance(self.name, Callable) 2268 and not isinstance(markup_name, Tag)) 2269 2270 if ((not self.name) 2271 or call_function_with_tag_data 2272 or (markup and self._matches(markup, self.name)) 2273 or (not markup and self._matches(markup_name, self.name))): 2274 if call_function_with_tag_data: 2275 match = self.name(markup_name, markup_attrs) 2276 else: 2277 match = True 2278 markup_attr_map = None 2279 for attr, match_against in list(self.attrs.items()): 2280 if not markup_attr_map: 2281 if hasattr(markup_attrs, 'get'): 2282 markup_attr_map = markup_attrs 2283 else: 2284 markup_attr_map = {} 2285 for k, v in markup_attrs: 2286 markup_attr_map[k] = v 2287 attr_value = markup_attr_map.get(attr) 2288 if not self._matches(attr_value, match_against): 2289 match = False 2290 break 2291 if match: 2292 if markup: 2293 found = markup 2294 else: 2295 found = markup_name 2296 if found and self.string and not self._matches(found.string, self.string): 2297 found = None 2298 return found 2299 2300 # For BS3 compatibility. 2301 searchTag = search_tag 2302 2303 def search(self, markup): 2304 """Find all items in `markup` that match this SoupStrainer. 2305 2306 Used by the core _find_all() method, which is ultimately 2307 called by all find_* methods. 2308 2309 :param markup: A PageElement or a list of them. 2310 """ 2311 # print('looking for %s in %s' % (self, markup)) 2312 found = None 2313 # If given a list of items, scan it for a text element that 2314 # matches. 2315 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 2316 for element in markup: 2317 if isinstance(element, NavigableString) \ 2318 and self.search(element): 2319 found = element 2320 break 2321 # If it's a Tag, make sure its name or attributes match. 2322 # Don't bother with Tags if we're searching for text. 2323 elif isinstance(markup, Tag): 2324 if not self.string or self.name or self.attrs: 2325 found = self.search_tag(markup) 2326 # If it's text, make sure the text matches. 2327 elif isinstance(markup, NavigableString) or \ 2328 isinstance(markup, str): 2329 if not self.name and not self.attrs and self._matches(markup, self.string): 2330 found = markup 2331 else: 2332 raise Exception( 2333 "I don't know how to match against a %s" % markup.__class__) 2334 return found 2335 2336 def _matches(self, markup, match_against, already_tried=None): 2337 # print(u"Matching %s against %s" % (markup, match_against)) 2338 result = False 2339 if isinstance(markup, list) or isinstance(markup, tuple): 2340 # This should only happen when searching a multi-valued attribute 2341 # like 'class'. 2342 for item in markup: 2343 if self._matches(item, match_against): 2344 return True 2345 # We didn't match any particular value of the multivalue 2346 # attribute, but maybe we match the attribute value when 2347 # considered as a string. 2348 if self._matches(' '.join(markup), match_against): 2349 return True 2350 return False 2351 2352 if match_against is True: 2353 # True matches any non-None value. 2354 return markup is not None 2355 2356 if isinstance(match_against, Callable): 2357 return match_against(markup) 2358 2359 # Custom callables take the tag as an argument, but all 2360 # other ways of matching match the tag name as a string. 2361 original_markup = markup 2362 if isinstance(markup, Tag): 2363 markup = markup.name 2364 2365 # Ensure that `markup` is either a Unicode string, or None. 2366 markup = self._normalize_search_value(markup) 2367 2368 if markup is None: 2369 # None matches None, False, an empty string, an empty list, and so on. 2370 return not match_against 2371 2372 if (hasattr(match_against, '__iter__') 2373 and not isinstance(match_against, str)): 2374 # We're asked to match against an iterable of items. 2375 # The markup must be match at least one item in the 2376 # iterable. We'll try each one in turn. 2377 # 2378 # To avoid infinite recursion we need to keep track of 2379 # items we've already seen. 2380 if not already_tried: 2381 already_tried = set() 2382 for item in match_against: 2383 if item.__hash__: 2384 key = item 2385 else: 2386 key = id(item) 2387 if key in already_tried: 2388 continue 2389 else: 2390 already_tried.add(key) 2391 if self._matches(original_markup, item, already_tried): 2392 return True 2393 else: 2394 return False 2395 2396 # Beyond this point we might need to run the test twice: once against 2397 # the tag's name and once against its prefixed name. 2398 match = False 2399 2400 if not match and isinstance(match_against, str): 2401 # Exact string match 2402 match = markup == match_against 2403 2404 if not match and hasattr(match_against, 'search'): 2405 # Regexp match 2406 return match_against.search(markup) 2407 2408 if (not match 2409 and isinstance(original_markup, Tag) 2410 and original_markup.prefix): 2411 # Try the whole thing again with the prefixed tag name. 2412 return self._matches( 2413 original_markup.prefix + ':' + original_markup.name, match_against 2414 ) 2415 2416 return match 2417 2418 2419class ResultSet(list): 2420 """A ResultSet is just a list that keeps track of the SoupStrainer 2421 that created it.""" 2422 def __init__(self, source, result=()): 2423 """Constructor. 2424 2425 :param source: A SoupStrainer. 2426 :param result: A list of PageElements. 2427 """ 2428 super(ResultSet, self).__init__(result) 2429 self.source = source 2430 2431 def __getattr__(self, key): 2432 """Raise a helpful exception to explain a common code fix.""" 2433 raise AttributeError( 2434 "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key 2435 ) 2436