1__license__ = "MIT" 2 3import collections.abc 4import re 5import sys 6import warnings 7from bs4.dammit import EntitySubstitution 8 9DEFAULT_OUTPUT_ENCODING = "utf-8" 10PY3K = (sys.version_info[0] > 2) 11 12whitespace_re = re.compile(r"\s+") 13 14def _alias(attr): 15 """Alias one attribute name to another for backward compatibility""" 16 @property 17 def alias(self): 18 return getattr(self, attr) 19 20 @alias.setter 21 def alias(self): 22 return setattr(self, attr) 23 return alias 24 25 26class NamespacedAttribute(str): 27 28 def __new__(cls, prefix, name, namespace=None): 29 if name is None: 30 obj = str.__new__(cls, prefix) 31 elif prefix is None: 32 # Not really namespaced. 33 obj = str.__new__(cls, name) 34 else: 35 obj = str.__new__(cls, prefix + ":" + name) 36 obj.prefix = prefix 37 obj.name = name 38 obj.namespace = namespace 39 return obj 40 41class AttributeValueWithCharsetSubstitution(str): 42 """A stand-in object for a character encoding specified in HTML.""" 43 44class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): 45 """A generic stand-in for the value of a meta tag's 'charset' attribute. 46 47 When Beautiful Soup parses the markup '<meta charset="utf8">', the 48 value of the 'charset' attribute will be one of these objects. 49 """ 50 51 def __new__(cls, original_value): 52 obj = str.__new__(cls, original_value) 53 obj.original_value = original_value 54 return obj 55 56 def encode(self, encoding): 57 return encoding 58 59 60class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): 61 """A generic stand-in for the value of a meta tag's 'content' attribute. 62 63 When Beautiful Soup parses the markup: 64 <meta http-equiv="content-type" content="text/html; charset=utf8"> 65 66 The value of the 'content' attribute will be one of these objects. 67 """ 68 69 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) 70 71 def __new__(cls, original_value): 72 match = cls.CHARSET_RE.search(original_value) 73 if match is None: 74 # No substitution necessary. 75 return str.__new__(str, original_value) 76 77 obj = str.__new__(cls, original_value) 78 obj.original_value = original_value 79 return obj 80 81 def encode(self, encoding): 82 def rewrite(match): 83 return match.group(1) + encoding 84 return self.CHARSET_RE.sub(rewrite, self.original_value) 85 86class HTMLAwareEntitySubstitution(EntitySubstitution): 87 88 """Entity substitution rules that are aware of some HTML quirks. 89 90 Specifically, the contents of <script> and <style> tags should not 91 undergo entity substitution. 92 93 Incoming NavigableString objects are checked to see if they're the 94 direct children of a <script> or <style> tag. 95 """ 96 97 cdata_containing_tags = set(["script", "style"]) 98 99 preformatted_tags = set(["pre"]) 100 101 @classmethod 102 def _substitute_if_appropriate(cls, ns, f): 103 if (isinstance(ns, NavigableString) 104 and ns.parent is not None 105 and ns.parent.name in cls.cdata_containing_tags): 106 # Do nothing. 107 return ns 108 # Substitute. 109 return f(ns) 110 111 @classmethod 112 def substitute_html(cls, ns): 113 return cls._substitute_if_appropriate( 114 ns, EntitySubstitution.substitute_html) 115 116 @classmethod 117 def substitute_xml(cls, ns): 118 return cls._substitute_if_appropriate( 119 ns, EntitySubstitution.substitute_xml) 120 121class PageElement(object): 122 """Contains the navigational information for some part of the page 123 (either a tag or a piece of text)""" 124 125 # There are five possible values for the "formatter" argument passed in 126 # to methods like encode() and prettify(): 127 # 128 # "html" - All Unicode characters with corresponding HTML entities 129 # are converted to those entities on output. 130 # "minimal" - Bare ampersands and angle brackets are converted to 131 # XML entities: & < > 132 # None - The null formatter. Unicode characters are never 133 # converted to entities. This is not recommended, but it's 134 # faster than "minimal". 135 # A function - This function will be called on every string that 136 # needs to undergo entity substitution. 137 # 138 139 # In an HTML document, the default "html" and "minimal" functions 140 # will leave the contents of <script> and <style> tags alone. For 141 # an XML document, all tags will be given the same treatment. 142 143 HTML_FORMATTERS = { 144 "html" : HTMLAwareEntitySubstitution.substitute_html, 145 "minimal" : HTMLAwareEntitySubstitution.substitute_xml, 146 None : None 147 } 148 149 XML_FORMATTERS = { 150 "html" : EntitySubstitution.substitute_html, 151 "minimal" : EntitySubstitution.substitute_xml, 152 None : None 153 } 154 155 def format_string(self, s, formatter='minimal'): 156 """Format the given string using the given formatter.""" 157 if not isinstance(formatter, collections.abc.Callable): 158 formatter = self._formatter_for_name(formatter) 159 if formatter is None: 160 output = s 161 else: 162 output = formatter(s) 163 return output 164 165 @property 166 def _is_xml(self): 167 """Is this element part of an XML tree or an HTML tree? 168 169 This is used when mapping a formatter name ("minimal") to an 170 appropriate function (one that performs entity-substitution on 171 the contents of <script> and <style> tags, or not). It's 172 inefficient, but it should be called very rarely. 173 """ 174 if self.parent is None: 175 # This is the top-level object. It should have .is_xml set 176 # from tree creation. If not, take a guess--BS is usually 177 # used on HTML markup. 178 return getattr(self, 'is_xml', False) 179 return self.parent._is_xml 180 181 def _formatter_for_name(self, name): 182 "Look up a formatter function based on its name and the tree." 183 if self._is_xml: 184 return self.XML_FORMATTERS.get( 185 name, EntitySubstitution.substitute_xml) 186 else: 187 return self.HTML_FORMATTERS.get( 188 name, HTMLAwareEntitySubstitution.substitute_xml) 189 190 def setup(self, parent=None, previous_element=None, next_element=None, 191 previous_sibling=None, next_sibling=None): 192 """Sets up the initial relations between this element and 193 other elements.""" 194 self.parent = parent 195 196 self.previous_element = previous_element 197 if previous_element is not None: 198 self.previous_element.next_element = self 199 200 self.next_element = next_element 201 if self.next_element: 202 self.next_element.previous_element = self 203 204 self.next_sibling = next_sibling 205 if self.next_sibling: 206 self.next_sibling.previous_sibling = self 207 208 if (not previous_sibling 209 and self.parent is not None and self.parent.contents): 210 previous_sibling = self.parent.contents[-1] 211 212 self.previous_sibling = previous_sibling 213 if previous_sibling: 214 self.previous_sibling.next_sibling = self 215 216 nextSibling = _alias("next_sibling") # BS3 217 previousSibling = _alias("previous_sibling") # BS3 218 219 def replace_with(self, replace_with): 220 if not self.parent: 221 raise ValueError( 222 "Cannot replace one element with another when the" 223 "element to be replaced is not part of a tree.") 224 if replace_with is self: 225 return 226 if replace_with is self.parent: 227 raise ValueError("Cannot replace a Tag with its parent.") 228 old_parent = self.parent 229 my_index = self.parent.index(self) 230 self.extract() 231 old_parent.insert(my_index, replace_with) 232 return self 233 replaceWith = replace_with # BS3 234 235 def unwrap(self): 236 my_parent = self.parent 237 if not self.parent: 238 raise ValueError( 239 "Cannot replace an element with its contents when that" 240 "element is not part of a tree.") 241 my_index = self.parent.index(self) 242 self.extract() 243 for child in reversed(self.contents[:]): 244 my_parent.insert(my_index, child) 245 return self 246 replace_with_children = unwrap 247 replaceWithChildren = unwrap # BS3 248 249 def wrap(self, wrap_inside): 250 me = self.replace_with(wrap_inside) 251 wrap_inside.append(me) 252 return wrap_inside 253 254 def extract(self): 255 """Destructively rips this element out of the tree.""" 256 if self.parent is not None: 257 del self.parent.contents[self.parent.index(self)] 258 259 #Find the two elements that would be next to each other if 260 #this element (and any children) hadn't been parsed. Connect 261 #the two. 262 last_child = self._last_descendant() 263 next_element = last_child.next_element 264 265 if (self.previous_element is not None and 266 self.previous_element is not next_element): 267 self.previous_element.next_element = next_element 268 if next_element is not None and next_element is not self.previous_element: 269 next_element.previous_element = self.previous_element 270 self.previous_element = None 271 last_child.next_element = None 272 273 self.parent = None 274 if (self.previous_sibling is not None 275 and self.previous_sibling is not self.next_sibling): 276 self.previous_sibling.next_sibling = self.next_sibling 277 if (self.next_sibling is not None 278 and self.next_sibling is not self.previous_sibling): 279 self.next_sibling.previous_sibling = self.previous_sibling 280 self.previous_sibling = self.next_sibling = None 281 return self 282 283 def _last_descendant(self, is_initialized=True, accept_self=True): 284 "Finds the last element beneath this object to be parsed." 285 if is_initialized and self.next_sibling: 286 last_child = self.next_sibling.previous_element 287 else: 288 last_child = self 289 while isinstance(last_child, Tag) and last_child.contents: 290 last_child = last_child.contents[-1] 291 if not accept_self and last_child is self: 292 last_child = None 293 return last_child 294 # BS3: Not part of the API! 295 _lastRecursiveChild = _last_descendant 296 297 def insert(self, position, new_child): 298 if new_child is None: 299 raise ValueError("Cannot insert None into a tag.") 300 if new_child is self: 301 raise ValueError("Cannot insert a tag into itself.") 302 if (isinstance(new_child, str) 303 and not isinstance(new_child, NavigableString)): 304 new_child = NavigableString(new_child) 305 306 position = min(position, len(self.contents)) 307 if hasattr(new_child, 'parent') and new_child.parent is not None: 308 # We're 'inserting' an element that's already one 309 # of this object's children. 310 if new_child.parent is self: 311 current_index = self.index(new_child) 312 if current_index < position: 313 # We're moving this element further down the list 314 # of this object's children. That means that when 315 # we extract this element, our target index will 316 # jump down one. 317 position -= 1 318 new_child.extract() 319 320 new_child.parent = self 321 previous_child = None 322 if position == 0: 323 new_child.previous_sibling = None 324 new_child.previous_element = self 325 else: 326 previous_child = self.contents[position - 1] 327 new_child.previous_sibling = previous_child 328 new_child.previous_sibling.next_sibling = new_child 329 new_child.previous_element = previous_child._last_descendant(False) 330 if new_child.previous_element is not None: 331 new_child.previous_element.next_element = new_child 332 333 new_childs_last_element = new_child._last_descendant(False) 334 335 if position >= len(self.contents): 336 new_child.next_sibling = None 337 338 parent = self 339 parents_next_sibling = None 340 while parents_next_sibling is None and parent is not None: 341 parents_next_sibling = parent.next_sibling 342 parent = parent.parent 343 if parents_next_sibling is not None: 344 # We found the element that comes next in the document. 345 break 346 if parents_next_sibling is not None: 347 new_childs_last_element.next_element = parents_next_sibling 348 else: 349 # The last element of this tag is the last element in 350 # the document. 351 new_childs_last_element.next_element = None 352 else: 353 next_child = self.contents[position] 354 new_child.next_sibling = next_child 355 if new_child.next_sibling is not None: 356 new_child.next_sibling.previous_sibling = new_child 357 new_childs_last_element.next_element = next_child 358 359 if new_childs_last_element.next_element is not None: 360 new_childs_last_element.next_element.previous_element = new_childs_last_element 361 self.contents.insert(position, new_child) 362 363 def append(self, tag): 364 """Appends the given tag to the contents of this tag.""" 365 self.insert(len(self.contents), tag) 366 367 def insert_before(self, predecessor): 368 """Makes the given element the immediate predecessor of this one. 369 370 The two elements will have the same parent, and the given element 371 will be immediately before this one. 372 """ 373 if self is predecessor: 374 raise ValueError("Can't insert an element before itself.") 375 parent = self.parent 376 if parent is None: 377 raise ValueError( 378 "Element has no parent, so 'before' has no meaning.") 379 # Extract first so that the index won't be screwed up if they 380 # are siblings. 381 if isinstance(predecessor, PageElement): 382 predecessor.extract() 383 index = parent.index(self) 384 parent.insert(index, predecessor) 385 386 def insert_after(self, successor): 387 """Makes the given element the immediate successor of this one. 388 389 The two elements will have the same parent, and the given element 390 will be immediately after this one. 391 """ 392 if self is successor: 393 raise ValueError("Can't insert an element after itself.") 394 parent = self.parent 395 if parent is None: 396 raise ValueError( 397 "Element has no parent, so 'after' has no meaning.") 398 # Extract first so that the index won't be screwed up if they 399 # are siblings. 400 if isinstance(successor, PageElement): 401 successor.extract() 402 index = parent.index(self) 403 parent.insert(index+1, successor) 404 405 def find_next(self, name=None, attrs={}, text=None, **kwargs): 406 """Returns the first item that matches the given criteria and 407 appears after this Tag in the document.""" 408 return self._find_one(self.find_all_next, name, attrs, text, **kwargs) 409 findNext = find_next # BS3 410 411 def find_all_next(self, name=None, attrs={}, text=None, limit=None, 412 **kwargs): 413 """Returns all items that match the given criteria and appear 414 after this Tag in the document.""" 415 return self._find_all(name, attrs, text, limit, self.next_elements, 416 **kwargs) 417 findAllNext = find_all_next # BS3 418 419 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): 420 """Returns the closest sibling to this Tag that matches the 421 given criteria and appears after this Tag in the document.""" 422 return self._find_one(self.find_next_siblings, name, attrs, text, 423 **kwargs) 424 findNextSibling = find_next_sibling # BS3 425 426 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, 427 **kwargs): 428 """Returns the siblings of this Tag that match the given 429 criteria and appear after this Tag in the document.""" 430 return self._find_all(name, attrs, text, limit, 431 self.next_siblings, **kwargs) 432 findNextSiblings = find_next_siblings # BS3 433 fetchNextSiblings = find_next_siblings # BS2 434 435 def find_previous(self, name=None, attrs={}, text=None, **kwargs): 436 """Returns the first item that matches the given criteria and 437 appears before this Tag in the document.""" 438 return self._find_one( 439 self.find_all_previous, name, attrs, text, **kwargs) 440 findPrevious = find_previous # BS3 441 442 def find_all_previous(self, name=None, attrs={}, text=None, limit=None, 443 **kwargs): 444 """Returns all items that match the given criteria and appear 445 before this Tag in the document.""" 446 return self._find_all(name, attrs, text, limit, self.previous_elements, 447 **kwargs) 448 findAllPrevious = find_all_previous # BS3 449 fetchPrevious = find_all_previous # BS2 450 451 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): 452 """Returns the closest sibling to this Tag that matches the 453 given criteria and appears before this Tag in the document.""" 454 return self._find_one(self.find_previous_siblings, name, attrs, text, 455 **kwargs) 456 findPreviousSibling = find_previous_sibling # BS3 457 458 def find_previous_siblings(self, name=None, attrs={}, text=None, 459 limit=None, **kwargs): 460 """Returns the siblings of this Tag that match the given 461 criteria and appear before this Tag in the document.""" 462 return self._find_all(name, attrs, text, limit, 463 self.previous_siblings, **kwargs) 464 findPreviousSiblings = find_previous_siblings # BS3 465 fetchPreviousSiblings = find_previous_siblings # BS2 466 467 def find_parent(self, name=None, attrs={}, **kwargs): 468 """Returns the closest parent of this Tag that matches the given 469 criteria.""" 470 # NOTE: We can't use _find_one because findParents takes a different 471 # set of arguments. 472 r = None 473 l = self.find_parents(name, attrs, 1, **kwargs) 474 if l: 475 r = l[0] 476 return r 477 findParent = find_parent # BS3 478 479 def find_parents(self, name=None, attrs={}, limit=None, **kwargs): 480 """Returns the parents of this Tag that match the given 481 criteria.""" 482 483 return self._find_all(name, attrs, None, limit, self.parents, 484 **kwargs) 485 findParents = find_parents # BS3 486 fetchParents = find_parents # BS2 487 488 @property 489 def next(self): 490 return self.next_element 491 492 @property 493 def previous(self): 494 return self.previous_element 495 496 #These methods do the real heavy lifting. 497 498 def _find_one(self, method, name, attrs, text, **kwargs): 499 r = None 500 l = method(name, attrs, text, 1, **kwargs) 501 if l: 502 r = l[0] 503 return r 504 505 def _find_all(self, name, attrs, text, limit, generator, **kwargs): 506 "Iterates over a generator looking for things that match." 507 508 if text is None and 'string' in kwargs: 509 text = kwargs['string'] 510 del kwargs['string'] 511 512 if isinstance(name, SoupStrainer): 513 strainer = name 514 else: 515 strainer = SoupStrainer(name, attrs, text, **kwargs) 516 517 if text is None and not limit and not attrs and not kwargs: 518 if name is True or name is None: 519 # Optimization to find all tags. 520 result = (element for element in generator 521 if isinstance(element, Tag)) 522 return ResultSet(strainer, result) 523 elif isinstance(name, str): 524 # Optimization to find all tags with a given name. 525 result = (element for element in generator 526 if isinstance(element, Tag) 527 and element.name == name) 528 return ResultSet(strainer, result) 529 results = ResultSet(strainer) 530 while True: 531 try: 532 i = next(generator) 533 except StopIteration: 534 break 535 if i: 536 found = strainer.search(i) 537 if found: 538 results.append(found) 539 if limit and len(results) >= limit: 540 break 541 return results 542 543 #These generators can be used to navigate starting from both 544 #NavigableStrings and Tags. 545 @property 546 def next_elements(self): 547 i = self.next_element 548 while i is not None: 549 yield i 550 i = i.next_element 551 552 @property 553 def next_siblings(self): 554 i = self.next_sibling 555 while i is not None: 556 yield i 557 i = i.next_sibling 558 559 @property 560 def previous_elements(self): 561 i = self.previous_element 562 while i is not None: 563 yield i 564 i = i.previous_element 565 566 @property 567 def previous_siblings(self): 568 i = self.previous_sibling 569 while i is not None: 570 yield i 571 i = i.previous_sibling 572 573 @property 574 def parents(self): 575 i = self.parent 576 while i is not None: 577 yield i 578 i = i.parent 579 580 # Methods for supporting CSS selectors. 581 582 tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') 583 584 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ 585 # \---------------------------/ \---/\-------------/ \-------/ 586 # | | | | 587 # | | | The value 588 # | | ~,|,^,$,* or = 589 # | Attribute 590 # Tag 591 attribselect_re = re.compile( 592 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' + 593 r'=?"?(?P<value>[^\]"]*)"?\]$' 594 ) 595 596 def _attr_value_as_string(self, value, default=None): 597 """Force an attribute value into a string representation. 598 599 A multi-valued attribute will be converted into a 600 space-separated stirng. 601 """ 602 value = self.get(value, default) 603 if isinstance(value, list) or isinstance(value, tuple): 604 value =" ".join(value) 605 return value 606 607 def _tag_name_matches_and(self, function, tag_name): 608 if not tag_name: 609 return function 610 else: 611 def _match(tag): 612 return tag.name == tag_name and function(tag) 613 return _match 614 615 def _attribute_checker(self, operator, attribute, value=''): 616 """Create a function that performs a CSS selector operation. 617 618 Takes an operator, attribute and optional value. Returns a 619 function that will return True for elements that match that 620 combination. 621 """ 622 if operator == '=': 623 # string representation of `attribute` is equal to `value` 624 return lambda el: el._attr_value_as_string(attribute) == value 625 elif operator == '~': 626 # space-separated list representation of `attribute` 627 # contains `value` 628 def _includes_value(element): 629 attribute_value = element.get(attribute, []) 630 if not isinstance(attribute_value, list): 631 attribute_value = attribute_value.split() 632 return value in attribute_value 633 return _includes_value 634 elif operator == '^': 635 # string representation of `attribute` starts with `value` 636 return lambda el: el._attr_value_as_string( 637 attribute, '').startswith(value) 638 elif operator == '$': 639 # string represenation of `attribute` ends with `value` 640 return lambda el: el._attr_value_as_string( 641 attribute, '').endswith(value) 642 elif operator == '*': 643 # string representation of `attribute` contains `value` 644 return lambda el: value in el._attr_value_as_string(attribute, '') 645 elif operator == '|': 646 # string representation of `attribute` is either exactly 647 # `value` or starts with `value` and then a dash. 648 def _is_or_starts_with_dash(element): 649 attribute_value = element._attr_value_as_string(attribute, '') 650 return (attribute_value == value or attribute_value.startswith( 651 value + '-')) 652 return _is_or_starts_with_dash 653 else: 654 return lambda el: el.has_attr(attribute) 655 656 # Old non-property versions of the generators, for backwards 657 # compatibility with BS3. 658 def nextGenerator(self): 659 return self.next_elements 660 661 def nextSiblingGenerator(self): 662 return self.next_siblings 663 664 def previousGenerator(self): 665 return self.previous_elements 666 667 def previousSiblingGenerator(self): 668 return self.previous_siblings 669 670 def parentGenerator(self): 671 return self.parents 672 673 674class NavigableString(str, PageElement): 675 676 PREFIX = '' 677 SUFFIX = '' 678 679 def __new__(cls, value): 680 """Create a new NavigableString. 681 682 When unpickling a NavigableString, this method is called with 683 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 684 passed in to the superclass's __new__ or the superclass won't know 685 how to handle non-ASCII characters. 686 """ 687 if isinstance(value, str): 688 u = str.__new__(cls, value) 689 else: 690 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 691 u.setup() 692 return u 693 694 def __copy__(self): 695 """A copy of a NavigableString has the same contents and class 696 as the original, but it is not connected to the parse tree. 697 """ 698 return type(self)(self) 699 700 def __getnewargs__(self): 701 return (str(self),) 702 703 def __getattr__(self, attr): 704 """text.string gives you text. This is for backwards 705 compatibility for Navigable*String, but for CData* it lets you 706 get the string without the CData wrapper.""" 707 if attr == 'string': 708 return self 709 else: 710 raise AttributeError( 711 "'%s' object has no attribute '%s'" % ( 712 self.__class__.__name__, attr)) 713 714 def output_ready(self, formatter="minimal"): 715 output = self.format_string(self, formatter) 716 return self.PREFIX + output + self.SUFFIX 717 718 @property 719 def name(self): 720 return None 721 722 @name.setter 723 def name(self, name): 724 raise AttributeError("A NavigableString cannot be given a name.") 725 726class PreformattedString(NavigableString): 727 """A NavigableString not subject to the normal formatting rules. 728 729 The string will be passed into the formatter (to trigger side effects), 730 but the return value will be ignored. 731 """ 732 733 def output_ready(self, formatter="minimal"): 734 """CData strings are passed into the formatter. 735 But the return value is ignored.""" 736 self.format_string(self, formatter) 737 return self.PREFIX + self + self.SUFFIX 738 739class CData(PreformattedString): 740 741 PREFIX = '<![CDATA[' 742 SUFFIX = ']]>' 743 744class ProcessingInstruction(PreformattedString): 745 746 PREFIX = '<?' 747 SUFFIX = '>' 748 749class Comment(PreformattedString): 750 751 PREFIX = '<!--' 752 SUFFIX = '-->' 753 754 755class Declaration(PreformattedString): 756 PREFIX = '<?' 757 SUFFIX = '?>' 758 759 760class Doctype(PreformattedString): 761 762 @classmethod 763 def for_name_and_ids(cls, name, pub_id, system_id): 764 value = name or '' 765 if pub_id is not None: 766 value += ' PUBLIC "%s"' % pub_id 767 if system_id is not None: 768 value += ' "%s"' % system_id 769 elif system_id is not None: 770 value += ' SYSTEM "%s"' % system_id 771 772 return Doctype(value) 773 774 PREFIX = '<!DOCTYPE ' 775 SUFFIX = '>\n' 776 777 778class Tag(PageElement): 779 780 """Represents a found HTML tag with its attributes and contents.""" 781 782 def __init__(self, parser=None, builder=None, name=None, namespace=None, 783 prefix=None, attrs=None, parent=None, previous=None): 784 "Basic constructor." 785 786 if parser is None: 787 self.parser_class = None 788 else: 789 # We don't actually store the parser object: that lets extracted 790 # chunks be garbage-collected. 791 self.parser_class = parser.__class__ 792 if name is None: 793 raise ValueError("No value provided for new tag's name.") 794 self.name = name 795 self.namespace = namespace 796 self.prefix = prefix 797 if attrs is None: 798 attrs = {} 799 elif attrs: 800 if builder is not None and builder.cdata_list_attributes: 801 attrs = builder._replace_cdata_list_attribute_values( 802 self.name, attrs) 803 else: 804 attrs = dict(attrs) 805 else: 806 attrs = dict(attrs) 807 self.attrs = attrs 808 self.contents = [] 809 self.setup(parent, previous) 810 self.hidden = False 811 812 # Set up any substitutions, such as the charset in a META tag. 813 if builder is not None: 814 builder.set_up_substitutions(self) 815 self.can_be_empty_element = builder.can_be_empty_element(name) 816 else: 817 self.can_be_empty_element = False 818 819 parserClass = _alias("parser_class") # BS3 820 821 def __copy__(self): 822 """A copy of a Tag is a new Tag, unconnected to the parse tree. 823 Its contents are a copy of the old Tag's contents. 824 """ 825 clone = type(self)(None, self.builder, self.name, self.namespace, 826 self.nsprefix, self.attrs) 827 for attr in ('can_be_empty_element', 'hidden'): 828 setattr(clone, attr, getattr(self, attr)) 829 for child in self.contents: 830 clone.append(child.__copy__()) 831 return clone 832 833 @property 834 def is_empty_element(self): 835 """Is this tag an empty-element tag? (aka a self-closing tag) 836 837 A tag that has contents is never an empty-element tag. 838 839 A tag that has no contents may or may not be an empty-element 840 tag. It depends on the builder used to create the tag. If the 841 builder has a designated list of empty-element tags, then only 842 a tag whose name shows up in that list is considered an 843 empty-element tag. 844 845 If the builder has no designated list of empty-element tags, 846 then any tag with no contents is an empty-element tag. 847 """ 848 return len(self.contents) == 0 and self.can_be_empty_element 849 isSelfClosing = is_empty_element # BS3 850 851 @property 852 def string(self): 853 """Convenience property to get the single string within this tag. 854 855 :Return: If this tag has a single string child, return value 856 is that string. If this tag has no children, or more than one 857 child, return value is None. If this tag has one child tag, 858 return value is the 'string' attribute of the child tag, 859 recursively. 860 """ 861 if len(self.contents) != 1: 862 return None 863 child = self.contents[0] 864 if isinstance(child, NavigableString): 865 return child 866 return child.string 867 868 @string.setter 869 def string(self, string): 870 self.clear() 871 self.append(string.__class__(string)) 872 873 def _all_strings(self, strip=False, types=(NavigableString, CData)): 874 """Yield all strings of certain classes, possibly stripping them. 875 876 By default, yields only NavigableString and CData objects. So 877 no comments, processing instructions, etc. 878 """ 879 for descendant in self.descendants: 880 if ( 881 (types is None and not isinstance(descendant, NavigableString)) 882 or 883 (types is not None and type(descendant) not in types)): 884 continue 885 if strip: 886 descendant = descendant.strip() 887 if len(descendant) == 0: 888 continue 889 yield descendant 890 891 strings = property(_all_strings) 892 893 @property 894 def stripped_strings(self): 895 for string in self._all_strings(True): 896 yield string 897 898 def get_text(self, separator="", strip=False, 899 types=(NavigableString, CData)): 900 """ 901 Get all child strings, concatenated using the given separator. 902 """ 903 return separator.join([s for s in self._all_strings( 904 strip, types=types)]) 905 getText = get_text 906 text = property(get_text) 907 908 def decompose(self): 909 """Recursively destroys the contents of this tree.""" 910 self.extract() 911 i = self 912 while i is not None: 913 next = i.next_element 914 i.__dict__.clear() 915 i.contents = [] 916 i = next 917 918 def clear(self, decompose=False): 919 """ 920 Extract all children. If decompose is True, decompose instead. 921 """ 922 if decompose: 923 for element in self.contents[:]: 924 if isinstance(element, Tag): 925 element.decompose() 926 else: 927 element.extract() 928 else: 929 for element in self.contents[:]: 930 element.extract() 931 932 def index(self, element): 933 """ 934 Find the index of a child by identity, not value. Avoids issues with 935 tag.contents.index(element) getting the index of equal elements. 936 """ 937 for i, child in enumerate(self.contents): 938 if child is element: 939 return i 940 raise ValueError("Tag.index: element not in tag") 941 942 def get(self, key, default=None): 943 """Returns the value of the 'key' attribute for the tag, or 944 the value given for 'default' if it doesn't have that 945 attribute.""" 946 return self.attrs.get(key, default) 947 948 def has_attr(self, key): 949 return key in self.attrs 950 951 def __hash__(self): 952 return str(self).__hash__() 953 954 def __getitem__(self, key): 955 """tag[key] returns the value of the 'key' attribute for the tag, 956 and throws an exception if it's not there.""" 957 return self.attrs[key] 958 959 def __iter__(self): 960 "Iterating over a tag iterates over its contents." 961 return iter(self.contents) 962 963 def __len__(self): 964 "The length of a tag is the length of its list of contents." 965 return len(self.contents) 966 967 def __contains__(self, x): 968 return x in self.contents 969 970 def __bool__(self): 971 "A tag is non-None even if it has no contents." 972 return True 973 974 def __setitem__(self, key, value): 975 """Setting tag[key] sets the value of the 'key' attribute for the 976 tag.""" 977 self.attrs[key] = value 978 979 def __delitem__(self, key): 980 "Deleting tag[key] deletes all 'key' attributes for the tag." 981 self.attrs.pop(key, None) 982 983 def __call__(self, *args, **kwargs): 984 """Calling a tag like a function is the same as calling its 985 find_all() method. Eg. tag('a') returns a list of all the A tags 986 found within this tag.""" 987 return self.find_all(*args, **kwargs) 988 989 def __getattr__(self, tag): 990 #print "Getattr %s.%s" % (self.__class__, tag) 991 if len(tag) > 3 and tag.endswith('Tag'): 992 # BS3: soup.aTag -> "soup.find("a") 993 tag_name = tag[:-3] 994 warnings.warn( 995 '.%sTag is deprecated, use .find("%s") instead.' % ( 996 tag_name, tag_name)) 997 return self.find(tag_name) 998 # We special case contents to avoid recursion. 999 elif not tag.startswith("__") and not tag=="contents": 1000 return self.find(tag) 1001 raise AttributeError( 1002 "'%s' object has no attribute '%s'" % (self.__class__, tag)) 1003 1004 def __eq__(self, other): 1005 """Returns true iff this tag has the same name, the same attributes, 1006 and the same contents (recursively) as the given tag.""" 1007 if self is other: 1008 return True 1009 if (not hasattr(other, 'name') or 1010 not hasattr(other, 'attrs') or 1011 not hasattr(other, 'contents') or 1012 self.name != other.name or 1013 self.attrs != other.attrs or 1014 len(self) != len(other)): 1015 return False 1016 for i, my_child in enumerate(self.contents): 1017 if my_child != other.contents[i]: 1018 return False 1019 return True 1020 1021 def __ne__(self, other): 1022 """Returns true iff this tag is not identical to the other tag, 1023 as defined in __eq__.""" 1024 return not self == other 1025 1026 def __repr__(self, encoding="unicode-escape"): 1027 """Renders this tag as a string.""" 1028 if PY3K: 1029 # "The return value must be a string object", i.e. Unicode 1030 return self.decode() 1031 else: 1032 # "The return value must be a string object", i.e. a bytestring. 1033 # By convention, the return value of __repr__ should also be 1034 # an ASCII string. 1035 return self.encode(encoding) 1036 1037 def __unicode__(self): 1038 return self.decode() 1039 1040 def __str__(self): 1041 if PY3K: 1042 return self.decode() 1043 else: 1044 return self.encode() 1045 1046 if PY3K: 1047 __str__ = __repr__ = __unicode__ 1048 1049 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, 1050 indent_level=None, formatter="minimal", 1051 errors="xmlcharrefreplace"): 1052 # Turn the data structure into Unicode, then encode the 1053 # Unicode. 1054 u = self.decode(indent_level, encoding, formatter) 1055 return u.encode(encoding, errors) 1056 1057 def _should_pretty_print(self, indent_level): 1058 """Should this tag be pretty-printed?""" 1059 return ( 1060 indent_level is not None and 1061 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags 1062 or self._is_xml)) 1063 1064 def decode(self, indent_level=None, 1065 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1066 formatter="minimal"): 1067 """Returns a Unicode representation of this tag and its contents. 1068 1069 :param eventual_encoding: The tag is destined to be 1070 encoded into this encoding. This method is _not_ 1071 responsible for performing that encoding. This information 1072 is passed in so that it can be substituted in if the 1073 document contains a <META> tag that mentions the document's 1074 encoding. 1075 """ 1076 1077 # First off, turn a string formatter into a function. This 1078 # will stop the lookup from happening over and over again. 1079 if not isinstance(formatter, collections.abc.Callable): 1080 formatter = self._formatter_for_name(formatter) 1081 1082 attrs = [] 1083 if self.attrs: 1084 for key, val in sorted(self.attrs.items()): 1085 if val is None: 1086 decoded = key 1087 else: 1088 if isinstance(val, list) or isinstance(val, tuple): 1089 val = ' '.join(val) 1090 elif not isinstance(val, str): 1091 val = str(val) 1092 elif ( 1093 isinstance(val, AttributeValueWithCharsetSubstitution) 1094 and eventual_encoding is not None): 1095 val = val.encode(eventual_encoding) 1096 1097 text = self.format_string(val, formatter) 1098 decoded = ( 1099 str(key) + '=' 1100 + EntitySubstitution.quoted_attribute_value(text)) 1101 attrs.append(decoded) 1102 close = '' 1103 closeTag = '' 1104 1105 prefix = '' 1106 if self.prefix: 1107 prefix = self.prefix + ":" 1108 1109 if self.is_empty_element: 1110 close = '/' 1111 else: 1112 closeTag = '</%s%s>' % (prefix, self.name) 1113 1114 pretty_print = self._should_pretty_print(indent_level) 1115 space = '' 1116 indent_space = '' 1117 if indent_level is not None: 1118 indent_space = (' ' * (indent_level - 1)) 1119 if pretty_print: 1120 space = indent_space 1121 indent_contents = indent_level + 1 1122 else: 1123 indent_contents = None 1124 contents = self.decode_contents( 1125 indent_contents, eventual_encoding, formatter) 1126 1127 if self.hidden: 1128 # This is the 'document root' object. 1129 s = contents 1130 else: 1131 s = [] 1132 attribute_string = '' 1133 if attrs: 1134 attribute_string = ' ' + ' '.join(attrs) 1135 if indent_level is not None: 1136 # Even if this particular tag is not pretty-printed, 1137 # we should indent up to the start of the tag. 1138 s.append(indent_space) 1139 s.append('<%s%s%s%s>' % ( 1140 prefix, self.name, attribute_string, close)) 1141 if pretty_print: 1142 s.append("\n") 1143 s.append(contents) 1144 if pretty_print and contents and contents[-1] != "\n": 1145 s.append("\n") 1146 if pretty_print and closeTag: 1147 s.append(space) 1148 s.append(closeTag) 1149 if indent_level is not None and closeTag and self.next_sibling: 1150 # Even if this particular tag is not pretty-printed, 1151 # we're now done with the tag, and we should add a 1152 # newline if appropriate. 1153 s.append("\n") 1154 s = ''.join(s) 1155 return s 1156 1157 def prettify(self, encoding=None, formatter="minimal"): 1158 if encoding is None: 1159 return self.decode(True, formatter=formatter) 1160 else: 1161 return self.encode(encoding, True, formatter=formatter) 1162 1163 def decode_contents(self, indent_level=None, 1164 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 1165 formatter="minimal"): 1166 """Renders the contents of this tag as a Unicode string. 1167 1168 :param indent_level: Each line of the rendering will be 1169 indented this many spaces. 1170 1171 :param eventual_encoding: The tag is destined to be 1172 encoded into this encoding. This method is _not_ 1173 responsible for performing that encoding. This information 1174 is passed in so that it can be substituted in if the 1175 document contains a <META> tag that mentions the document's 1176 encoding. 1177 1178 :param formatter: The output formatter responsible for converting 1179 entities to Unicode characters. 1180 """ 1181 # First off, turn a string formatter into a function. This 1182 # will stop the lookup from happening over and over again. 1183 if not isinstance(formatter, collections.abc.Callable): 1184 formatter = self._formatter_for_name(formatter) 1185 1186 pretty_print = (indent_level is not None) 1187 s = [] 1188 for c in self: 1189 text = None 1190 if isinstance(c, NavigableString): 1191 text = c.output_ready(formatter) 1192 elif isinstance(c, Tag): 1193 s.append(c.decode(indent_level, eventual_encoding, 1194 formatter)) 1195 if text and indent_level and not self.name == 'pre': 1196 text = text.strip() 1197 if text: 1198 if pretty_print and not self.name == 'pre': 1199 s.append(" " * (indent_level - 1)) 1200 s.append(text) 1201 if pretty_print and not self.name == 'pre': 1202 s.append("\n") 1203 return ''.join(s) 1204 1205 def encode_contents( 1206 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, 1207 formatter="minimal"): 1208 """Renders the contents of this tag as a bytestring. 1209 1210 :param indent_level: Each line of the rendering will be 1211 indented this many spaces. 1212 1213 :param eventual_encoding: The bytestring will be in this encoding. 1214 1215 :param formatter: The output formatter responsible for converting 1216 entities to Unicode characters. 1217 """ 1218 1219 contents = self.decode_contents(indent_level, encoding, formatter) 1220 return contents.encode(encoding) 1221 1222 # Old method for BS3 compatibility 1223 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 1224 prettyPrint=False, indentLevel=0): 1225 if not prettyPrint: 1226 indentLevel = None 1227 return self.encode_contents( 1228 indent_level=indentLevel, encoding=encoding) 1229 1230 #Soup methods 1231 1232 def find(self, name=None, attrs={}, recursive=True, text=None, 1233 **kwargs): 1234 """Return only the first child of this Tag matching the given 1235 criteria.""" 1236 r = None 1237 l = self.find_all(name, attrs, recursive, text, 1, **kwargs) 1238 if l: 1239 r = l[0] 1240 return r 1241 findChild = find 1242 1243 def find_all(self, name=None, attrs={}, recursive=True, text=None, 1244 limit=None, **kwargs): 1245 """Extracts a list of Tag objects that match the given 1246 criteria. You can specify the name of the Tag and any 1247 attributes you want the Tag to have. 1248 1249 The value of a key-value pair in the 'attrs' map can be a 1250 string, a list of strings, a regular expression object, or a 1251 callable that takes a string and returns whether or not the 1252 string matches for some custom definition of 'matches'. The 1253 same is true of the tag name.""" 1254 1255 generator = self.descendants 1256 if not recursive: 1257 generator = self.children 1258 return self._find_all(name, attrs, text, limit, generator, **kwargs) 1259 findAll = find_all # BS3 1260 findChildren = find_all # BS2 1261 1262 #Generator methods 1263 @property 1264 def children(self): 1265 # return iter() to make the purpose of the method clear 1266 return iter(self.contents) # XXX This seems to be untested. 1267 1268 @property 1269 def descendants(self): 1270 if not len(self.contents): 1271 return 1272 stopNode = self._last_descendant().next_element 1273 current = self.contents[0] 1274 while current is not stopNode: 1275 yield current 1276 current = current.next_element 1277 1278 # CSS selector code 1279 1280 _selector_combinators = ['>', '+', '~'] 1281 _select_debug = False 1282 def select_one(self, selector): 1283 """Perform a CSS selection operation on the current element.""" 1284 value = self.select(selector, limit=1) 1285 if value: 1286 return value[0] 1287 return None 1288 1289 def select(self, selector, _candidate_generator=None, limit=None): 1290 """Perform a CSS selection operation on the current element.""" 1291 1292 # Handle grouping selectors if ',' exists, ie: p,a 1293 if ',' in selector: 1294 context = [] 1295 for partial_selector in selector.split(','): 1296 partial_selector = partial_selector.strip() 1297 if partial_selector == '': 1298 raise ValueError('Invalid group selection syntax: %s' % selector) 1299 candidates = self.select(partial_selector, limit=limit) 1300 for candidate in candidates: 1301 if candidate not in context: 1302 context.append(candidate) 1303 1304 if limit and len(context) >= limit: 1305 break 1306 return context 1307 1308 tokens = selector.split() 1309 current_context = [self] 1310 1311 if tokens[-1] in self._selector_combinators: 1312 raise ValueError( 1313 'Final combinator "%s" is missing an argument.' % tokens[-1]) 1314 1315 if self._select_debug: 1316 print('Running CSS selector "%s"' % selector) 1317 1318 for index, token in enumerate(tokens): 1319 new_context = [] 1320 new_context_ids = set([]) 1321 1322 if tokens[index-1] in self._selector_combinators: 1323 # This token was consumed by the previous combinator. Skip it. 1324 if self._select_debug: 1325 print(' Token was consumed by the previous combinator.') 1326 continue 1327 1328 if self._select_debug: 1329 print(' Considering token "%s"' % token) 1330 recursive_candidate_generator = None 1331 tag_name = None 1332 1333 # Each operation corresponds to a checker function, a rule 1334 # for determining whether a candidate matches the 1335 # selector. Candidates are generated by the active 1336 # iterator. 1337 checker = None 1338 1339 m = self.attribselect_re.match(token) 1340 if m is not None: 1341 # Attribute selector 1342 tag_name, attribute, operator, value = m.groups() 1343 checker = self._attribute_checker(operator, attribute, value) 1344 1345 elif '#' in token: 1346 # ID selector 1347 tag_name, tag_id = token.split('#', 1) 1348 def id_matches(tag): 1349 return tag.get('id', None) == tag_id 1350 checker = id_matches 1351 1352 elif '.' in token: 1353 # Class selector 1354 tag_name, klass = token.split('.', 1) 1355 classes = set(klass.split('.')) 1356 def classes_match(candidate): 1357 return classes.issubset(candidate.get('class', [])) 1358 checker = classes_match 1359 1360 elif ':' in token: 1361 # Pseudo-class 1362 tag_name, pseudo = token.split(':', 1) 1363 if tag_name == '': 1364 raise ValueError( 1365 "A pseudo-class must be prefixed with a tag name.") 1366 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) 1367 found = [] 1368 if pseudo_attributes is None: 1369 pseudo_type = pseudo 1370 pseudo_value = None 1371 else: 1372 pseudo_type, pseudo_value = pseudo_attributes.groups() 1373 if pseudo_type == 'nth-of-type': 1374 try: 1375 pseudo_value = int(pseudo_value) 1376 except: 1377 raise NotImplementedError( 1378 'Only numeric values are currently supported for the nth-of-type pseudo-class.') 1379 if pseudo_value < 1: 1380 raise ValueError( 1381 'nth-of-type pseudo-class value must be at least 1.') 1382 class Counter(object): 1383 def __init__(self, destination): 1384 self.count = 0 1385 self.destination = destination 1386 1387 def nth_child_of_type(self, tag): 1388 self.count += 1 1389 if self.count == self.destination: 1390 return True 1391 if self.count > self.destination: 1392 # Stop the generator that's sending us 1393 # these things. 1394 raise StopIteration() 1395 return False 1396 checker = Counter(pseudo_value).nth_child_of_type 1397 else: 1398 raise NotImplementedError( 1399 'Only the following pseudo-classes are implemented: nth-of-type.') 1400 1401 elif token == '*': 1402 # Star selector -- matches everything 1403 pass 1404 elif token == '>': 1405 # Run the next token as a CSS selector against the 1406 # direct children of each tag in the current context. 1407 recursive_candidate_generator = lambda tag: tag.children 1408 elif token == '~': 1409 # Run the next token as a CSS selector against the 1410 # siblings of each tag in the current context. 1411 recursive_candidate_generator = lambda tag: tag.next_siblings 1412 elif token == '+': 1413 # For each tag in the current context, run the next 1414 # token as a CSS selector against the tag's next 1415 # sibling that's a tag. 1416 def next_tag_sibling(tag): 1417 yield tag.find_next_sibling(True) 1418 recursive_candidate_generator = next_tag_sibling 1419 1420 elif self.tag_name_re.match(token): 1421 # Just a tag name. 1422 tag_name = token 1423 else: 1424 raise ValueError( 1425 'Unsupported or invalid CSS selector: "%s"' % token) 1426 if recursive_candidate_generator: 1427 # This happens when the selector looks like "> foo". 1428 # 1429 # The generator calls select() recursively on every 1430 # member of the current context, passing in a different 1431 # candidate generator and a different selector. 1432 # 1433 # In the case of "> foo", the candidate generator is 1434 # one that yields a tag's direct children (">"), and 1435 # the selector is "foo". 1436 next_token = tokens[index+1] 1437 def recursive_select(tag): 1438 if self._select_debug: 1439 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) 1440 print('-' * 40) 1441 for i in tag.select(next_token, recursive_candidate_generator): 1442 if self._select_debug: 1443 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) 1444 yield i 1445 if self._select_debug: 1446 print('-' * 40) 1447 _use_candidate_generator = recursive_select 1448 elif _candidate_generator is None: 1449 # By default, a tag's candidates are all of its 1450 # children. If tag_name is defined, only yield tags 1451 # with that name. 1452 if self._select_debug: 1453 if tag_name: 1454 check = "[any]" 1455 else: 1456 check = tag_name 1457 print(' Default candidate generator, tag name="%s"' % check) 1458 if self._select_debug: 1459 # This is redundant with later code, but it stops 1460 # a bunch of bogus tags from cluttering up the 1461 # debug log. 1462 def default_candidate_generator(tag): 1463 for child in tag.descendants: 1464 if not isinstance(child, Tag): 1465 continue 1466 if tag_name and not child.name == tag_name: 1467 continue 1468 yield child 1469 _use_candidate_generator = default_candidate_generator 1470 else: 1471 _use_candidate_generator = lambda tag: tag.descendants 1472 else: 1473 _use_candidate_generator = _candidate_generator 1474 1475 count = 0 1476 for tag in current_context: 1477 if self._select_debug: 1478 print(" Running candidate generator on %s %s" % ( 1479 tag.name, repr(tag.attrs))) 1480 for candidate in _use_candidate_generator(tag): 1481 if not isinstance(candidate, Tag): 1482 continue 1483 if tag_name and candidate.name != tag_name: 1484 continue 1485 if checker is not None: 1486 try: 1487 result = checker(candidate) 1488 except StopIteration: 1489 # The checker has decided we should no longer 1490 # run the generator. 1491 break 1492 if checker is None or result: 1493 if self._select_debug: 1494 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) 1495 if id(candidate) not in new_context_ids: 1496 # If a tag matches a selector more than once, 1497 # don't include it in the context more than once. 1498 new_context.append(candidate) 1499 new_context_ids.add(id(candidate)) 1500 if limit and len(new_context) >= limit: 1501 break 1502 elif self._select_debug: 1503 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) 1504 1505 1506 current_context = new_context 1507 1508 if self._select_debug: 1509 print("Final verdict:") 1510 for i in current_context: 1511 print(" %s %s" % (i.name, i.attrs)) 1512 return current_context 1513 1514 # Old names for backwards compatibility 1515 def childGenerator(self): 1516 return self.children 1517 1518 def recursiveChildGenerator(self): 1519 return self.descendants 1520 1521 def has_key(self, key): 1522 """This was kind of misleading because has_key() (attributes) 1523 was different from __in__ (contents). has_key() is gone in 1524 Python 3, anyway.""" 1525 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % ( 1526 key)) 1527 return self.has_attr(key) 1528 1529# Next, a couple classes to represent queries and their results. 1530class SoupStrainer(object): 1531 """Encapsulates a number of ways of matching a markup element (tag or 1532 text).""" 1533 1534 def __init__(self, name=None, attrs={}, text=None, **kwargs): 1535 self.name = self._normalize_search_value(name) 1536 if not isinstance(attrs, dict): 1537 # Treat a non-dict value for attrs as a search for the 'class' 1538 # attribute. 1539 kwargs['class'] = attrs 1540 attrs = None 1541 1542 if 'class_' in kwargs: 1543 # Treat class_="foo" as a search for the 'class' 1544 # attribute, overriding any non-dict value for attrs. 1545 kwargs['class'] = kwargs['class_'] 1546 del kwargs['class_'] 1547 1548 if kwargs: 1549 if attrs: 1550 attrs = attrs.copy() 1551 attrs.update(kwargs) 1552 else: 1553 attrs = kwargs 1554 normalized_attrs = {} 1555 for key, value in list(attrs.items()): 1556 normalized_attrs[key] = self._normalize_search_value(value) 1557 1558 self.attrs = normalized_attrs 1559 self.text = self._normalize_search_value(text) 1560 1561 def _normalize_search_value(self, value): 1562 # Leave it alone if it's a Unicode string, a callable, a 1563 # regular expression, a boolean, or None. 1564 if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match') 1565 or isinstance(value, bool) or value is None): 1566 return value 1567 1568 # If it's a bytestring, convert it to Unicode, treating it as UTF-8. 1569 if isinstance(value, bytes): 1570 return value.decode("utf8") 1571 1572 # If it's listlike, convert it into a list of strings. 1573 if hasattr(value, '__iter__'): 1574 new_value = [] 1575 for v in value: 1576 if (hasattr(v, '__iter__') and not isinstance(v, bytes) 1577 and not isinstance(v, str)): 1578 # This is almost certainly the user's mistake. In the 1579 # interests of avoiding infinite loops, we'll let 1580 # it through as-is rather than doing a recursive call. 1581 new_value.append(v) 1582 else: 1583 new_value.append(self._normalize_search_value(v)) 1584 return new_value 1585 1586 # Otherwise, convert it into a Unicode string. 1587 # The unicode(str()) thing is so this will do the same thing on Python 2 1588 # and Python 3. 1589 return str(str(value)) 1590 1591 def __str__(self): 1592 if self.text: 1593 return self.text 1594 else: 1595 return "%s|%s" % (self.name, self.attrs) 1596 1597 def search_tag(self, markup_name=None, markup_attrs={}): 1598 found = None 1599 markup = None 1600 if isinstance(markup_name, Tag): 1601 markup = markup_name 1602 markup_attrs = markup 1603 call_function_with_tag_data = ( 1604 isinstance(self.name, collections.abc.Callable) 1605 and not isinstance(markup_name, Tag)) 1606 1607 if ((not self.name) 1608 or call_function_with_tag_data 1609 or (markup and self._matches(markup, self.name)) 1610 or (not markup and self._matches(markup_name, self.name))): 1611 if call_function_with_tag_data: 1612 match = self.name(markup_name, markup_attrs) 1613 else: 1614 match = True 1615 markup_attr_map = None 1616 for attr, match_against in list(self.attrs.items()): 1617 if not markup_attr_map: 1618 if hasattr(markup_attrs, 'get'): 1619 markup_attr_map = markup_attrs 1620 else: 1621 markup_attr_map = {} 1622 for k, v in markup_attrs: 1623 markup_attr_map[k] = v 1624 attr_value = markup_attr_map.get(attr) 1625 if not self._matches(attr_value, match_against): 1626 match = False 1627 break 1628 if match: 1629 if markup: 1630 found = markup 1631 else: 1632 found = markup_name 1633 if found and self.text and not self._matches(found.string, self.text): 1634 found = None 1635 return found 1636 searchTag = search_tag 1637 1638 def search(self, markup): 1639 # print 'looking for %s in %s' % (self, markup) 1640 found = None 1641 # If given a list of items, scan it for a text element that 1642 # matches. 1643 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): 1644 for element in markup: 1645 if isinstance(element, NavigableString) \ 1646 and self.search(element): 1647 found = element 1648 break 1649 # If it's a Tag, make sure its name or attributes match. 1650 # Don't bother with Tags if we're searching for text. 1651 elif isinstance(markup, Tag): 1652 if not self.text or self.name or self.attrs: 1653 found = self.search_tag(markup) 1654 # If it's text, make sure the text matches. 1655 elif isinstance(markup, NavigableString) or \ 1656 isinstance(markup, str): 1657 if not self.name and not self.attrs and self._matches(markup, self.text): 1658 found = markup 1659 else: 1660 raise Exception( 1661 "I don't know how to match against a %s" % markup.__class__) 1662 return found 1663 1664 def _matches(self, markup, match_against): 1665 # print u"Matching %s against %s" % (markup, match_against) 1666 result = False 1667 if isinstance(markup, list) or isinstance(markup, tuple): 1668 # This should only happen when searching a multi-valued attribute 1669 # like 'class'. 1670 if (isinstance(match_against, str) 1671 and ' ' in match_against): 1672 # A bit of a special case. If they try to match "foo 1673 # bar" on a multivalue attribute's value, only accept 1674 # the literal value "foo bar" 1675 # 1676 # XXX This is going to be pretty slow because we keep 1677 # splitting match_against. But it shouldn't come up 1678 # too often. 1679 return (whitespace_re.split(match_against) == markup) 1680 else: 1681 for item in markup: 1682 if self._matches(item, match_against): 1683 return True 1684 return False 1685 1686 if match_against is True: 1687 # True matches any non-None value. 1688 return markup is not None 1689 1690 if isinstance(match_against, collections.abc.Callable): 1691 return match_against(markup) 1692 1693 # Custom callables take the tag as an argument, but all 1694 # other ways of matching match the tag name as a string. 1695 if isinstance(markup, Tag): 1696 markup = markup.name 1697 1698 # Ensure that `markup` is either a Unicode string, or None. 1699 markup = self._normalize_search_value(markup) 1700 1701 if markup is None: 1702 # None matches None, False, an empty string, an empty list, and so on. 1703 return not match_against 1704 1705 if isinstance(match_against, str): 1706 # Exact string match 1707 return markup == match_against 1708 1709 if hasattr(match_against, 'match'): 1710 # Regexp match 1711 return match_against.search(markup) 1712 1713 if hasattr(match_against, '__iter__'): 1714 # The markup must be an exact match against something 1715 # in the iterable. 1716 return markup in match_against 1717 1718 1719class ResultSet(list): 1720 """A ResultSet is just a list that keeps track of the SoupStrainer 1721 that created it.""" 1722 def __init__(self, source, result=()): 1723 super(ResultSet, self).__init__(result) 1724 self.source = source 1725