1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". 2 3http://www.crummy.com/software/BeautifulSoup/ 4 5Beautiful Soup uses a pluggable XML or HTML parser to parse a 6(possibly invalid) document into a tree representation. Beautiful Soup 7provides methods and Pythonic idioms that make it easy to navigate, 8search, and modify the parse tree. 9 10Beautiful Soup works with Python 3.6 and up. It works better if lxml 11and/or html5lib is installed. 12 13For more than you ever wanted to know about Beautiful Soup, see the 14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 15""" 16 17__author__ = "Leonard Richardson (leonardr@segfault.org)" 18__version__ = "4.12.3" 19__copyright__ = "Copyright (c) 2004-2024 Leonard Richardson" 20# Use of this source code is governed by the MIT license. 21__license__ = "MIT" 22 23__all__ = ['BeautifulSoup'] 24 25from collections import Counter 26import os 27import re 28import sys 29import traceback 30import warnings 31 32# The very first thing we do is give a useful error if someone is 33# running this code under Python 2. 34if sys.version_info.major < 3: 35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.') 36 37from .builder import ( 38 builder_registry, 39 ParserRejectedMarkup, 40 XMLParsedAsHTMLWarning, 41 HTMLParserTreeBuilder 42) 43from .dammit import UnicodeDammit 44from .element import ( 45 CData, 46 Comment, 47 CSS, 48 DEFAULT_OUTPUT_ENCODING, 49 Declaration, 50 Doctype, 51 NavigableString, 52 PageElement, 53 ProcessingInstruction, 54 PYTHON_SPECIFIC_ENCODINGS, 55 ResultSet, 56 Script, 57 Stylesheet, 58 SoupStrainer, 59 Tag, 60 TemplateString, 61 ) 62 63# Define some custom warnings. 64class GuessedAtParserWarning(UserWarning): 65 """The warning issued when BeautifulSoup has to guess what parser to 66 use -- probably because no parser was specified in the constructor. 67 """ 68 69class MarkupResemblesLocatorWarning(UserWarning): 70 """The warning issued when BeautifulSoup is given 'markup' that 71 actually looks like a resource locator -- a URL or a path to a file 72 on disk. 73 """ 74 75 76class BeautifulSoup(Tag): 77 """A data structure representing a parsed HTML or XML document. 78 79 Most of the methods you'll call on a BeautifulSoup object are inherited from 80 PageElement or Tag. 81 82 Internally, this class defines the basic interface called by the 83 tree builders when converting an HTML/XML document into a data 84 structure. The interface abstracts away the differences between 85 parsers. To write a new tree builder, you'll need to understand 86 these methods as a whole. 87 88 These methods will be called by the BeautifulSoup constructor: 89 * reset() 90 * feed(markup) 91 92 The tree builder may call these methods from its feed() implementation: 93 * handle_starttag(name, attrs) # See note about return value 94 * handle_endtag(name) 95 * handle_data(data) # Appends to the current data node 96 * endData(containerClass) # Ends the current data node 97 98 No matter how complicated the underlying parser is, you should be 99 able to build a tree using 'start tag' events, 'end tag' events, 100 'data' events, and "done with data" events. 101 102 If you encounter an empty-element tag (aka a self-closing tag, 103 like HTML's <br> tag), call handle_starttag and then 104 handle_endtag. 105 """ 106 107 # Since BeautifulSoup subclasses Tag, it's possible to treat it as 108 # a Tag with a .name. This name makes it clear the BeautifulSoup 109 # object isn't a real markup tag. 110 ROOT_TAG_NAME = '[document]' 111 112 # If the end-user gives no indication which tree builder they 113 # want, look for one with these features. 114 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 115 116 # A string containing all ASCII whitespace characters, used in 117 # endData() to detect data chunks that seem 'empty'. 118 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 119 120 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" 121 122 def __init__(self, markup="", features=None, builder=None, 123 parse_only=None, from_encoding=None, exclude_encodings=None, 124 element_classes=None, **kwargs): 125 """Constructor. 126 127 :param markup: A string or a file-like object representing 128 markup to be parsed. 129 130 :param features: Desirable features of the parser to be 131 used. This may be the name of a specific parser ("lxml", 132 "lxml-xml", "html.parser", or "html5lib") or it may be the 133 type of markup to be used ("html", "html5", "xml"). It's 134 recommended that you name a specific parser, so that 135 Beautiful Soup gives you the same results across platforms 136 and virtual environments. 137 138 :param builder: A TreeBuilder subclass to instantiate (or 139 instance to use) instead of looking one up based on 140 `features`. You only need to use this if you've implemented a 141 custom TreeBuilder. 142 143 :param parse_only: A SoupStrainer. Only parts of the document 144 matching the SoupStrainer will be considered. This is useful 145 when parsing part of a document that would otherwise be too 146 large to fit into memory. 147 148 :param from_encoding: A string indicating the encoding of the 149 document to be parsed. Pass this in if Beautiful Soup is 150 guessing wrongly about the document's encoding. 151 152 :param exclude_encodings: A list of strings indicating 153 encodings known to be wrong. Pass this in if you don't know 154 the document's encoding but you know Beautiful Soup's guess is 155 wrong. 156 157 :param element_classes: A dictionary mapping BeautifulSoup 158 classes like Tag and NavigableString, to other classes you'd 159 like to be instantiated instead as the parse tree is 160 built. This is useful for subclassing Tag or NavigableString 161 to modify default behavior. 162 163 :param kwargs: For backwards compatibility purposes, the 164 constructor accepts certain keyword arguments used in 165 Beautiful Soup 3. None of these arguments do anything in 166 Beautiful Soup 4; they will result in a warning and then be 167 ignored. 168 169 Apart from this, any keyword arguments passed into the 170 BeautifulSoup constructor are propagated to the TreeBuilder 171 constructor. This makes it possible to configure a 172 TreeBuilder by passing in arguments, not just by saying which 173 one to use. 174 """ 175 if 'convertEntities' in kwargs: 176 del kwargs['convertEntities'] 177 warnings.warn( 178 "BS4 does not respect the convertEntities argument to the " 179 "BeautifulSoup constructor. Entities are always converted " 180 "to Unicode characters.") 181 182 if 'markupMassage' in kwargs: 183 del kwargs['markupMassage'] 184 warnings.warn( 185 "BS4 does not respect the markupMassage argument to the " 186 "BeautifulSoup constructor. The tree builder is responsible " 187 "for any necessary markup massage.") 188 189 if 'smartQuotesTo' in kwargs: 190 del kwargs['smartQuotesTo'] 191 warnings.warn( 192 "BS4 does not respect the smartQuotesTo argument to the " 193 "BeautifulSoup constructor. Smart quotes are always converted " 194 "to Unicode characters.") 195 196 if 'selfClosingTags' in kwargs: 197 del kwargs['selfClosingTags'] 198 warnings.warn( 199 "BS4 does not respect the selfClosingTags argument to the " 200 "BeautifulSoup constructor. The tree builder is responsible " 201 "for understanding self-closing tags.") 202 203 if 'isHTML' in kwargs: 204 del kwargs['isHTML'] 205 warnings.warn( 206 "BS4 does not respect the isHTML argument to the " 207 "BeautifulSoup constructor. Suggest you use " 208 "features='lxml' for HTML and features='lxml-xml' for " 209 "XML.") 210 211 def deprecated_argument(old_name, new_name): 212 if old_name in kwargs: 213 warnings.warn( 214 'The "%s" argument to the BeautifulSoup constructor ' 215 'has been renamed to "%s."' % (old_name, new_name), 216 DeprecationWarning, stacklevel=3 217 ) 218 return kwargs.pop(old_name) 219 return None 220 221 parse_only = parse_only or deprecated_argument( 222 "parseOnlyThese", "parse_only") 223 224 from_encoding = from_encoding or deprecated_argument( 225 "fromEncoding", "from_encoding") 226 227 if from_encoding and isinstance(markup, str): 228 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") 229 from_encoding = None 230 231 self.element_classes = element_classes or dict() 232 233 # We need this information to track whether or not the builder 234 # was specified well enough that we can omit the 'you need to 235 # specify a parser' warning. 236 original_builder = builder 237 original_features = features 238 239 if isinstance(builder, type): 240 # A builder class was passed in; it needs to be instantiated. 241 builder_class = builder 242 builder = None 243 elif builder is None: 244 if isinstance(features, str): 245 features = [features] 246 if features is None or len(features) == 0: 247 features = self.DEFAULT_BUILDER_FEATURES 248 builder_class = builder_registry.lookup(*features) 249 if builder_class is None: 250 raise FeatureNotFound( 251 "Couldn't find a tree builder with the features you " 252 "requested: %s. Do you need to install a parser library?" 253 % ",".join(features)) 254 255 # At this point either we have a TreeBuilder instance in 256 # builder, or we have a builder_class that we can instantiate 257 # with the remaining **kwargs. 258 if builder is None: 259 builder = builder_class(**kwargs) 260 if not original_builder and not ( 261 original_features == builder.NAME or 262 original_features in builder.ALTERNATE_NAMES 263 ) and markup: 264 # The user did not tell us which TreeBuilder to use, 265 # and we had to guess. Issue a warning. 266 if builder.is_xml: 267 markup_type = "XML" 268 else: 269 markup_type = "HTML" 270 271 # This code adapted from warnings.py so that we get the same line 272 # of code as our warnings.warn() call gets, even if the answer is wrong 273 # (as it may be in a multithreading situation). 274 caller = None 275 try: 276 caller = sys._getframe(1) 277 except ValueError: 278 pass 279 if caller: 280 globals = caller.f_globals 281 line_number = caller.f_lineno 282 else: 283 globals = sys.__dict__ 284 line_number= 1 285 filename = globals.get('__file__') 286 if filename: 287 fnl = filename.lower() 288 if fnl.endswith((".pyc", ".pyo")): 289 filename = filename[:-1] 290 if filename: 291 # If there is no filename at all, the user is most likely in a REPL, 292 # and the warning is not necessary. 293 values = dict( 294 filename=filename, 295 line_number=line_number, 296 parser=builder.NAME, 297 markup_type=markup_type 298 ) 299 warnings.warn( 300 self.NO_PARSER_SPECIFIED_WARNING % values, 301 GuessedAtParserWarning, stacklevel=2 302 ) 303 else: 304 if kwargs: 305 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") 306 307 self.builder = builder 308 self.is_xml = builder.is_xml 309 self.known_xml = self.is_xml 310 self._namespaces = dict() 311 self.parse_only = parse_only 312 313 if hasattr(markup, 'read'): # It's a file-type object. 314 markup = markup.read() 315 elif len(markup) <= 256 and ( 316 (isinstance(markup, bytes) and not b'<' in markup) 317 or (isinstance(markup, str) and not '<' in markup) 318 ): 319 # Issue warnings for a couple beginner problems 320 # involving passing non-markup to Beautiful Soup. 321 # Beautiful Soup will still parse the input as markup, 322 # since that is sometimes the intended behavior. 323 if not self._markup_is_url(markup): 324 self._markup_resembles_filename(markup) 325 326 rejections = [] 327 success = False 328 for (self.markup, self.original_encoding, self.declared_html_encoding, 329 self.contains_replacement_characters) in ( 330 self.builder.prepare_markup( 331 markup, from_encoding, exclude_encodings=exclude_encodings)): 332 self.reset() 333 self.builder.initialize_soup(self) 334 try: 335 self._feed() 336 success = True 337 break 338 except ParserRejectedMarkup as e: 339 rejections.append(e) 340 pass 341 342 if not success: 343 other_exceptions = [str(e) for e in rejections] 344 raise ParserRejectedMarkup( 345 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) 346 ) 347 348 # Clear out the markup and remove the builder's circular 349 # reference to this object. 350 self.markup = None 351 self.builder.soup = None 352 353 def _clone(self): 354 """Create a new BeautifulSoup object with the same TreeBuilder, 355 but not associated with any markup. 356 357 This is the first step of the deepcopy process. 358 """ 359 clone = type(self)("", None, self.builder) 360 361 # Keep track of the encoding of the original document, 362 # since we won't be parsing it again. 363 clone.original_encoding = self.original_encoding 364 return clone 365 366 def __getstate__(self): 367 # Frequently a tree builder can't be pickled. 368 d = dict(self.__dict__) 369 if 'builder' in d and d['builder'] is not None and not self.builder.picklable: 370 d['builder'] = type(self.builder) 371 # Store the contents as a Unicode string. 372 d['contents'] = [] 373 d['markup'] = self.decode() 374 375 # If _most_recent_element is present, it's a Tag object left 376 # over from initial parse. It might not be picklable and we 377 # don't need it. 378 if '_most_recent_element' in d: 379 del d['_most_recent_element'] 380 return d 381 382 def __setstate__(self, state): 383 # If necessary, restore the TreeBuilder by looking it up. 384 self.__dict__ = state 385 if isinstance(self.builder, type): 386 self.builder = self.builder() 387 elif not self.builder: 388 # We don't know which builder was used to build this 389 # parse tree, so use a default we know is always available. 390 self.builder = HTMLParserTreeBuilder() 391 self.builder.soup = self 392 self.reset() 393 self._feed() 394 return state 395 396 397 @classmethod 398 def _decode_markup(cls, markup): 399 """Ensure `markup` is bytes so it's safe to send into warnings.warn. 400 401 TODO: warnings.warn had this problem back in 2010 but it might not 402 anymore. 403 """ 404 if isinstance(markup, bytes): 405 decoded = markup.decode('utf-8', 'replace') 406 else: 407 decoded = markup 408 return decoded 409 410 @classmethod 411 def _markup_is_url(cls, markup): 412 """Error-handling method to raise a warning if incoming markup looks 413 like a URL. 414 415 :param markup: A string. 416 :return: Whether or not the markup resembles a URL 417 closely enough to justify a warning. 418 """ 419 if isinstance(markup, bytes): 420 space = b' ' 421 cant_start_with = (b"http:", b"https:") 422 elif isinstance(markup, str): 423 space = ' ' 424 cant_start_with = ("http:", "https:") 425 else: 426 return False 427 428 if any(markup.startswith(prefix) for prefix in cant_start_with): 429 if not space in markup: 430 warnings.warn( 431 'The input looks more like a URL than markup. You may want to use' 432 ' an HTTP client like requests to get the document behind' 433 ' the URL, and feed that document to Beautiful Soup.', 434 MarkupResemblesLocatorWarning, 435 stacklevel=3 436 ) 437 return True 438 return False 439 440 @classmethod 441 def _markup_resembles_filename(cls, markup): 442 """Error-handling method to raise a warning if incoming markup 443 resembles a filename. 444 445 :param markup: A bytestring or string. 446 :return: Whether or not the markup resembles a filename 447 closely enough to justify a warning. 448 """ 449 path_characters = '/\\' 450 extensions = ['.html', '.htm', '.xml', '.xhtml', '.txt'] 451 if isinstance(markup, bytes): 452 path_characters = path_characters.encode("utf8") 453 extensions = [x.encode('utf8') for x in extensions] 454 filelike = False 455 if any(x in markup for x in path_characters): 456 filelike = True 457 else: 458 lower = markup.lower() 459 if any(lower.endswith(ext) for ext in extensions): 460 filelike = True 461 if filelike: 462 warnings.warn( 463 'The input looks more like a filename than markup. You may' 464 ' want to open this file and pass the filehandle into' 465 ' Beautiful Soup.', 466 MarkupResemblesLocatorWarning, stacklevel=3 467 ) 468 return True 469 return False 470 471 def _feed(self): 472 """Internal method that parses previously set markup, creating a large 473 number of Tag and NavigableString objects. 474 """ 475 # Convert the document to Unicode. 476 self.builder.reset() 477 478 self.builder.feed(self.markup) 479 # Close out any unfinished strings and close all the open tags. 480 self.endData() 481 while self.currentTag.name != self.ROOT_TAG_NAME: 482 self.popTag() 483 484 def reset(self): 485 """Reset this object to a state as though it had never parsed any 486 markup. 487 """ 488 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 489 self.hidden = 1 490 self.builder.reset() 491 self.current_data = [] 492 self.currentTag = None 493 self.tagStack = [] 494 self.open_tag_counter = Counter() 495 self.preserve_whitespace_tag_stack = [] 496 self.string_container_stack = [] 497 self._most_recent_element = None 498 self.pushTag(self) 499 500 def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, 501 sourceline=None, sourcepos=None, **kwattrs): 502 """Create a new Tag associated with this BeautifulSoup object. 503 504 :param name: The name of the new Tag. 505 :param namespace: The URI of the new Tag's XML namespace, if any. 506 :param prefix: The prefix for the new Tag's XML namespace, if any. 507 :param attrs: A dictionary of this Tag's attribute values; can 508 be used instead of `kwattrs` for attributes like 'class' 509 that are reserved words in Python. 510 :param sourceline: The line number where this tag was 511 (purportedly) found in its source document. 512 :param sourcepos: The character position within `sourceline` where this 513 tag was (purportedly) found. 514 :param kwattrs: Keyword arguments for the new Tag's attribute values. 515 516 """ 517 kwattrs.update(attrs) 518 return self.element_classes.get(Tag, Tag)( 519 None, self.builder, name, namespace, nsprefix, kwattrs, 520 sourceline=sourceline, sourcepos=sourcepos 521 ) 522 523 def string_container(self, base_class=None): 524 container = base_class or NavigableString 525 526 # There may be a general override of NavigableString. 527 container = self.element_classes.get( 528 container, container 529 ) 530 531 # On top of that, we may be inside a tag that needs a special 532 # container class. 533 if self.string_container_stack and container is NavigableString: 534 container = self.builder.string_containers.get( 535 self.string_container_stack[-1].name, container 536 ) 537 return container 538 539 def new_string(self, s, subclass=None): 540 """Create a new NavigableString associated with this BeautifulSoup 541 object. 542 """ 543 container = self.string_container(subclass) 544 return container(s) 545 546 def insert_before(self, *args): 547 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 548 it because there is nothing before or after it in the parse tree. 549 """ 550 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 551 552 def insert_after(self, *args): 553 """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement 554 it because there is nothing before or after it in the parse tree. 555 """ 556 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 557 558 def popTag(self): 559 """Internal method called by _popToTag when a tag is closed.""" 560 tag = self.tagStack.pop() 561 if tag.name in self.open_tag_counter: 562 self.open_tag_counter[tag.name] -= 1 563 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 564 self.preserve_whitespace_tag_stack.pop() 565 if self.string_container_stack and tag == self.string_container_stack[-1]: 566 self.string_container_stack.pop() 567 #print("Pop", tag.name) 568 if self.tagStack: 569 self.currentTag = self.tagStack[-1] 570 return self.currentTag 571 572 def pushTag(self, tag): 573 """Internal method called by handle_starttag when a tag is opened.""" 574 #print("Push", tag.name) 575 if self.currentTag is not None: 576 self.currentTag.contents.append(tag) 577 self.tagStack.append(tag) 578 self.currentTag = self.tagStack[-1] 579 if tag.name != self.ROOT_TAG_NAME: 580 self.open_tag_counter[tag.name] += 1 581 if tag.name in self.builder.preserve_whitespace_tags: 582 self.preserve_whitespace_tag_stack.append(tag) 583 if tag.name in self.builder.string_containers: 584 self.string_container_stack.append(tag) 585 586 def endData(self, containerClass=None): 587 """Method called by the TreeBuilder when the end of a data segment 588 occurs. 589 """ 590 if self.current_data: 591 current_data = ''.join(self.current_data) 592 # If whitespace is not preserved, and this string contains 593 # nothing but ASCII spaces, replace it with a single space 594 # or newline. 595 if not self.preserve_whitespace_tag_stack: 596 strippable = True 597 for i in current_data: 598 if i not in self.ASCII_SPACES: 599 strippable = False 600 break 601 if strippable: 602 if '\n' in current_data: 603 current_data = '\n' 604 else: 605 current_data = ' ' 606 607 # Reset the data collector. 608 self.current_data = [] 609 610 # Should we add this string to the tree at all? 611 if self.parse_only and len(self.tagStack) <= 1 and \ 612 (not self.parse_only.text or \ 613 not self.parse_only.search(current_data)): 614 return 615 616 containerClass = self.string_container(containerClass) 617 o = containerClass(current_data) 618 self.object_was_parsed(o) 619 620 def object_was_parsed(self, o, parent=None, most_recent_element=None): 621 """Method called by the TreeBuilder to integrate an object into the parse tree.""" 622 if parent is None: 623 parent = self.currentTag 624 if most_recent_element is not None: 625 previous_element = most_recent_element 626 else: 627 previous_element = self._most_recent_element 628 629 next_element = previous_sibling = next_sibling = None 630 if isinstance(o, Tag): 631 next_element = o.next_element 632 next_sibling = o.next_sibling 633 previous_sibling = o.previous_sibling 634 if previous_element is None: 635 previous_element = o.previous_element 636 637 fix = parent.next_element is not None 638 639 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 640 641 self._most_recent_element = o 642 parent.contents.append(o) 643 644 # Check if we are inserting into an already parsed node. 645 if fix: 646 self._linkage_fixer(parent) 647 648 def _linkage_fixer(self, el): 649 """Make sure linkage of this fragment is sound.""" 650 651 first = el.contents[0] 652 child = el.contents[-1] 653 descendant = child 654 655 if child is first and el.parent is not None: 656 # Parent should be linked to first child 657 el.next_element = child 658 # We are no longer linked to whatever this element is 659 prev_el = child.previous_element 660 if prev_el is not None and prev_el is not el: 661 prev_el.next_element = None 662 # First child should be linked to the parent, and no previous siblings. 663 child.previous_element = el 664 child.previous_sibling = None 665 666 # We have no sibling as we've been appended as the last. 667 child.next_sibling = None 668 669 # This index is a tag, dig deeper for a "last descendant" 670 if isinstance(child, Tag) and child.contents: 671 descendant = child._last_descendant(False) 672 673 # As the final step, link last descendant. It should be linked 674 # to the parent's next sibling (if found), else walk up the chain 675 # and find a parent with a sibling. It should have no next sibling. 676 descendant.next_element = None 677 descendant.next_sibling = None 678 target = el 679 while True: 680 if target is None: 681 break 682 elif target.next_sibling is not None: 683 descendant.next_element = target.next_sibling 684 target.next_sibling.previous_element = child 685 break 686 target = target.parent 687 688 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 689 """Pops the tag stack up to and including the most recent 690 instance of the given tag. 691 692 If there are no open tags with the given name, nothing will be 693 popped. 694 695 :param name: Pop up to the most recent tag with this name. 696 :param nsprefix: The namespace prefix that goes with `name`. 697 :param inclusivePop: It this is false, pops the tag stack up 698 to but *not* including the most recent instqance of the 699 given tag. 700 701 """ 702 #print("Popping to %s" % name) 703 if name == self.ROOT_TAG_NAME: 704 # The BeautifulSoup object itself can never be popped. 705 return 706 707 most_recently_popped = None 708 709 stack_size = len(self.tagStack) 710 for i in range(stack_size - 1, 0, -1): 711 if not self.open_tag_counter.get(name): 712 break 713 t = self.tagStack[i] 714 if (name == t.name and nsprefix == t.prefix): 715 if inclusivePop: 716 most_recently_popped = self.popTag() 717 break 718 most_recently_popped = self.popTag() 719 720 return most_recently_popped 721 722 def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, 723 sourcepos=None, namespaces=None): 724 """Called by the tree builder when a new tag is encountered. 725 726 :param name: Name of the tag. 727 :param nsprefix: Namespace prefix for the tag. 728 :param attrs: A dictionary of attribute values. 729 :param sourceline: The line number where this tag was found in its 730 source document. 731 :param sourcepos: The character position within `sourceline` where this 732 tag was found. 733 :param namespaces: A dictionary of all namespace prefix mappings 734 currently in scope in the document. 735 736 If this method returns None, the tag was rejected by an active 737 SoupStrainer. You should proceed as if the tag had not occurred 738 in the document. For instance, if this was a self-closing tag, 739 don't call handle_endtag. 740 """ 741 # print("Start tag %s: %s" % (name, attrs)) 742 self.endData() 743 744 if (self.parse_only and len(self.tagStack) <= 1 745 and (self.parse_only.text 746 or not self.parse_only.search_tag(name, attrs))): 747 return None 748 749 tag = self.element_classes.get(Tag, Tag)( 750 self, self.builder, name, namespace, nsprefix, attrs, 751 self.currentTag, self._most_recent_element, 752 sourceline=sourceline, sourcepos=sourcepos, 753 namespaces=namespaces 754 ) 755 if tag is None: 756 return tag 757 if self._most_recent_element is not None: 758 self._most_recent_element.next_element = tag 759 self._most_recent_element = tag 760 self.pushTag(tag) 761 return tag 762 763 def handle_endtag(self, name, nsprefix=None): 764 """Called by the tree builder when an ending tag is encountered. 765 766 :param name: Name of the tag. 767 :param nsprefix: Namespace prefix for the tag. 768 """ 769 #print("End tag: " + name) 770 self.endData() 771 self._popToTag(name, nsprefix) 772 773 def handle_data(self, data): 774 """Called by the tree builder when a chunk of textual data is encountered.""" 775 self.current_data.append(data) 776 777 def decode(self, pretty_print=False, 778 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 779 formatter="minimal", iterator=None): 780 """Returns a string or Unicode representation of the parse tree 781 as an HTML or XML document. 782 783 :param pretty_print: If this is True, indentation will be used to 784 make the document more readable. 785 :param eventual_encoding: The encoding of the final document. 786 If this is None, the document will be a Unicode string. 787 """ 788 if self.is_xml: 789 # Print the XML declaration 790 encoding_part = '' 791 if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: 792 # This is a special Python encoding; it can't actually 793 # go into an XML document because it means nothing 794 # outside of Python. 795 eventual_encoding = None 796 if eventual_encoding != None: 797 encoding_part = ' encoding="%s"' % eventual_encoding 798 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 799 else: 800 prefix = '' 801 if not pretty_print: 802 indent_level = None 803 else: 804 indent_level = 0 805 return prefix + super(BeautifulSoup, self).decode( 806 indent_level, eventual_encoding, formatter, iterator) 807 808# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' 809_s = BeautifulSoup 810_soup = BeautifulSoup 811 812class BeautifulStoneSoup(BeautifulSoup): 813 """Deprecated interface to an XML parser.""" 814 815 def __init__(self, *args, **kwargs): 816 kwargs['features'] = 'xml' 817 warnings.warn( 818 'The BeautifulStoneSoup class is deprecated. Instead of using ' 819 'it, pass features="xml" into the BeautifulSoup constructor.', 820 DeprecationWarning, stacklevel=2 821 ) 822 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 823 824 825class StopParsing(Exception): 826 """Exception raised by a TreeBuilder if it's unable to continue parsing.""" 827 pass 828 829class FeatureNotFound(ValueError): 830 """Exception raised by the BeautifulSoup constructor if no parser with the 831 requested features is found. 832 """ 833 pass 834 835 836#If this file is run as a script, act as an HTML pretty-printer. 837if __name__ == '__main__': 838 import sys 839 soup = BeautifulSoup(sys.stdin) 840 print((soup.prettify())) 841