1"""Beautiful Soup 2Elixir and Tonic 3"The Screen-Scraper's Friend" 4http://www.crummy.com/software/BeautifulSoup/ 5 6Beautiful Soup uses a pluggable XML or HTML parser to parse a 7(possibly invalid) document into a tree representation. Beautiful Soup 8provides provides methods and Pythonic idioms that make it easy to 9navigate, search, and modify the parse tree. 10 11Beautiful Soup works with Python 2.6 and up. It works better if lxml 12and/or html5lib is installed. 13 14For more than you ever wanted to know about Beautiful Soup, see the 15documentation: 16http://www.crummy.com/software/BeautifulSoup/bs4/doc/ 17""" 18 19__author__ = "Leonard Richardson (leonardr@segfault.org)" 20__version__ = "4.4.1" 21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" 22__license__ = "MIT" 23 24__all__ = ['BeautifulSoup'] 25 26import os 27import re 28import warnings 29 30from .builder import builder_registry, ParserRejectedMarkup 31from .dammit import UnicodeDammit 32from .element import ( 33 CData, 34 Comment, 35 DEFAULT_OUTPUT_ENCODING, 36 Declaration, 37 Doctype, 38 NavigableString, 39 PageElement, 40 ProcessingInstruction, 41 ResultSet, 42 SoupStrainer, 43 Tag, 44 ) 45 46# The very first thing we do is give a useful error if someone is 47# running this code under Python 3 without converting it. 48'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 49 50class BeautifulSoup(Tag): 51 """ 52 This class defines the basic interface called by the tree builders. 53 54 These methods will be called by the parser: 55 reset() 56 feed(markup) 57 58 The tree builder may call these methods from its feed() implementation: 59 handle_starttag(name, attrs) # See note about return value 60 handle_endtag(name) 61 handle_data(data) # Appends to the current data node 62 endData(containerClass=NavigableString) # Ends the current data node 63 64 No matter how complicated the underlying parser is, you should be 65 able to build a tree using 'start tag' events, 'end tag' events, 66 'data' events, and "done with data" events. 67 68 If you encounter an empty-element tag (aka a self-closing tag, 69 like HTML's <br> tag), call handle_starttag and then 70 handle_endtag. 71 """ 72 ROOT_TAG_NAME = '[document]' 73 74 # If the end-user gives no indication which tree builder they 75 # want, look for one with these features. 76 DEFAULT_BUILDER_FEATURES = ['html', 'fast'] 77 78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' 79 80 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" 81 82 def __init__(self, markup="", features=None, builder=None, 83 parse_only=None, from_encoding=None, exclude_encodings=None, 84 **kwargs): 85 """The Soup object is initialized as the 'root tag', and the 86 provided markup (which can be a string or a file-like object) 87 is fed into the underlying parser.""" 88 89 if 'convertEntities' in kwargs: 90 warnings.warn( 91 "BS4 does not respect the convertEntities argument to the " 92 "BeautifulSoup constructor. Entities are always converted " 93 "to Unicode characters.") 94 95 if 'markupMassage' in kwargs: 96 del kwargs['markupMassage'] 97 warnings.warn( 98 "BS4 does not respect the markupMassage argument to the " 99 "BeautifulSoup constructor. The tree builder is responsible " 100 "for any necessary markup massage.") 101 102 if 'smartQuotesTo' in kwargs: 103 del kwargs['smartQuotesTo'] 104 warnings.warn( 105 "BS4 does not respect the smartQuotesTo argument to the " 106 "BeautifulSoup constructor. Smart quotes are always converted " 107 "to Unicode characters.") 108 109 if 'selfClosingTags' in kwargs: 110 del kwargs['selfClosingTags'] 111 warnings.warn( 112 "BS4 does not respect the selfClosingTags argument to the " 113 "BeautifulSoup constructor. The tree builder is responsible " 114 "for understanding self-closing tags.") 115 116 if 'isHTML' in kwargs: 117 del kwargs['isHTML'] 118 warnings.warn( 119 "BS4 does not respect the isHTML argument to the " 120 "BeautifulSoup constructor. Suggest you use " 121 "features='lxml' for HTML and features='lxml-xml' for " 122 "XML.") 123 124 def deprecated_argument(old_name, new_name): 125 if old_name in kwargs: 126 warnings.warn( 127 'The "%s" argument to the BeautifulSoup constructor ' 128 'has been renamed to "%s."' % (old_name, new_name)) 129 value = kwargs[old_name] 130 del kwargs[old_name] 131 return value 132 return None 133 134 parse_only = parse_only or deprecated_argument( 135 "parseOnlyThese", "parse_only") 136 137 from_encoding = from_encoding or deprecated_argument( 138 "fromEncoding", "from_encoding") 139 140 if len(kwargs) > 0: 141 arg = list(kwargs.keys()).pop() 142 raise TypeError( 143 "__init__() got an unexpected keyword argument '%s'" % arg) 144 145 if builder is None: 146 original_features = features 147 if isinstance(features, str): 148 features = [features] 149 if features is None or len(features) == 0: 150 features = self.DEFAULT_BUILDER_FEATURES 151 builder_class = builder_registry.lookup(*features) 152 if builder_class is None: 153 raise FeatureNotFound( 154 "Couldn't find a tree builder with the features you " 155 "requested: %s. Do you need to install a parser library?" 156 % ",".join(features)) 157 builder = builder_class() 158 if not (original_features == builder.NAME or 159 original_features in builder.ALTERNATE_NAMES): 160 if builder.is_xml: 161 markup_type = "XML" 162 else: 163 markup_type = "HTML" 164 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( 165 parser=builder.NAME, 166 markup_type=markup_type)) 167 168 self.builder = builder 169 self.is_xml = builder.is_xml 170 self.builder.soup = self 171 172 self.parse_only = parse_only 173 174 if hasattr(markup, 'read'): # It's a file-type object. 175 markup = markup.read() 176 elif len(markup) <= 256: 177 # Print out warnings for a couple beginner problems 178 # involving passing non-markup to Beautiful Soup. 179 # Beautiful Soup will still parse the input as markup, 180 # just in case that's what the user really wants. 181 if (isinstance(markup, str) 182 and not os.path.supports_unicode_filenames): 183 possible_filename = markup.encode("utf8") 184 else: 185 possible_filename = markup 186 is_file = False 187 try: 188 is_file = os.path.exists(possible_filename) 189 except Exception as e: 190 # This is almost certainly a problem involving 191 # characters not valid in filenames on this 192 # system. Just let it go. 193 pass 194 if is_file: 195 if isinstance(markup, str): 196 markup = markup.encode("utf8") 197 warnings.warn( 198 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) 199 if markup[:5] == "http:" or markup[:6] == "https:": 200 # TODO: This is ugly but I couldn't get it to work in 201 # Python 3 otherwise. 202 if ((isinstance(markup, bytes) and not b' ' in markup) 203 or (isinstance(markup, str) and not ' ' in markup)): 204 if isinstance(markup, str): 205 markup = markup.encode("utf8") 206 warnings.warn( 207 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) 208 209 for (self.markup, self.original_encoding, self.declared_html_encoding, 210 self.contains_replacement_characters) in ( 211 self.builder.prepare_markup( 212 markup, from_encoding, exclude_encodings=exclude_encodings)): 213 self.reset() 214 try: 215 self._feed() 216 break 217 except ParserRejectedMarkup: 218 pass 219 220 # Clear out the markup and remove the builder's circular 221 # reference to this object. 222 self.markup = None 223 self.builder.soup = None 224 225 def __copy__(self): 226 return type(self)(self.encode(), builder=self.builder) 227 228 def __getstate__(self): 229 # Frequently a tree builder can't be pickled. 230 d = dict(self.__dict__) 231 if 'builder' in d and not self.builder.picklable: 232 del d['builder'] 233 return d 234 235 def _feed(self): 236 # Convert the document to Unicode. 237 self.builder.reset() 238 239 self.builder.feed(self.markup) 240 # Close out any unfinished strings and close all the open tags. 241 self.endData() 242 while self.currentTag.name != self.ROOT_TAG_NAME: 243 self.popTag() 244 245 def reset(self): 246 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) 247 self.hidden = 1 248 self.builder.reset() 249 self.current_data = [] 250 self.currentTag = None 251 self.tagStack = [] 252 self.preserve_whitespace_tag_stack = [] 253 self.pushTag(self) 254 255 def new_tag(self, name, namespace=None, nsprefix=None, **attrs): 256 """Create a new tag associated with this soup.""" 257 return Tag(None, self.builder, name, namespace, nsprefix, attrs) 258 259 def new_string(self, s, subclass=NavigableString): 260 """Create a new NavigableString associated with this soup.""" 261 return subclass(s) 262 263 def insert_before(self, successor): 264 raise NotImplementedError("BeautifulSoup objects don't support insert_before().") 265 266 def insert_after(self, successor): 267 raise NotImplementedError("BeautifulSoup objects don't support insert_after().") 268 269 def popTag(self): 270 tag = self.tagStack.pop() 271 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: 272 self.preserve_whitespace_tag_stack.pop() 273 #print "Pop", tag.name 274 if self.tagStack: 275 self.currentTag = self.tagStack[-1] 276 return self.currentTag 277 278 def pushTag(self, tag): 279 #print "Push", tag.name 280 if self.currentTag: 281 self.currentTag.contents.append(tag) 282 self.tagStack.append(tag) 283 self.currentTag = self.tagStack[-1] 284 if tag.name in self.builder.preserve_whitespace_tags: 285 self.preserve_whitespace_tag_stack.append(tag) 286 287 def endData(self, containerClass=NavigableString): 288 if self.current_data: 289 current_data = ''.join(self.current_data) 290 # If whitespace is not preserved, and this string contains 291 # nothing but ASCII spaces, replace it with a single space 292 # or newline. 293 if not self.preserve_whitespace_tag_stack: 294 strippable = True 295 for i in current_data: 296 if i not in self.ASCII_SPACES: 297 strippable = False 298 break 299 if strippable: 300 if '\n' in current_data: 301 current_data = '\n' 302 else: 303 current_data = ' ' 304 305 # Reset the data collector. 306 self.current_data = [] 307 308 # Should we add this string to the tree at all? 309 if self.parse_only and len(self.tagStack) <= 1 and \ 310 (not self.parse_only.text or \ 311 not self.parse_only.search(current_data)): 312 return 313 314 o = containerClass(current_data) 315 self.object_was_parsed(o) 316 317 def object_was_parsed(self, o, parent=None, most_recent_element=None): 318 """Add an object to the parse tree.""" 319 parent = parent or self.currentTag 320 previous_element = most_recent_element or self._most_recent_element 321 322 next_element = previous_sibling = next_sibling = None 323 if isinstance(o, Tag): 324 next_element = o.next_element 325 next_sibling = o.next_sibling 326 previous_sibling = o.previous_sibling 327 if not previous_element: 328 previous_element = o.previous_element 329 330 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) 331 332 self._most_recent_element = o 333 parent.contents.append(o) 334 335 if parent.next_sibling: 336 # This node is being inserted into an element that has 337 # already been parsed. Deal with any dangling references. 338 index = parent.contents.index(o) 339 if index == 0: 340 previous_element = parent 341 previous_sibling = None 342 else: 343 previous_element = previous_sibling = parent.contents[index-1] 344 if index == len(parent.contents)-1: 345 next_element = parent.next_sibling 346 next_sibling = None 347 else: 348 next_element = next_sibling = parent.contents[index+1] 349 350 o.previous_element = previous_element 351 if previous_element: 352 previous_element.next_element = o 353 o.next_element = next_element 354 if next_element: 355 next_element.previous_element = o 356 o.next_sibling = next_sibling 357 if next_sibling: 358 next_sibling.previous_sibling = o 359 o.previous_sibling = previous_sibling 360 if previous_sibling: 361 previous_sibling.next_sibling = o 362 363 def _popToTag(self, name, nsprefix=None, inclusivePop=True): 364 """Pops the tag stack up to and including the most recent 365 instance of the given tag. If inclusivePop is false, pops the tag 366 stack up to but *not* including the most recent instqance of 367 the given tag.""" 368 #print "Popping to %s" % name 369 if name == self.ROOT_TAG_NAME: 370 # The BeautifulSoup object itself can never be popped. 371 return 372 373 most_recently_popped = None 374 375 stack_size = len(self.tagStack) 376 for i in range(stack_size - 1, 0, -1): 377 t = self.tagStack[i] 378 if (name == t.name and nsprefix == t.prefix): 379 if inclusivePop: 380 most_recently_popped = self.popTag() 381 break 382 most_recently_popped = self.popTag() 383 384 return most_recently_popped 385 386 def handle_starttag(self, name, namespace, nsprefix, attrs): 387 """Push a start tag on to the stack. 388 389 If this method returns None, the tag was rejected by the 390 SoupStrainer. You should proceed as if the tag had not occured 391 in the document. For instance, if this was a self-closing tag, 392 don't call handle_endtag. 393 """ 394 395 # print "Start tag %s: %s" % (name, attrs) 396 self.endData() 397 398 if (self.parse_only and len(self.tagStack) <= 1 399 and (self.parse_only.text 400 or not self.parse_only.search_tag(name, attrs))): 401 return None 402 403 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, 404 self.currentTag, self._most_recent_element) 405 if tag is None: 406 return tag 407 if self._most_recent_element: 408 self._most_recent_element.next_element = tag 409 self._most_recent_element = tag 410 self.pushTag(tag) 411 return tag 412 413 def handle_endtag(self, name, nsprefix=None): 414 #print "End tag: " + name 415 self.endData() 416 self._popToTag(name, nsprefix) 417 418 def handle_data(self, data): 419 self.current_data.append(data) 420 421 def decode(self, pretty_print=False, 422 eventual_encoding=DEFAULT_OUTPUT_ENCODING, 423 formatter="minimal"): 424 """Returns a string or Unicode representation of this document. 425 To get Unicode, pass None for encoding.""" 426 427 if self.is_xml: 428 # Print the XML declaration 429 encoding_part = '' 430 if eventual_encoding is not None: 431 encoding_part = ' encoding="%s"' % eventual_encoding 432 prefix = '<?xml version="1.0"%s?>\n' % encoding_part 433 else: 434 prefix = '' 435 if not pretty_print: 436 indent_level = None 437 else: 438 indent_level = 0 439 return prefix + super(BeautifulSoup, self).decode( 440 indent_level, eventual_encoding, formatter) 441 442# Alias to make it easier to type import: 'from bs4 import _soup' 443_s = BeautifulSoup 444_soup = BeautifulSoup 445 446class BeautifulStoneSoup(BeautifulSoup): 447 """Deprecated interface to an XML parser.""" 448 449 def __init__(self, *args, **kwargs): 450 kwargs['features'] = 'xml' 451 warnings.warn( 452 'The BeautifulStoneSoup class is deprecated. Instead of using ' 453 'it, pass features="xml" into the BeautifulSoup constructor.') 454 super(BeautifulStoneSoup, self).__init__(*args, **kwargs) 455 456 457class StopParsing(Exception): 458 pass 459 460class FeatureNotFound(ValueError): 461 pass 462 463 464#By default, act as an HTML pretty-printer. 465if __name__ == '__main__': 466 import sys 467 soup = BeautifulSoup(sys.stdin) 468 print(soup.prettify()) 469