1"""Diagnostic functions, mainly for use when doing tech support.""" 2 3# Use of this source code is governed by the MIT license. 4__license__ = "MIT" 5 6import cProfile 7from io import BytesIO 8from html.parser import HTMLParser 9import bs4 10from bs4 import BeautifulSoup, __version__ 11from bs4.builder import builder_registry 12 13import os 14import pstats 15import random 16import tempfile 17import time 18import traceback 19import sys 20 21def diagnose(data): 22 """Diagnostic suite for isolating common problems. 23 24 :param data: A string containing markup that needs to be explained. 25 :return: None; diagnostics are printed to standard output. 26 """ 27 print(("Diagnostic running on Beautiful Soup %s" % __version__)) 28 print(("Python version %s" % sys.version)) 29 30 basic_parsers = ["html.parser", "html5lib", "lxml"] 31 for name in basic_parsers: 32 for builder in builder_registry.builders: 33 if name in builder.features: 34 break 35 else: 36 basic_parsers.remove(name) 37 print(( 38 "I noticed that %s is not installed. Installing it may help." % 39 name)) 40 41 if 'lxml' in basic_parsers: 42 basic_parsers.append("lxml-xml") 43 try: 44 from lxml import etree 45 print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) 46 except ImportError as e: 47 print( 48 "lxml is not installed or couldn't be imported.") 49 50 51 if 'html5lib' in basic_parsers: 52 try: 53 import html5lib 54 print(("Found html5lib version %s" % html5lib.__version__)) 55 except ImportError as e: 56 print( 57 "html5lib is not installed or couldn't be imported.") 58 59 if hasattr(data, 'read'): 60 data = data.read() 61 62 for parser in basic_parsers: 63 print(("Trying to parse your markup with %s" % parser)) 64 success = False 65 try: 66 soup = BeautifulSoup(data, features=parser) 67 success = True 68 except Exception as e: 69 print(("%s could not parse the markup." % parser)) 70 traceback.print_exc() 71 if success: 72 print(("Here's what %s did with the markup:" % parser)) 73 print((soup.prettify())) 74 75 print(("-" * 80)) 76 77def lxml_trace(data, html=True, **kwargs): 78 """Print out the lxml events that occur during parsing. 79 80 This lets you see how lxml parses a document when no Beautiful 81 Soup code is running. You can use this to determine whether 82 an lxml-specific problem is in Beautiful Soup's lxml tree builders 83 or in lxml itself. 84 85 :param data: Some markup. 86 :param html: If True, markup will be parsed with lxml's HTML parser. 87 if False, lxml's XML parser will be used. 88 """ 89 from lxml import etree 90 recover = kwargs.pop('recover', True) 91 if isinstance(data, str): 92 data = data.encode("utf8") 93 reader = BytesIO(data) 94 for event, element in etree.iterparse( 95 reader, html=html, recover=recover, **kwargs 96 ): 97 print(("%s, %4s, %s" % (event, element.tag, element.text))) 98 99class AnnouncingParser(HTMLParser): 100 """Subclass of HTMLParser that announces parse events, without doing 101 anything else. 102 103 You can use this to get a picture of how html.parser sees a given 104 document. The easiest way to do this is to call `htmlparser_trace`. 105 """ 106 107 def _p(self, s): 108 print(s) 109 110 def handle_starttag(self, name, attrs): 111 self._p("%s START" % name) 112 113 def handle_endtag(self, name): 114 self._p("%s END" % name) 115 116 def handle_data(self, data): 117 self._p("%s DATA" % data) 118 119 def handle_charref(self, name): 120 self._p("%s CHARREF" % name) 121 122 def handle_entityref(self, name): 123 self._p("%s ENTITYREF" % name) 124 125 def handle_comment(self, data): 126 self._p("%s COMMENT" % data) 127 128 def handle_decl(self, data): 129 self._p("%s DECL" % data) 130 131 def unknown_decl(self, data): 132 self._p("%s UNKNOWN-DECL" % data) 133 134 def handle_pi(self, data): 135 self._p("%s PI" % data) 136 137def htmlparser_trace(data): 138 """Print out the HTMLParser events that occur during parsing. 139 140 This lets you see how HTMLParser parses a document when no 141 Beautiful Soup code is running. 142 143 :param data: Some markup. 144 """ 145 parser = AnnouncingParser() 146 parser.feed(data) 147 148_vowels = "aeiou" 149_consonants = "bcdfghjklmnpqrstvwxyz" 150 151def rword(length=5): 152 "Generate a random word-like string." 153 s = '' 154 for i in range(length): 155 if i % 2 == 0: 156 t = _consonants 157 else: 158 t = _vowels 159 s += random.choice(t) 160 return s 161 162def rsentence(length=4): 163 "Generate a random sentence-like string." 164 return " ".join(rword(random.randint(4,9)) for i in range(length)) 165 166def rdoc(num_elements=1000): 167 """Randomly generate an invalid HTML document.""" 168 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] 169 elements = [] 170 for i in range(num_elements): 171 choice = random.randint(0,3) 172 if choice == 0: 173 # New tag. 174 tag_name = random.choice(tag_names) 175 elements.append("<%s>" % tag_name) 176 elif choice == 1: 177 elements.append(rsentence(random.randint(1,4))) 178 elif choice == 2: 179 # Close a tag. 180 tag_name = random.choice(tag_names) 181 elements.append("</%s>" % tag_name) 182 return "<html>" + "\n".join(elements) + "</html>" 183 184def benchmark_parsers(num_elements=100000): 185 """Very basic head-to-head performance benchmark.""" 186 print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) 187 data = rdoc(num_elements) 188 print(("Generated a large invalid HTML document (%d bytes)." % len(data))) 189 190 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: 191 success = False 192 try: 193 a = time.time() 194 soup = BeautifulSoup(data, parser) 195 b = time.time() 196 success = True 197 except Exception as e: 198 print(("%s could not parse the markup." % parser)) 199 traceback.print_exc() 200 if success: 201 print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) 202 203 from lxml import etree 204 a = time.time() 205 etree.HTML(data) 206 b = time.time() 207 print(("Raw lxml parsed the markup in %.2fs." % (b-a))) 208 209 import html5lib 210 parser = html5lib.HTMLParser() 211 a = time.time() 212 parser.parse(data) 213 b = time.time() 214 print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) 215 216def profile(num_elements=100000, parser="lxml"): 217 """Use Python's profiler on a randomly generated document.""" 218 filehandle = tempfile.NamedTemporaryFile() 219 filename = filehandle.name 220 221 data = rdoc(num_elements) 222 vars = dict(bs4=bs4, data=data, parser=parser) 223 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) 224 225 stats = pstats.Stats(filename) 226 # stats.strip_dirs() 227 stats.sort_stats("cumulative") 228 stats.print_stats('_html5lib|bs4', 50) 229 230# If this file is run as a script, standard input is diagnosed. 231if __name__ == '__main__': 232 diagnose(sys.stdin.read()) 233