xref: /openbmc/openbmc/poky/bitbake/lib/bs4/diagnose.py (revision 82c905dc)
1"""Diagnostic functions, mainly for use when doing tech support."""
2
3__license__ = "MIT"
4
5import cProfile
6from io import StringIO
7from html.parser import HTMLParser
8import bs4
9from bs4 import BeautifulSoup, __version__
10from bs4.builder import builder_registry
11
12import os
13import pstats
14import random
15import tempfile
16import time
17import traceback
18import sys
19import cProfile
20
21def diagnose(data):
22    """Diagnostic suite for isolating common problems."""
23    print("Diagnostic running on Beautiful Soup %s" % __version__)
24    print("Python version %s" % sys.version)
25
26    basic_parsers = ["html.parser", "html5lib", "lxml"]
27    for name in basic_parsers:
28        for builder in builder_registry.builders:
29            if name in builder.features:
30                break
31        else:
32            basic_parsers.remove(name)
33            print((
34                "I noticed that %s is not installed. Installing it may help." %
35                name))
36
37    if 'lxml' in basic_parsers:
38        basic_parsers.append(["lxml", "xml"])
39        try:
40            from lxml import etree
41            print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
42        except ImportError as e:
43            print (
44                "lxml is not installed or couldn't be imported.")
45
46
47    if 'html5lib' in basic_parsers:
48        try:
49            import html5lib
50            print("Found html5lib version %s" % html5lib.__version__)
51        except ImportError as e:
52            print (
53                "html5lib is not installed or couldn't be imported.")
54
55    if hasattr(data, 'read'):
56        data = data.read()
57    elif os.path.exists(data):
58        print('"%s" looks like a filename. Reading data from the file.' % data)
59        data = open(data).read()
60    elif data.startswith("http:") or data.startswith("https:"):
61        print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
62        print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
63        return
64    print()
65
66    for parser in basic_parsers:
67        print("Trying to parse your markup with %s" % parser)
68        success = False
69        try:
70            soup = BeautifulSoup(data, parser)
71            success = True
72        except Exception as e:
73            print("%s could not parse the markup." % parser)
74            traceback.print_exc()
75        if success:
76            print("Here's what %s did with the markup:" % parser)
77            print(soup.prettify())
78
79        print("-" * 80)
80
81def lxml_trace(data, html=True, **kwargs):
82    """Print out the lxml events that occur during parsing.
83
84    This lets you see how lxml parses a document when no Beautiful
85    Soup code is running.
86    """
87    from lxml import etree
88    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
89        print(("%s, %4s, %s" % (event, element.tag, element.text)))
90
91class AnnouncingParser(HTMLParser):
92    """Announces HTMLParser parse events, without doing anything else."""
93
94    def _p(self, s):
95        print(s)
96
97    def handle_starttag(self, name, attrs):
98        self._p("%s START" % name)
99
100    def handle_endtag(self, name):
101        self._p("%s END" % name)
102
103    def handle_data(self, data):
104        self._p("%s DATA" % data)
105
106    def handle_charref(self, name):
107        self._p("%s CHARREF" % name)
108
109    def handle_entityref(self, name):
110        self._p("%s ENTITYREF" % name)
111
112    def handle_comment(self, data):
113        self._p("%s COMMENT" % data)
114
115    def handle_decl(self, data):
116        self._p("%s DECL" % data)
117
118    def unknown_decl(self, data):
119        self._p("%s UNKNOWN-DECL" % data)
120
121    def handle_pi(self, data):
122        self._p("%s PI" % data)
123
124def htmlparser_trace(data):
125    """Print out the HTMLParser events that occur during parsing.
126
127    This lets you see how HTMLParser parses a document when no
128    Beautiful Soup code is running.
129    """
130    parser = AnnouncingParser()
131    parser.feed(data)
132
133_vowels = "aeiou"
134_consonants = "bcdfghjklmnpqrstvwxyz"
135
136def rword(length=5):
137    "Generate a random word-like string."
138    s = ''
139    for i in range(length):
140        if i % 2 == 0:
141            t = _consonants
142        else:
143            t = _vowels
144        s += random.choice(t)
145    return s
146
147def rsentence(length=4):
148    "Generate a random sentence-like string."
149    return " ".join(rword(random.randint(4,9)) for i in range(length))
150
151def rdoc(num_elements=1000):
152    """Randomly generate an invalid HTML document."""
153    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
154    elements = []
155    for i in range(num_elements):
156        choice = random.randint(0,3)
157        if choice == 0:
158            # New tag.
159            tag_name = random.choice(tag_names)
160            elements.append("<%s>" % tag_name)
161        elif choice == 1:
162            elements.append(rsentence(random.randint(1,4)))
163        elif choice == 2:
164            # Close a tag.
165            tag_name = random.choice(tag_names)
166            elements.append("</%s>" % tag_name)
167    return "<html>" + "\n".join(elements) + "</html>"
168
169def benchmark_parsers(num_elements=100000):
170    """Very basic head-to-head performance benchmark."""
171    print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
172    data = rdoc(num_elements)
173    print("Generated a large invalid HTML document (%d bytes)." % len(data))
174
175    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
176        success = False
177        try:
178            a = time.time()
179            soup = BeautifulSoup(data, parser)
180            b = time.time()
181            success = True
182        except Exception as e:
183            print("%s could not parse the markup." % parser)
184            traceback.print_exc()
185        if success:
186            print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
187
188    from lxml import etree
189    a = time.time()
190    etree.HTML(data)
191    b = time.time()
192    print("Raw lxml parsed the markup in %.2fs." % (b-a))
193
194    import html5lib
195    parser = html5lib.HTMLParser()
196    a = time.time()
197    parser.parse(data)
198    b = time.time()
199    print("Raw html5lib parsed the markup in %.2fs." % (b-a))
200
201def profile(num_elements=100000, parser="lxml"):
202
203    filehandle = tempfile.NamedTemporaryFile()
204    filename = filehandle.name
205
206    data = rdoc(num_elements)
207    vars = dict(bs4=bs4, data=data, parser=parser)
208    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
209
210    stats = pstats.Stats(filename)
211    # stats.strip_dirs()
212    stats.sort_stats("cumulative")
213    stats.print_stats('_html5lib|bs4', 50)
214
215if __name__ == '__main__':
216    diagnose(sys.stdin.read())
217