xref: /openbmc/openbmc/poky/bitbake/lib/bb/codeparser.py (revision c9537f57ab488bf5d90132917b0184e2527970a5)
1#
2# Copyright BitBake Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7"""
8BitBake code parser
9
10Parses actual code (i.e. python and shell) for functions and in-line
11expressions. Used mainly to determine dependencies on other functions
12and variables within the BitBake metadata. Also provides a cache for
13this information in order to speed up processing.
14
15(Not to be confused with the code that parses the metadata itself,
16see lib/bb/parse/ for that).
17
18NOTE: if you change how the parsers gather information you will almost
19certainly need to increment CodeParserCache.CACHE_VERSION below so that
20any existing codeparser cache gets invalidated. Additionally you'll need
21to increment __cache_version__ in cache.py in order to ensure that old
22recipe caches don't trigger "Taskhash mismatch" errors.
23
24"""
25
26import ast
27import sys
28import codegen
29import logging
30import inspect
31import bb.pysh as pysh
32import bb.utils, bb.data
33import hashlib
34from itertools import chain
35from bb.pysh import pyshyacc, pyshlex
36from bb.cache import MultiProcessCache
37
38logger = logging.getLogger('BitBake.CodeParser')
39
40def bbhash(s):
41    return hashlib.sha256(s.encode("utf-8")).hexdigest()
42
43def check_indent(codestr):
44    """If the code is indented, add a top level piece of code to 'remove' the indentation"""
45
46    i = 0
47    while codestr[i] in ["\n", "\t", " "]:
48        i = i + 1
49
50    if i == 0:
51        return codestr
52
53    if codestr[i-1] == "\t" or codestr[i-1] == " ":
54        if codestr[0] == "\n":
55            # Since we're adding a line, we need to remove one line of any empty padding
56            # to ensure line numbers are correct
57            codestr = codestr[1:]
58        return "if 1:\n" + codestr
59
60    return codestr
61
62modulecode_deps = {}
63
64def add_module_functions(fn, functions, namespace):
65    import os
66    fstat = os.stat(fn)
67    fixedhash = fn + ":" + str(fstat.st_size) +  ":" + str(fstat.st_mtime)
68    for f in functions:
69        name = "%s.%s" % (namespace, f)
70        parser = PythonParser(name, logger)
71        try:
72            parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f, func=functions[f])
73            #bb.warn("Cached %s" % f)
74        except KeyError:
75            try:
76                targetfn = inspect.getsourcefile(functions[f])
77            except TypeError:
78                # Builtin
79                continue
80            if fn != targetfn:
81                # Skip references to other modules outside this file
82                #bb.warn("Skipping %s" % name)
83                continue
84            try:
85                lines, lineno = inspect.getsourcelines(functions[f])
86            except TypeError:
87                # Builtin
88                continue
89            src = "".join(lines)
90            parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f, func=functions[f])
91            #bb.warn("Not cached %s" % f)
92        execs = parser.execs.copy()
93        # Expand internal module exec references
94        for e in parser.execs:
95            if e in functions:
96                execs.remove(e)
97                execs.add(namespace + "." + e)
98        visitorcode = None
99        if hasattr(functions[f], 'visitorcode'):
100            visitorcode = getattr(functions[f], "visitorcode")
101        modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy(), parser.extra, visitorcode]
102        #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains))
103
104def update_module_dependencies(d):
105    for mod in modulecode_deps:
106        excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split())
107        if excludes:
108            modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3], modulecode_deps[mod][4], modulecode_deps[mod][5]]
109
110# A custom getstate/setstate using tuples is actually worth 15% cachesize by
111# avoiding duplication of the attribute names!
112class SetCache(object):
113    def __init__(self):
114        self.setcache = {}
115
116    def internSet(self, items):
117
118        new = []
119        for i in items:
120            new.append(sys.intern(i))
121        s = frozenset(new)
122        h = hash(s)
123        if h in self.setcache:
124            return self.setcache[h]
125        self.setcache[h] = s
126        return s
127
128codecache = SetCache()
129
130class pythonCacheLine(object):
131    def __init__(self, refs, execs, contains, extra):
132        self.refs = codecache.internSet(refs)
133        self.execs = codecache.internSet(execs)
134        self.contains = {}
135        for c in contains:
136            self.contains[c] = codecache.internSet(contains[c])
137        self.extra = extra
138
139    def __getstate__(self):
140        return (self.refs, self.execs, self.contains, self.extra)
141
142    def __setstate__(self, state):
143        (refs, execs, contains, extra) = state
144        self.__init__(refs, execs, contains, extra)
145    def __hash__(self):
146        l = (hash(self.refs), hash(self.execs), hash(self.extra))
147        for c in sorted(self.contains.keys()):
148            l = l + (c, hash(self.contains[c]))
149        return hash(l)
150    def __repr__(self):
151        return " ".join([str(self.refs), str(self.execs), str(self.contains)])
152
153
154class shellCacheLine(object):
155    def __init__(self, execs):
156        self.execs = codecache.internSet(execs)
157
158    def __getstate__(self):
159        return (self.execs)
160
161    def __setstate__(self, state):
162        (execs) = state
163        self.__init__(execs)
164    def __hash__(self):
165        return hash(self.execs)
166    def __repr__(self):
167        return str(self.execs)
168
169class CodeParserCache(MultiProcessCache):
170    cache_file_name = "bb_codeparser.dat"
171    # NOTE: you must increment this if you change how the parsers gather information,
172    # so that an existing cache gets invalidated. Additionally you'll need
173    # to increment __cache_version__ in cache.py in order to ensure that old
174    # recipe caches don't trigger "Taskhash mismatch" errors.
175    CACHE_VERSION = 14
176
177    def __init__(self):
178        MultiProcessCache.__init__(self)
179        self.pythoncache = self.cachedata[0]
180        self.shellcache = self.cachedata[1]
181        self.pythoncacheextras = self.cachedata_extras[0]
182        self.shellcacheextras = self.cachedata_extras[1]
183
184        # To avoid duplication in the codeparser cache, keep
185        # a lookup of hashes of objects we already have
186        self.pythoncachelines = {}
187        self.shellcachelines = {}
188
189    def newPythonCacheLine(self, refs, execs, contains, extra):
190        cacheline = pythonCacheLine(refs, execs, contains, extra)
191        h = hash(cacheline)
192        if h in self.pythoncachelines:
193            return self.pythoncachelines[h]
194        self.pythoncachelines[h] = cacheline
195        return cacheline
196
197    def newShellCacheLine(self, execs):
198        cacheline = shellCacheLine(execs)
199        h = hash(cacheline)
200        if h in self.shellcachelines:
201            return self.shellcachelines[h]
202        self.shellcachelines[h] = cacheline
203        return cacheline
204
205    def init_cache(self, cachedir):
206        # Check if we already have the caches
207        if self.pythoncache:
208            return
209
210        MultiProcessCache.init_cache(self, cachedir)
211
212        # cachedata gets re-assigned in the parent
213        self.pythoncache = self.cachedata[0]
214        self.shellcache = self.cachedata[1]
215
216    def create_cachedata(self):
217        data = [{}, {}]
218        return data
219
220codeparsercache = CodeParserCache()
221
222def parser_cache_init(cachedir):
223    codeparsercache.init_cache(cachedir)
224
225def parser_cache_save():
226    codeparsercache.save_extras()
227
228def parser_cache_savemerge():
229    codeparsercache.save_merge()
230
231Logger = logging.getLoggerClass()
232class BufferedLogger(Logger):
233    def __init__(self, name, level=0, target=None):
234        Logger.__init__(self, name)
235        self.setLevel(level)
236        self.buffer = []
237        self.target = target
238
239    def handle(self, record):
240        self.buffer.append(record)
241
242    def flush(self):
243        for record in self.buffer:
244            if self.target.isEnabledFor(record.levelno):
245                self.target.handle(record)
246        self.buffer = []
247
248class DummyLogger():
249    def flush(self):
250        return
251
252class PythonParser():
253    getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional")
254    getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
255    containsfuncs = ("bb.utils.contains", "base_contains")
256    containsanyfuncs = ("bb.utils.contains_any",  "bb.utils.filter")
257    execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
258
259    def warn(self, func, arg):
260        """Warn about calls of bitbake APIs which pass a non-literal
261        argument for the variable name, as we're not able to track such
262        a reference.
263        """
264
265        try:
266            funcstr = codegen.to_source(func)
267            argstr = codegen.to_source(arg)
268        except TypeError:
269            self.log.debug2('Failed to convert function and argument to source form')
270        else:
271            self.log.debug(self.unhandled_message % (funcstr, argstr))
272
273    def visit_Call(self, node):
274        name = self.called_node_name(node.func)
275        if name and name in modulecode_deps and modulecode_deps[name][5]:
276            visitorcode = modulecode_deps[name][5]
277            contains, execs, warn = visitorcode(name, node.args)
278            for i in contains:
279                self.contains[i] = contains[i]
280            self.execs |= execs
281            if warn:
282                self.warn(node.func, warn)
283        elif name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs):
284            if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
285                varname = node.args[0].value
286                if name in self.containsfuncs and isinstance(node.args[1], ast.Constant):
287                    if varname not in self.contains:
288                        self.contains[varname] = set()
289                    self.contains[varname].add(node.args[1].value)
290                elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant):
291                    if varname not in self.contains:
292                        self.contains[varname] = set()
293                    self.contains[varname].update(node.args[1].value.split())
294                elif name.endswith(self.getvarflags):
295                    if isinstance(node.args[1], ast.Constant):
296                        self.references.add('%s[%s]' % (varname, node.args[1].value))
297                    else:
298                        self.warn(node.func, node.args[1])
299                else:
300                    self.references.add(varname)
301            else:
302                self.warn(node.func, node.args[0])
303        elif name and name.endswith(".expand"):
304            if isinstance(node.args[0], ast.Constant):
305                value = node.args[0].value
306                d = bb.data.init()
307                parser = d.expandWithRefs(value, self.name)
308                self.references |= parser.references
309                self.execs |= parser.execs
310                for varname in parser.contains:
311                    if varname not in self.contains:
312                        self.contains[varname] = set()
313                    self.contains[varname] |= parser.contains[varname]
314        elif name in self.execfuncs:
315            if isinstance(node.args[0], ast.Constant):
316                self.var_execs.add(node.args[0].value)
317            else:
318                self.warn(node.func, node.args[0])
319        elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
320            self.execs.add(name)
321
322    def called_node_name(self, node):
323        """Given a called node, return its original string form"""
324        components = []
325        while node:
326            if isinstance(node, ast.Attribute):
327                components.append(node.attr)
328                node = node.value
329            elif isinstance(node, ast.Name):
330                components.append(node.id)
331                return '.'.join(reversed(components))
332            else:
333                break
334
335    def __init__(self, name, log):
336        self.name = name
337        self.var_execs = set()
338        self.contains = {}
339        self.execs = set()
340        self.references = set()
341        self._log = log
342        # Defer init as expensive
343        self.log = DummyLogger()
344
345        self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
346        self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
347
348    # For the python module code it is expensive to have the function text so it is
349    # uses a different fixedhash to cache against. We can take the hit on obtaining the
350    # text if it isn't in the cache.
351    def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None, func=None):
352        if not fixedhash and (not node or not node.strip()):
353            return
354
355        if fixedhash:
356            h = fixedhash
357        else:
358            h = bbhash(str(node))
359
360        if h in codeparsercache.pythoncache:
361            self.references = set(codeparsercache.pythoncache[h].refs)
362            self.execs = set(codeparsercache.pythoncache[h].execs)
363            self.contains = {}
364            for i in codeparsercache.pythoncache[h].contains:
365                self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
366            self.extra = codeparsercache.pythoncache[h].extra
367            return
368
369        if h in codeparsercache.pythoncacheextras:
370            self.references = set(codeparsercache.pythoncacheextras[h].refs)
371            self.execs = set(codeparsercache.pythoncacheextras[h].execs)
372            self.contains = {}
373            for i in codeparsercache.pythoncacheextras[h].contains:
374                self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
375            self.extra = codeparsercache.pythoncacheextras[h].extra
376            return
377
378        if fixedhash and not node:
379            raise KeyError
380
381        # Need to parse so take the hit on the real log buffer
382        self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log)
383
384        # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
385        node = "\n" * int(lineno) + node
386        code = compile(check_indent(str(node)), filename, "exec",
387                       ast.PyCF_ONLY_AST)
388
389        for n in ast.walk(code):
390            if n.__class__.__name__ == "Call":
391                self.visit_Call(n)
392
393        if func is not None:
394            self.references |= getattr(func, "bb_vardeps", set())
395            self.references -= getattr(func, "bb_vardepsexclude", set())
396
397        self.execs.update(self.var_execs)
398        self.extra = None
399        if fixedhash:
400            self.extra = bbhash(str(node))
401
402        codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains, self.extra)
403
404class ShellParser():
405    def __init__(self, name, log):
406        self.funcdefs = set()
407        self.allexecs = set()
408        self.execs = set()
409        self._name = name
410        self._log = log
411        # Defer init as expensive
412        self.log = DummyLogger()
413
414        self.unhandled_template = "unable to handle non-literal command '%s'"
415        self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
416
417    def parse_shell(self, value):
418        """Parse the supplied shell code in a string, returning the external
419        commands it executes.
420        """
421
422        h = bbhash(str(value))
423
424        if h in codeparsercache.shellcache:
425            self.execs = set(codeparsercache.shellcache[h].execs)
426            return self.execs
427
428        if h in codeparsercache.shellcacheextras:
429            self.execs = set(codeparsercache.shellcacheextras[h].execs)
430            return self.execs
431
432        # Need to parse so take the hit on the real log buffer
433        self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log)
434
435        self._parse_shell(value)
436        self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
437
438        codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
439
440        return self.execs
441
442    def _parse_shell(self, value):
443        try:
444            tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
445        except Exception:
446            bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:]))
447            raise
448
449        self.process_tokens(tokens)
450
451    def process_tokens(self, tokens):
452        """Process a supplied portion of the syntax tree as returned by
453        pyshyacc.parse.
454        """
455
456        def function_definition(value):
457            self.funcdefs.add(value.name)
458            return [value.body], None
459
460        def case_clause(value):
461            # Element 0 of each item in the case is the list of patterns, and
462            # Element 1 of each item in the case is the list of commands to be
463            # executed when that pattern matches.
464            words = chain(*[item[0] for item in value.items])
465            cmds  = chain(*[item[1] for item in value.items])
466            return cmds, words
467
468        def if_clause(value):
469            main = chain(value.cond, value.if_cmds)
470            rest = value.else_cmds
471            if isinstance(rest, tuple) and rest[0] == "elif":
472                return chain(main, if_clause(rest[1]))
473            else:
474                return chain(main, rest)
475
476        def simple_command(value):
477            return None, chain(value.words, (assign[1] for assign in value.assigns))
478
479        token_handlers = {
480            "and_or": lambda x: ((x.left, x.right), None),
481            "async": lambda x: ([x], None),
482            "brace_group": lambda x: (x.cmds, None),
483            "for_clause": lambda x: (x.cmds, x.items),
484            "function_definition": function_definition,
485            "if_clause": lambda x: (if_clause(x), None),
486            "pipeline": lambda x: (x.commands, None),
487            "redirect_list": lambda x: ([x.cmd], None),
488            "subshell": lambda x: (x.cmds, None),
489            "while_clause": lambda x: (chain(x.condition, x.cmds), None),
490            "until_clause": lambda x: (chain(x.condition, x.cmds), None),
491            "simple_command": simple_command,
492            "case_clause": case_clause,
493        }
494
495        def process_token_list(tokens):
496            for token in tokens:
497                if isinstance(token, list):
498                    process_token_list(token)
499                    continue
500                name, value = token
501                try:
502                    more_tokens, words = token_handlers[name](value)
503                except KeyError:
504                    raise NotImplementedError("Unsupported token type " + name)
505
506                if more_tokens:
507                    self.process_tokens(more_tokens)
508
509                if words:
510                    self.process_words(words)
511
512        process_token_list(tokens)
513
514    def process_words(self, words):
515        """Process a set of 'words' in pyshyacc parlance, which includes
516        extraction of executed commands from $() blocks, as well as grabbing
517        the command name argument.
518        """
519
520        words = list(words)
521        for word in words:
522            wtree = pyshlex.make_wordtree(word[1])
523            for part in wtree:
524                if not isinstance(part, list):
525                    continue
526
527                candidates = [part]
528
529                # If command is of type:
530                #
531                #   var="... $(cmd [...]) ..."
532                #
533                # Then iterate on what's between the quotes and if we find a
534                # list, make that what we check for below.
535                if len(part) >= 3 and part[0] == '"':
536                    for p in part[1:-1]:
537                        if isinstance(p, list):
538                            candidates.append(p)
539
540                for candidate in candidates:
541                    if len(candidate) >= 2:
542                        if candidate[0] in ('`', '$('):
543                            command = pyshlex.wordtree_as_string(candidate[1:-1])
544                            self._parse_shell(command)
545
546                            if word[0] in ("cmd_name", "cmd_word"):
547                                if word in words:
548                                    words.remove(word)
549
550        usetoken = False
551        for word in words:
552            if word[0] in ("cmd_name", "cmd_word") or \
553               (usetoken and word[0] == "TOKEN"):
554                if "=" in word[1]:
555                    usetoken = True
556                    continue
557
558                cmd = word[1]
559                if cmd.startswith("$"):
560                    self.log.debug(self.unhandled_template % cmd)
561                elif cmd == "eval":
562                    command = " ".join(word for _, word in words[1:])
563                    self._parse_shell(command)
564                else:
565                    self.allexecs.add(cmd)
566                break
567