xref: /openbmc/openbmc/poky/bitbake/lib/bb/codeparser.py (revision c124f4f2e04dca16a428a76c89677328bc7bf908)
1#
2# Copyright BitBake Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7"""
8BitBake code parser
9
10Parses actual code (i.e. python and shell) for functions and in-line
11expressions. Used mainly to determine dependencies on other functions
12and variables within the BitBake metadata. Also provides a cache for
13this information in order to speed up processing.
14
15(Not to be confused with the code that parses the metadata itself,
16see lib/bb/parse/ for that).
17
18NOTE: if you change how the parsers gather information you will almost
19certainly need to increment CodeParserCache.CACHE_VERSION below so that
20any existing codeparser cache gets invalidated. Additionally you'll need
21to increment __cache_version__ in cache.py in order to ensure that old
22recipe caches don't trigger "Taskhash mismatch" errors.
23
24"""
25
26import ast
27import sys
28import codegen
29import logging
30import inspect
31import bb.pysh as pysh
32import bb.utils, bb.data
33import hashlib
34from itertools import chain
35from bb.pysh import pyshyacc, pyshlex
36from bb.cache import MultiProcessCache
37
38logger = logging.getLogger('BitBake.CodeParser')
39
40def bbhash(s):
41    return hashlib.sha256(s.encode("utf-8")).hexdigest()
42
43def check_indent(codestr):
44    """If the code is indented, add a top level piece of code to 'remove' the indentation"""
45
46    i = 0
47    while codestr[i] in ["\n", "\t", " "]:
48        i = i + 1
49
50    if i == 0:
51        return codestr
52
53    if codestr[i-1] == "\t" or codestr[i-1] == " ":
54        if codestr[0] == "\n":
55            # Since we're adding a line, we need to remove one line of any empty padding
56            # to ensure line numbers are correct
57            codestr = codestr[1:]
58        return "if 1:\n" + codestr
59
60    return codestr
61
62modulecode_deps = {}
63
64def add_module_functions(fn, functions, namespace):
65    import os
66    fstat = os.stat(fn)
67    fixedhash = fn + ":" + str(fstat.st_size) +  ":" + str(fstat.st_mtime)
68    for f in functions:
69        name = "%s.%s" % (namespace, f)
70        parser = PythonParser(name, logger)
71        try:
72            parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f)
73            #bb.warn("Cached %s" % f)
74        except KeyError:
75            targetfn = inspect.getsourcefile(functions[f])
76            if fn != targetfn:
77                # Skip references to other modules outside this file
78                #bb.warn("Skipping %s" % name)
79                continue
80            lines, lineno = inspect.getsourcelines(functions[f])
81            src = "".join(lines)
82            parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f)
83            #bb.warn("Not cached %s" % f)
84        execs = parser.execs.copy()
85        # Expand internal module exec references
86        for e in parser.execs:
87            if e in functions:
88                execs.remove(e)
89                execs.add(namespace + "." + e)
90        visitorcode = None
91        if hasattr(functions[f], 'visitorcode'):
92            visitorcode = getattr(functions[f], "visitorcode")
93        modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy(), parser.extra, visitorcode]
94        #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains))
95
96def update_module_dependencies(d):
97    for mod in modulecode_deps:
98        excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split())
99        if excludes:
100            modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3], modulecode_deps[mod][4], modulecode_deps[mod][5]]
101
102# A custom getstate/setstate using tuples is actually worth 15% cachesize by
103# avoiding duplication of the attribute names!
104class SetCache(object):
105    def __init__(self):
106        self.setcache = {}
107
108    def internSet(self, items):
109
110        new = []
111        for i in items:
112            new.append(sys.intern(i))
113        s = frozenset(new)
114        h = hash(s)
115        if h in self.setcache:
116            return self.setcache[h]
117        self.setcache[h] = s
118        return s
119
120codecache = SetCache()
121
122class pythonCacheLine(object):
123    def __init__(self, refs, execs, contains, extra):
124        self.refs = codecache.internSet(refs)
125        self.execs = codecache.internSet(execs)
126        self.contains = {}
127        for c in contains:
128            self.contains[c] = codecache.internSet(contains[c])
129        self.extra = extra
130
131    def __getstate__(self):
132        return (self.refs, self.execs, self.contains, self.extra)
133
134    def __setstate__(self, state):
135        (refs, execs, contains, extra) = state
136        self.__init__(refs, execs, contains, extra)
137    def __hash__(self):
138        l = (hash(self.refs), hash(self.execs), hash(self.extra))
139        for c in sorted(self.contains.keys()):
140            l = l + (c, hash(self.contains[c]))
141        return hash(l)
142    def __repr__(self):
143        return " ".join([str(self.refs), str(self.execs), str(self.contains)])
144
145
146class shellCacheLine(object):
147    def __init__(self, execs):
148        self.execs = codecache.internSet(execs)
149
150    def __getstate__(self):
151        return (self.execs)
152
153    def __setstate__(self, state):
154        (execs) = state
155        self.__init__(execs)
156    def __hash__(self):
157        return hash(self.execs)
158    def __repr__(self):
159        return str(self.execs)
160
161class CodeParserCache(MultiProcessCache):
162    cache_file_name = "bb_codeparser.dat"
163    # NOTE: you must increment this if you change how the parsers gather information,
164    # so that an existing cache gets invalidated. Additionally you'll need
165    # to increment __cache_version__ in cache.py in order to ensure that old
166    # recipe caches don't trigger "Taskhash mismatch" errors.
167    CACHE_VERSION = 14
168
169    def __init__(self):
170        MultiProcessCache.__init__(self)
171        self.pythoncache = self.cachedata[0]
172        self.shellcache = self.cachedata[1]
173        self.pythoncacheextras = self.cachedata_extras[0]
174        self.shellcacheextras = self.cachedata_extras[1]
175
176        # To avoid duplication in the codeparser cache, keep
177        # a lookup of hashes of objects we already have
178        self.pythoncachelines = {}
179        self.shellcachelines = {}
180
181    def newPythonCacheLine(self, refs, execs, contains, extra):
182        cacheline = pythonCacheLine(refs, execs, contains, extra)
183        h = hash(cacheline)
184        if h in self.pythoncachelines:
185            return self.pythoncachelines[h]
186        self.pythoncachelines[h] = cacheline
187        return cacheline
188
189    def newShellCacheLine(self, execs):
190        cacheline = shellCacheLine(execs)
191        h = hash(cacheline)
192        if h in self.shellcachelines:
193            return self.shellcachelines[h]
194        self.shellcachelines[h] = cacheline
195        return cacheline
196
197    def init_cache(self, cachedir):
198        # Check if we already have the caches
199        if self.pythoncache:
200            return
201
202        MultiProcessCache.init_cache(self, cachedir)
203
204        # cachedata gets re-assigned in the parent
205        self.pythoncache = self.cachedata[0]
206        self.shellcache = self.cachedata[1]
207
208    def create_cachedata(self):
209        data = [{}, {}]
210        return data
211
212codeparsercache = CodeParserCache()
213
214def parser_cache_init(cachedir):
215    codeparsercache.init_cache(cachedir)
216
217def parser_cache_save():
218    codeparsercache.save_extras()
219
220def parser_cache_savemerge():
221    codeparsercache.save_merge()
222
223Logger = logging.getLoggerClass()
224class BufferedLogger(Logger):
225    def __init__(self, name, level=0, target=None):
226        Logger.__init__(self, name)
227        self.setLevel(level)
228        self.buffer = []
229        self.target = target
230
231    def handle(self, record):
232        self.buffer.append(record)
233
234    def flush(self):
235        for record in self.buffer:
236            if self.target.isEnabledFor(record.levelno):
237                self.target.handle(record)
238        self.buffer = []
239
240class DummyLogger():
241    def flush(self):
242        return
243
244class PythonParser():
245    getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional")
246    getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
247    containsfuncs = ("bb.utils.contains", "base_contains")
248    containsanyfuncs = ("bb.utils.contains_any",  "bb.utils.filter")
249    execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
250
251    def warn(self, func, arg):
252        """Warn about calls of bitbake APIs which pass a non-literal
253        argument for the variable name, as we're not able to track such
254        a reference.
255        """
256
257        try:
258            funcstr = codegen.to_source(func)
259            argstr = codegen.to_source(arg)
260        except TypeError:
261            self.log.debug2('Failed to convert function and argument to source form')
262        else:
263            self.log.debug(self.unhandled_message % (funcstr, argstr))
264
265    def visit_Call(self, node):
266        name = self.called_node_name(node.func)
267        if name and name in modulecode_deps and modulecode_deps[name][5]:
268            visitorcode = modulecode_deps[name][5]
269            contains, execs, warn = visitorcode(name, node.args)
270            for i in contains:
271                self.contains[i] = contains[i]
272            self.execs |= execs
273            if warn:
274                self.warn(node.func, warn)
275        elif name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs):
276            if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
277                varname = node.args[0].value
278                if name in self.containsfuncs and isinstance(node.args[1], ast.Constant):
279                    if varname not in self.contains:
280                        self.contains[varname] = set()
281                    self.contains[varname].add(node.args[1].value)
282                elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant):
283                    if varname not in self.contains:
284                        self.contains[varname] = set()
285                    self.contains[varname].update(node.args[1].value.split())
286                elif name.endswith(self.getvarflags):
287                    if isinstance(node.args[1], ast.Constant):
288                        self.references.add('%s[%s]' % (varname, node.args[1].value))
289                    else:
290                        self.warn(node.func, node.args[1])
291                else:
292                    self.references.add(varname)
293            else:
294                self.warn(node.func, node.args[0])
295        elif name and name.endswith(".expand"):
296            if isinstance(node.args[0], ast.Constant):
297                value = node.args[0].value
298                d = bb.data.init()
299                parser = d.expandWithRefs(value, self.name)
300                self.references |= parser.references
301                self.execs |= parser.execs
302                for varname in parser.contains:
303                    if varname not in self.contains:
304                        self.contains[varname] = set()
305                    self.contains[varname] |= parser.contains[varname]
306        elif name in self.execfuncs:
307            if isinstance(node.args[0], ast.Constant):
308                self.var_execs.add(node.args[0].value)
309            else:
310                self.warn(node.func, node.args[0])
311        elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
312            self.execs.add(name)
313
314    def called_node_name(self, node):
315        """Given a called node, return its original string form"""
316        components = []
317        while node:
318            if isinstance(node, ast.Attribute):
319                components.append(node.attr)
320                node = node.value
321            elif isinstance(node, ast.Name):
322                components.append(node.id)
323                return '.'.join(reversed(components))
324            else:
325                break
326
327    def __init__(self, name, log):
328        self.name = name
329        self.var_execs = set()
330        self.contains = {}
331        self.execs = set()
332        self.references = set()
333        self._log = log
334        # Defer init as expensive
335        self.log = DummyLogger()
336
337        self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
338        self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
339
340    # For the python module code it is expensive to have the function text so it is
341    # uses a different fixedhash to cache against. We can take the hit on obtaining the
342    # text if it isn't in the cache.
343    def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None):
344        if not fixedhash and (not node or not node.strip()):
345            return
346
347        if fixedhash:
348            h = fixedhash
349        else:
350            h = bbhash(str(node))
351
352        if h in codeparsercache.pythoncache:
353            self.references = set(codeparsercache.pythoncache[h].refs)
354            self.execs = set(codeparsercache.pythoncache[h].execs)
355            self.contains = {}
356            for i in codeparsercache.pythoncache[h].contains:
357                self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
358            self.extra = codeparsercache.pythoncache[h].extra
359            return
360
361        if h in codeparsercache.pythoncacheextras:
362            self.references = set(codeparsercache.pythoncacheextras[h].refs)
363            self.execs = set(codeparsercache.pythoncacheextras[h].execs)
364            self.contains = {}
365            for i in codeparsercache.pythoncacheextras[h].contains:
366                self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
367            self.extra = codeparsercache.pythoncacheextras[h].extra
368            return
369
370        if fixedhash and not node:
371            raise KeyError
372
373        # Need to parse so take the hit on the real log buffer
374        self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log)
375
376        # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
377        node = "\n" * int(lineno) + node
378        code = compile(check_indent(str(node)), filename, "exec",
379                       ast.PyCF_ONLY_AST)
380
381        for n in ast.walk(code):
382            if n.__class__.__name__ == "Call":
383                self.visit_Call(n)
384
385        self.execs.update(self.var_execs)
386        self.extra = None
387        if fixedhash:
388            self.extra = bbhash(str(node))
389
390        codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains, self.extra)
391
392class ShellParser():
393    def __init__(self, name, log):
394        self.funcdefs = set()
395        self.allexecs = set()
396        self.execs = set()
397        self._name = name
398        self._log = log
399        # Defer init as expensive
400        self.log = DummyLogger()
401
402        self.unhandled_template = "unable to handle non-literal command '%s'"
403        self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
404
405    def parse_shell(self, value):
406        """Parse the supplied shell code in a string, returning the external
407        commands it executes.
408        """
409
410        h = bbhash(str(value))
411
412        if h in codeparsercache.shellcache:
413            self.execs = set(codeparsercache.shellcache[h].execs)
414            return self.execs
415
416        if h in codeparsercache.shellcacheextras:
417            self.execs = set(codeparsercache.shellcacheextras[h].execs)
418            return self.execs
419
420        # Need to parse so take the hit on the real log buffer
421        self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log)
422
423        self._parse_shell(value)
424        self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
425
426        codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
427
428        return self.execs
429
430    def _parse_shell(self, value):
431        try:
432            tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
433        except Exception:
434            bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:]))
435            raise
436
437        self.process_tokens(tokens)
438
439    def process_tokens(self, tokens):
440        """Process a supplied portion of the syntax tree as returned by
441        pyshyacc.parse.
442        """
443
444        def function_definition(value):
445            self.funcdefs.add(value.name)
446            return [value.body], None
447
448        def case_clause(value):
449            # Element 0 of each item in the case is the list of patterns, and
450            # Element 1 of each item in the case is the list of commands to be
451            # executed when that pattern matches.
452            words = chain(*[item[0] for item in value.items])
453            cmds  = chain(*[item[1] for item in value.items])
454            return cmds, words
455
456        def if_clause(value):
457            main = chain(value.cond, value.if_cmds)
458            rest = value.else_cmds
459            if isinstance(rest, tuple) and rest[0] == "elif":
460                return chain(main, if_clause(rest[1]))
461            else:
462                return chain(main, rest)
463
464        def simple_command(value):
465            return None, chain(value.words, (assign[1] for assign in value.assigns))
466
467        token_handlers = {
468            "and_or": lambda x: ((x.left, x.right), None),
469            "async": lambda x: ([x], None),
470            "brace_group": lambda x: (x.cmds, None),
471            "for_clause": lambda x: (x.cmds, x.items),
472            "function_definition": function_definition,
473            "if_clause": lambda x: (if_clause(x), None),
474            "pipeline": lambda x: (x.commands, None),
475            "redirect_list": lambda x: ([x.cmd], None),
476            "subshell": lambda x: (x.cmds, None),
477            "while_clause": lambda x: (chain(x.condition, x.cmds), None),
478            "until_clause": lambda x: (chain(x.condition, x.cmds), None),
479            "simple_command": simple_command,
480            "case_clause": case_clause,
481        }
482
483        def process_token_list(tokens):
484            for token in tokens:
485                if isinstance(token, list):
486                    process_token_list(token)
487                    continue
488                name, value = token
489                try:
490                    more_tokens, words = token_handlers[name](value)
491                except KeyError:
492                    raise NotImplementedError("Unsupported token type " + name)
493
494                if more_tokens:
495                    self.process_tokens(more_tokens)
496
497                if words:
498                    self.process_words(words)
499
500        process_token_list(tokens)
501
502    def process_words(self, words):
503        """Process a set of 'words' in pyshyacc parlance, which includes
504        extraction of executed commands from $() blocks, as well as grabbing
505        the command name argument.
506        """
507
508        words = list(words)
509        for word in words:
510            wtree = pyshlex.make_wordtree(word[1])
511            for part in wtree:
512                if not isinstance(part, list):
513                    continue
514
515                candidates = [part]
516
517                # If command is of type:
518                #
519                #   var="... $(cmd [...]) ..."
520                #
521                # Then iterate on what's between the quotes and if we find a
522                # list, make that what we check for below.
523                if len(part) >= 3 and part[0] == '"':
524                    for p in part[1:-1]:
525                        if isinstance(p, list):
526                            candidates.append(p)
527
528                for candidate in candidates:
529                    if len(candidate) >= 2:
530                        if candidate[0] in ('`', '$('):
531                            command = pyshlex.wordtree_as_string(candidate[1:-1])
532                            self._parse_shell(command)
533
534                            if word[0] in ("cmd_name", "cmd_word"):
535                                if word in words:
536                                    words.remove(word)
537
538        usetoken = False
539        for word in words:
540            if word[0] in ("cmd_name", "cmd_word") or \
541               (usetoken and word[0] == "TOKEN"):
542                if "=" in word[1]:
543                    usetoken = True
544                    continue
545
546                cmd = word[1]
547                if cmd.startswith("$"):
548                    self.log.debug(self.unhandled_template % cmd)
549                elif cmd == "eval":
550                    command = " ".join(word for _, word in words[1:])
551                    self._parse_shell(command)
552                else:
553                    self.allexecs.add(cmd)
554                break
555