1#
2# Copyright BitBake Contributors
3#
4# SPDX-License-Identifier: GPL-2.0-only
5#
6
7"""
8BitBake code parser
9
10Parses actual code (i.e. python and shell) for functions and in-line
11expressions. Used mainly to determine dependencies on other functions
12and variables within the BitBake metadata. Also provides a cache for
13this information in order to speed up processing.
14
15(Not to be confused with the code that parses the metadata itself,
16see lib/bb/parse/ for that).
17
18NOTE: if you change how the parsers gather information you will almost
19certainly need to increment CodeParserCache.CACHE_VERSION below so that
20any existing codeparser cache gets invalidated. Additionally you'll need
21to increment __cache_version__ in cache.py in order to ensure that old
22recipe caches don't trigger "Taskhash mismatch" errors.
23
24"""
25
26import ast
27import sys
28import codegen
29import logging
30import inspect
31import bb.pysh as pysh
32import bb.utils, bb.data
33import hashlib
34from itertools import chain
35from bb.pysh import pyshyacc, pyshlex
36from bb.cache import MultiProcessCache
37
38logger = logging.getLogger('BitBake.CodeParser')
39
40def bbhash(s):
41    return hashlib.sha256(s.encode("utf-8")).hexdigest()
42
43def check_indent(codestr):
44    """If the code is indented, add a top level piece of code to 'remove' the indentation"""
45
46    i = 0
47    while codestr[i] in ["\n", "\t", " "]:
48        i = i + 1
49
50    if i == 0:
51        return codestr
52
53    if codestr[i-1] == "\t" or codestr[i-1] == " ":
54        if codestr[0] == "\n":
55            # Since we're adding a line, we need to remove one line of any empty padding
56            # to ensure line numbers are correct
57            codestr = codestr[1:]
58        return "if 1:\n" + codestr
59
60    return codestr
61
62modulecode_deps = {}
63
64def add_module_functions(fn, functions, namespace):
65    import os
66    fstat = os.stat(fn)
67    fixedhash = fn + ":" + str(fstat.st_size) +  ":" + str(fstat.st_mtime)
68    for f in functions:
69        name = "%s.%s" % (namespace, f)
70        parser = PythonParser(name, logger)
71        try:
72            parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f)
73            #bb.warn("Cached %s" % f)
74        except KeyError:
75            lines, lineno = inspect.getsourcelines(functions[f])
76            src = "".join(lines)
77            parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f)
78            #bb.warn("Not cached %s" % f)
79        execs = parser.execs.copy()
80        # Expand internal module exec references
81        for e in parser.execs:
82            if e in functions:
83                execs.remove(e)
84                execs.add(namespace + "." + e)
85        modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy()]
86        #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains))
87
88def update_module_dependencies(d):
89    for mod in modulecode_deps:
90        excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split())
91        if excludes:
92            modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3]]
93
94# A custom getstate/setstate using tuples is actually worth 15% cachesize by
95# avoiding duplication of the attribute names!
96class SetCache(object):
97    def __init__(self):
98        self.setcache = {}
99
100    def internSet(self, items):
101
102        new = []
103        for i in items:
104            new.append(sys.intern(i))
105        s = frozenset(new)
106        h = hash(s)
107        if h in self.setcache:
108            return self.setcache[h]
109        self.setcache[h] = s
110        return s
111
112codecache = SetCache()
113
114class pythonCacheLine(object):
115    def __init__(self, refs, execs, contains):
116        self.refs = codecache.internSet(refs)
117        self.execs = codecache.internSet(execs)
118        self.contains = {}
119        for c in contains:
120            self.contains[c] = codecache.internSet(contains[c])
121
122    def __getstate__(self):
123        return (self.refs, self.execs, self.contains)
124
125    def __setstate__(self, state):
126        (refs, execs, contains) = state
127        self.__init__(refs, execs, contains)
128    def __hash__(self):
129        l = (hash(self.refs), hash(self.execs))
130        for c in sorted(self.contains.keys()):
131            l = l + (c, hash(self.contains[c]))
132        return hash(l)
133    def __repr__(self):
134        return " ".join([str(self.refs), str(self.execs), str(self.contains)])
135
136
137class shellCacheLine(object):
138    def __init__(self, execs):
139        self.execs = codecache.internSet(execs)
140
141    def __getstate__(self):
142        return (self.execs)
143
144    def __setstate__(self, state):
145        (execs) = state
146        self.__init__(execs)
147    def __hash__(self):
148        return hash(self.execs)
149    def __repr__(self):
150        return str(self.execs)
151
152class CodeParserCache(MultiProcessCache):
153    cache_file_name = "bb_codeparser.dat"
154    # NOTE: you must increment this if you change how the parsers gather information,
155    # so that an existing cache gets invalidated. Additionally you'll need
156    # to increment __cache_version__ in cache.py in order to ensure that old
157    # recipe caches don't trigger "Taskhash mismatch" errors.
158    CACHE_VERSION = 11
159
160    def __init__(self):
161        MultiProcessCache.__init__(self)
162        self.pythoncache = self.cachedata[0]
163        self.shellcache = self.cachedata[1]
164        self.pythoncacheextras = self.cachedata_extras[0]
165        self.shellcacheextras = self.cachedata_extras[1]
166
167        # To avoid duplication in the codeparser cache, keep
168        # a lookup of hashes of objects we already have
169        self.pythoncachelines = {}
170        self.shellcachelines = {}
171
172    def newPythonCacheLine(self, refs, execs, contains):
173        cacheline = pythonCacheLine(refs, execs, contains)
174        h = hash(cacheline)
175        if h in self.pythoncachelines:
176            return self.pythoncachelines[h]
177        self.pythoncachelines[h] = cacheline
178        return cacheline
179
180    def newShellCacheLine(self, execs):
181        cacheline = shellCacheLine(execs)
182        h = hash(cacheline)
183        if h in self.shellcachelines:
184            return self.shellcachelines[h]
185        self.shellcachelines[h] = cacheline
186        return cacheline
187
188    def init_cache(self, cachedir):
189        # Check if we already have the caches
190        if self.pythoncache:
191            return
192
193        MultiProcessCache.init_cache(self, cachedir)
194
195        # cachedata gets re-assigned in the parent
196        self.pythoncache = self.cachedata[0]
197        self.shellcache = self.cachedata[1]
198
199    def create_cachedata(self):
200        data = [{}, {}]
201        return data
202
203codeparsercache = CodeParserCache()
204
205def parser_cache_init(cachedir):
206    codeparsercache.init_cache(cachedir)
207
208def parser_cache_save():
209    codeparsercache.save_extras()
210
211def parser_cache_savemerge():
212    codeparsercache.save_merge()
213
214Logger = logging.getLoggerClass()
215class BufferedLogger(Logger):
216    def __init__(self, name, level=0, target=None):
217        Logger.__init__(self, name)
218        self.setLevel(level)
219        self.buffer = []
220        self.target = target
221
222    def handle(self, record):
223        self.buffer.append(record)
224
225    def flush(self):
226        for record in self.buffer:
227            if self.target.isEnabledFor(record.levelno):
228                self.target.handle(record)
229        self.buffer = []
230
231class DummyLogger():
232    def flush(self):
233        return
234
235class PythonParser():
236    getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional")
237    getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
238    containsfuncs = ("bb.utils.contains", "base_contains")
239    containsanyfuncs = ("bb.utils.contains_any",  "bb.utils.filter")
240    execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
241
242    def warn(self, func, arg):
243        """Warn about calls of bitbake APIs which pass a non-literal
244        argument for the variable name, as we're not able to track such
245        a reference.
246        """
247
248        try:
249            funcstr = codegen.to_source(func)
250            argstr = codegen.to_source(arg)
251        except TypeError:
252            self.log.debug2('Failed to convert function and argument to source form')
253        else:
254            self.log.debug(self.unhandled_message % (funcstr, argstr))
255
256    def visit_Call(self, node):
257        name = self.called_node_name(node.func)
258        if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs):
259            if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str):
260                varname = node.args[0].value
261                if name in self.containsfuncs and isinstance(node.args[1], ast.Constant):
262                    if varname not in self.contains:
263                        self.contains[varname] = set()
264                    self.contains[varname].add(node.args[1].value)
265                elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant):
266                    if varname not in self.contains:
267                        self.contains[varname] = set()
268                    self.contains[varname].update(node.args[1].value.split())
269                elif name.endswith(self.getvarflags):
270                    if isinstance(node.args[1], ast.Constant):
271                        self.references.add('%s[%s]' % (varname, node.args[1].value))
272                    else:
273                        self.warn(node.func, node.args[1])
274                else:
275                    self.references.add(varname)
276            else:
277                self.warn(node.func, node.args[0])
278        elif name and name.endswith(".expand"):
279            if isinstance(node.args[0], ast.Constant):
280                value = node.args[0].value
281                d = bb.data.init()
282                parser = d.expandWithRefs(value, self.name)
283                self.references |= parser.references
284                self.execs |= parser.execs
285                for varname in parser.contains:
286                    if varname not in self.contains:
287                        self.contains[varname] = set()
288                    self.contains[varname] |= parser.contains[varname]
289        elif name in self.execfuncs:
290            if isinstance(node.args[0], ast.Constant):
291                self.var_execs.add(node.args[0].value)
292            else:
293                self.warn(node.func, node.args[0])
294        elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
295            self.execs.add(name)
296
297    def called_node_name(self, node):
298        """Given a called node, return its original string form"""
299        components = []
300        while node:
301            if isinstance(node, ast.Attribute):
302                components.append(node.attr)
303                node = node.value
304            elif isinstance(node, ast.Name):
305                components.append(node.id)
306                return '.'.join(reversed(components))
307            else:
308                break
309
310    def __init__(self, name, log):
311        self.name = name
312        self.var_execs = set()
313        self.contains = {}
314        self.execs = set()
315        self.references = set()
316        self._log = log
317        # Defer init as expensive
318        self.log = DummyLogger()
319
320        self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
321        self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
322
323    # For the python module code it is expensive to have the function text so it is
324    # uses a different fixedhash to cache against. We can take the hit on obtaining the
325    # text if it isn't in the cache.
326    def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None):
327        if not fixedhash and (not node or not node.strip()):
328            return
329
330        if fixedhash:
331            h = fixedhash
332        else:
333            h = bbhash(str(node))
334
335        if h in codeparsercache.pythoncache:
336            self.references = set(codeparsercache.pythoncache[h].refs)
337            self.execs = set(codeparsercache.pythoncache[h].execs)
338            self.contains = {}
339            for i in codeparsercache.pythoncache[h].contains:
340                self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
341            return
342
343        if h in codeparsercache.pythoncacheextras:
344            self.references = set(codeparsercache.pythoncacheextras[h].refs)
345            self.execs = set(codeparsercache.pythoncacheextras[h].execs)
346            self.contains = {}
347            for i in codeparsercache.pythoncacheextras[h].contains:
348                self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
349            return
350
351        if fixedhash and not node:
352            raise KeyError
353
354        # Need to parse so take the hit on the real log buffer
355        self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log)
356
357        # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
358        node = "\n" * int(lineno) + node
359        code = compile(check_indent(str(node)), filename, "exec",
360                       ast.PyCF_ONLY_AST)
361
362        for n in ast.walk(code):
363            if n.__class__.__name__ == "Call":
364                self.visit_Call(n)
365
366        self.execs.update(self.var_execs)
367
368        codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
369
370class ShellParser():
371    def __init__(self, name, log):
372        self.funcdefs = set()
373        self.allexecs = set()
374        self.execs = set()
375        self._name = name
376        self._log = log
377        # Defer init as expensive
378        self.log = DummyLogger()
379
380        self.unhandled_template = "unable to handle non-literal command '%s'"
381        self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
382
383    def parse_shell(self, value):
384        """Parse the supplied shell code in a string, returning the external
385        commands it executes.
386        """
387
388        h = bbhash(str(value))
389
390        if h in codeparsercache.shellcache:
391            self.execs = set(codeparsercache.shellcache[h].execs)
392            return self.execs
393
394        if h in codeparsercache.shellcacheextras:
395            self.execs = set(codeparsercache.shellcacheextras[h].execs)
396            return self.execs
397
398        # Need to parse so take the hit on the real log buffer
399        self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log)
400
401        self._parse_shell(value)
402        self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
403
404        codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
405
406        return self.execs
407
408    def _parse_shell(self, value):
409        try:
410            tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
411        except Exception:
412            bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:]))
413            raise
414
415        self.process_tokens(tokens)
416
417    def process_tokens(self, tokens):
418        """Process a supplied portion of the syntax tree as returned by
419        pyshyacc.parse.
420        """
421
422        def function_definition(value):
423            self.funcdefs.add(value.name)
424            return [value.body], None
425
426        def case_clause(value):
427            # Element 0 of each item in the case is the list of patterns, and
428            # Element 1 of each item in the case is the list of commands to be
429            # executed when that pattern matches.
430            words = chain(*[item[0] for item in value.items])
431            cmds  = chain(*[item[1] for item in value.items])
432            return cmds, words
433
434        def if_clause(value):
435            main = chain(value.cond, value.if_cmds)
436            rest = value.else_cmds
437            if isinstance(rest, tuple) and rest[0] == "elif":
438                return chain(main, if_clause(rest[1]))
439            else:
440                return chain(main, rest)
441
442        def simple_command(value):
443            return None, chain(value.words, (assign[1] for assign in value.assigns))
444
445        token_handlers = {
446            "and_or": lambda x: ((x.left, x.right), None),
447            "async": lambda x: ([x], None),
448            "brace_group": lambda x: (x.cmds, None),
449            "for_clause": lambda x: (x.cmds, x.items),
450            "function_definition": function_definition,
451            "if_clause": lambda x: (if_clause(x), None),
452            "pipeline": lambda x: (x.commands, None),
453            "redirect_list": lambda x: ([x.cmd], None),
454            "subshell": lambda x: (x.cmds, None),
455            "while_clause": lambda x: (chain(x.condition, x.cmds), None),
456            "until_clause": lambda x: (chain(x.condition, x.cmds), None),
457            "simple_command": simple_command,
458            "case_clause": case_clause,
459        }
460
461        def process_token_list(tokens):
462            for token in tokens:
463                if isinstance(token, list):
464                    process_token_list(token)
465                    continue
466                name, value = token
467                try:
468                    more_tokens, words = token_handlers[name](value)
469                except KeyError:
470                    raise NotImplementedError("Unsupported token type " + name)
471
472                if more_tokens:
473                    self.process_tokens(more_tokens)
474
475                if words:
476                    self.process_words(words)
477
478        process_token_list(tokens)
479
480    def process_words(self, words):
481        """Process a set of 'words' in pyshyacc parlance, which includes
482        extraction of executed commands from $() blocks, as well as grabbing
483        the command name argument.
484        """
485
486        words = list(words)
487        for word in list(words):
488            wtree = pyshlex.make_wordtree(word[1])
489            for part in wtree:
490                if not isinstance(part, list):
491                    continue
492
493                if part[0] in ('`', '$('):
494                    command = pyshlex.wordtree_as_string(part[1:-1])
495                    self._parse_shell(command)
496
497                    if word[0] in ("cmd_name", "cmd_word"):
498                        if word in words:
499                            words.remove(word)
500
501        usetoken = False
502        for word in words:
503            if word[0] in ("cmd_name", "cmd_word") or \
504               (usetoken and word[0] == "TOKEN"):
505                if "=" in word[1]:
506                    usetoken = True
507                    continue
508
509                cmd = word[1]
510                if cmd.startswith("$"):
511                    self.log.debug(self.unhandled_template % cmd)
512                elif cmd == "eval":
513                    command = " ".join(word for _, word in words[1:])
514                    self._parse_shell(command)
515                else:
516                    self.allexecs.add(cmd)
517                break
518