1"""
2BitBake code parser
3
4Parses actual code (i.e. python and shell) for functions and in-line
5expressions. Used mainly to determine dependencies on other functions
6and variables within the BitBake metadata. Also provides a cache for
7this information in order to speed up processing.
8
9(Not to be confused with the code that parses the metadata itself,
10see lib/bb/parse/ for that).
11
12NOTE: if you change how the parsers gather information you will almost
13certainly need to increment CodeParserCache.CACHE_VERSION below so that
14any existing codeparser cache gets invalidated. Additionally you'll need
15to increment __cache_version__ in cache.py in order to ensure that old
16recipe caches don't trigger "Taskhash mismatch" errors.
17
18"""
19
20import ast
21import sys
22import codegen
23import logging
24import pickle
25import bb.pysh as pysh
26import os.path
27import bb.utils, bb.data
28import hashlib
29from itertools import chain
30from bb.pysh import pyshyacc, pyshlex, sherrors
31from bb.cache import MultiProcessCache
32
33logger = logging.getLogger('BitBake.CodeParser')
34
35def bbhash(s):
36    return hashlib.md5(s.encode("utf-8")).hexdigest()
37
38def check_indent(codestr):
39    """If the code is indented, add a top level piece of code to 'remove' the indentation"""
40
41    i = 0
42    while codestr[i] in ["\n", "\t", " "]:
43        i = i + 1
44
45    if i == 0:
46        return codestr
47
48    if codestr[i-1] == "\t" or codestr[i-1] == " ":
49        if codestr[0] == "\n":
50            # Since we're adding a line, we need to remove one line of any empty padding
51            # to ensure line numbers are correct
52            codestr = codestr[1:]
53        return "if 1:\n" + codestr
54
55    return codestr
56
57
58# Basically pickle, in python 2.7.3 at least, does badly with data duplication
59# upon pickling and unpickling. Combine this with duplicate objects and things
60# are a mess.
61#
62# When the sets are originally created, python calls intern() on the set keys
63# which significantly improves memory usage. Sadly the pickle/unpickle process
64# doesn't call intern() on the keys and results in the same strings being duplicated
65# in memory. This also means pickle will save the same string multiple times in
66# the cache file.
67#
68# By having shell and python cacheline objects with setstate/getstate, we force
69# the object creation through our own routine where we can call intern (via internSet).
70#
71# We also use hashable frozensets and ensure we use references to these so that
72# duplicates can be removed, both in memory and in the resulting pickled data.
73#
74# By playing these games, the size of the cache file shrinks dramatically
75# meaning faster load times and the reloaded cache files also consume much less
76# memory. Smaller cache files, faster load times and lower memory usage is good.
77#
78# A custom getstate/setstate using tuples is actually worth 15% cachesize by
79# avoiding duplication of the attribute names!
80
81class SetCache(object):
82    def __init__(self):
83        self.setcache = {}
84
85    def internSet(self, items):
86
87        new = []
88        for i in items:
89            new.append(sys.intern(i))
90        s = frozenset(new)
91        h = hash(s)
92        if h in self.setcache:
93            return self.setcache[h]
94        self.setcache[h] = s
95        return s
96
97codecache = SetCache()
98
99class pythonCacheLine(object):
100    def __init__(self, refs, execs, contains):
101        self.refs = codecache.internSet(refs)
102        self.execs = codecache.internSet(execs)
103        self.contains = {}
104        for c in contains:
105            self.contains[c] = codecache.internSet(contains[c])
106
107    def __getstate__(self):
108        return (self.refs, self.execs, self.contains)
109
110    def __setstate__(self, state):
111        (refs, execs, contains) = state
112        self.__init__(refs, execs, contains)
113    def __hash__(self):
114        l = (hash(self.refs), hash(self.execs))
115        for c in sorted(self.contains.keys()):
116            l = l + (c, hash(self.contains[c]))
117        return hash(l)
118    def __repr__(self):
119        return " ".join([str(self.refs), str(self.execs), str(self.contains)])
120
121
122class shellCacheLine(object):
123    def __init__(self, execs):
124        self.execs = codecache.internSet(execs)
125
126    def __getstate__(self):
127        return (self.execs)
128
129    def __setstate__(self, state):
130        (execs) = state
131        self.__init__(execs)
132    def __hash__(self):
133        return hash(self.execs)
134    def __repr__(self):
135        return str(self.execs)
136
137class CodeParserCache(MultiProcessCache):
138    cache_file_name = "bb_codeparser.dat"
139    # NOTE: you must increment this if you change how the parsers gather information,
140    # so that an existing cache gets invalidated. Additionally you'll need
141    # to increment __cache_version__ in cache.py in order to ensure that old
142    # recipe caches don't trigger "Taskhash mismatch" errors.
143    CACHE_VERSION = 9
144
145    def __init__(self):
146        MultiProcessCache.__init__(self)
147        self.pythoncache = self.cachedata[0]
148        self.shellcache = self.cachedata[1]
149        self.pythoncacheextras = self.cachedata_extras[0]
150        self.shellcacheextras = self.cachedata_extras[1]
151
152        # To avoid duplication in the codeparser cache, keep
153        # a lookup of hashes of objects we already have
154        self.pythoncachelines = {}
155        self.shellcachelines = {}
156
157    def newPythonCacheLine(self, refs, execs, contains):
158        cacheline = pythonCacheLine(refs, execs, contains)
159        h = hash(cacheline)
160        if h in self.pythoncachelines:
161            return self.pythoncachelines[h]
162        self.pythoncachelines[h] = cacheline
163        return cacheline
164
165    def newShellCacheLine(self, execs):
166        cacheline = shellCacheLine(execs)
167        h = hash(cacheline)
168        if h in self.shellcachelines:
169            return self.shellcachelines[h]
170        self.shellcachelines[h] = cacheline
171        return cacheline
172
173    def init_cache(self, d):
174        # Check if we already have the caches
175        if self.pythoncache:
176            return
177
178        MultiProcessCache.init_cache(self, d)
179
180        # cachedata gets re-assigned in the parent
181        self.pythoncache = self.cachedata[0]
182        self.shellcache = self.cachedata[1]
183
184    def create_cachedata(self):
185        data = [{}, {}]
186        return data
187
188codeparsercache = CodeParserCache()
189
190def parser_cache_init(d):
191    codeparsercache.init_cache(d)
192
193def parser_cache_save():
194    codeparsercache.save_extras()
195
196def parser_cache_savemerge():
197    codeparsercache.save_merge()
198
199Logger = logging.getLoggerClass()
200class BufferedLogger(Logger):
201    def __init__(self, name, level=0, target=None):
202        Logger.__init__(self, name)
203        self.setLevel(level)
204        self.buffer = []
205        self.target = target
206
207    def handle(self, record):
208        self.buffer.append(record)
209
210    def flush(self):
211        for record in self.buffer:
212            if self.target.isEnabledFor(record.levelno):
213                self.target.handle(record)
214        self.buffer = []
215
216class PythonParser():
217    getvars = (".getVar", ".appendVar", ".prependVar")
218    getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
219    containsfuncs = ("bb.utils.contains", "base_contains")
220    containsanyfuncs = ("bb.utils.contains_any",  "bb.utils.filter")
221    execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
222
223    def warn(self, func, arg):
224        """Warn about calls of bitbake APIs which pass a non-literal
225        argument for the variable name, as we're not able to track such
226        a reference.
227        """
228
229        try:
230            funcstr = codegen.to_source(func)
231            argstr = codegen.to_source(arg)
232        except TypeError:
233            self.log.debug(2, 'Failed to convert function and argument to source form')
234        else:
235            self.log.debug(1, self.unhandled_message % (funcstr, argstr))
236
237    def visit_Call(self, node):
238        name = self.called_node_name(node.func)
239        if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs):
240            if isinstance(node.args[0], ast.Str):
241                varname = node.args[0].s
242                if name in self.containsfuncs and isinstance(node.args[1], ast.Str):
243                    if varname not in self.contains:
244                        self.contains[varname] = set()
245                    self.contains[varname].add(node.args[1].s)
246                elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Str):
247                    if varname not in self.contains:
248                        self.contains[varname] = set()
249                    self.contains[varname].update(node.args[1].s.split())
250                elif name.endswith(self.getvarflags):
251                    if isinstance(node.args[1], ast.Str):
252                        self.references.add('%s[%s]' % (varname, node.args[1].s))
253                    else:
254                        self.warn(node.func, node.args[1])
255                else:
256                    self.references.add(varname)
257            else:
258                self.warn(node.func, node.args[0])
259        elif name and name.endswith(".expand"):
260            if isinstance(node.args[0], ast.Str):
261                value = node.args[0].s
262                d = bb.data.init()
263                parser = d.expandWithRefs(value, self.name)
264                self.references |= parser.references
265                self.execs |= parser.execs
266                for varname in parser.contains:
267                    if varname not in self.contains:
268                        self.contains[varname] = set()
269                    self.contains[varname] |= parser.contains[varname]
270        elif name in self.execfuncs:
271            if isinstance(node.args[0], ast.Str):
272                self.var_execs.add(node.args[0].s)
273            else:
274                self.warn(node.func, node.args[0])
275        elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
276            self.execs.add(name)
277
278    def called_node_name(self, node):
279        """Given a called node, return its original string form"""
280        components = []
281        while node:
282            if isinstance(node, ast.Attribute):
283                components.append(node.attr)
284                node = node.value
285            elif isinstance(node, ast.Name):
286                components.append(node.id)
287                return '.'.join(reversed(components))
288            else:
289                break
290
291    def __init__(self, name, log):
292        self.name = name
293        self.var_execs = set()
294        self.contains = {}
295        self.execs = set()
296        self.references = set()
297        self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log)
298
299        self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
300        self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
301
302    def parse_python(self, node, lineno=0, filename="<string>"):
303        if not node or not node.strip():
304            return
305
306        h = bbhash(str(node))
307
308        if h in codeparsercache.pythoncache:
309            self.references = set(codeparsercache.pythoncache[h].refs)
310            self.execs = set(codeparsercache.pythoncache[h].execs)
311            self.contains = {}
312            for i in codeparsercache.pythoncache[h].contains:
313                self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
314            return
315
316        if h in codeparsercache.pythoncacheextras:
317            self.references = set(codeparsercache.pythoncacheextras[h].refs)
318            self.execs = set(codeparsercache.pythoncacheextras[h].execs)
319            self.contains = {}
320            for i in codeparsercache.pythoncacheextras[h].contains:
321                self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
322            return
323
324        # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
325        node = "\n" * int(lineno) + node
326        code = compile(check_indent(str(node)), filename, "exec",
327                       ast.PyCF_ONLY_AST)
328
329        for n in ast.walk(code):
330            if n.__class__.__name__ == "Call":
331                self.visit_Call(n)
332
333        self.execs.update(self.var_execs)
334
335        codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
336
337class ShellParser():
338    def __init__(self, name, log):
339        self.funcdefs = set()
340        self.allexecs = set()
341        self.execs = set()
342        self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log)
343        self.unhandled_template = "unable to handle non-literal command '%s'"
344        self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
345
346    def parse_shell(self, value):
347        """Parse the supplied shell code in a string, returning the external
348        commands it executes.
349        """
350
351        h = bbhash(str(value))
352
353        if h in codeparsercache.shellcache:
354            self.execs = set(codeparsercache.shellcache[h].execs)
355            return self.execs
356
357        if h in codeparsercache.shellcacheextras:
358            self.execs = set(codeparsercache.shellcacheextras[h].execs)
359            return self.execs
360
361        self._parse_shell(value)
362        self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
363
364        codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
365
366        return self.execs
367
368    def _parse_shell(self, value):
369        try:
370            tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
371        except pyshlex.NeedMore:
372            raise sherrors.ShellSyntaxError("Unexpected EOF")
373
374        self.process_tokens(tokens)
375
376    def process_tokens(self, tokens):
377        """Process a supplied portion of the syntax tree as returned by
378        pyshyacc.parse.
379        """
380
381        def function_definition(value):
382            self.funcdefs.add(value.name)
383            return [value.body], None
384
385        def case_clause(value):
386            # Element 0 of each item in the case is the list of patterns, and
387            # Element 1 of each item in the case is the list of commands to be
388            # executed when that pattern matches.
389            words = chain(*[item[0] for item in value.items])
390            cmds  = chain(*[item[1] for item in value.items])
391            return cmds, words
392
393        def if_clause(value):
394            main = chain(value.cond, value.if_cmds)
395            rest = value.else_cmds
396            if isinstance(rest, tuple) and rest[0] == "elif":
397                return chain(main, if_clause(rest[1]))
398            else:
399                return chain(main, rest)
400
401        def simple_command(value):
402            return None, chain(value.words, (assign[1] for assign in value.assigns))
403
404        token_handlers = {
405            "and_or": lambda x: ((x.left, x.right), None),
406            "async": lambda x: ([x], None),
407            "brace_group": lambda x: (x.cmds, None),
408            "for_clause": lambda x: (x.cmds, x.items),
409            "function_definition": function_definition,
410            "if_clause": lambda x: (if_clause(x), None),
411            "pipeline": lambda x: (x.commands, None),
412            "redirect_list": lambda x: ([x.cmd], None),
413            "subshell": lambda x: (x.cmds, None),
414            "while_clause": lambda x: (chain(x.condition, x.cmds), None),
415            "until_clause": lambda x: (chain(x.condition, x.cmds), None),
416            "simple_command": simple_command,
417            "case_clause": case_clause,
418        }
419
420        def process_token_list(tokens):
421            for token in tokens:
422                if isinstance(token, list):
423                    process_token_list(token)
424                    continue
425                name, value = token
426                try:
427                    more_tokens, words = token_handlers[name](value)
428                except KeyError:
429                    raise NotImplementedError("Unsupported token type " + name)
430
431                if more_tokens:
432                    self.process_tokens(more_tokens)
433
434                if words:
435                    self.process_words(words)
436
437        process_token_list(tokens)
438
439    def process_words(self, words):
440        """Process a set of 'words' in pyshyacc parlance, which includes
441        extraction of executed commands from $() blocks, as well as grabbing
442        the command name argument.
443        """
444
445        words = list(words)
446        for word in list(words):
447            wtree = pyshlex.make_wordtree(word[1])
448            for part in wtree:
449                if not isinstance(part, list):
450                    continue
451
452                if part[0] in ('`', '$('):
453                    command = pyshlex.wordtree_as_string(part[1:-1])
454                    self._parse_shell(command)
455
456                    if word[0] in ("cmd_name", "cmd_word"):
457                        if word in words:
458                            words.remove(word)
459
460        usetoken = False
461        for word in words:
462            if word[0] in ("cmd_name", "cmd_word") or \
463               (usetoken and word[0] == "TOKEN"):
464                if "=" in word[1]:
465                    usetoken = True
466                    continue
467
468                cmd = word[1]
469                if cmd.startswith("$"):
470                    self.log.debug(1, self.unhandled_template % cmd)
471                elif cmd == "eval":
472                    command = " ".join(word for _, word in words[1:])
473                    self._parse_shell(command)
474                else:
475                    self.allexecs.add(cmd)
476                break
477