1# 2# Copyright BitBake Contributors 3# 4# SPDX-License-Identifier: GPL-2.0-only 5# 6 7""" 8BitBake code parser 9 10Parses actual code (i.e. python and shell) for functions and in-line 11expressions. Used mainly to determine dependencies on other functions 12and variables within the BitBake metadata. Also provides a cache for 13this information in order to speed up processing. 14 15(Not to be confused with the code that parses the metadata itself, 16see lib/bb/parse/ for that). 17 18NOTE: if you change how the parsers gather information you will almost 19certainly need to increment CodeParserCache.CACHE_VERSION below so that 20any existing codeparser cache gets invalidated. Additionally you'll need 21to increment __cache_version__ in cache.py in order to ensure that old 22recipe caches don't trigger "Taskhash mismatch" errors. 23 24""" 25 26import ast 27import sys 28import codegen 29import logging 30import inspect 31import bb.pysh as pysh 32import bb.utils, bb.data 33import hashlib 34from itertools import chain 35from bb.pysh import pyshyacc, pyshlex 36from bb.cache import MultiProcessCache 37 38logger = logging.getLogger('BitBake.CodeParser') 39 40def bbhash(s): 41 return hashlib.sha256(s.encode("utf-8")).hexdigest() 42 43def check_indent(codestr): 44 """If the code is indented, add a top level piece of code to 'remove' the indentation""" 45 46 i = 0 47 while codestr[i] in ["\n", "\t", " "]: 48 i = i + 1 49 50 if i == 0: 51 return codestr 52 53 if codestr[i-1] == "\t" or codestr[i-1] == " ": 54 if codestr[0] == "\n": 55 # Since we're adding a line, we need to remove one line of any empty padding 56 # to ensure line numbers are correct 57 codestr = codestr[1:] 58 return "if 1:\n" + codestr 59 60 return codestr 61 62modulecode_deps = {} 63 64def add_module_functions(fn, functions, namespace): 65 import os 66 fstat = os.stat(fn) 67 fixedhash = fn + ":" + str(fstat.st_size) + ":" + str(fstat.st_mtime) 68 for f in functions: 69 name = "%s.%s" % (namespace, f) 70 parser = PythonParser(name, logger) 71 try: 72 parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f) 73 #bb.warn("Cached %s" % f) 74 except KeyError: 75 lines, lineno = inspect.getsourcelines(functions[f]) 76 src = "".join(lines) 77 parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f) 78 #bb.warn("Not cached %s" % f) 79 execs = parser.execs.copy() 80 # Expand internal module exec references 81 for e in parser.execs: 82 if e in functions: 83 execs.remove(e) 84 execs.add(namespace + "." + e) 85 modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy()] 86 #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains)) 87 88def update_module_dependencies(d): 89 for mod in modulecode_deps: 90 excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split()) 91 if excludes: 92 modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3]] 93 94# A custom getstate/setstate using tuples is actually worth 15% cachesize by 95# avoiding duplication of the attribute names! 96class SetCache(object): 97 def __init__(self): 98 self.setcache = {} 99 100 def internSet(self, items): 101 102 new = [] 103 for i in items: 104 new.append(sys.intern(i)) 105 s = frozenset(new) 106 h = hash(s) 107 if h in self.setcache: 108 return self.setcache[h] 109 self.setcache[h] = s 110 return s 111 112codecache = SetCache() 113 114class pythonCacheLine(object): 115 def __init__(self, refs, execs, contains): 116 self.refs = codecache.internSet(refs) 117 self.execs = codecache.internSet(execs) 118 self.contains = {} 119 for c in contains: 120 self.contains[c] = codecache.internSet(contains[c]) 121 122 def __getstate__(self): 123 return (self.refs, self.execs, self.contains) 124 125 def __setstate__(self, state): 126 (refs, execs, contains) = state 127 self.__init__(refs, execs, contains) 128 def __hash__(self): 129 l = (hash(self.refs), hash(self.execs)) 130 for c in sorted(self.contains.keys()): 131 l = l + (c, hash(self.contains[c])) 132 return hash(l) 133 def __repr__(self): 134 return " ".join([str(self.refs), str(self.execs), str(self.contains)]) 135 136 137class shellCacheLine(object): 138 def __init__(self, execs): 139 self.execs = codecache.internSet(execs) 140 141 def __getstate__(self): 142 return (self.execs) 143 144 def __setstate__(self, state): 145 (execs) = state 146 self.__init__(execs) 147 def __hash__(self): 148 return hash(self.execs) 149 def __repr__(self): 150 return str(self.execs) 151 152class CodeParserCache(MultiProcessCache): 153 cache_file_name = "bb_codeparser.dat" 154 # NOTE: you must increment this if you change how the parsers gather information, 155 # so that an existing cache gets invalidated. Additionally you'll need 156 # to increment __cache_version__ in cache.py in order to ensure that old 157 # recipe caches don't trigger "Taskhash mismatch" errors. 158 CACHE_VERSION = 11 159 160 def __init__(self): 161 MultiProcessCache.__init__(self) 162 self.pythoncache = self.cachedata[0] 163 self.shellcache = self.cachedata[1] 164 self.pythoncacheextras = self.cachedata_extras[0] 165 self.shellcacheextras = self.cachedata_extras[1] 166 167 # To avoid duplication in the codeparser cache, keep 168 # a lookup of hashes of objects we already have 169 self.pythoncachelines = {} 170 self.shellcachelines = {} 171 172 def newPythonCacheLine(self, refs, execs, contains): 173 cacheline = pythonCacheLine(refs, execs, contains) 174 h = hash(cacheline) 175 if h in self.pythoncachelines: 176 return self.pythoncachelines[h] 177 self.pythoncachelines[h] = cacheline 178 return cacheline 179 180 def newShellCacheLine(self, execs): 181 cacheline = shellCacheLine(execs) 182 h = hash(cacheline) 183 if h in self.shellcachelines: 184 return self.shellcachelines[h] 185 self.shellcachelines[h] = cacheline 186 return cacheline 187 188 def init_cache(self, cachedir): 189 # Check if we already have the caches 190 if self.pythoncache: 191 return 192 193 MultiProcessCache.init_cache(self, cachedir) 194 195 # cachedata gets re-assigned in the parent 196 self.pythoncache = self.cachedata[0] 197 self.shellcache = self.cachedata[1] 198 199 def create_cachedata(self): 200 data = [{}, {}] 201 return data 202 203codeparsercache = CodeParserCache() 204 205def parser_cache_init(cachedir): 206 codeparsercache.init_cache(cachedir) 207 208def parser_cache_save(): 209 codeparsercache.save_extras() 210 211def parser_cache_savemerge(): 212 codeparsercache.save_merge() 213 214Logger = logging.getLoggerClass() 215class BufferedLogger(Logger): 216 def __init__(self, name, level=0, target=None): 217 Logger.__init__(self, name) 218 self.setLevel(level) 219 self.buffer = [] 220 self.target = target 221 222 def handle(self, record): 223 self.buffer.append(record) 224 225 def flush(self): 226 for record in self.buffer: 227 if self.target.isEnabledFor(record.levelno): 228 self.target.handle(record) 229 self.buffer = [] 230 231class DummyLogger(): 232 def flush(self): 233 return 234 235class PythonParser(): 236 getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional") 237 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag") 238 containsfuncs = ("bb.utils.contains", "base_contains") 239 containsanyfuncs = ("bb.utils.contains_any", "bb.utils.filter") 240 execfuncs = ("bb.build.exec_func", "bb.build.exec_task") 241 242 def warn(self, func, arg): 243 """Warn about calls of bitbake APIs which pass a non-literal 244 argument for the variable name, as we're not able to track such 245 a reference. 246 """ 247 248 try: 249 funcstr = codegen.to_source(func) 250 argstr = codegen.to_source(arg) 251 except TypeError: 252 self.log.debug2('Failed to convert function and argument to source form') 253 else: 254 self.log.debug(self.unhandled_message % (funcstr, argstr)) 255 256 def visit_Call(self, node): 257 name = self.called_node_name(node.func) 258 if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs): 259 if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str): 260 varname = node.args[0].value 261 if name in self.containsfuncs and isinstance(node.args[1], ast.Constant): 262 if varname not in self.contains: 263 self.contains[varname] = set() 264 self.contains[varname].add(node.args[1].value) 265 elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant): 266 if varname not in self.contains: 267 self.contains[varname] = set() 268 self.contains[varname].update(node.args[1].value.split()) 269 elif name.endswith(self.getvarflags): 270 if isinstance(node.args[1], ast.Constant): 271 self.references.add('%s[%s]' % (varname, node.args[1].value)) 272 else: 273 self.warn(node.func, node.args[1]) 274 else: 275 self.references.add(varname) 276 else: 277 self.warn(node.func, node.args[0]) 278 elif name and name.endswith(".expand"): 279 if isinstance(node.args[0], ast.Constant): 280 value = node.args[0].value 281 d = bb.data.init() 282 parser = d.expandWithRefs(value, self.name) 283 self.references |= parser.references 284 self.execs |= parser.execs 285 for varname in parser.contains: 286 if varname not in self.contains: 287 self.contains[varname] = set() 288 self.contains[varname] |= parser.contains[varname] 289 elif name in self.execfuncs: 290 if isinstance(node.args[0], ast.Constant): 291 self.var_execs.add(node.args[0].value) 292 else: 293 self.warn(node.func, node.args[0]) 294 elif name and isinstance(node.func, (ast.Name, ast.Attribute)): 295 self.execs.add(name) 296 297 def called_node_name(self, node): 298 """Given a called node, return its original string form""" 299 components = [] 300 while node: 301 if isinstance(node, ast.Attribute): 302 components.append(node.attr) 303 node = node.value 304 elif isinstance(node, ast.Name): 305 components.append(node.id) 306 return '.'.join(reversed(components)) 307 else: 308 break 309 310 def __init__(self, name, log): 311 self.name = name 312 self.var_execs = set() 313 self.contains = {} 314 self.execs = set() 315 self.references = set() 316 self._log = log 317 # Defer init as expensive 318 self.log = DummyLogger() 319 320 self.unhandled_message = "in call of %s, argument '%s' is not a string literal" 321 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) 322 323 # For the python module code it is expensive to have the function text so it is 324 # uses a different fixedhash to cache against. We can take the hit on obtaining the 325 # text if it isn't in the cache. 326 def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None): 327 if not fixedhash and (not node or not node.strip()): 328 return 329 330 if fixedhash: 331 h = fixedhash 332 else: 333 h = bbhash(str(node)) 334 335 if h in codeparsercache.pythoncache: 336 self.references = set(codeparsercache.pythoncache[h].refs) 337 self.execs = set(codeparsercache.pythoncache[h].execs) 338 self.contains = {} 339 for i in codeparsercache.pythoncache[h].contains: 340 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) 341 return 342 343 if h in codeparsercache.pythoncacheextras: 344 self.references = set(codeparsercache.pythoncacheextras[h].refs) 345 self.execs = set(codeparsercache.pythoncacheextras[h].execs) 346 self.contains = {} 347 for i in codeparsercache.pythoncacheextras[h].contains: 348 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) 349 return 350 351 if fixedhash and not node: 352 raise KeyError 353 354 # Need to parse so take the hit on the real log buffer 355 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log) 356 357 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though 358 node = "\n" * int(lineno) + node 359 code = compile(check_indent(str(node)), filename, "exec", 360 ast.PyCF_ONLY_AST) 361 362 for n in ast.walk(code): 363 if n.__class__.__name__ == "Call": 364 self.visit_Call(n) 365 366 self.execs.update(self.var_execs) 367 368 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains) 369 370class ShellParser(): 371 def __init__(self, name, log): 372 self.funcdefs = set() 373 self.allexecs = set() 374 self.execs = set() 375 self._name = name 376 self._log = log 377 # Defer init as expensive 378 self.log = DummyLogger() 379 380 self.unhandled_template = "unable to handle non-literal command '%s'" 381 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) 382 383 def parse_shell(self, value): 384 """Parse the supplied shell code in a string, returning the external 385 commands it executes. 386 """ 387 388 h = bbhash(str(value)) 389 390 if h in codeparsercache.shellcache: 391 self.execs = set(codeparsercache.shellcache[h].execs) 392 return self.execs 393 394 if h in codeparsercache.shellcacheextras: 395 self.execs = set(codeparsercache.shellcacheextras[h].execs) 396 return self.execs 397 398 # Need to parse so take the hit on the real log buffer 399 self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log) 400 401 self._parse_shell(value) 402 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) 403 404 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) 405 406 return self.execs 407 408 def _parse_shell(self, value): 409 try: 410 tokens, _ = pyshyacc.parse(value, eof=True, debug=False) 411 except Exception: 412 bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:])) 413 raise 414 415 self.process_tokens(tokens) 416 417 def process_tokens(self, tokens): 418 """Process a supplied portion of the syntax tree as returned by 419 pyshyacc.parse. 420 """ 421 422 def function_definition(value): 423 self.funcdefs.add(value.name) 424 return [value.body], None 425 426 def case_clause(value): 427 # Element 0 of each item in the case is the list of patterns, and 428 # Element 1 of each item in the case is the list of commands to be 429 # executed when that pattern matches. 430 words = chain(*[item[0] for item in value.items]) 431 cmds = chain(*[item[1] for item in value.items]) 432 return cmds, words 433 434 def if_clause(value): 435 main = chain(value.cond, value.if_cmds) 436 rest = value.else_cmds 437 if isinstance(rest, tuple) and rest[0] == "elif": 438 return chain(main, if_clause(rest[1])) 439 else: 440 return chain(main, rest) 441 442 def simple_command(value): 443 return None, chain(value.words, (assign[1] for assign in value.assigns)) 444 445 token_handlers = { 446 "and_or": lambda x: ((x.left, x.right), None), 447 "async": lambda x: ([x], None), 448 "brace_group": lambda x: (x.cmds, None), 449 "for_clause": lambda x: (x.cmds, x.items), 450 "function_definition": function_definition, 451 "if_clause": lambda x: (if_clause(x), None), 452 "pipeline": lambda x: (x.commands, None), 453 "redirect_list": lambda x: ([x.cmd], None), 454 "subshell": lambda x: (x.cmds, None), 455 "while_clause": lambda x: (chain(x.condition, x.cmds), None), 456 "until_clause": lambda x: (chain(x.condition, x.cmds), None), 457 "simple_command": simple_command, 458 "case_clause": case_clause, 459 } 460 461 def process_token_list(tokens): 462 for token in tokens: 463 if isinstance(token, list): 464 process_token_list(token) 465 continue 466 name, value = token 467 try: 468 more_tokens, words = token_handlers[name](value) 469 except KeyError: 470 raise NotImplementedError("Unsupported token type " + name) 471 472 if more_tokens: 473 self.process_tokens(more_tokens) 474 475 if words: 476 self.process_words(words) 477 478 process_token_list(tokens) 479 480 def process_words(self, words): 481 """Process a set of 'words' in pyshyacc parlance, which includes 482 extraction of executed commands from $() blocks, as well as grabbing 483 the command name argument. 484 """ 485 486 words = list(words) 487 for word in list(words): 488 wtree = pyshlex.make_wordtree(word[1]) 489 for part in wtree: 490 if not isinstance(part, list): 491 continue 492 493 if part[0] in ('`', '$('): 494 command = pyshlex.wordtree_as_string(part[1:-1]) 495 self._parse_shell(command) 496 497 if word[0] in ("cmd_name", "cmd_word"): 498 if word in words: 499 words.remove(word) 500 501 usetoken = False 502 for word in words: 503 if word[0] in ("cmd_name", "cmd_word") or \ 504 (usetoken and word[0] == "TOKEN"): 505 if "=" in word[1]: 506 usetoken = True 507 continue 508 509 cmd = word[1] 510 if cmd.startswith("$"): 511 self.log.debug(self.unhandled_template % cmd) 512 elif cmd == "eval": 513 command = " ".join(word for _, word in words[1:]) 514 self._parse_shell(command) 515 else: 516 self.allexecs.add(cmd) 517 break 518