1# 2# Copyright BitBake Contributors 3# 4# SPDX-License-Identifier: GPL-2.0-only 5# 6 7""" 8BitBake code parser 9 10Parses actual code (i.e. python and shell) for functions and in-line 11expressions. Used mainly to determine dependencies on other functions 12and variables within the BitBake metadata. Also provides a cache for 13this information in order to speed up processing. 14 15(Not to be confused with the code that parses the metadata itself, 16see lib/bb/parse/ for that). 17 18NOTE: if you change how the parsers gather information you will almost 19certainly need to increment CodeParserCache.CACHE_VERSION below so that 20any existing codeparser cache gets invalidated. Additionally you'll need 21to increment __cache_version__ in cache.py in order to ensure that old 22recipe caches don't trigger "Taskhash mismatch" errors. 23 24""" 25 26import ast 27import sys 28import codegen 29import logging 30import inspect 31import bb.pysh as pysh 32import bb.utils, bb.data 33import hashlib 34from itertools import chain 35from bb.pysh import pyshyacc, pyshlex 36from bb.cache import MultiProcessCache 37 38logger = logging.getLogger('BitBake.CodeParser') 39 40def bbhash(s): 41 return hashlib.sha256(s.encode("utf-8")).hexdigest() 42 43def check_indent(codestr): 44 """If the code is indented, add a top level piece of code to 'remove' the indentation""" 45 46 i = 0 47 while codestr[i] in ["\n", "\t", " "]: 48 i = i + 1 49 50 if i == 0: 51 return codestr 52 53 if codestr[i-1] == "\t" or codestr[i-1] == " ": 54 if codestr[0] == "\n": 55 # Since we're adding a line, we need to remove one line of any empty padding 56 # to ensure line numbers are correct 57 codestr = codestr[1:] 58 return "if 1:\n" + codestr 59 60 return codestr 61 62modulecode_deps = {} 63 64def add_module_functions(fn, functions, namespace): 65 import os 66 fstat = os.stat(fn) 67 fixedhash = fn + ":" + str(fstat.st_size) + ":" + str(fstat.st_mtime) 68 for f in functions: 69 name = "%s.%s" % (namespace, f) 70 parser = PythonParser(name, logger) 71 try: 72 parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f) 73 #bb.warn("Cached %s" % f) 74 except KeyError: 75 targetfn = inspect.getsourcefile(functions[f]) 76 if fn != targetfn: 77 # Skip references to other modules outside this file 78 #bb.warn("Skipping %s" % name) 79 continue 80 lines, lineno = inspect.getsourcelines(functions[f]) 81 src = "".join(lines) 82 parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f) 83 #bb.warn("Not cached %s" % f) 84 execs = parser.execs.copy() 85 # Expand internal module exec references 86 for e in parser.execs: 87 if e in functions: 88 execs.remove(e) 89 execs.add(namespace + "." + e) 90 visitorcode = None 91 if hasattr(functions[f], 'visitorcode'): 92 visitorcode = getattr(functions[f], "visitorcode") 93 modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy(), parser.extra, visitorcode] 94 #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains)) 95 96def update_module_dependencies(d): 97 for mod in modulecode_deps: 98 excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split()) 99 if excludes: 100 modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3], modulecode_deps[mod][4], modulecode_deps[mod][5]] 101 102# A custom getstate/setstate using tuples is actually worth 15% cachesize by 103# avoiding duplication of the attribute names! 104class SetCache(object): 105 def __init__(self): 106 self.setcache = {} 107 108 def internSet(self, items): 109 110 new = [] 111 for i in items: 112 new.append(sys.intern(i)) 113 s = frozenset(new) 114 h = hash(s) 115 if h in self.setcache: 116 return self.setcache[h] 117 self.setcache[h] = s 118 return s 119 120codecache = SetCache() 121 122class pythonCacheLine(object): 123 def __init__(self, refs, execs, contains, extra): 124 self.refs = codecache.internSet(refs) 125 self.execs = codecache.internSet(execs) 126 self.contains = {} 127 for c in contains: 128 self.contains[c] = codecache.internSet(contains[c]) 129 self.extra = extra 130 131 def __getstate__(self): 132 return (self.refs, self.execs, self.contains, self.extra) 133 134 def __setstate__(self, state): 135 (refs, execs, contains, extra) = state 136 self.__init__(refs, execs, contains, extra) 137 def __hash__(self): 138 l = (hash(self.refs), hash(self.execs), hash(self.extra)) 139 for c in sorted(self.contains.keys()): 140 l = l + (c, hash(self.contains[c])) 141 return hash(l) 142 def __repr__(self): 143 return " ".join([str(self.refs), str(self.execs), str(self.contains)]) 144 145 146class shellCacheLine(object): 147 def __init__(self, execs): 148 self.execs = codecache.internSet(execs) 149 150 def __getstate__(self): 151 return (self.execs) 152 153 def __setstate__(self, state): 154 (execs) = state 155 self.__init__(execs) 156 def __hash__(self): 157 return hash(self.execs) 158 def __repr__(self): 159 return str(self.execs) 160 161class CodeParserCache(MultiProcessCache): 162 cache_file_name = "bb_codeparser.dat" 163 # NOTE: you must increment this if you change how the parsers gather information, 164 # so that an existing cache gets invalidated. Additionally you'll need 165 # to increment __cache_version__ in cache.py in order to ensure that old 166 # recipe caches don't trigger "Taskhash mismatch" errors. 167 CACHE_VERSION = 14 168 169 def __init__(self): 170 MultiProcessCache.__init__(self) 171 self.pythoncache = self.cachedata[0] 172 self.shellcache = self.cachedata[1] 173 self.pythoncacheextras = self.cachedata_extras[0] 174 self.shellcacheextras = self.cachedata_extras[1] 175 176 # To avoid duplication in the codeparser cache, keep 177 # a lookup of hashes of objects we already have 178 self.pythoncachelines = {} 179 self.shellcachelines = {} 180 181 def newPythonCacheLine(self, refs, execs, contains, extra): 182 cacheline = pythonCacheLine(refs, execs, contains, extra) 183 h = hash(cacheline) 184 if h in self.pythoncachelines: 185 return self.pythoncachelines[h] 186 self.pythoncachelines[h] = cacheline 187 return cacheline 188 189 def newShellCacheLine(self, execs): 190 cacheline = shellCacheLine(execs) 191 h = hash(cacheline) 192 if h in self.shellcachelines: 193 return self.shellcachelines[h] 194 self.shellcachelines[h] = cacheline 195 return cacheline 196 197 def init_cache(self, cachedir): 198 # Check if we already have the caches 199 if self.pythoncache: 200 return 201 202 MultiProcessCache.init_cache(self, cachedir) 203 204 # cachedata gets re-assigned in the parent 205 self.pythoncache = self.cachedata[0] 206 self.shellcache = self.cachedata[1] 207 208 def create_cachedata(self): 209 data = [{}, {}] 210 return data 211 212codeparsercache = CodeParserCache() 213 214def parser_cache_init(cachedir): 215 codeparsercache.init_cache(cachedir) 216 217def parser_cache_save(): 218 codeparsercache.save_extras() 219 220def parser_cache_savemerge(): 221 codeparsercache.save_merge() 222 223Logger = logging.getLoggerClass() 224class BufferedLogger(Logger): 225 def __init__(self, name, level=0, target=None): 226 Logger.__init__(self, name) 227 self.setLevel(level) 228 self.buffer = [] 229 self.target = target 230 231 def handle(self, record): 232 self.buffer.append(record) 233 234 def flush(self): 235 for record in self.buffer: 236 if self.target.isEnabledFor(record.levelno): 237 self.target.handle(record) 238 self.buffer = [] 239 240class DummyLogger(): 241 def flush(self): 242 return 243 244class PythonParser(): 245 getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional") 246 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag") 247 containsfuncs = ("bb.utils.contains", "base_contains") 248 containsanyfuncs = ("bb.utils.contains_any", "bb.utils.filter") 249 execfuncs = ("bb.build.exec_func", "bb.build.exec_task") 250 251 def warn(self, func, arg): 252 """Warn about calls of bitbake APIs which pass a non-literal 253 argument for the variable name, as we're not able to track such 254 a reference. 255 """ 256 257 try: 258 funcstr = codegen.to_source(func) 259 argstr = codegen.to_source(arg) 260 except TypeError: 261 self.log.debug2('Failed to convert function and argument to source form') 262 else: 263 self.log.debug(self.unhandled_message % (funcstr, argstr)) 264 265 def visit_Call(self, node): 266 name = self.called_node_name(node.func) 267 if name and name in modulecode_deps and modulecode_deps[name][5]: 268 visitorcode = modulecode_deps[name][5] 269 contains, execs, warn = visitorcode(name, node.args) 270 for i in contains: 271 self.contains[i] = contains[i] 272 self.execs |= execs 273 if warn: 274 self.warn(node.func, warn) 275 elif name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs): 276 if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str): 277 varname = node.args[0].value 278 if name in self.containsfuncs and isinstance(node.args[1], ast.Constant): 279 if varname not in self.contains: 280 self.contains[varname] = set() 281 self.contains[varname].add(node.args[1].value) 282 elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant): 283 if varname not in self.contains: 284 self.contains[varname] = set() 285 self.contains[varname].update(node.args[1].value.split()) 286 elif name.endswith(self.getvarflags): 287 if isinstance(node.args[1], ast.Constant): 288 self.references.add('%s[%s]' % (varname, node.args[1].value)) 289 else: 290 self.warn(node.func, node.args[1]) 291 else: 292 self.references.add(varname) 293 else: 294 self.warn(node.func, node.args[0]) 295 elif name and name.endswith(".expand"): 296 if isinstance(node.args[0], ast.Constant): 297 value = node.args[0].value 298 d = bb.data.init() 299 parser = d.expandWithRefs(value, self.name) 300 self.references |= parser.references 301 self.execs |= parser.execs 302 for varname in parser.contains: 303 if varname not in self.contains: 304 self.contains[varname] = set() 305 self.contains[varname] |= parser.contains[varname] 306 elif name in self.execfuncs: 307 if isinstance(node.args[0], ast.Constant): 308 self.var_execs.add(node.args[0].value) 309 else: 310 self.warn(node.func, node.args[0]) 311 elif name and isinstance(node.func, (ast.Name, ast.Attribute)): 312 self.execs.add(name) 313 314 def called_node_name(self, node): 315 """Given a called node, return its original string form""" 316 components = [] 317 while node: 318 if isinstance(node, ast.Attribute): 319 components.append(node.attr) 320 node = node.value 321 elif isinstance(node, ast.Name): 322 components.append(node.id) 323 return '.'.join(reversed(components)) 324 else: 325 break 326 327 def __init__(self, name, log): 328 self.name = name 329 self.var_execs = set() 330 self.contains = {} 331 self.execs = set() 332 self.references = set() 333 self._log = log 334 # Defer init as expensive 335 self.log = DummyLogger() 336 337 self.unhandled_message = "in call of %s, argument '%s' is not a string literal" 338 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) 339 340 # For the python module code it is expensive to have the function text so it is 341 # uses a different fixedhash to cache against. We can take the hit on obtaining the 342 # text if it isn't in the cache. 343 def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None): 344 if not fixedhash and (not node or not node.strip()): 345 return 346 347 if fixedhash: 348 h = fixedhash 349 else: 350 h = bbhash(str(node)) 351 352 if h in codeparsercache.pythoncache: 353 self.references = set(codeparsercache.pythoncache[h].refs) 354 self.execs = set(codeparsercache.pythoncache[h].execs) 355 self.contains = {} 356 for i in codeparsercache.pythoncache[h].contains: 357 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) 358 self.extra = codeparsercache.pythoncache[h].extra 359 return 360 361 if h in codeparsercache.pythoncacheextras: 362 self.references = set(codeparsercache.pythoncacheextras[h].refs) 363 self.execs = set(codeparsercache.pythoncacheextras[h].execs) 364 self.contains = {} 365 for i in codeparsercache.pythoncacheextras[h].contains: 366 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) 367 self.extra = codeparsercache.pythoncacheextras[h].extra 368 return 369 370 if fixedhash and not node: 371 raise KeyError 372 373 # Need to parse so take the hit on the real log buffer 374 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log) 375 376 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though 377 node = "\n" * int(lineno) + node 378 code = compile(check_indent(str(node)), filename, "exec", 379 ast.PyCF_ONLY_AST) 380 381 for n in ast.walk(code): 382 if n.__class__.__name__ == "Call": 383 self.visit_Call(n) 384 385 self.execs.update(self.var_execs) 386 self.extra = None 387 if fixedhash: 388 self.extra = bbhash(str(node)) 389 390 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains, self.extra) 391 392class ShellParser(): 393 def __init__(self, name, log): 394 self.funcdefs = set() 395 self.allexecs = set() 396 self.execs = set() 397 self._name = name 398 self._log = log 399 # Defer init as expensive 400 self.log = DummyLogger() 401 402 self.unhandled_template = "unable to handle non-literal command '%s'" 403 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) 404 405 def parse_shell(self, value): 406 """Parse the supplied shell code in a string, returning the external 407 commands it executes. 408 """ 409 410 h = bbhash(str(value)) 411 412 if h in codeparsercache.shellcache: 413 self.execs = set(codeparsercache.shellcache[h].execs) 414 return self.execs 415 416 if h in codeparsercache.shellcacheextras: 417 self.execs = set(codeparsercache.shellcacheextras[h].execs) 418 return self.execs 419 420 # Need to parse so take the hit on the real log buffer 421 self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log) 422 423 self._parse_shell(value) 424 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) 425 426 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) 427 428 return self.execs 429 430 def _parse_shell(self, value): 431 try: 432 tokens, _ = pyshyacc.parse(value, eof=True, debug=False) 433 except Exception: 434 bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:])) 435 raise 436 437 self.process_tokens(tokens) 438 439 def process_tokens(self, tokens): 440 """Process a supplied portion of the syntax tree as returned by 441 pyshyacc.parse. 442 """ 443 444 def function_definition(value): 445 self.funcdefs.add(value.name) 446 return [value.body], None 447 448 def case_clause(value): 449 # Element 0 of each item in the case is the list of patterns, and 450 # Element 1 of each item in the case is the list of commands to be 451 # executed when that pattern matches. 452 words = chain(*[item[0] for item in value.items]) 453 cmds = chain(*[item[1] for item in value.items]) 454 return cmds, words 455 456 def if_clause(value): 457 main = chain(value.cond, value.if_cmds) 458 rest = value.else_cmds 459 if isinstance(rest, tuple) and rest[0] == "elif": 460 return chain(main, if_clause(rest[1])) 461 else: 462 return chain(main, rest) 463 464 def simple_command(value): 465 return None, chain(value.words, (assign[1] for assign in value.assigns)) 466 467 token_handlers = { 468 "and_or": lambda x: ((x.left, x.right), None), 469 "async": lambda x: ([x], None), 470 "brace_group": lambda x: (x.cmds, None), 471 "for_clause": lambda x: (x.cmds, x.items), 472 "function_definition": function_definition, 473 "if_clause": lambda x: (if_clause(x), None), 474 "pipeline": lambda x: (x.commands, None), 475 "redirect_list": lambda x: ([x.cmd], None), 476 "subshell": lambda x: (x.cmds, None), 477 "while_clause": lambda x: (chain(x.condition, x.cmds), None), 478 "until_clause": lambda x: (chain(x.condition, x.cmds), None), 479 "simple_command": simple_command, 480 "case_clause": case_clause, 481 } 482 483 def process_token_list(tokens): 484 for token in tokens: 485 if isinstance(token, list): 486 process_token_list(token) 487 continue 488 name, value = token 489 try: 490 more_tokens, words = token_handlers[name](value) 491 except KeyError: 492 raise NotImplementedError("Unsupported token type " + name) 493 494 if more_tokens: 495 self.process_tokens(more_tokens) 496 497 if words: 498 self.process_words(words) 499 500 process_token_list(tokens) 501 502 def process_words(self, words): 503 """Process a set of 'words' in pyshyacc parlance, which includes 504 extraction of executed commands from $() blocks, as well as grabbing 505 the command name argument. 506 """ 507 508 words = list(words) 509 for word in words: 510 wtree = pyshlex.make_wordtree(word[1]) 511 for part in wtree: 512 if not isinstance(part, list): 513 continue 514 515 candidates = [part] 516 517 # If command is of type: 518 # 519 # var="... $(cmd [...]) ..." 520 # 521 # Then iterate on what's between the quotes and if we find a 522 # list, make that what we check for below. 523 if len(part) >= 3 and part[0] == '"': 524 for p in part[1:-1]: 525 if isinstance(p, list): 526 candidates.append(p) 527 528 for candidate in candidates: 529 if len(candidate) >= 2: 530 if candidate[0] in ('`', '$('): 531 command = pyshlex.wordtree_as_string(candidate[1:-1]) 532 self._parse_shell(command) 533 534 if word[0] in ("cmd_name", "cmd_word"): 535 if word in words: 536 words.remove(word) 537 538 usetoken = False 539 for word in words: 540 if word[0] in ("cmd_name", "cmd_word") or \ 541 (usetoken and word[0] == "TOKEN"): 542 if "=" in word[1]: 543 usetoken = True 544 continue 545 546 cmd = word[1] 547 if cmd.startswith("$"): 548 self.log.debug(self.unhandled_template % cmd) 549 elif cmd == "eval": 550 command = " ".join(word for _, word in words[1:]) 551 self._parse_shell(command) 552 else: 553 self.allexecs.add(cmd) 554 break 555