1# 2# Copyright BitBake Contributors 3# 4# SPDX-License-Identifier: GPL-2.0-only 5# 6 7""" 8BitBake code parser 9 10Parses actual code (i.e. python and shell) for functions and in-line 11expressions. Used mainly to determine dependencies on other functions 12and variables within the BitBake metadata. Also provides a cache for 13this information in order to speed up processing. 14 15(Not to be confused with the code that parses the metadata itself, 16see lib/bb/parse/ for that). 17 18NOTE: if you change how the parsers gather information you will almost 19certainly need to increment CodeParserCache.CACHE_VERSION below so that 20any existing codeparser cache gets invalidated. Additionally you'll need 21to increment __cache_version__ in cache.py in order to ensure that old 22recipe caches don't trigger "Taskhash mismatch" errors. 23 24""" 25 26import ast 27import sys 28import codegen 29import logging 30import inspect 31import bb.pysh as pysh 32import bb.utils, bb.data 33import hashlib 34from itertools import chain 35from bb.pysh import pyshyacc, pyshlex 36from bb.cache import MultiProcessCache 37 38logger = logging.getLogger('BitBake.CodeParser') 39 40def bbhash(s): 41 return hashlib.sha256(s.encode("utf-8")).hexdigest() 42 43def check_indent(codestr): 44 """If the code is indented, add a top level piece of code to 'remove' the indentation""" 45 46 i = 0 47 while codestr[i] in ["\n", "\t", " "]: 48 i = i + 1 49 50 if i == 0: 51 return codestr 52 53 if codestr[i-1] == "\t" or codestr[i-1] == " ": 54 if codestr[0] == "\n": 55 # Since we're adding a line, we need to remove one line of any empty padding 56 # to ensure line numbers are correct 57 codestr = codestr[1:] 58 return "if 1:\n" + codestr 59 60 return codestr 61 62modulecode_deps = {} 63 64def add_module_functions(fn, functions, namespace): 65 import os 66 fstat = os.stat(fn) 67 fixedhash = fn + ":" + str(fstat.st_size) + ":" + str(fstat.st_mtime) 68 for f in functions: 69 name = "%s.%s" % (namespace, f) 70 parser = PythonParser(name, logger) 71 try: 72 parser.parse_python(None, filename=fn, lineno=1, fixedhash=fixedhash+f, func=functions[f]) 73 #bb.warn("Cached %s" % f) 74 except KeyError: 75 try: 76 targetfn = inspect.getsourcefile(functions[f]) 77 except TypeError: 78 # Builtin 79 continue 80 if fn != targetfn: 81 # Skip references to other modules outside this file 82 #bb.warn("Skipping %s" % name) 83 continue 84 try: 85 lines, lineno = inspect.getsourcelines(functions[f]) 86 except TypeError: 87 # Builtin 88 continue 89 src = "".join(lines) 90 parser.parse_python(src, filename=fn, lineno=lineno, fixedhash=fixedhash+f, func=functions[f]) 91 #bb.warn("Not cached %s" % f) 92 execs = parser.execs.copy() 93 # Expand internal module exec references 94 for e in parser.execs: 95 if e in functions: 96 execs.remove(e) 97 execs.add(namespace + "." + e) 98 visitorcode = None 99 if hasattr(functions[f], 'visitorcode'): 100 visitorcode = getattr(functions[f], "visitorcode") 101 modulecode_deps[name] = [parser.references.copy(), execs, parser.var_execs.copy(), parser.contains.copy(), parser.extra, visitorcode] 102 #bb.warn("%s: %s\nRefs:%s Execs: %s %s %s" % (name, fn, parser.references, parser.execs, parser.var_execs, parser.contains)) 103 104def update_module_dependencies(d): 105 for mod in modulecode_deps: 106 excludes = set((d.getVarFlag(mod, "vardepsexclude") or "").split()) 107 if excludes: 108 modulecode_deps[mod] = [modulecode_deps[mod][0] - excludes, modulecode_deps[mod][1] - excludes, modulecode_deps[mod][2] - excludes, modulecode_deps[mod][3], modulecode_deps[mod][4], modulecode_deps[mod][5]] 109 110# A custom getstate/setstate using tuples is actually worth 15% cachesize by 111# avoiding duplication of the attribute names! 112class SetCache(object): 113 def __init__(self): 114 self.setcache = {} 115 116 def internSet(self, items): 117 118 new = [] 119 for i in items: 120 new.append(sys.intern(i)) 121 s = frozenset(new) 122 h = hash(s) 123 if h in self.setcache: 124 return self.setcache[h] 125 self.setcache[h] = s 126 return s 127 128codecache = SetCache() 129 130class pythonCacheLine(object): 131 def __init__(self, refs, execs, contains, extra): 132 self.refs = codecache.internSet(refs) 133 self.execs = codecache.internSet(execs) 134 self.contains = {} 135 for c in contains: 136 self.contains[c] = codecache.internSet(contains[c]) 137 self.extra = extra 138 139 def __getstate__(self): 140 return (self.refs, self.execs, self.contains, self.extra) 141 142 def __setstate__(self, state): 143 (refs, execs, contains, extra) = state 144 self.__init__(refs, execs, contains, extra) 145 def __hash__(self): 146 l = (hash(self.refs), hash(self.execs), hash(self.extra)) 147 for c in sorted(self.contains.keys()): 148 l = l + (c, hash(self.contains[c])) 149 return hash(l) 150 def __repr__(self): 151 return " ".join([str(self.refs), str(self.execs), str(self.contains)]) 152 153 154class shellCacheLine(object): 155 def __init__(self, execs): 156 self.execs = codecache.internSet(execs) 157 158 def __getstate__(self): 159 return (self.execs) 160 161 def __setstate__(self, state): 162 (execs) = state 163 self.__init__(execs) 164 def __hash__(self): 165 return hash(self.execs) 166 def __repr__(self): 167 return str(self.execs) 168 169class CodeParserCache(MultiProcessCache): 170 cache_file_name = "bb_codeparser.dat" 171 # NOTE: you must increment this if you change how the parsers gather information, 172 # so that an existing cache gets invalidated. Additionally you'll need 173 # to increment __cache_version__ in cache.py in order to ensure that old 174 # recipe caches don't trigger "Taskhash mismatch" errors. 175 CACHE_VERSION = 14 176 177 def __init__(self): 178 MultiProcessCache.__init__(self) 179 self.pythoncache = self.cachedata[0] 180 self.shellcache = self.cachedata[1] 181 self.pythoncacheextras = self.cachedata_extras[0] 182 self.shellcacheextras = self.cachedata_extras[1] 183 184 # To avoid duplication in the codeparser cache, keep 185 # a lookup of hashes of objects we already have 186 self.pythoncachelines = {} 187 self.shellcachelines = {} 188 189 def newPythonCacheLine(self, refs, execs, contains, extra): 190 cacheline = pythonCacheLine(refs, execs, contains, extra) 191 h = hash(cacheline) 192 if h in self.pythoncachelines: 193 return self.pythoncachelines[h] 194 self.pythoncachelines[h] = cacheline 195 return cacheline 196 197 def newShellCacheLine(self, execs): 198 cacheline = shellCacheLine(execs) 199 h = hash(cacheline) 200 if h in self.shellcachelines: 201 return self.shellcachelines[h] 202 self.shellcachelines[h] = cacheline 203 return cacheline 204 205 def init_cache(self, cachedir): 206 # Check if we already have the caches 207 if self.pythoncache: 208 return 209 210 MultiProcessCache.init_cache(self, cachedir) 211 212 # cachedata gets re-assigned in the parent 213 self.pythoncache = self.cachedata[0] 214 self.shellcache = self.cachedata[1] 215 216 def create_cachedata(self): 217 data = [{}, {}] 218 return data 219 220codeparsercache = CodeParserCache() 221 222def parser_cache_init(cachedir): 223 codeparsercache.init_cache(cachedir) 224 225def parser_cache_save(): 226 codeparsercache.save_extras() 227 228def parser_cache_savemerge(): 229 codeparsercache.save_merge() 230 231Logger = logging.getLoggerClass() 232class BufferedLogger(Logger): 233 def __init__(self, name, level=0, target=None): 234 Logger.__init__(self, name) 235 self.setLevel(level) 236 self.buffer = [] 237 self.target = target 238 239 def handle(self, record): 240 self.buffer.append(record) 241 242 def flush(self): 243 for record in self.buffer: 244 if self.target.isEnabledFor(record.levelno): 245 self.target.handle(record) 246 self.buffer = [] 247 248class DummyLogger(): 249 def flush(self): 250 return 251 252class PythonParser(): 253 getvars = (".getVar", ".appendVar", ".prependVar", "oe.utils.conditional") 254 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag") 255 containsfuncs = ("bb.utils.contains", "base_contains") 256 containsanyfuncs = ("bb.utils.contains_any", "bb.utils.filter") 257 execfuncs = ("bb.build.exec_func", "bb.build.exec_task") 258 259 def warn(self, func, arg): 260 """Warn about calls of bitbake APIs which pass a non-literal 261 argument for the variable name, as we're not able to track such 262 a reference. 263 """ 264 265 try: 266 funcstr = codegen.to_source(func) 267 argstr = codegen.to_source(arg) 268 except TypeError: 269 self.log.debug2('Failed to convert function and argument to source form') 270 else: 271 self.log.debug(self.unhandled_message % (funcstr, argstr)) 272 273 def visit_Call(self, node): 274 name = self.called_node_name(node.func) 275 if name and name in modulecode_deps and modulecode_deps[name][5]: 276 visitorcode = modulecode_deps[name][5] 277 contains, execs, warn = visitorcode(name, node.args) 278 for i in contains: 279 self.contains[i] = contains[i] 280 self.execs |= execs 281 if warn: 282 self.warn(node.func, warn) 283 elif name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs): 284 if isinstance(node.args[0], ast.Constant) and isinstance(node.args[0].value, str): 285 varname = node.args[0].value 286 if name in self.containsfuncs and isinstance(node.args[1], ast.Constant): 287 if varname not in self.contains: 288 self.contains[varname] = set() 289 self.contains[varname].add(node.args[1].value) 290 elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Constant): 291 if varname not in self.contains: 292 self.contains[varname] = set() 293 self.contains[varname].update(node.args[1].value.split()) 294 elif name.endswith(self.getvarflags): 295 if isinstance(node.args[1], ast.Constant): 296 self.references.add('%s[%s]' % (varname, node.args[1].value)) 297 else: 298 self.warn(node.func, node.args[1]) 299 else: 300 self.references.add(varname) 301 else: 302 self.warn(node.func, node.args[0]) 303 elif name and name.endswith(".expand"): 304 if isinstance(node.args[0], ast.Constant): 305 value = node.args[0].value 306 d = bb.data.init() 307 parser = d.expandWithRefs(value, self.name) 308 self.references |= parser.references 309 self.execs |= parser.execs 310 for varname in parser.contains: 311 if varname not in self.contains: 312 self.contains[varname] = set() 313 self.contains[varname] |= parser.contains[varname] 314 elif name in self.execfuncs: 315 if isinstance(node.args[0], ast.Constant): 316 self.var_execs.add(node.args[0].value) 317 else: 318 self.warn(node.func, node.args[0]) 319 elif name and isinstance(node.func, (ast.Name, ast.Attribute)): 320 self.execs.add(name) 321 322 def called_node_name(self, node): 323 """Given a called node, return its original string form""" 324 components = [] 325 while node: 326 if isinstance(node, ast.Attribute): 327 components.append(node.attr) 328 node = node.value 329 elif isinstance(node, ast.Name): 330 components.append(node.id) 331 return '.'.join(reversed(components)) 332 else: 333 break 334 335 def __init__(self, name, log): 336 self.name = name 337 self.var_execs = set() 338 self.contains = {} 339 self.execs = set() 340 self.references = set() 341 self._log = log 342 # Defer init as expensive 343 self.log = DummyLogger() 344 345 self.unhandled_message = "in call of %s, argument '%s' is not a string literal" 346 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) 347 348 # For the python module code it is expensive to have the function text so it is 349 # uses a different fixedhash to cache against. We can take the hit on obtaining the 350 # text if it isn't in the cache. 351 def parse_python(self, node, lineno=0, filename="<string>", fixedhash=None, func=None): 352 if not fixedhash and (not node or not node.strip()): 353 return 354 355 if fixedhash: 356 h = fixedhash 357 else: 358 h = bbhash(str(node)) 359 360 if h in codeparsercache.pythoncache: 361 self.references = set(codeparsercache.pythoncache[h].refs) 362 self.execs = set(codeparsercache.pythoncache[h].execs) 363 self.contains = {} 364 for i in codeparsercache.pythoncache[h].contains: 365 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) 366 self.extra = codeparsercache.pythoncache[h].extra 367 return 368 369 if h in codeparsercache.pythoncacheextras: 370 self.references = set(codeparsercache.pythoncacheextras[h].refs) 371 self.execs = set(codeparsercache.pythoncacheextras[h].execs) 372 self.contains = {} 373 for i in codeparsercache.pythoncacheextras[h].contains: 374 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) 375 self.extra = codeparsercache.pythoncacheextras[h].extra 376 return 377 378 if fixedhash and not node: 379 raise KeyError 380 381 # Need to parse so take the hit on the real log buffer 382 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, self._log) 383 384 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though 385 node = "\n" * int(lineno) + node 386 code = compile(check_indent(str(node)), filename, "exec", 387 ast.PyCF_ONLY_AST) 388 389 for n in ast.walk(code): 390 if n.__class__.__name__ == "Call": 391 self.visit_Call(n) 392 393 if func is not None: 394 self.references |= getattr(func, "bb_vardeps", set()) 395 self.references -= getattr(func, "bb_vardepsexclude", set()) 396 397 self.execs.update(self.var_execs) 398 self.extra = None 399 if fixedhash: 400 self.extra = bbhash(str(node)) 401 402 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains, self.extra) 403 404class ShellParser(): 405 def __init__(self, name, log): 406 self.funcdefs = set() 407 self.allexecs = set() 408 self.execs = set() 409 self._name = name 410 self._log = log 411 # Defer init as expensive 412 self.log = DummyLogger() 413 414 self.unhandled_template = "unable to handle non-literal command '%s'" 415 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) 416 417 def parse_shell(self, value): 418 """Parse the supplied shell code in a string, returning the external 419 commands it executes. 420 """ 421 422 h = bbhash(str(value)) 423 424 if h in codeparsercache.shellcache: 425 self.execs = set(codeparsercache.shellcache[h].execs) 426 return self.execs 427 428 if h in codeparsercache.shellcacheextras: 429 self.execs = set(codeparsercache.shellcacheextras[h].execs) 430 return self.execs 431 432 # Need to parse so take the hit on the real log buffer 433 self.log = BufferedLogger('BitBake.Data.%s' % self._name, logging.DEBUG, self._log) 434 435 self._parse_shell(value) 436 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) 437 438 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) 439 440 return self.execs 441 442 def _parse_shell(self, value): 443 try: 444 tokens, _ = pyshyacc.parse(value, eof=True, debug=False) 445 except Exception: 446 bb.error('Error during parse shell code, the last 5 lines are:\n%s' % '\n'.join(value.split('\n')[-5:])) 447 raise 448 449 self.process_tokens(tokens) 450 451 def process_tokens(self, tokens): 452 """Process a supplied portion of the syntax tree as returned by 453 pyshyacc.parse. 454 """ 455 456 def function_definition(value): 457 self.funcdefs.add(value.name) 458 return [value.body], None 459 460 def case_clause(value): 461 # Element 0 of each item in the case is the list of patterns, and 462 # Element 1 of each item in the case is the list of commands to be 463 # executed when that pattern matches. 464 words = chain(*[item[0] for item in value.items]) 465 cmds = chain(*[item[1] for item in value.items]) 466 return cmds, words 467 468 def if_clause(value): 469 main = chain(value.cond, value.if_cmds) 470 rest = value.else_cmds 471 if isinstance(rest, tuple) and rest[0] == "elif": 472 return chain(main, if_clause(rest[1])) 473 else: 474 return chain(main, rest) 475 476 def simple_command(value): 477 return None, chain(value.words, (assign[1] for assign in value.assigns)) 478 479 token_handlers = { 480 "and_or": lambda x: ((x.left, x.right), None), 481 "async": lambda x: ([x], None), 482 "brace_group": lambda x: (x.cmds, None), 483 "for_clause": lambda x: (x.cmds, x.items), 484 "function_definition": function_definition, 485 "if_clause": lambda x: (if_clause(x), None), 486 "pipeline": lambda x: (x.commands, None), 487 "redirect_list": lambda x: ([x.cmd], None), 488 "subshell": lambda x: (x.cmds, None), 489 "while_clause": lambda x: (chain(x.condition, x.cmds), None), 490 "until_clause": lambda x: (chain(x.condition, x.cmds), None), 491 "simple_command": simple_command, 492 "case_clause": case_clause, 493 } 494 495 def process_token_list(tokens): 496 for token in tokens: 497 if isinstance(token, list): 498 process_token_list(token) 499 continue 500 name, value = token 501 try: 502 more_tokens, words = token_handlers[name](value) 503 except KeyError: 504 raise NotImplementedError("Unsupported token type " + name) 505 506 if more_tokens: 507 self.process_tokens(more_tokens) 508 509 if words: 510 self.process_words(words) 511 512 process_token_list(tokens) 513 514 def process_words(self, words): 515 """Process a set of 'words' in pyshyacc parlance, which includes 516 extraction of executed commands from $() blocks, as well as grabbing 517 the command name argument. 518 """ 519 520 words = list(words) 521 for word in words: 522 wtree = pyshlex.make_wordtree(word[1]) 523 for part in wtree: 524 if not isinstance(part, list): 525 continue 526 527 candidates = [part] 528 529 # If command is of type: 530 # 531 # var="... $(cmd [...]) ..." 532 # 533 # Then iterate on what's between the quotes and if we find a 534 # list, make that what we check for below. 535 if len(part) >= 3 and part[0] == '"': 536 for p in part[1:-1]: 537 if isinstance(p, list): 538 candidates.append(p) 539 540 for candidate in candidates: 541 if len(candidate) >= 2: 542 if candidate[0] in ('`', '$('): 543 command = pyshlex.wordtree_as_string(candidate[1:-1]) 544 self._parse_shell(command) 545 546 if word[0] in ("cmd_name", "cmd_word"): 547 if word in words: 548 words.remove(word) 549 550 usetoken = False 551 for word in words: 552 if word[0] in ("cmd_name", "cmd_word") or \ 553 (usetoken and word[0] == "TOKEN"): 554 if "=" in word[1]: 555 usetoken = True 556 continue 557 558 cmd = word[1] 559 if cmd.startswith("$"): 560 self.log.debug(self.unhandled_template % cmd) 561 elif cmd == "eval": 562 command = " ".join(word for _, word in words[1:]) 563 self._parse_shell(command) 564 else: 565 self.allexecs.add(cmd) 566 break 567