1""" 2BitBake code parser 3 4Parses actual code (i.e. python and shell) for functions and in-line 5expressions. Used mainly to determine dependencies on other functions 6and variables within the BitBake metadata. Also provides a cache for 7this information in order to speed up processing. 8 9(Not to be confused with the code that parses the metadata itself, 10see lib/bb/parse/ for that). 11 12NOTE: if you change how the parsers gather information you will almost 13certainly need to increment CodeParserCache.CACHE_VERSION below so that 14any existing codeparser cache gets invalidated. Additionally you'll need 15to increment __cache_version__ in cache.py in order to ensure that old 16recipe caches don't trigger "Taskhash mismatch" errors. 17 18""" 19 20import ast 21import sys 22import codegen 23import logging 24import pickle 25import bb.pysh as pysh 26import os.path 27import bb.utils, bb.data 28import hashlib 29from itertools import chain 30from bb.pysh import pyshyacc, pyshlex, sherrors 31from bb.cache import MultiProcessCache 32 33logger = logging.getLogger('BitBake.CodeParser') 34 35def bbhash(s): 36 return hashlib.md5(s.encode("utf-8")).hexdigest() 37 38def check_indent(codestr): 39 """If the code is indented, add a top level piece of code to 'remove' the indentation""" 40 41 i = 0 42 while codestr[i] in ["\n", "\t", " "]: 43 i = i + 1 44 45 if i == 0: 46 return codestr 47 48 if codestr[i-1] == "\t" or codestr[i-1] == " ": 49 if codestr[0] == "\n": 50 # Since we're adding a line, we need to remove one line of any empty padding 51 # to ensure line numbers are correct 52 codestr = codestr[1:] 53 return "if 1:\n" + codestr 54 55 return codestr 56 57 58# Basically pickle, in python 2.7.3 at least, does badly with data duplication 59# upon pickling and unpickling. Combine this with duplicate objects and things 60# are a mess. 61# 62# When the sets are originally created, python calls intern() on the set keys 63# which significantly improves memory usage. Sadly the pickle/unpickle process 64# doesn't call intern() on the keys and results in the same strings being duplicated 65# in memory. This also means pickle will save the same string multiple times in 66# the cache file. 67# 68# By having shell and python cacheline objects with setstate/getstate, we force 69# the object creation through our own routine where we can call intern (via internSet). 70# 71# We also use hashable frozensets and ensure we use references to these so that 72# duplicates can be removed, both in memory and in the resulting pickled data. 73# 74# By playing these games, the size of the cache file shrinks dramatically 75# meaning faster load times and the reloaded cache files also consume much less 76# memory. Smaller cache files, faster load times and lower memory usage is good. 77# 78# A custom getstate/setstate using tuples is actually worth 15% cachesize by 79# avoiding duplication of the attribute names! 80 81class SetCache(object): 82 def __init__(self): 83 self.setcache = {} 84 85 def internSet(self, items): 86 87 new = [] 88 for i in items: 89 new.append(sys.intern(i)) 90 s = frozenset(new) 91 h = hash(s) 92 if h in self.setcache: 93 return self.setcache[h] 94 self.setcache[h] = s 95 return s 96 97codecache = SetCache() 98 99class pythonCacheLine(object): 100 def __init__(self, refs, execs, contains): 101 self.refs = codecache.internSet(refs) 102 self.execs = codecache.internSet(execs) 103 self.contains = {} 104 for c in contains: 105 self.contains[c] = codecache.internSet(contains[c]) 106 107 def __getstate__(self): 108 return (self.refs, self.execs, self.contains) 109 110 def __setstate__(self, state): 111 (refs, execs, contains) = state 112 self.__init__(refs, execs, contains) 113 def __hash__(self): 114 l = (hash(self.refs), hash(self.execs)) 115 for c in sorted(self.contains.keys()): 116 l = l + (c, hash(self.contains[c])) 117 return hash(l) 118 def __repr__(self): 119 return " ".join([str(self.refs), str(self.execs), str(self.contains)]) 120 121 122class shellCacheLine(object): 123 def __init__(self, execs): 124 self.execs = codecache.internSet(execs) 125 126 def __getstate__(self): 127 return (self.execs) 128 129 def __setstate__(self, state): 130 (execs) = state 131 self.__init__(execs) 132 def __hash__(self): 133 return hash(self.execs) 134 def __repr__(self): 135 return str(self.execs) 136 137class CodeParserCache(MultiProcessCache): 138 cache_file_name = "bb_codeparser.dat" 139 # NOTE: you must increment this if you change how the parsers gather information, 140 # so that an existing cache gets invalidated. Additionally you'll need 141 # to increment __cache_version__ in cache.py in order to ensure that old 142 # recipe caches don't trigger "Taskhash mismatch" errors. 143 CACHE_VERSION = 9 144 145 def __init__(self): 146 MultiProcessCache.__init__(self) 147 self.pythoncache = self.cachedata[0] 148 self.shellcache = self.cachedata[1] 149 self.pythoncacheextras = self.cachedata_extras[0] 150 self.shellcacheextras = self.cachedata_extras[1] 151 152 # To avoid duplication in the codeparser cache, keep 153 # a lookup of hashes of objects we already have 154 self.pythoncachelines = {} 155 self.shellcachelines = {} 156 157 def newPythonCacheLine(self, refs, execs, contains): 158 cacheline = pythonCacheLine(refs, execs, contains) 159 h = hash(cacheline) 160 if h in self.pythoncachelines: 161 return self.pythoncachelines[h] 162 self.pythoncachelines[h] = cacheline 163 return cacheline 164 165 def newShellCacheLine(self, execs): 166 cacheline = shellCacheLine(execs) 167 h = hash(cacheline) 168 if h in self.shellcachelines: 169 return self.shellcachelines[h] 170 self.shellcachelines[h] = cacheline 171 return cacheline 172 173 def init_cache(self, d): 174 # Check if we already have the caches 175 if self.pythoncache: 176 return 177 178 MultiProcessCache.init_cache(self, d) 179 180 # cachedata gets re-assigned in the parent 181 self.pythoncache = self.cachedata[0] 182 self.shellcache = self.cachedata[1] 183 184 def create_cachedata(self): 185 data = [{}, {}] 186 return data 187 188codeparsercache = CodeParserCache() 189 190def parser_cache_init(d): 191 codeparsercache.init_cache(d) 192 193def parser_cache_save(): 194 codeparsercache.save_extras() 195 196def parser_cache_savemerge(): 197 codeparsercache.save_merge() 198 199Logger = logging.getLoggerClass() 200class BufferedLogger(Logger): 201 def __init__(self, name, level=0, target=None): 202 Logger.__init__(self, name) 203 self.setLevel(level) 204 self.buffer = [] 205 self.target = target 206 207 def handle(self, record): 208 self.buffer.append(record) 209 210 def flush(self): 211 for record in self.buffer: 212 if self.target.isEnabledFor(record.levelno): 213 self.target.handle(record) 214 self.buffer = [] 215 216class PythonParser(): 217 getvars = (".getVar", ".appendVar", ".prependVar") 218 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag") 219 containsfuncs = ("bb.utils.contains", "base_contains") 220 containsanyfuncs = ("bb.utils.contains_any", "bb.utils.filter") 221 execfuncs = ("bb.build.exec_func", "bb.build.exec_task") 222 223 def warn(self, func, arg): 224 """Warn about calls of bitbake APIs which pass a non-literal 225 argument for the variable name, as we're not able to track such 226 a reference. 227 """ 228 229 try: 230 funcstr = codegen.to_source(func) 231 argstr = codegen.to_source(arg) 232 except TypeError: 233 self.log.debug(2, 'Failed to convert function and argument to source form') 234 else: 235 self.log.debug(1, self.unhandled_message % (funcstr, argstr)) 236 237 def visit_Call(self, node): 238 name = self.called_node_name(node.func) 239 if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs or name in self.containsanyfuncs): 240 if isinstance(node.args[0], ast.Str): 241 varname = node.args[0].s 242 if name in self.containsfuncs and isinstance(node.args[1], ast.Str): 243 if varname not in self.contains: 244 self.contains[varname] = set() 245 self.contains[varname].add(node.args[1].s) 246 elif name in self.containsanyfuncs and isinstance(node.args[1], ast.Str): 247 if varname not in self.contains: 248 self.contains[varname] = set() 249 self.contains[varname].update(node.args[1].s.split()) 250 elif name.endswith(self.getvarflags): 251 if isinstance(node.args[1], ast.Str): 252 self.references.add('%s[%s]' % (varname, node.args[1].s)) 253 else: 254 self.warn(node.func, node.args[1]) 255 else: 256 self.references.add(varname) 257 else: 258 self.warn(node.func, node.args[0]) 259 elif name and name.endswith(".expand"): 260 if isinstance(node.args[0], ast.Str): 261 value = node.args[0].s 262 d = bb.data.init() 263 parser = d.expandWithRefs(value, self.name) 264 self.references |= parser.references 265 self.execs |= parser.execs 266 for varname in parser.contains: 267 if varname not in self.contains: 268 self.contains[varname] = set() 269 self.contains[varname] |= parser.contains[varname] 270 elif name in self.execfuncs: 271 if isinstance(node.args[0], ast.Str): 272 self.var_execs.add(node.args[0].s) 273 else: 274 self.warn(node.func, node.args[0]) 275 elif name and isinstance(node.func, (ast.Name, ast.Attribute)): 276 self.execs.add(name) 277 278 def called_node_name(self, node): 279 """Given a called node, return its original string form""" 280 components = [] 281 while node: 282 if isinstance(node, ast.Attribute): 283 components.append(node.attr) 284 node = node.value 285 elif isinstance(node, ast.Name): 286 components.append(node.id) 287 return '.'.join(reversed(components)) 288 else: 289 break 290 291 def __init__(self, name, log): 292 self.name = name 293 self.var_execs = set() 294 self.contains = {} 295 self.execs = set() 296 self.references = set() 297 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log) 298 299 self.unhandled_message = "in call of %s, argument '%s' is not a string literal" 300 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) 301 302 def parse_python(self, node, lineno=0, filename="<string>"): 303 if not node or not node.strip(): 304 return 305 306 h = bbhash(str(node)) 307 308 if h in codeparsercache.pythoncache: 309 self.references = set(codeparsercache.pythoncache[h].refs) 310 self.execs = set(codeparsercache.pythoncache[h].execs) 311 self.contains = {} 312 for i in codeparsercache.pythoncache[h].contains: 313 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) 314 return 315 316 if h in codeparsercache.pythoncacheextras: 317 self.references = set(codeparsercache.pythoncacheextras[h].refs) 318 self.execs = set(codeparsercache.pythoncacheextras[h].execs) 319 self.contains = {} 320 for i in codeparsercache.pythoncacheextras[h].contains: 321 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) 322 return 323 324 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though 325 node = "\n" * int(lineno) + node 326 code = compile(check_indent(str(node)), filename, "exec", 327 ast.PyCF_ONLY_AST) 328 329 for n in ast.walk(code): 330 if n.__class__.__name__ == "Call": 331 self.visit_Call(n) 332 333 self.execs.update(self.var_execs) 334 335 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains) 336 337class ShellParser(): 338 def __init__(self, name, log): 339 self.funcdefs = set() 340 self.allexecs = set() 341 self.execs = set() 342 self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log) 343 self.unhandled_template = "unable to handle non-literal command '%s'" 344 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) 345 346 def parse_shell(self, value): 347 """Parse the supplied shell code in a string, returning the external 348 commands it executes. 349 """ 350 351 h = bbhash(str(value)) 352 353 if h in codeparsercache.shellcache: 354 self.execs = set(codeparsercache.shellcache[h].execs) 355 return self.execs 356 357 if h in codeparsercache.shellcacheextras: 358 self.execs = set(codeparsercache.shellcacheextras[h].execs) 359 return self.execs 360 361 self._parse_shell(value) 362 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) 363 364 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) 365 366 return self.execs 367 368 def _parse_shell(self, value): 369 try: 370 tokens, _ = pyshyacc.parse(value, eof=True, debug=False) 371 except pyshlex.NeedMore: 372 raise sherrors.ShellSyntaxError("Unexpected EOF") 373 374 self.process_tokens(tokens) 375 376 def process_tokens(self, tokens): 377 """Process a supplied portion of the syntax tree as returned by 378 pyshyacc.parse. 379 """ 380 381 def function_definition(value): 382 self.funcdefs.add(value.name) 383 return [value.body], None 384 385 def case_clause(value): 386 # Element 0 of each item in the case is the list of patterns, and 387 # Element 1 of each item in the case is the list of commands to be 388 # executed when that pattern matches. 389 words = chain(*[item[0] for item in value.items]) 390 cmds = chain(*[item[1] for item in value.items]) 391 return cmds, words 392 393 def if_clause(value): 394 main = chain(value.cond, value.if_cmds) 395 rest = value.else_cmds 396 if isinstance(rest, tuple) and rest[0] == "elif": 397 return chain(main, if_clause(rest[1])) 398 else: 399 return chain(main, rest) 400 401 def simple_command(value): 402 return None, chain(value.words, (assign[1] for assign in value.assigns)) 403 404 token_handlers = { 405 "and_or": lambda x: ((x.left, x.right), None), 406 "async": lambda x: ([x], None), 407 "brace_group": lambda x: (x.cmds, None), 408 "for_clause": lambda x: (x.cmds, x.items), 409 "function_definition": function_definition, 410 "if_clause": lambda x: (if_clause(x), None), 411 "pipeline": lambda x: (x.commands, None), 412 "redirect_list": lambda x: ([x.cmd], None), 413 "subshell": lambda x: (x.cmds, None), 414 "while_clause": lambda x: (chain(x.condition, x.cmds), None), 415 "until_clause": lambda x: (chain(x.condition, x.cmds), None), 416 "simple_command": simple_command, 417 "case_clause": case_clause, 418 } 419 420 def process_token_list(tokens): 421 for token in tokens: 422 if isinstance(token, list): 423 process_token_list(token) 424 continue 425 name, value = token 426 try: 427 more_tokens, words = token_handlers[name](value) 428 except KeyError: 429 raise NotImplementedError("Unsupported token type " + name) 430 431 if more_tokens: 432 self.process_tokens(more_tokens) 433 434 if words: 435 self.process_words(words) 436 437 process_token_list(tokens) 438 439 def process_words(self, words): 440 """Process a set of 'words' in pyshyacc parlance, which includes 441 extraction of executed commands from $() blocks, as well as grabbing 442 the command name argument. 443 """ 444 445 words = list(words) 446 for word in list(words): 447 wtree = pyshlex.make_wordtree(word[1]) 448 for part in wtree: 449 if not isinstance(part, list): 450 continue 451 452 if part[0] in ('`', '$('): 453 command = pyshlex.wordtree_as_string(part[1:-1]) 454 self._parse_shell(command) 455 456 if word[0] in ("cmd_name", "cmd_word"): 457 if word in words: 458 words.remove(word) 459 460 usetoken = False 461 for word in words: 462 if word[0] in ("cmd_name", "cmd_word") or \ 463 (usetoken and word[0] == "TOKEN"): 464 if "=" in word[1]: 465 usetoken = True 466 continue 467 468 cmd = word[1] 469 if cmd.startswith("$"): 470 self.log.debug(1, self.unhandled_template % cmd) 471 elif cmd == "eval": 472 command = " ".join(word for _, word in words[1:]) 473 self._parse_shell(command) 474 else: 475 self.allexecs.add(cmd) 476 break 477