1#!/usr/bin/env python3 2# 3# Copyright OpenEmbedded Contributors 4# 5# SPDX-License-Identifier: MIT 6# 7 8import argparse 9import os 10import re 11import sys 12 13from collections import defaultdict 14from concurrent.futures import ThreadPoolExecutor 15from dataclasses import dataclass 16from pathlib import Path 17 18if sys.version_info < (3, 8, 0): 19 raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.") 20 21SSTATE_PREFIX = "sstate:" 22SSTATE_EXTENSION = ".tar.zst" 23# SSTATE_EXTENSION = ".tgz" 24# .siginfo.done files are mentioned in the original script? 25SSTATE_SUFFIXES = ( 26 SSTATE_EXTENSION, 27 f"{SSTATE_EXTENSION}.siginfo", 28 f"{SSTATE_EXTENSION}.done", 29) 30 31RE_SSTATE_PKGSPEC = re.compile( 32 rf"""sstate:(?P<pn>[^:]*): 33 (?P<package_target>[^:]*): 34 (?P<pv>[^:]*): 35 (?P<pr>[^:]*): 36 (?P<sstate_pkgarch>[^:]*): 37 (?P<sstate_version>[^_]*): 38 (?P<bb_unihash>[^_]*)_ 39 (?P<bb_task>[^:]*) 40 (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""", 41 re.X, 42) 43 44 45# Really we'd like something like a Path subclass which implements a stat 46# cache here, unfortunately there's no good way to do that transparently 47# (yet); see: 48# 49# https://github.com/python/cpython/issues/70219 50# https://discuss.python.org/t/make-pathlib-extensible/3428/77 51@dataclass 52class SstateEntry: 53 """Class for keeping track of an entry in sstate-cache.""" 54 55 path: Path 56 match: re.Match 57 stat_result: os.stat_result = None 58 59 def __hash__(self): 60 return self.path.__hash__() 61 62 def __getattr__(self, name): 63 return self.match.group(name) 64 65 66# this is what's in the original script; as far as I can tell, it's an 67# implementation artefact which we don't need? 68def find_archs(): 69 # all_archs 70 builder_arch = os.uname().machine 71 72 # FIXME 73 layer_paths = [Path("../..")] 74 75 tune_archs = set() 76 re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"') 77 for path in layer_paths: 78 for tunefile in [ 79 p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file() 80 ]: 81 with open(tunefile) as f: 82 for line in f: 83 m = re_tune.match(line) 84 if m: 85 tune_archs.update(m.group(1).split()) 86 87 # all_machines 88 machine_archs = set() 89 for path in layer_paths: 90 for machine_file in path.glob("meta*/conf/machine/*.conf"): 91 machine_archs.add(machine_file.parts[-1][:-5]) 92 93 extra_archs = set() 94 all_archs = ( 95 set( 96 arch.replace("-", "_") 97 for arch in machine_archs | tune_archs | set(["allarch", builder_arch]) 98 ) 99 | extra_archs 100 ) 101 102 print(all_archs) 103 104 105# again, not needed? 106def find_tasks(): 107 print(set([p.bb_task for p in paths])) 108 109 110def collect_sstate_paths(args): 111 def scandir(path, paths): 112 # Assume everything is a directory; by not checking we avoid needing an 113 # additional stat which is potentially a synchronous roundtrip over NFS 114 try: 115 for p in path.iterdir(): 116 filename = p.parts[-1] 117 if filename.startswith(SSTATE_PREFIX): 118 if filename.endswith(SSTATE_SUFFIXES): 119 m = RE_SSTATE_PKGSPEC.match(p.parts[-1]) 120 assert m 121 paths.add(SstateEntry(p, m)) 122 # ignore other things (includes things like lockfiles) 123 else: 124 scandir(p, paths) 125 126 except NotADirectoryError: 127 pass 128 129 paths = set() 130 # TODO: parellise scandir 131 scandir(Path(args.cache_dir), paths) 132 133 def path_stat(p): 134 p.stat_result = p.path.lstat() 135 136 if args.remove_duplicated: 137 # This is probably slightly performance negative on a local filesystem 138 # when we interact with the GIL; over NFS it's a massive win. 139 with ThreadPoolExecutor(max_workers=args.jobs) as executor: 140 executor.map(path_stat, paths) 141 142 return paths 143 144 145def remove_by_stamps(args, paths): 146 all_sums = set() 147 for stamps_dir in args.stamps_dir: 148 stamps_path = Path(stamps_dir) 149 assert stamps_path.is_dir() 150 re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)") 151 all_sums |= set( 152 [ 153 re_sigdata.search(x.parts[-1]).group(1) 154 for x in stamps_path.glob("*/*/*.do_*.sigdata.*") 155 ] 156 ) 157 re_setscene = re.compile(r"do_.*_setscene\.([^.]*)") 158 all_sums |= set( 159 [ 160 re_setscene.search(x.parts[-1]).group(1) 161 for x in stamps_path.glob("*/*/*.do_*_setscene.*") 162 ] 163 ) 164 return [p for p in paths if p.bb_unihash not in all_sums] 165 166 167def remove_duplicated(args, paths): 168 # Skip populate_lic as it produces duplicates in a normal build 169 # 170 # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates 171 valid_paths = [p for p in paths if p.bb_task != "populate_lic"] 172 173 keep = dict() 174 remove = list() 175 for p in valid_paths: 176 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext]) 177 if sstate_sig not in keep: 178 keep[sstate_sig] = p 179 elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime: 180 remove.append(keep[sstate_sig]) 181 keep[sstate_sig] = p 182 else: 183 remove.append(p) 184 185 return remove 186 187 188def remove_orphans(args, paths): 189 remove = list() 190 pathsigs = defaultdict(list) 191 for p in paths: 192 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task]) 193 pathsigs[sstate_sig].append(p) 194 for k, v in pathsigs.items(): 195 if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0: 196 remove.extend(v) 197 return remove 198 199 200def parse_arguments(): 201 parser = argparse.ArgumentParser(description="sstate cache management utility.") 202 203 parser.add_argument( 204 "--cache-dir", 205 default=os.environ.get("SSTATE_CACHE_DIR"), 206 help="""Specify sstate cache directory, will use the environment 207 variable SSTATE_CACHE_DIR if it is not specified.""", 208 ) 209 210 # parser.add_argument( 211 # "--extra-archs", 212 # help="""Specify list of architectures which should be tested, this list 213 # will be extended with native arch, allarch and empty arch. The 214 # script won't be trying to generate list of available archs from 215 # AVAILTUNES in tune files.""", 216 # ) 217 218 # parser.add_argument( 219 # "--extra-layer", 220 # help="""Specify the layer which will be used for searching the archs, 221 # it will search the meta and meta-* layers in the top dir by 222 # default, and will search meta, meta-*, <layer1>, <layer2>, 223 # ...<layern> when specified. Use "," as the separator. 224 # 225 # This is useless for --stamps-dir or when --extra-archs is used.""", 226 # ) 227 228 parser.add_argument( 229 "-d", 230 "--remove-duplicated", 231 action="store_true", 232 help="""Remove the duplicated sstate cache files of one package, only 233 the newest one will be kept. The duplicated sstate cache files 234 of one package must have the same arch, which means sstate cache 235 files with multiple archs are not considered duplicate. 236 237 Conflicts with --stamps-dir.""", 238 ) 239 240 parser.add_argument( 241 "--remove-orphans", 242 action="store_true", 243 help=f"""Remove orphan siginfo files from the sstate cache, i.e. those 244 where this is no {SSTATE_EXTENSION} file but there are associated 245 tracking files.""", 246 ) 247 248 parser.add_argument( 249 "--stamps-dir", 250 action="append", 251 help="""Specify the build directory's stamps directories, the sstate 252 cache file which IS USED by these build diretories will be KEPT, 253 other sstate cache files in cache-dir will be removed. Can be 254 specified multiple times for several directories. 255 256 Conflicts with --remove-duplicated.""", 257 ) 258 259 parser.add_argument( 260 "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel." 261 ) 262 263 # parser.add_argument( 264 # "-L", 265 # "--follow-symlink", 266 # action="store_true", 267 # help="Remove both the symbol link and the destination file, default: no.", 268 # ) 269 270 parser.add_argument( 271 "-n", "--dry-run", action="store_true", help="Don't execute, just go through the motions." 272 ) 273 274 parser.add_argument( 275 "-y", 276 "--yes", 277 action="store_true", 278 help="""Automatic yes to prompts; assume "yes" as answer to all prompts 279 and run non-interactively.""", 280 ) 281 282 parser.add_argument( 283 "-v", "--verbose", action="store_true", help="Explain what is being done." 284 ) 285 286 parser.add_argument( 287 "-D", 288 "--debug", 289 action="count", 290 default=0, 291 help="Show debug info, repeat for more debug info.", 292 ) 293 294 args = parser.parse_args() 295 if args.cache_dir is None or ( 296 not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans 297 ): 298 parser.print_usage() 299 sys.exit(1) 300 301 return args 302 303 304def main(): 305 args = parse_arguments() 306 307 paths = collect_sstate_paths(args) 308 if args.remove_duplicated: 309 remove = remove_duplicated(args, paths) 310 elif args.stamps_dir: 311 remove = remove_by_stamps(args, paths) 312 else: 313 remove = list() 314 315 if args.remove_orphans: 316 remove = set(remove) | set(remove_orphans(args, paths)) 317 318 if args.debug >= 1: 319 print("\n".join([str(p.path) for p in remove])) 320 print(f"{len(remove)} out of {len(paths)} files will be removed!") 321 if args.dry_run: 322 return 0 323 324 if not args.yes: 325 print("Do you want to continue (y/n)?") 326 confirm = input() in ("y", "Y") 327 else: 328 confirm = True 329 if confirm: 330 # TODO: parallelise remove 331 for p in remove: 332 p.path.unlink() 333 334 335if __name__ == "__main__": 336 main() 337