1#!/usr/bin/env python3 2# 3# Copyright OpenEmbedded Contributors 4# 5# SPDX-License-Identifier: MIT 6# 7 8import argparse 9import os 10import re 11import sys 12 13from collections import defaultdict 14from concurrent.futures import ThreadPoolExecutor 15from dataclasses import dataclass 16from pathlib import Path 17 18if sys.version_info < (3, 8, 0): 19 raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.") 20 21SSTATE_PREFIX = "sstate:" 22SSTATE_EXTENSION = ".tar.zst" 23# SSTATE_EXTENSION = ".tgz" 24# .siginfo.done files are mentioned in the original script? 25SSTATE_SUFFIXES = ( 26 SSTATE_EXTENSION, 27 f"{SSTATE_EXTENSION}.siginfo", 28 f"{SSTATE_EXTENSION}.done", 29) 30 31RE_SSTATE_PKGSPEC = re.compile( 32 rf"""sstate:(?P<pn>[^:]*): 33 (?P<package_target>[^:]*): 34 (?P<pv>[^:]*): 35 (?P<pr>[^:]*): 36 (?P<sstate_pkgarch>[^:]*): 37 (?P<sstate_version>[^_]*): 38 (?P<bb_unihash>[^_]*)_ 39 (?P<bb_task>[^:]*) 40 (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""", 41 re.X, 42) 43 44 45# Really we'd like something like a Path subclass which implements a stat 46# cache here, unfortunately there's no good way to do that transparently 47# (yet); see: 48# 49# https://github.com/python/cpython/issues/70219 50# https://discuss.python.org/t/make-pathlib-extensible/3428/77 51@dataclass 52class SstateEntry: 53 """Class for keeping track of an entry in sstate-cache.""" 54 55 path: Path 56 match: re.Match 57 stat_result: os.stat_result = None 58 59 def __hash__(self): 60 return self.path.__hash__() 61 62 def __getattr__(self, name): 63 return self.match.group(name) 64 65 66# this is what's in the original script; as far as I can tell, it's an 67# implementation artefact which we don't need? 68def find_archs(): 69 # all_archs 70 builder_arch = os.uname().machine 71 72 # FIXME 73 layer_paths = [Path("../..")] 74 75 tune_archs = set() 76 re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"') 77 for path in layer_paths: 78 for tunefile in [ 79 p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file() 80 ]: 81 with open(tunefile) as f: 82 for line in f: 83 m = re_tune.match(line) 84 if m: 85 tune_archs.update(m.group(1).split()) 86 87 # all_machines 88 machine_archs = set() 89 for path in layer_paths: 90 for machine_file in path.glob("meta*/conf/machine/*.conf"): 91 machine_archs.add(machine_file.parts[-1][:-5]) 92 93 extra_archs = set() 94 all_archs = ( 95 set( 96 arch.replace("-", "_") 97 for arch in machine_archs | tune_archs | set(["allarch", builder_arch]) 98 ) 99 | extra_archs 100 ) 101 102 print(all_archs) 103 104 105# again, not needed? 106def find_tasks(): 107 print(set([p.bb_task for p in paths])) 108 109 110def collect_sstate_paths(args): 111 def scandir(path, paths): 112 # Assume everything is a directory; by not checking we avoid needing an 113 # additional stat which is potentially a synchronous roundtrip over NFS 114 try: 115 for p in path.iterdir(): 116 filename = p.parts[-1] 117 if filename.startswith(SSTATE_PREFIX): 118 if filename.endswith(SSTATE_SUFFIXES): 119 m = RE_SSTATE_PKGSPEC.match(p.parts[-1]) 120 assert m 121 paths.add(SstateEntry(p, m)) 122 # ignore other things (includes things like lockfiles) 123 else: 124 scandir(p, paths) 125 126 except NotADirectoryError: 127 pass 128 129 paths = set() 130 # TODO: parellise scandir 131 scandir(Path(args.cache_dir), paths) 132 133 def path_stat(p): 134 p.stat_result = p.path.lstat() 135 136 if args.remove_duplicated: 137 # This is probably slightly performance negative on a local filesystem 138 # when we interact with the GIL; over NFS it's a massive win. 139 with ThreadPoolExecutor(max_workers=args.jobs) as executor: 140 executor.map(path_stat, paths) 141 142 return paths 143 144 145def remove_by_stamps(args, paths): 146 all_sums = set() 147 for stamps_dir in args.stamps_dir: 148 stamps_path = Path(stamps_dir) 149 assert stamps_path.is_dir() 150 re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)") 151 all_sums |= set( 152 [ 153 re_sigdata.search(x.parts[-1]).group(1) 154 for x in stamps_path.glob("*/*/*.do_*.sigdata.*") 155 ] 156 ) 157 re_setscene = re.compile(r"do_.*_setscene\.([^.]*)") 158 all_sums |= set( 159 [ 160 re_setscene.search(x.parts[-1]).group(1) 161 for x in stamps_path.glob("*/*/*.do_*_setscene.*") 162 ] 163 ) 164 return [p for p in paths if p.bb_unihash not in all_sums] 165 166 167def remove_duplicated(args, paths): 168 # Skip populate_lic as it produces duplicates in a normal build 169 # 170 # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates 171 valid_paths = [p for p in paths if p.bb_task != "populate_lic"] 172 173 keep = dict() 174 remove = list() 175 for p in valid_paths: 176 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext]) 177 if sstate_sig not in keep: 178 keep[sstate_sig] = p 179 elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime: 180 remove.append(keep[sstate_sig]) 181 keep[sstate_sig] = p 182 else: 183 remove.append(p) 184 185 return remove 186 187 188def remove_orphans(args, paths): 189 remove = list() 190 pathsigs = defaultdict(list) 191 for p in paths: 192 sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task]) 193 pathsigs[sstate_sig].append(p) 194 for k, v in pathsigs.items(): 195 if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0: 196 remove.extend(v) 197 return remove 198 199 200def parse_arguments(): 201 parser = argparse.ArgumentParser(description="sstate cache management utility.") 202 203 parser.add_argument( 204 "--cache-dir", 205 default=os.environ.get("SSTATE_CACHE_DIR"), 206 help="""Specify sstate cache directory, will use the environment 207 variable SSTATE_CACHE_DIR if it is not specified.""", 208 ) 209 210 # parser.add_argument( 211 # "--extra-archs", 212 # help="""Specify list of architectures which should be tested, this list 213 # will be extended with native arch, allarch and empty arch. The 214 # script won't be trying to generate list of available archs from 215 # AVAILTUNES in tune files.""", 216 # ) 217 218 # parser.add_argument( 219 # "--extra-layer", 220 # help="""Specify the layer which will be used for searching the archs, 221 # it will search the meta and meta-* layers in the top dir by 222 # default, and will search meta, meta-*, <layer1>, <layer2>, 223 # ...<layern> when specified. Use "," as the separator. 224 # 225 # This is useless for --stamps-dir or when --extra-archs is used.""", 226 # ) 227 228 parser.add_argument( 229 "-d", 230 "--remove-duplicated", 231 action="store_true", 232 help="""Remove the duplicated sstate cache files of one package, only 233 the newest one will be kept. The duplicated sstate cache files 234 of one package must have the same arch, which means sstate cache 235 files with multiple archs are not considered duplicate. 236 237 Conflicts with --stamps-dir.""", 238 ) 239 240 parser.add_argument( 241 "--remove-orphans", 242 action="store_true", 243 help=f"""Remove orphan siginfo files from the sstate cache, i.e. those 244 where this is no {SSTATE_EXTENSION} file but there are associated 245 tracking files.""", 246 ) 247 248 parser.add_argument( 249 "--stamps-dir", 250 action="append", 251 help="""Specify the build directory's stamps directories, the sstate 252 cache file which IS USED by these build diretories will be KEPT, 253 other sstate cache files in cache-dir will be removed. Can be 254 specified multiple times for several directories. 255 256 Conflicts with --remove-duplicated.""", 257 ) 258 259 parser.add_argument( 260 "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel." 261 ) 262 263 # parser.add_argument( 264 # "-L", 265 # "--follow-symlink", 266 # action="store_true", 267 # help="Remove both the symbol link and the destination file, default: no.", 268 # ) 269 270 parser.add_argument( 271 "-y", 272 "--yes", 273 action="store_true", 274 help="""Automatic yes to prompts; assume "yes" as answer to all prompts 275 and run non-interactively.""", 276 ) 277 278 parser.add_argument( 279 "-v", "--verbose", action="store_true", help="Explain what is being done." 280 ) 281 282 parser.add_argument( 283 "-D", 284 "--debug", 285 action="count", 286 default=0, 287 help="Show debug info, repeat for more debug info.", 288 ) 289 290 args = parser.parse_args() 291 if args.cache_dir is None or ( 292 not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans 293 ): 294 parser.print_usage() 295 sys.exit(1) 296 297 return args 298 299 300def main(): 301 args = parse_arguments() 302 303 paths = collect_sstate_paths(args) 304 if args.remove_duplicated: 305 remove = remove_duplicated(args, paths) 306 elif args.stamps_dir: 307 remove = remove_by_stamps(args, paths) 308 else: 309 remove = list() 310 311 if args.remove_orphans: 312 remove = set(remove) | set(remove_orphans(args, paths)) 313 314 if args.debug >= 1: 315 print("\n".join([str(p.path) for p in remove])) 316 print(f"{len(remove)} out of {len(paths)} files will be removed!") 317 if not args.yes: 318 print("Do you want to continue (y/n)?") 319 confirm = input() in ("y", "Y") 320 else: 321 confirm = True 322 if confirm: 323 # TODO: parallelise remove 324 for p in remove: 325 p.path.unlink() 326 327 328if __name__ == "__main__": 329 main() 330