1#!/usr/bin/env python3
2#
3# Copyright OpenEmbedded Contributors
4#
5# SPDX-License-Identifier: MIT
6#
7
8import argparse
9import os
10import re
11import sys
12
13from collections import defaultdict
14from concurrent.futures import ThreadPoolExecutor
15from dataclasses import dataclass
16from pathlib import Path
17
18if sys.version_info < (3, 8, 0):
19    raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
20
21SSTATE_PREFIX = "sstate:"
22SSTATE_EXTENSION = ".tar.zst"
23# SSTATE_EXTENSION = ".tgz"
24# .siginfo.done files are mentioned in the original script?
25SSTATE_SUFFIXES = (
26    SSTATE_EXTENSION,
27    f"{SSTATE_EXTENSION}.siginfo",
28    f"{SSTATE_EXTENSION}.done",
29)
30
31RE_SSTATE_PKGSPEC = re.compile(
32    rf"""sstate:(?P<pn>[^:]*):
33         (?P<package_target>[^:]*):
34         (?P<pv>[^:]*):
35         (?P<pr>[^:]*):
36         (?P<sstate_pkgarch>[^:]*):
37         (?P<sstate_version>[^_]*):
38         (?P<bb_unihash>[^_]*)_
39         (?P<bb_task>[^:]*)
40         (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
41    re.X,
42)
43
44
45# Really we'd like something like a Path subclass which implements a stat
46# cache here, unfortunately there's no good way to do that transparently
47# (yet); see:
48#
49# https://github.com/python/cpython/issues/70219
50# https://discuss.python.org/t/make-pathlib-extensible/3428/77
51@dataclass
52class SstateEntry:
53    """Class for keeping track of an entry in sstate-cache."""
54
55    path: Path
56    match: re.Match
57    stat_result: os.stat_result = None
58
59    def __hash__(self):
60        return self.path.__hash__()
61
62    def __getattr__(self, name):
63        return self.match.group(name)
64
65
66# this is what's in the original script; as far as I can tell, it's an
67# implementation artefact which we don't need?
68def find_archs():
69    # all_archs
70    builder_arch = os.uname().machine
71
72    # FIXME
73    layer_paths = [Path("../..")]
74
75    tune_archs = set()
76    re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
77    for path in layer_paths:
78        for tunefile in [
79            p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
80        ]:
81            with open(tunefile) as f:
82                for line in f:
83                    m = re_tune.match(line)
84                    if m:
85                        tune_archs.update(m.group(1).split())
86
87    # all_machines
88    machine_archs = set()
89    for path in layer_paths:
90        for machine_file in path.glob("meta*/conf/machine/*.conf"):
91            machine_archs.add(machine_file.parts[-1][:-5])
92
93    extra_archs = set()
94    all_archs = (
95        set(
96            arch.replace("-", "_")
97            for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
98        )
99        | extra_archs
100    )
101
102    print(all_archs)
103
104
105# again, not needed?
106def find_tasks():
107    print(set([p.bb_task for p in paths]))
108
109
110def collect_sstate_paths(args):
111    def scandir(path, paths):
112        # Assume everything is a directory; by not checking we avoid needing an
113        # additional stat which is potentially a synchronous roundtrip over NFS
114        try:
115            for p in path.iterdir():
116                filename = p.parts[-1]
117                if filename.startswith(SSTATE_PREFIX):
118                    if filename.endswith(SSTATE_SUFFIXES):
119                        m = RE_SSTATE_PKGSPEC.match(p.parts[-1])
120                        assert m
121                        paths.add(SstateEntry(p, m))
122                    # ignore other things (includes things like lockfiles)
123                else:
124                    scandir(p, paths)
125
126        except NotADirectoryError:
127            pass
128
129    paths = set()
130    # TODO: parellise scandir
131    scandir(Path(args.cache_dir), paths)
132
133    def path_stat(p):
134        p.stat_result = p.path.lstat()
135
136    if args.remove_duplicated:
137        # This is probably slightly performance negative on a local filesystem
138        # when we interact with the GIL; over NFS it's a massive win.
139        with ThreadPoolExecutor(max_workers=args.jobs) as executor:
140            executor.map(path_stat, paths)
141
142    return paths
143
144
145def remove_by_stamps(args, paths):
146    all_sums = set()
147    for stamps_dir in args.stamps_dir:
148        stamps_path = Path(stamps_dir)
149        assert stamps_path.is_dir()
150        re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)")
151        all_sums |= set(
152            [
153                re_sigdata.search(x.parts[-1]).group(1)
154                for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
155            ]
156        )
157        re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
158        all_sums |= set(
159            [
160                re_setscene.search(x.parts[-1]).group(1)
161                for x in stamps_path.glob("*/*/*.do_*_setscene.*")
162            ]
163        )
164    return [p for p in paths if p.bb_unihash not in all_sums]
165
166
167def remove_duplicated(args, paths):
168    # Skip populate_lic as it produces duplicates in a normal build
169    #
170    # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
171    valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
172
173    keep = dict()
174    remove = list()
175    for p in valid_paths:
176        sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext])
177        if sstate_sig not in keep:
178            keep[sstate_sig] = p
179        elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
180            remove.append(keep[sstate_sig])
181            keep[sstate_sig] = p
182        else:
183            remove.append(p)
184
185    return remove
186
187
188def remove_orphans(args, paths):
189    remove = list()
190    pathsigs = defaultdict(list)
191    for p in paths:
192        sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task])
193        pathsigs[sstate_sig].append(p)
194    for k, v in pathsigs.items():
195        if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
196            remove.extend(v)
197    return remove
198
199
200def parse_arguments():
201    parser = argparse.ArgumentParser(description="sstate cache management utility.")
202
203    parser.add_argument(
204        "--cache-dir",
205        default=os.environ.get("SSTATE_CACHE_DIR"),
206        help="""Specify sstate cache directory, will use the environment
207            variable SSTATE_CACHE_DIR if it is not specified.""",
208    )
209
210    # parser.add_argument(
211    #     "--extra-archs",
212    #     help="""Specify list of architectures which should be tested, this list
213    #         will be extended with native arch, allarch and empty arch. The
214    #         script won't be trying to generate list of available archs from
215    #         AVAILTUNES in tune files.""",
216    # )
217
218    # parser.add_argument(
219    #     "--extra-layer",
220    #     help="""Specify the layer which will be used for searching the archs,
221    #         it will search the meta and meta-* layers in the top dir by
222    #         default, and will search meta, meta-*, <layer1>, <layer2>,
223    #         ...<layern> when specified. Use "," as the separator.
224    #
225    #         This is useless for --stamps-dir or when --extra-archs is used.""",
226    # )
227
228    parser.add_argument(
229        "-d",
230        "--remove-duplicated",
231        action="store_true",
232        help="""Remove the duplicated sstate cache files of one package, only
233            the newest one will be kept. The duplicated sstate cache files
234            of one package must have the same arch, which means sstate cache
235            files with multiple archs are not considered duplicate.
236
237            Conflicts with --stamps-dir.""",
238    )
239
240    parser.add_argument(
241        "--remove-orphans",
242        action="store_true",
243        help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
244            where this is no {SSTATE_EXTENSION} file but there are associated
245            tracking files.""",
246    )
247
248    parser.add_argument(
249        "--stamps-dir",
250        action="append",
251        help="""Specify the build directory's stamps directories, the sstate
252            cache file which IS USED by these build diretories will be KEPT,
253            other sstate cache files in cache-dir will be removed. Can be
254            specified multiple times for several directories.
255
256            Conflicts with --remove-duplicated.""",
257    )
258
259    parser.add_argument(
260        "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
261    )
262
263    # parser.add_argument(
264    #     "-L",
265    #     "--follow-symlink",
266    #     action="store_true",
267    #     help="Remove both the symbol link and the destination file, default: no.",
268    # )
269
270    parser.add_argument(
271        "-y",
272        "--yes",
273        action="store_true",
274        help="""Automatic yes to prompts; assume "yes" as answer to all prompts
275            and run non-interactively.""",
276    )
277
278    parser.add_argument(
279        "-v", "--verbose", action="store_true", help="Explain what is being done."
280    )
281
282    parser.add_argument(
283        "-D",
284        "--debug",
285        action="count",
286        default=0,
287        help="Show debug info, repeat for more debug info.",
288    )
289
290    args = parser.parse_args()
291    if args.cache_dir is None or (
292        not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
293    ):
294        parser.print_usage()
295        sys.exit(1)
296
297    return args
298
299
300def main():
301    args = parse_arguments()
302
303    paths = collect_sstate_paths(args)
304    if args.remove_duplicated:
305        remove = remove_duplicated(args, paths)
306    elif args.stamps_dir:
307        remove = remove_by_stamps(args, paths)
308    else:
309        remove = list()
310
311    if args.remove_orphans:
312        remove = set(remove) | set(remove_orphans(args, paths))
313
314    if args.debug >= 1:
315        print("\n".join([str(p.path) for p in remove]))
316    print(f"{len(remove)} out of {len(paths)} files will be removed!")
317    if not args.yes:
318        print("Do you want to continue (y/n)?")
319        confirm = input() in ("y", "Y")
320    else:
321        confirm = True
322    if confirm:
323        # TODO: parallelise remove
324        for p in remove:
325            p.path.unlink()
326
327
328if __name__ == "__main__":
329    main()
330