xref: /openbmc/openbmc/poky/scripts/lib/resulttool/regression.py (revision 8460358c3d24c71d9d38fd126c745854a6301564)
1# resulttool - regression analysis
2#
3# Copyright (c) 2019, Intel Corporation.
4# Copyright (c) 2019, Linux Foundation
5#
6# SPDX-License-Identifier: GPL-2.0-only
7#
8
9import resulttool.resultutils as resultutils
10
11from oeqa.utils.git import GitRepo
12import oeqa.utils.gitarchive as gitarchive
13
14METADATA_MATCH_TABLE = {
15    "oeselftest": "OESELFTEST_METADATA"
16}
17
18OESELFTEST_METADATA_GUESS_TABLE={
19    "trigger-build-posttrigger": {
20        "run_all_tests": False,
21        "run_tests":["buildoptions.SourceMirroring.test_yocto_source_mirror"],
22        "skips": None,
23        "machine": None,
24        "select_tags":None,
25        "exclude_tags": None
26    },
27    "reproducible": {
28        "run_all_tests": False,
29        "run_tests":["reproducible"],
30        "skips": None,
31        "machine": None,
32        "select_tags":None,
33        "exclude_tags": None
34    },
35    "arch-qemu-quick": {
36        "run_all_tests": True,
37        "run_tests":None,
38        "skips": None,
39        "machine": None,
40        "select_tags":["machine"],
41        "exclude_tags": None
42    },
43    "arch-qemu-full-x86-or-x86_64": {
44        "run_all_tests": True,
45        "run_tests":None,
46        "skips": None,
47        "machine": None,
48        "select_tags":["machine", "toolchain-system"],
49        "exclude_tags": None
50    },
51    "arch-qemu-full-others": {
52        "run_all_tests": True,
53        "run_tests":None,
54        "skips": None,
55        "machine": None,
56        "select_tags":["machine", "toolchain-user"],
57        "exclude_tags": None
58    },
59    "selftest": {
60        "run_all_tests": True,
61        "run_tests":None,
62        "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"],
63        "machine": None,
64        "select_tags":None,
65        "exclude_tags": ["machine", "toolchain-system", "toolchain-user"]
66    },
67    "bringup": {
68        "run_all_tests": True,
69        "run_tests":None,
70        "skips": ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"],
71        "machine": None,
72        "select_tags":None,
73        "exclude_tags": ["machine", "toolchain-system", "toolchain-user"]
74    }
75}
76
77STATUS_STRINGS = {
78    "None": "No matching test result"
79}
80
81REGRESSIONS_DISPLAY_LIMIT=50
82
83MISSING_TESTS_BANNER =   "-------------------------- Missing tests --------------------------"
84ADDITIONAL_DATA_BANNER = "--------------------- Matches and improvements --------------------"
85
86def test_has_at_least_one_matching_tag(test, tag_list):
87    return "oetags" in test and any(oetag in tag_list for oetag in test["oetags"])
88
89def all_tests_have_at_least_one_matching_tag(results, tag_list):
90    return all(test_has_at_least_one_matching_tag(test_result, tag_list) or test_name.startswith("ptestresult") for (test_name, test_result) in results.items())
91
92def any_test_have_any_matching_tag(results, tag_list):
93    return any(test_has_at_least_one_matching_tag(test, tag_list) for test in results.values())
94
95def have_skipped_test(result, test_prefix):
96    return all( result[test]['status'] == "SKIPPED" for test in result if test.startswith(test_prefix))
97
98def have_all_tests_skipped(result, test_prefixes_list):
99    return all(have_skipped_test(result, test_prefix) for test_prefix in test_prefixes_list)
100
101def guess_oeselftest_metadata(results):
102    """
103    When an oeselftest test result is lacking OESELFTEST_METADATA, we can try to guess it based on results content.
104    Check results for specific values (absence/presence of oetags, number and name of executed tests...),
105    and if it matches one of known configuration from autobuilder configuration, apply guessed OSELFTEST_METADATA
106    to it to allow proper test filtering.
107    This guessing process is tightly coupled to config.json in autobuilder. It should trigger less and less,
108    as new tests will have OESELFTEST_METADATA properly appended at test reporting time
109    """
110
111    if len(results) == 1 and "buildoptions.SourceMirroring.test_yocto_source_mirror" in results:
112        return OESELFTEST_METADATA_GUESS_TABLE['trigger-build-posttrigger']
113    elif all(result.startswith("reproducible") for result in results):
114        return OESELFTEST_METADATA_GUESS_TABLE['reproducible']
115    elif all_tests_have_at_least_one_matching_tag(results, ["machine"]):
116        return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-quick']
117    elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-system"]):
118        return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-x86-or-x86_64']
119    elif all_tests_have_at_least_one_matching_tag(results, ["machine", "toolchain-user"]):
120        return OESELFTEST_METADATA_GUESS_TABLE['arch-qemu-full-others']
121    elif not any_test_have_any_matching_tag(results, ["machine", "toolchain-user", "toolchain-system"]):
122        if have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror", "reproducible"]):
123            return OESELFTEST_METADATA_GUESS_TABLE['selftest']
124        elif have_all_tests_skipped(results, ["distrodata.Distrodata.test_checkpkg", "buildoptions.SourceMirroring.test_yocto_source_mirror"]):
125            return OESELFTEST_METADATA_GUESS_TABLE['bringup']
126
127    return None
128
129
130def metadata_matches(base_configuration, target_configuration):
131    """
132    For passed base and target, check test type. If test type matches one of
133    properties described in METADATA_MATCH_TABLE, compare metadata if it is
134    present in base. Return true if metadata matches, or if base lacks some
135    data (either TEST_TYPE or the corresponding metadata)
136    """
137    test_type = base_configuration.get('TEST_TYPE')
138    if test_type not in METADATA_MATCH_TABLE:
139        return True
140
141    metadata_key = METADATA_MATCH_TABLE.get(test_type)
142    if target_configuration.get(metadata_key) != base_configuration.get(metadata_key):
143        return False
144
145    return True
146
147
148def machine_matches(base_configuration, target_configuration):
149    return base_configuration.get('MACHINE') == target_configuration.get('MACHINE')
150
151
152def can_be_compared(logger, base, target):
153    """
154    Some tests are not relevant to be compared, for example some oeselftest
155    run with different tests sets or parameters. Return true if tests can be
156    compared
157    """
158    ret = True
159    base_configuration = base['configuration']
160    target_configuration = target['configuration']
161
162    # Older test results lack proper OESELFTEST_METADATA: if not present, try to guess it based on tests results.
163    if base_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in base_configuration:
164        guess = guess_oeselftest_metadata(base['result'])
165        if guess is None:
166            logger.error(f"ERROR: did not manage to guess oeselftest metadata for {base_configuration['STARTTIME']}")
167        else:
168            logger.debug(f"Enriching {base_configuration['STARTTIME']} with {guess}")
169            base_configuration['OESELFTEST_METADATA'] = guess
170    if target_configuration.get('TEST_TYPE') == 'oeselftest' and 'OESELFTEST_METADATA' not in target_configuration:
171        guess = guess_oeselftest_metadata(target['result'])
172        if guess is None:
173            logger.error(f"ERROR: did not manage to guess oeselftest metadata for {target_configuration['STARTTIME']}")
174        else:
175            logger.debug(f"Enriching {target_configuration['STARTTIME']} with {guess}")
176            target_configuration['OESELFTEST_METADATA'] = guess
177
178    # Test runs with LTP results in should only be compared with other runs with LTP tests in them
179    if base_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in base['result']):
180        ret = target_configuration.get('TEST_TYPE') == 'runtime' and any(result.startswith("ltpresult") for result in target['result'])
181
182    return ret and metadata_matches(base_configuration, target_configuration) \
183        and machine_matches(base_configuration, target_configuration)
184
185def get_status_str(raw_status):
186    raw_status_lower = raw_status.lower() if raw_status else "None"
187    return STATUS_STRINGS.get(raw_status_lower, raw_status)
188
189def get_additional_info_line(new_pass_count, new_tests):
190    result=[]
191    if new_tests:
192        result.append(f'+{new_tests} test(s) present')
193    if new_pass_count:
194        result.append(f'+{new_pass_count} test(s) now passing')
195
196    if not result:
197        return ""
198
199    return '    -> ' + ', '.join(result) + '\n'
200
201def compare_result(logger, base_name, target_name, base_result, target_result, display_limit=None):
202    base_result = base_result.get('result')
203    target_result = target_result.get('result')
204    result = {}
205    new_tests = 0
206    regressions = {}
207    resultstring = ""
208    new_tests = 0
209    new_pass_count = 0
210
211    display_limit = int(display_limit) if display_limit else REGRESSIONS_DISPLAY_LIMIT
212
213    if base_result and target_result:
214        for k in base_result:
215            if k in ['ptestresult.rawlogs', 'ptestresult.sections']:
216                continue
217            base_testcase = base_result[k]
218            base_status = base_testcase.get('status')
219            if base_status:
220                target_testcase = target_result.get(k, {})
221                target_status = target_testcase.get('status')
222                if base_status != target_status:
223                    result[k] = {'base': base_status, 'target': target_status}
224            else:
225                logger.error('Failed to retrieved base test case status: %s' % k)
226
227        # Also count new tests that were not present in base results: it
228        # could be newly added tests, but it could also highlights some tests
229        # renames or fixed faulty ptests
230        for k in target_result:
231            if k not in base_result:
232                new_tests += 1
233    if result:
234        new_pass_count = sum(test['target'] is not None and test['target'].startswith("PASS") for test in result.values())
235        # Print a regression report only if at least one test has a regression status (FAIL, SKIPPED, absent...)
236        if new_pass_count < len(result):
237            resultstring = "Regression:  %s\n             %s\n" % (base_name, target_name)
238            for k in sorted(result):
239                if not result[k]['target'] or not result[k]['target'].startswith("PASS"):
240                    # Differentiate each ptest kind when listing regressions
241                    key_parts = k.split('.')
242                    key = '.'.join(key_parts[:2]) if k.startswith('ptest') else key_parts[0]
243                    # Append new regression to corresponding test family
244                    regressions[key] = regressions.setdefault(key, []) + ['        %s: %s -> %s\n' % (k, get_status_str(result[k]['base']), get_status_str(result[k]['target']))]
245            resultstring += f"    Total: {sum([len(regressions[r]) for r in regressions])} new regression(s):\n"
246            for k in regressions:
247                resultstring += f"    {len(regressions[k])} regression(s) for {k}\n"
248                count_to_print=min([display_limit, len(regressions[k])]) if display_limit > 0 else len(regressions[k])
249                resultstring += ''.join(regressions[k][:count_to_print])
250                if count_to_print < len(regressions[k]):
251                    resultstring+='        [...]\n'
252            if new_pass_count > 0:
253                resultstring += f'    Additionally, {new_pass_count} previously failing test(s) is/are now passing\n'
254            if new_tests > 0:
255                resultstring += f'    Additionally, {new_tests} new test(s) is/are present\n'
256        else:
257            resultstring = "%s\n%s\n" % (base_name, target_name)
258            result = None
259    else:
260        resultstring = "%s\n%s\n" % (base_name, target_name)
261
262    if not result:
263        additional_info = get_additional_info_line(new_pass_count, new_tests)
264        if additional_info:
265            resultstring += additional_info
266
267    return result, resultstring
268
269def get_results(logger, source):
270    return resultutils.load_resultsdata(source, configmap=resultutils.regression_map)
271
272def regression(args, logger):
273    base_results = get_results(logger, args.base_result)
274    target_results = get_results(logger, args.target_result)
275
276    regression_common(args, logger, base_results, target_results)
277
278# Some test case naming is poor and contains random strings, particularly lttng/babeltrace.
279# Truncating the test names works since they contain file and line number identifiers
280# which allows us to match them without the random components.
281def fixup_ptest_names(results, logger):
282    for r in results:
283        for i in results[r]:
284            tests = list(results[r][i]['result'].keys())
285            for test in tests:
286                new = None
287                if test.startswith(("ptestresult.lttng-tools.", "ptestresult.babeltrace.", "ptestresult.babeltrace2")) and "_-_" in test:
288                    new = test.split("_-_")[0]
289                elif test.startswith(("ptestresult.curl.")) and "__" in test:
290                    new = test.split("__")[0]
291                elif test.startswith(("ptestresult.dbus.")) and "__" in test:
292                    new = test.split("__")[0]
293                elif test.startswith("ptestresult.binutils") and "build-st-" in test:
294                    new = test.split(" ")[0]
295                elif test.startswith("ptestresult.gcc") and "/tmp/runtest." in test:
296                    new = ".".join(test.split(".")[:2])
297                if new:
298                    results[r][i]['result'][new] = results[r][i]['result'][test]
299                    del results[r][i]['result'][test]
300
301def regression_common(args, logger, base_results, target_results):
302    if args.base_result_id:
303        base_results = resultutils.filter_resultsdata(base_results, args.base_result_id)
304    if args.target_result_id:
305        target_results = resultutils.filter_resultsdata(target_results, args.target_result_id)
306
307    fixup_ptest_names(base_results, logger)
308    fixup_ptest_names(target_results, logger)
309
310    matches = []
311    regressions = []
312    notfound = []
313
314    for a in base_results:
315        if a in target_results:
316            base = list(base_results[a].keys())
317            target = list(target_results[a].keys())
318            # We may have multiple base/targets which are for different configurations. Start by
319            # removing any pairs which match
320            for c in base.copy():
321                for b in target.copy():
322                    if not can_be_compared(logger, base_results[a][c], target_results[a][b]):
323                        continue
324                    res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit)
325                    if not res:
326                        matches.append(resstr)
327                        base.remove(c)
328                        target.remove(b)
329                        break
330            # Should only now see regressions, we may not be able to match multiple pairs directly
331            for c in base:
332                for b in target:
333                    if not can_be_compared(logger, base_results[a][c], target_results[a][b]):
334                        continue
335                    res, resstr = compare_result(logger, c, b, base_results[a][c], target_results[a][b], args.limit)
336                    if res:
337                        regressions.append(resstr)
338        else:
339            notfound.append("%s not found in target" % a)
340    print("\n".join(sorted(regressions)))
341    print("\n" + MISSING_TESTS_BANNER + "\n")
342    print("\n".join(sorted(notfound)))
343    print("\n" + ADDITIONAL_DATA_BANNER + "\n")
344    print("\n".join(sorted(matches)))
345    return 0
346
347def regression_git(args, logger):
348    base_results = {}
349    target_results = {}
350
351    tag_name = "{branch}/{commit_number}-g{commit}/{tag_number}"
352    repo = GitRepo(args.repo)
353
354    revs = gitarchive.get_test_revs(logger, repo, tag_name, branch=args.branch)
355
356    if args.branch2:
357        revs2 = gitarchive.get_test_revs(logger, repo, tag_name, branch=args.branch2)
358        if not len(revs2):
359            logger.error("No revisions found to compare against")
360            return 1
361        if not len(revs):
362            logger.error("No revision to report on found")
363            return 1
364    else:
365        if len(revs) < 2:
366            logger.error("Only %d tester revisions found, unable to generate report" % len(revs))
367            return 1
368
369    # Pick revisions
370    if args.commit:
371        if args.commit_number:
372            logger.warning("Ignoring --commit-number as --commit was specified")
373        index1 = gitarchive.rev_find(revs, 'commit', args.commit)
374    elif args.commit_number:
375        index1 = gitarchive.rev_find(revs, 'commit_number', args.commit_number)
376    else:
377        index1 = len(revs) - 1
378
379    if args.branch2:
380        revs2.append(revs[index1])
381        index1 = len(revs2) - 1
382        revs = revs2
383
384    if args.commit2:
385        if args.commit_number2:
386            logger.warning("Ignoring --commit-number2 as --commit2 was specified")
387        index2 = gitarchive.rev_find(revs, 'commit', args.commit2)
388    elif args.commit_number2:
389        index2 = gitarchive.rev_find(revs, 'commit_number', args.commit_number2)
390    else:
391        if index1 > 0:
392            index2 = index1 - 1
393            # Find the closest matching commit number for comparision
394            # In future we could check the commit is a common ancestor and
395            # continue back if not but this good enough for now
396            while index2 > 0 and revs[index2].commit_number > revs[index1].commit_number:
397                index2 = index2 - 1
398        else:
399            logger.error("Unable to determine the other commit, use "
400                      "--commit2 or --commit-number2 to specify it")
401            return 1
402
403    logger.info("Comparing:\n%s\nto\n%s\n" % (revs[index1], revs[index2]))
404
405    base_results = resultutils.git_get_result(repo, revs[index1][2])
406    target_results = resultutils.git_get_result(repo, revs[index2][2])
407
408    regression_common(args, logger, base_results, target_results)
409
410    return 0
411
412def register_commands(subparsers):
413    """Register subcommands from this plugin"""
414
415    parser_build = subparsers.add_parser('regression', help='regression file/directory analysis',
416                                         description='regression analysis comparing the base set of results to the target results',
417                                         group='analysis')
418    parser_build.set_defaults(func=regression)
419    parser_build.add_argument('base_result',
420                              help='base result file/directory/URL for the comparison')
421    parser_build.add_argument('target_result',
422                              help='target result file/directory/URL to compare with')
423    parser_build.add_argument('-b', '--base-result-id', default='',
424                              help='(optional) filter the base results to this result ID')
425    parser_build.add_argument('-t', '--target-result-id', default='',
426                              help='(optional) filter the target results to this result ID')
427    parser_build.add_argument('-l', '--limit', default=REGRESSIONS_DISPLAY_LIMIT, help="Maximum number of changes to display per test. Can be set to 0 to print all changes")
428
429    parser_build = subparsers.add_parser('regression-git', help='regression git analysis',
430                                         description='regression analysis comparing base result set to target '
431                                                     'result set',
432                                         group='analysis')
433    parser_build.set_defaults(func=regression_git)
434    parser_build.add_argument('repo',
435                              help='the git repository containing the data')
436    parser_build.add_argument('-b', '--base-result-id', default='',
437                              help='(optional) default select regression based on configurations unless base result '
438                                   'id was provided')
439    parser_build.add_argument('-t', '--target-result-id', default='',
440                              help='(optional) default select regression based on configurations unless target result '
441                                   'id was provided')
442
443    parser_build.add_argument('--branch', '-B', default='master', help="Branch to find commit in")
444    parser_build.add_argument('--branch2', help="Branch to find comparision revisions in")
445    parser_build.add_argument('--commit', help="Revision to search for")
446    parser_build.add_argument('--commit-number', help="Revision number to search for, redundant if --commit is specified")
447    parser_build.add_argument('--commit2', help="Revision to compare with")
448    parser_build.add_argument('--commit-number2', help="Revision number to compare with, redundant if --commit2 is specified")
449    parser_build.add_argument('-l', '--limit', default=REGRESSIONS_DISPLAY_LIMIT, help="Maximum number of changes to display per test. Can be set to 0 to print all changes")
450
451