1#!/bin/sh
2#
3# Copyright 2015, Daniel Axtens, IBM Corporation
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7#  the Free Software Foundation; version 2 of the License.
8#
9# This program is distributed in the hope that it will be useful,
10# but WITHOUT ANY WARRANTY; without even the implied warranty of
11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12# GNU General Public License for more details.
13
14
15# do we have ./getscom, ./putscom?
16if [ -x ./getscom ] && [ -x ./putscom ]; then
17	GETSCOM=./getscom
18	PUTSCOM=./putscom
19elif which getscom > /dev/null; then
20	GETSCOM=$(which getscom)
21	PUTSCOM=$(which putscom)
22else
23	cat <<EOF
24Can't find getscom/putscom in . or \$PATH.
25See https://github.com/open-power/skiboot.
26The tool is in external/xscom-utils
27EOF
28	exit 1
29fi
30
31# We will get 8 HMI events per injection
32# todo: deal with things being offline
33expected_hmis=8
34COUNT_HMIS() {
35    dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
36}
37
38# massively expand snooze delay, allowing injection on all cores
39ppc64_cpu --smt-snooze-delay=1000000000
40
41# when we exit, restore it
42trap "ppc64_cpu --smt-snooze-delay=100" 0 1
43
44# for each chip+core combination
45# todo - less fragile parsing
46egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
47while read chipcore; do
48	chip=$(echo "$chipcore"|awk '{print $3}')
49	core=$(echo "$chipcore"|awk '{print $5}')
50	fir="0x1${core}013100"
51
52	# verify that Core FIR is zero as expected
53	if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
54		echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
55		echo "Result of $GETSCOM -c 0x${chip} $fir:"
56		$GETSCOM -c 0x${chip} $fir
57		echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
58		echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
59		exit 1
60	fi
61
62	# keep track of the number of HMIs handled
63	old_hmis=$(COUNT_HMIS)
64
65	# do injection, adding a marker to dmesg for clarity
66	echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
67	# inject a RegFile recoverable error
68	if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
69		echo "Error injecting. Aborting!"
70		exit 1
71	fi
72
73	# now we want to wait for all the HMIs to be processed
74	# we expect one per thread on the core
75	i=0;
76	new_hmis=$(COUNT_HMIS)
77	while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
78	    echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
79	    sleep 5;
80	    i=$((i + 1))
81	    new_hmis=$(COUNT_HMIS)
82	done
83	if [ $i = 12 ]; then
84	    echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
85	    exit 1
86	fi
87	echo "Processed $expected_hmis events; presumed success. Check dmesg."
88	echo ""
89done
90