1#!/bin/sh 2# SPDX-License-Identifier: GPL-2.0-only 3 4. ./eeh-functions.sh 5 6if ! eeh_supported ; then 7 echo "EEH not supported on this system, skipping" 8 exit 0; 9fi 10 11if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ 12 [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then 13 echo "debugfs EEH testing files are missing. Is debugfs mounted?" 14 exit 1; 15fi 16 17pre_lspci=`mktemp` 18lspci > $pre_lspci 19 20# Bump the max freeze count to something absurd so we don't 21# trip over it while breaking things. 22echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes 23 24# record the devices that we break in here. Assuming everything 25# goes to plan we should get them back once the recover process 26# is finished. 27devices="" 28 29# Build up a list of candidate devices. 30for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do 31 # skip bridges since we can't recover them (yet...) 32 if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then 33 echo "$dev, Skipped: bridge" 34 continue; 35 fi 36 37 # Skip VFs for now since we don't have a reliable way 38 # to break them. 39 if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then 40 echo "$dev, Skipped: virtfn" 41 continue; 42 fi 43 44 # Don't inject errosr into an already-frozen PE. This happens with 45 # PEs that contain multiple PCI devices (e.g. multi-function cards) 46 # and injecting new errors during the recovery process will probably 47 # result in the recovery failing and the device being marked as 48 # failed. 49 if ! pe_ok $dev ; then 50 echo "$dev, Skipped: Bad initial PE state" 51 continue; 52 fi 53 54 echo "$dev, Added" 55 56 # Add to this list of device to check 57 devices="$devices $dev" 58done 59 60dev_count="$(echo $devices | wc -w)" 61echo "Found ${dev_count} breakable devices..." 62 63failed=0 64for dev in $devices ; do 65 echo "Breaking $dev..." 66 67 if ! pe_ok $dev ; then 68 echo "Skipping $dev, Initial PE state is not ok" 69 failed="$((failed + 1))" 70 continue; 71 fi 72 73 if ! eeh_one_dev $dev ; then 74 failed="$((failed + 1))" 75 fi 76done 77 78echo "$failed devices failed to recover ($dev_count tested)" 79lspci | diff -u $pre_lspci - 80rm -f $pre_lspci 81 82exit $failed 83