1#!/bin/sh 2# SPDX-License-Identifier: GPL-2.0-only 3 4. ./eeh-functions.sh 5 6if ! eeh_supported ; then 7 echo "EEH not supported on this system, skipping" 8 exit 0; 9fi 10 11if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ 12 [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then 13 echo "debugfs EEH testing files are missing. Is debugfs mounted?" 14 exit 1; 15fi 16 17pre_lspci=`mktemp` 18lspci > $pre_lspci 19 20# Bump the max freeze count to something absurd so we don't 21# trip over it while breaking things. 22echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes 23 24# record the devices that we break in here. Assuming everything 25# goes to plan we should get them back once the recover process 26# is finished. 27devices="" 28 29# Build up a list of candidate devices. 30for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do 31 # skip bridges since we can't recover them (yet...) 32 if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then 33 echo "$dev, Skipped: bridge" 34 continue; 35 fi 36 37 # Skip VFs for now since we don't have a reliable way 38 # to break them. 39 if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then 40 echo "$dev, Skipped: virtfn" 41 continue; 42 fi 43 44 if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then 45 echo "$dev, Skipped: ahci doesn't support recovery" 46 continue 47 fi 48 49 # Don't inject errosr into an already-frozen PE. This happens with 50 # PEs that contain multiple PCI devices (e.g. multi-function cards) 51 # and injecting new errors during the recovery process will probably 52 # result in the recovery failing and the device being marked as 53 # failed. 54 if ! pe_ok $dev ; then 55 echo "$dev, Skipped: Bad initial PE state" 56 continue; 57 fi 58 59 echo "$dev, Added" 60 61 # Add to this list of device to check 62 devices="$devices $dev" 63done 64 65dev_count="$(echo $devices | wc -w)" 66echo "Found ${dev_count} breakable devices..." 67 68failed=0 69for dev in $devices ; do 70 echo "Breaking $dev..." 71 72 if ! pe_ok $dev ; then 73 echo "Skipping $dev, Initial PE state is not ok" 74 failed="$((failed + 1))" 75 continue; 76 fi 77 78 if ! eeh_one_dev $dev ; then 79 failed="$((failed + 1))" 80 fi 81done 82 83echo "$failed devices failed to recover ($dev_count tested)" 84lspci | diff -u $pre_lspci - 85rm -f $pre_lspci 86 87exit $failed 88