1 #include <assert.h> 2 #include <libpdbg.h> 3 #include <unistd.h> 4 5 #include <hei_main.hpp> 6 #include <phosphor-logging/log.hpp> 7 #include <util/pdbg.hpp> 8 #include <util/trace.hpp> 9 10 #include <algorithm> 11 #include <fstream> 12 #include <iostream> 13 #include <map> 14 #include <string> 15 16 namespace analyzer 17 { 18 19 //------------------------------------------------------------------------------ 20 21 // Forward references for externally defined functions. 22 23 /** 24 * @brief Will get the list of active chip and initialize the isolator. 25 * @param o_chips The returned list of active chips. 26 */ 27 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 28 29 /** 30 * @brief Will create and submit a PEL using the given data. 31 * @param i_rootCause A signature defining the attention root cause. 32 * @param i_isoData The data gathered during isolation (for FFDC). 33 */ 34 void createPel(const libhei::Signature& i_rootCause, 35 const libhei::IsolationData& i_isoData); 36 37 //------------------------------------------------------------------------------ 38 39 const char* __attn(libhei::AttentionType_t i_attnType) 40 { 41 const char* str = ""; 42 switch (i_attnType) 43 { 44 case libhei::ATTN_TYPE_CHECKSTOP: 45 str = "CHECKSTOP"; 46 break; 47 case libhei::ATTN_TYPE_UNIT_CS: 48 str = "UNIT_CS"; 49 break; 50 case libhei::ATTN_TYPE_RECOVERABLE: 51 str = "RECOVERABLE"; 52 break; 53 case libhei::ATTN_TYPE_SP_ATTN: 54 str = "SP_ATTN"; 55 break; 56 case libhei::ATTN_TYPE_HOST_ATTN: 57 str = "HOST_ATTN"; 58 break; 59 default: 60 trace::err("Unsupported attention type: %u", i_attnType); 61 assert(0); 62 } 63 return str; 64 } 65 66 //------------------------------------------------------------------------------ 67 68 bool __filterRootCause(const libhei::IsolationData& i_isoData, 69 libhei::Signature& o_signature) 70 { 71 // We'll need to make a copy of the list so that the original list is 72 // maintained for the log. 73 std::vector<libhei::Signature> sigList{i_isoData.getSignatureList()}; 74 75 // For debug, trace out the original list of signatures before filtering. 76 for (const auto& sig : sigList) 77 { 78 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 79 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 80 __attn(sig.getAttnType())); 81 } 82 83 // Special and host attentions are not supported by this user application. 84 auto newEndItr = 85 std::remove_if(sigList.begin(), sigList.end(), [&](const auto& t) { 86 return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() || 87 libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType()); 88 }); 89 90 // Shrink the vector, if needed. 91 sigList.resize(std::distance(sigList.begin(), newEndItr)); 92 93 // START WORKAROUND 94 // TODO: Filtering should be determined by the RAS Data Files provided by 95 // the host firmware via the PNOR (similar to the Chip Data Files). 96 // Until that support is available, use a rudimentary filter that 97 // first looks for any recoverable attention, then any unit checkstop, 98 // and then any system checkstop. This is built on the premise that 99 // recoverable errors could be the root cause of an system checkstop 100 // attentions. Fortunately, we just need to sort the list by the 101 // greater attention type value. 102 std::sort(sigList.begin(), sigList.end(), 103 [&](const auto& a, const auto& b) { 104 return a.getAttnType() > b.getAttnType(); 105 }); 106 // END WORKAROUND 107 108 // Check if a root cause attention was found. 109 if (!sigList.empty()) 110 { 111 // The entry at the front of the list will be the root cause. 112 o_signature = sigList.front(); 113 return true; 114 } 115 116 return false; // default, no active attentions found. 117 } 118 119 //------------------------------------------------------------------------------ 120 121 bool __analyze(const libhei::IsolationData& i_isoData) 122 { 123 bool attnFound = false; 124 125 libhei::Signature rootCause{}; 126 attnFound = __filterRootCause(i_isoData, rootCause); 127 128 if (!attnFound) 129 { 130 // NOTE: It is possible for TI handling that there will not be an active 131 // attention. In which case, we will not do anything and let the 132 // caller of this function determine if this is the expected 133 // behavior. 134 trace::inf("No active attentions found"); 135 } 136 else 137 { 138 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 139 util::pdbg::getPath(rootCause.getChip()), 140 rootCause.toUint32(), __attn(rootCause.getAttnType())); 141 142 // TODO: Perform service actions based on the root cause. 143 144 // Create and commit a PEL. 145 createPel(rootCause, i_isoData); 146 } 147 148 return attnFound; 149 } 150 151 //------------------------------------------------------------------------------ 152 153 bool analyzeHardware() 154 { 155 bool attnFound = false; 156 157 trace::inf(">>> enter analyzeHardware()"); 158 159 if (util::pdbg::queryHardwareAnalysisSupported()) 160 { 161 // Initialize the isolator and get all of the chips to be analyzed. 162 trace::inf("Initializing the isolator..."); 163 std::vector<libhei::Chip> chips; 164 initializeIsolator(chips); 165 166 // Isolate attentions. 167 trace::inf("Isolating errors: # of chips=%u", chips.size()); 168 libhei::IsolationData isoData{}; 169 libhei::isolate(chips, isoData); 170 171 // Analyze the isolation data and perform service actions if needed. 172 attnFound = __analyze(isoData); 173 174 // All done, clean up the isolator. 175 trace::inf("Uninitializing isolator..."); 176 libhei::uninitialize(); 177 } 178 else 179 { 180 trace::err("Hardware error analysis is not supported on this system"); 181 } 182 183 trace::inf("<<< exit analyzeHardware()"); 184 185 return attnFound; 186 } 187 188 } // namespace analyzer 189