1 #include <assert.h> 2 #include <libpdbg.h> 3 #include <unistd.h> 4 5 #include <analyzer/service_data.hpp> 6 #include <hei_main.hpp> 7 #include <phosphor-logging/log.hpp> 8 #include <util/pdbg.hpp> 9 #include <util/trace.hpp> 10 11 #include <algorithm> 12 #include <fstream> 13 #include <iostream> 14 #include <map> 15 #include <string> 16 17 namespace analyzer 18 { 19 20 //------------------------------------------------------------------------------ 21 22 // Forward references for externally defined functions. 23 24 /** 25 * @brief Will get the list of active chip and initialize the isolator. 26 * @param o_chips The returned list of active chips. 27 */ 28 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 29 30 /** 31 * @brief Will create and submit a PEL using the given data. 32 * @param i_rootCause A signature defining the attention root cause. 33 * @param i_isoData The data gathered during isolation (for FFDC). 34 * @param i_servData Data regarding service actions gathered during analysis. 35 */ 36 void createPel(const libhei::Signature& i_rootCause, 37 const libhei::IsolationData& i_isoData, 38 const ServiceData& i_servData); 39 40 //------------------------------------------------------------------------------ 41 42 const char* __attn(libhei::AttentionType_t i_attnType) 43 { 44 const char* str = ""; 45 switch (i_attnType) 46 { 47 case libhei::ATTN_TYPE_CHECKSTOP: 48 str = "CHECKSTOP"; 49 break; 50 case libhei::ATTN_TYPE_UNIT_CS: 51 str = "UNIT_CS"; 52 break; 53 case libhei::ATTN_TYPE_RECOVERABLE: 54 str = "RECOVERABLE"; 55 break; 56 case libhei::ATTN_TYPE_SP_ATTN: 57 str = "SP_ATTN"; 58 break; 59 case libhei::ATTN_TYPE_HOST_ATTN: 60 str = "HOST_ATTN"; 61 break; 62 default: 63 trace::err("Unsupported attention type: %u", i_attnType); 64 assert(0); 65 } 66 return str; 67 } 68 69 //------------------------------------------------------------------------------ 70 71 bool __filterRootCause(const libhei::IsolationData& i_isoData, 72 libhei::Signature& o_signature) 73 { 74 // We'll need to make a copy of the list so that the original list is 75 // maintained for the log. 76 std::vector<libhei::Signature> sigList{i_isoData.getSignatureList()}; 77 78 // For debug, trace out the original list of signatures before filtering. 79 for (const auto& sig : sigList) 80 { 81 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 82 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 83 __attn(sig.getAttnType())); 84 } 85 86 // Special and host attentions are not supported by this user application. 87 auto newEndItr = 88 std::remove_if(sigList.begin(), sigList.end(), [&](const auto& t) { 89 return (libhei::ATTN_TYPE_SP_ATTN == t.getAttnType() || 90 libhei::ATTN_TYPE_HOST_ATTN == t.getAttnType()); 91 }); 92 93 // Shrink the vector, if needed. 94 sigList.resize(std::distance(sigList.begin(), newEndItr)); 95 96 // START WORKAROUND 97 // TODO: Filtering should be determined by the RAS Data Files provided by 98 // the host firmware via the PNOR (similar to the Chip Data Files). 99 // Until that support is available, use a rudimentary filter that 100 // first looks for any recoverable attention, then any unit checkstop, 101 // and then any system checkstop. This is built on the premise that 102 // recoverable errors could be the root cause of an system checkstop 103 // attentions. Fortunately, we just need to sort the list by the 104 // greater attention type value. 105 std::sort(sigList.begin(), sigList.end(), 106 [&](const auto& a, const auto& b) { 107 return a.getAttnType() > b.getAttnType(); 108 }); 109 // END WORKAROUND 110 111 // Check if a root cause attention was found. 112 if (!sigList.empty()) 113 { 114 // The entry at the front of the list will be the root cause. 115 o_signature = sigList.front(); 116 return true; 117 } 118 119 return false; // default, no active attentions found. 120 } 121 122 //------------------------------------------------------------------------------ 123 124 bool __analyze(const libhei::IsolationData& i_isoData) 125 { 126 bool attnFound = false; 127 128 libhei::Signature rootCause{}; 129 attnFound = __filterRootCause(i_isoData, rootCause); 130 131 if (!attnFound) 132 { 133 // NOTE: It is possible for TI handling that there will not be an active 134 // attention. In which case, we will not do anything and let the 135 // caller of this function determine if this is the expected 136 // behavior. 137 trace::inf("No active attentions found"); 138 } 139 else 140 { 141 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 142 util::pdbg::getPath(rootCause.getChip()), 143 rootCause.toUint32(), __attn(rootCause.getAttnType())); 144 145 // TODO: Perform service actions based on the root cause. The default 146 // callout if none other exist is level 2 support. 147 ServiceData servData{}; 148 servData.addCallout(std::make_shared<ProcedureCallout>( 149 ProcedureCallout::NEXTLVL, Callout::Priority::HIGH)); 150 151 // Create and commit a PEL. 152 createPel(rootCause, i_isoData, servData); 153 } 154 155 return attnFound; 156 } 157 158 //------------------------------------------------------------------------------ 159 160 bool analyzeHardware() 161 { 162 bool attnFound = false; 163 164 trace::inf(">>> enter analyzeHardware()"); 165 166 if (util::pdbg::queryHardwareAnalysisSupported()) 167 { 168 // Initialize the isolator and get all of the chips to be analyzed. 169 trace::inf("Initializing the isolator..."); 170 std::vector<libhei::Chip> chips; 171 initializeIsolator(chips); 172 173 // Isolate attentions. 174 trace::inf("Isolating errors: # of chips=%u", chips.size()); 175 libhei::IsolationData isoData{}; 176 libhei::isolate(chips, isoData); 177 178 // Analyze the isolation data and perform service actions if needed. 179 attnFound = __analyze(isoData); 180 181 // All done, clean up the isolator. 182 trace::inf("Uninitializing isolator..."); 183 libhei::uninitialize(); 184 } 185 else 186 { 187 trace::err("Hardware error analysis is not supported on this system"); 188 } 189 190 trace::inf("<<< exit analyzeHardware()"); 191 192 return attnFound; 193 } 194 195 } // namespace analyzer 196