1 #include <assert.h> 2 #include <unistd.h> 3 4 #include <analyzer/analyzer_main.hpp> 5 #include <analyzer/ras-data/ras-data-parser.hpp> 6 #include <analyzer/service_data.hpp> 7 #include <attn/attn_dump.hpp> 8 #include <hei_main.hpp> 9 #include <util/pdbg.hpp> 10 #include <util/trace.hpp> 11 12 namespace analyzer 13 { 14 //------------------------------------------------------------------------------ 15 16 // Forward references for externally defined functions. 17 18 /** 19 * @brief Will get the list of active chip and initialize the isolator. 20 * @param o_chips The returned list of active chips. 21 */ 22 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 23 24 /** 25 * @brief Will get the list of active chip and initialize the isolator. 26 * @param i_type The type of analysis to perform. See enum for details. 27 * @param i_isoData The data gathered during isolation (for FFDC). 28 * @param o_rootCause The returned root cause signature. 29 * @param i_rasData The RAS data parser. 30 * @return True, if root cause has been found. False, otherwise. 31 */ 32 bool filterRootCause(AnalysisType i_type, 33 const libhei::IsolationData& i_isoData, 34 libhei::Signature& o_rootCause, 35 const RasDataParser& i_rasData); 36 37 /** 38 * @brief Will create and submit a PEL using the given data. 39 * @param i_servData Data regarding service actions gathered during analysis. 40 * @return The platform log ID. Will return zero if no PEL is generated. 41 */ 42 uint32_t commitPel(const ServiceData& i_servData); 43 44 //------------------------------------------------------------------------------ 45 46 const char* __attn(libhei::AttentionType_t i_type) 47 { 48 const char* str = ""; 49 switch (i_type) 50 { 51 case libhei::ATTN_TYPE_CHECKSTOP: 52 str = "CHECKSTOP"; 53 break; 54 case libhei::ATTN_TYPE_UNIT_CS: 55 str = "UNIT_CS"; 56 break; 57 case libhei::ATTN_TYPE_RECOVERABLE: 58 str = "RECOVERABLE"; 59 break; 60 case libhei::ATTN_TYPE_SP_ATTN: 61 str = "SP_ATTN"; 62 break; 63 case libhei::ATTN_TYPE_HOST_ATTN: 64 str = "HOST_ATTN"; 65 break; 66 default: 67 trace::err("Unsupported attention type: %u", i_type); 68 assert(0); 69 } 70 return str; 71 } 72 73 //------------------------------------------------------------------------------ 74 75 const char* __analysisType(AnalysisType i_type) 76 { 77 const char* str = ""; 78 switch (i_type) 79 { 80 case AnalysisType::SYSTEM_CHECKSTOP: 81 str = "SYSTEM_CHECKSTOP"; 82 break; 83 case AnalysisType::TERMINATE_IMMEDIATE: 84 str = "TERMINATE_IMMEDIATE"; 85 break; 86 case AnalysisType::MANUAL: 87 str = "MANUAL"; 88 break; 89 default: 90 trace::err("Unsupported analysis type: %u", i_type); 91 assert(0); 92 } 93 return str; 94 } 95 96 //------------------------------------------------------------------------------ 97 98 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump) 99 { 100 uint32_t o_plid = 0; // default, zero indicates PEL was not created 101 102 if (!util::pdbg::queryHardwareAnalysisSupported()) 103 { 104 trace::err("Hardware error analysis is not supported on this system"); 105 return o_plid; 106 } 107 108 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type)); 109 110 // Initialize the isolator and get all of the chips to be analyzed. 111 trace::inf("Initializing the isolator..."); 112 std::vector<libhei::Chip> chips; 113 initializeIsolator(chips); 114 115 // Isolate attentions. 116 trace::inf("Isolating errors: # of chips=%u", chips.size()); 117 libhei::IsolationData isoData{}; 118 libhei::isolate(chips, isoData); 119 120 // For debug, trace out the original list of signatures before filtering. 121 for (const auto& sig : isoData.getSignatureList()) 122 { 123 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 124 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 125 __attn(sig.getAttnType())); 126 } 127 128 // Filter for root cause attention. 129 libhei::Signature rootCause{}; 130 RasDataParser rasData{}; 131 bool attnFound = false; 132 try 133 { 134 attnFound = filterRootCause(i_type, isoData, rootCause, rasData); 135 } 136 catch (const std::exception& e) 137 { 138 trace::err("Exception caught during root cause filtering"); 139 trace::err(e.what()); 140 attnFound = false; // just in case 141 } 142 143 // If a root cause attention was found, or if this was a system checkstop, 144 // generate a PEL. 145 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type) 146 { 147 if (attnFound) 148 { 149 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 150 util::pdbg::getPath(rootCause.getChip()), 151 rootCause.toUint32(), __attn(rootCause.getAttnType())); 152 } 153 else 154 { 155 // This is bad. Analysis should have found a root cause attention 156 // for a system checkstop. Issues could range from code bugs to SCOM 157 // errors. Regardless, generate a PEL with FFDC to assist with 158 // debug. 159 trace::err("System checkstop with no root cause attention"); 160 rootCause = libhei::Signature{}; // just in case 161 } 162 163 // Start building the service data. 164 ServiceData servData{rootCause, i_type, isoData}; 165 166 // Apply any service actions, if needed. Note that there are no 167 // resolutions for manual analysis. 168 if (AnalysisType::MANUAL != i_type) 169 { 170 if (attnFound) 171 { 172 try 173 { 174 // Resolve the root cause attention. 175 rasData.getResolution(rootCause)->resolve(servData); 176 } 177 catch (const std::exception& e) 178 { 179 trace::err("Exception caught during root cause analysis"); 180 trace::err(e.what()); 181 182 // We'll still want to create a PEL for the FFDC, but 183 // since the analysis failed, we need to callout Level 2 184 // Support. 185 servData.calloutProcedure(callout::Procedure::NEXTLVL, 186 callout::Priority::HIGH); 187 } 188 } 189 else 190 { 191 // Analysis failed so callout the Level 2 Support. 192 servData.calloutProcedure(callout::Procedure::NEXTLVL, 193 callout::Priority::HIGH); 194 } 195 } 196 197 // Create and commit a PEL. 198 o_plid = commitPel(servData); 199 200 if (0 == o_plid) 201 { 202 trace::err("Failed to create PEL"); 203 } 204 else 205 { 206 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid); 207 208 // Gather/return information needed for dump. A hardware dump will 209 // always be used for system checkstop attenions. Software dumps 210 // will be reserved for MP-IPLs during TI analysis. 211 // TODO: Need ID from root cause. At the moment, HUID does not exist 212 // in devtree. Will need a better ID definition. 213 o_dump.unitId = 0; 214 o_dump.dumpType = attn::DumpType::Hardware; 215 } 216 } 217 else 218 { 219 // It is possible for TI handling, or manually initiated analysis via 220 // the command line, that there will not be an active attention. In 221 // which case, we will do nothing and let the caller of this function 222 // determine if this is the expected behavior. 223 trace::inf("No active attentions found"); 224 } 225 226 // All done, clean up the isolator. 227 trace::inf("Uninitializing isolator..."); 228 libhei::uninitialize(); 229 230 trace::inf("<<< exit analyzeHardware()"); 231 232 return o_plid; 233 } 234 235 //------------------------------------------------------------------------------ 236 237 } // namespace analyzer 238