1 #include <assert.h> 2 #include <unistd.h> 3 4 #include <analyzer/analyzer_main.hpp> 5 #include <analyzer/ras-data/ras-data-parser.hpp> 6 #include <analyzer/service_data.hpp> 7 #include <attn/attn_dump.hpp> 8 #include <hei_main.hpp> 9 #include <util/pdbg.hpp> 10 #include <util/trace.hpp> 11 12 namespace analyzer 13 { 14 15 //------------------------------------------------------------------------------ 16 17 // Forward references for externally defined functions. 18 19 /** 20 * @brief Will get the list of active chip and initialize the isolator. 21 * @param o_chips The returned list of active chips. 22 */ 23 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 24 25 /** 26 * @brief Will get the list of active chip and initialize the isolator. 27 * @param i_type The type of analysis to perform. See enum for details. 28 * @param i_isoData The data gathered during isolation (for FFDC). 29 * @param o_rootCause The returned root cause signature. 30 * @return True, if root cause has been found. False, otherwise. 31 */ 32 bool filterRootCause(AnalysisType i_type, 33 const libhei::IsolationData& i_isoData, 34 libhei::Signature& o_rootCause); 35 36 /** 37 * @brief Will create and submit a PEL using the given data. 38 * @param i_servData Data regarding service actions gathered during analysis. 39 * @return The platform log ID. Will return zero if no PEL is generated. 40 */ 41 uint32_t commitPel(const ServiceData& i_servData); 42 43 //------------------------------------------------------------------------------ 44 45 const char* __attn(libhei::AttentionType_t i_type) 46 { 47 const char* str = ""; 48 switch (i_type) 49 { 50 case libhei::ATTN_TYPE_CHECKSTOP: 51 str = "CHECKSTOP"; 52 break; 53 case libhei::ATTN_TYPE_UNIT_CS: 54 str = "UNIT_CS"; 55 break; 56 case libhei::ATTN_TYPE_RECOVERABLE: 57 str = "RECOVERABLE"; 58 break; 59 case libhei::ATTN_TYPE_SP_ATTN: 60 str = "SP_ATTN"; 61 break; 62 case libhei::ATTN_TYPE_HOST_ATTN: 63 str = "HOST_ATTN"; 64 break; 65 default: 66 trace::err("Unsupported attention type: %u", i_type); 67 assert(0); 68 } 69 return str; 70 } 71 72 //------------------------------------------------------------------------------ 73 74 const char* __analysisType(AnalysisType i_type) 75 { 76 const char* str = ""; 77 switch (i_type) 78 { 79 case AnalysisType::SYSTEM_CHECKSTOP: 80 str = "SYSTEM_CHECKSTOP"; 81 break; 82 case AnalysisType::TERMINATE_IMMEDIATE: 83 str = "TERMINATE_IMMEDIATE"; 84 break; 85 case AnalysisType::MANUAL: 86 str = "MANUAL"; 87 break; 88 default: 89 trace::err("Unsupported analysis type: %u", i_type); 90 assert(0); 91 } 92 return str; 93 } 94 95 //------------------------------------------------------------------------------ 96 97 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump) 98 { 99 uint32_t o_plid = 0; // default, zero indicates PEL was not created 100 101 if (!util::pdbg::queryHardwareAnalysisSupported()) 102 { 103 trace::err("Hardware error analysis is not supported on this system"); 104 return o_plid; 105 } 106 107 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type)); 108 109 // Initialize the isolator and get all of the chips to be analyzed. 110 trace::inf("Initializing the isolator..."); 111 std::vector<libhei::Chip> chips; 112 initializeIsolator(chips); 113 114 // Isolate attentions. 115 trace::inf("Isolating errors: # of chips=%u", chips.size()); 116 libhei::IsolationData isoData{}; 117 libhei::isolate(chips, isoData); 118 119 // For debug, trace out the original list of signatures before filtering. 120 for (const auto& sig : isoData.getSignatureList()) 121 { 122 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 123 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 124 __attn(sig.getAttnType())); 125 } 126 127 // Filter for root cause attention. 128 libhei::Signature rootCause{}; 129 bool attnFound = filterRootCause(i_type, isoData, rootCause); 130 131 // If a root cause attention was found, or if this was a system checkstop, 132 // generate a PEL. 133 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type) 134 { 135 if (attnFound) 136 { 137 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 138 util::pdbg::getPath(rootCause.getChip()), 139 rootCause.toUint32(), __attn(rootCause.getAttnType())); 140 } 141 else 142 { 143 // This is bad. Analysis should have found a root cause attention 144 // for a system checkstop. Issues could range from code bugs to SCOM 145 // errors. Regardless, generate a PEL with FFDC to assist with 146 // debug. 147 trace::err("System checkstop with no root cause attention"); 148 rootCause = libhei::Signature{}; // just in case 149 } 150 151 // Start building the service data. 152 ServiceData servData{rootCause, i_type, isoData}; 153 154 // Apply any service actions, if needed. Note that there are no 155 // resolutions for manual analysis. 156 if (AnalysisType::MANUAL != i_type) 157 { 158 if (attnFound) 159 { 160 try 161 { 162 // Resolve the root cause attention. 163 RasDataParser rasData{}; 164 rasData.getResolution(rootCause)->resolve(servData); 165 } 166 catch (const std::exception& e) 167 { 168 trace::err("Exception caught during root cause analysis"); 169 trace::err(e.what()); 170 171 // We'll still want to create a PEL for the FFDC, but 172 // since the analysis failed, we need to callout Level 2 173 // Support. 174 servData.calloutProcedure(callout::Procedure::NEXTLVL, 175 callout::Priority::HIGH); 176 } 177 } 178 else 179 { 180 // Analysis failed so callout the Level 2 Support. 181 servData.calloutProcedure(callout::Procedure::NEXTLVL, 182 callout::Priority::HIGH); 183 } 184 } 185 186 // Create and commit a PEL. 187 o_plid = commitPel(servData); 188 189 if (0 == o_plid) 190 { 191 trace::err("Failed to create PEL"); 192 } 193 else 194 { 195 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid); 196 197 // Gather/return information needed for dump. A hardware dump will 198 // always be used for system checkstop attenions. Software dumps 199 // will be reserved for MP-IPLs during TI analysis. 200 // TODO: Need ID from root cause. At the moment, HUID does not exist 201 // in devtree. Will need a better ID definition. 202 o_dump.unitId = 0; 203 o_dump.dumpType = attn::DumpType::Hardware; 204 } 205 } 206 else 207 { 208 // It is possible for TI handling, or manually initiated analysis via 209 // the command line, that there will not be an active attention. In 210 // which case, we will do nothing and let the caller of this function 211 // determine if this is the expected behavior. 212 trace::inf("No active attentions found"); 213 } 214 215 // All done, clean up the isolator. 216 trace::inf("Uninitializing isolator..."); 217 libhei::uninitialize(); 218 219 trace::inf("<<< exit analyzeHardware()"); 220 221 return o_plid; 222 } 223 224 //------------------------------------------------------------------------------ 225 226 /** 227 * @brief Get error isolator build information 228 * 229 * @return Pointer to build information 230 */ 231 const char* getBuildInfo() 232 { 233 return libhei::getBuildInfo(); 234 } 235 236 } // namespace analyzer 237