1 #include <assert.h> 2 #include <unistd.h> 3 4 #include <analyzer/analyzer_main.hpp> 5 #include <analyzer/ras-data/ras-data-parser.hpp> 6 #include <analyzer/service_data.hpp> 7 #include <attn/attn_dump.hpp> 8 #include <hei_main.hpp> 9 #include <util/pdbg.hpp> 10 #include <util/trace.hpp> 11 12 namespace analyzer 13 { 14 15 //------------------------------------------------------------------------------ 16 17 // Forward references for externally defined functions. 18 19 /** 20 * @brief Will get the list of active chip and initialize the isolator. 21 * @param o_chips The returned list of active chips. 22 */ 23 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 24 25 /** 26 * @brief Will get the list of active chip and initialize the isolator. 27 * @param i_type The type of analysis to perform. See enum for details. 28 * @param i_isoData The data gathered during isolation (for FFDC). 29 * @param o_rootCause The returned root cause signature. 30 * @return True, if root cause has been found. False, otherwise. 31 */ 32 bool filterRootCause(AnalysisType i_type, 33 const libhei::IsolationData& i_isoData, 34 libhei::Signature& o_rootCause); 35 36 /** 37 * @brief Will create and submit a PEL using the given data. 38 * @param i_isoData The data gathered during isolation (for FFDC). 39 * @param i_servData Data regarding service actions gathered during analysis. 40 * @return The platform log ID. Will return zero if no PEL is generated. 41 */ 42 uint32_t createPel(const libhei::IsolationData& i_isoData, 43 const ServiceData& i_servData); 44 45 //------------------------------------------------------------------------------ 46 47 const char* __attn(libhei::AttentionType_t i_type) 48 { 49 const char* str = ""; 50 switch (i_type) 51 { 52 case libhei::ATTN_TYPE_CHECKSTOP: 53 str = "CHECKSTOP"; 54 break; 55 case libhei::ATTN_TYPE_UNIT_CS: 56 str = "UNIT_CS"; 57 break; 58 case libhei::ATTN_TYPE_RECOVERABLE: 59 str = "RECOVERABLE"; 60 break; 61 case libhei::ATTN_TYPE_SP_ATTN: 62 str = "SP_ATTN"; 63 break; 64 case libhei::ATTN_TYPE_HOST_ATTN: 65 str = "HOST_ATTN"; 66 break; 67 default: 68 trace::err("Unsupported attention type: %u", i_type); 69 assert(0); 70 } 71 return str; 72 } 73 74 //------------------------------------------------------------------------------ 75 76 const char* __analysisType(AnalysisType i_type) 77 { 78 const char* str = ""; 79 switch (i_type) 80 { 81 case AnalysisType::SYSTEM_CHECKSTOP: 82 str = "SYSTEM_CHECKSTOP"; 83 break; 84 case AnalysisType::TERMINATE_IMMEDIATE: 85 str = "TERMINATE_IMMEDIATE"; 86 break; 87 case AnalysisType::MANUAL: 88 str = "MANUAL"; 89 break; 90 default: 91 trace::err("Unsupported analysis type: %u", i_type); 92 assert(0); 93 } 94 return str; 95 } 96 97 //------------------------------------------------------------------------------ 98 99 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump) 100 { 101 uint32_t o_plid = 0; // default, zero indicates PEL was not created 102 103 if (!util::pdbg::queryHardwareAnalysisSupported()) 104 { 105 trace::err("Hardware error analysis is not supported on this system"); 106 return o_plid; 107 } 108 109 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type)); 110 111 // Initialize the isolator and get all of the chips to be analyzed. 112 trace::inf("Initializing the isolator..."); 113 std::vector<libhei::Chip> chips; 114 initializeIsolator(chips); 115 116 // Isolate attentions. 117 trace::inf("Isolating errors: # of chips=%u", chips.size()); 118 libhei::IsolationData isoData{}; 119 libhei::isolate(chips, isoData); 120 121 // For debug, trace out the original list of signatures before filtering. 122 for (const auto& sig : isoData.getSignatureList()) 123 { 124 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 125 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 126 __attn(sig.getAttnType())); 127 } 128 129 // Filter for root cause attention. 130 libhei::Signature rootCause{}; 131 bool attnFound = filterRootCause(i_type, isoData, rootCause); 132 133 // If a root cause attention was found, or if this was a system checkstop, 134 // generate a PEL. 135 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type) 136 { 137 if (attnFound) 138 { 139 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 140 util::pdbg::getPath(rootCause.getChip()), 141 rootCause.toUint32(), __attn(rootCause.getAttnType())); 142 } 143 else 144 { 145 // This is bad. Analysis should have found a root cause attention 146 // for a system checkstop. Issues could range from code bugs to SCOM 147 // errors. Regardless, generate a PEL with FFDC to assist with 148 // debug. 149 trace::err("System checkstop with no root cause attention"); 150 rootCause = libhei::Signature{}; // just in case 151 } 152 153 // Start building the service data. 154 ServiceData servData{rootCause, i_type}; 155 156 // Apply any service actions, if needed. Note that there are no 157 // resolutions for manual analysis. 158 if (AnalysisType::MANUAL != i_type) 159 { 160 if (attnFound) 161 { 162 // Resolve the root cause attention. 163 RasDataParser rasData{}; 164 rasData.getResolution(rootCause)->resolve(servData); 165 } 166 else 167 { 168 // Analysis failed so apply the Level 2 Support resolution. 169 ProcedureCalloutResolution res{callout::Procedure::NEXTLVL, 170 callout::Priority::HIGH}; 171 res.resolve(servData); 172 } 173 } 174 175 // Create and commit a PEL. 176 o_plid = createPel(isoData, servData); 177 178 if (0 == o_plid) 179 { 180 trace::err("Failed to create PEL"); 181 } 182 else 183 { 184 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid); 185 186 // Gather/return information needed for dump. A hardware dump will 187 // always be used for system checkstop attenions. Software dumps 188 // will be reserved for MP-IPLs during TI analysis. 189 // TODO: Need ID from root cause. At the moment, HUID does not exist 190 // in devtree. Will need a better ID definition. 191 o_dump.unitId = 0; 192 o_dump.dumpType = attn::DumpType::Hardware; 193 } 194 } 195 else 196 { 197 // It is possible for TI handling, or manually initiated analysis via 198 // the command line, that there will not be an active attention. In 199 // which case, we will do nothing and let the caller of this function 200 // determine if this is the expected behavior. 201 trace::inf("No active attentions found"); 202 } 203 204 // All done, clean up the isolator. 205 trace::inf("Uninitializing isolator..."); 206 libhei::uninitialize(); 207 208 trace::inf("<<< exit analyzeHardware()"); 209 210 return o_plid; 211 } 212 213 //------------------------------------------------------------------------------ 214 215 /** 216 * @brief Get error isolator build information 217 * 218 * @return Pointer to build information 219 */ 220 const char* getBuildInfo() 221 { 222 return libhei::getBuildInfo(); 223 } 224 225 } // namespace analyzer 226