1 #include <assert.h> 2 #include <unistd.h> 3 4 #include <analyzer/analyzer_main.hpp> 5 #include <analyzer/ras-data/ras-data-parser.hpp> 6 #include <analyzer/service_data.hpp> 7 #include <attn/attn_dump.hpp> 8 #include <hei_main.hpp> 9 #include <util/pdbg.hpp> 10 #include <util/trace.hpp> 11 12 namespace analyzer 13 { 14 15 //------------------------------------------------------------------------------ 16 17 // Forward references for externally defined functions. 18 19 /** 20 * @brief Will get the list of active chip and initialize the isolator. 21 * @param o_chips The returned list of active chips. 22 */ 23 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 24 25 /** 26 * @brief Will get the list of active chip and initialize the isolator. 27 * @param i_isoData The data gathered during isolation (for FFDC). 28 * @param o_rootCause The returned root cause signature. 29 * @return True, if root cause has been found. False, otherwise. 30 */ 31 bool filterRootCause(const libhei::IsolationData& i_isoData, 32 libhei::Signature& o_rootCause); 33 34 /** 35 * @brief Will create and submit a PEL using the given data. 36 * @param i_isoData The data gathered during isolation (for FFDC). 37 * @param i_servData Data regarding service actions gathered during analysis. 38 * @return The platform log ID. Will return zero if no PEL is generated. 39 */ 40 uint32_t createPel(const libhei::IsolationData& i_isoData, 41 const ServiceData& i_servData); 42 43 //------------------------------------------------------------------------------ 44 45 const char* __attn(libhei::AttentionType_t i_type) 46 { 47 const char* str = ""; 48 switch (i_type) 49 { 50 case libhei::ATTN_TYPE_CHECKSTOP: 51 str = "CHECKSTOP"; 52 break; 53 case libhei::ATTN_TYPE_UNIT_CS: 54 str = "UNIT_CS"; 55 break; 56 case libhei::ATTN_TYPE_RECOVERABLE: 57 str = "RECOVERABLE"; 58 break; 59 case libhei::ATTN_TYPE_SP_ATTN: 60 str = "SP_ATTN"; 61 break; 62 case libhei::ATTN_TYPE_HOST_ATTN: 63 str = "HOST_ATTN"; 64 break; 65 default: 66 trace::err("Unsupported attention type: %u", i_type); 67 assert(0); 68 } 69 return str; 70 } 71 72 //------------------------------------------------------------------------------ 73 74 const char* __analysisType(AnalysisType i_type) 75 { 76 const char* str = ""; 77 switch (i_type) 78 { 79 case AnalysisType::SYSTEM_CHECKSTOP: 80 str = "SYSTEM_CHECKSTOP"; 81 break; 82 case AnalysisType::TERMINATE_IMMEDIATE: 83 str = "TERMINATE_IMMEDIATE"; 84 break; 85 case AnalysisType::MANUAL: 86 str = "MANUAL"; 87 break; 88 default: 89 trace::err("Unsupported analysis type: %u", i_type); 90 assert(0); 91 } 92 return str; 93 } 94 95 //------------------------------------------------------------------------------ 96 97 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump) 98 { 99 uint32_t o_plid = 0; // default, zero indicates PEL was not created 100 101 if (!util::pdbg::queryHardwareAnalysisSupported()) 102 { 103 trace::err("Hardware error analysis is not supported on this system"); 104 return o_plid; 105 } 106 107 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type)); 108 109 // Initialize the isolator and get all of the chips to be analyzed. 110 trace::inf("Initializing the isolator..."); 111 std::vector<libhei::Chip> chips; 112 initializeIsolator(chips); 113 114 // Isolate attentions. 115 trace::inf("Isolating errors: # of chips=%u", chips.size()); 116 libhei::IsolationData isoData{}; 117 libhei::isolate(chips, isoData); 118 119 // For debug, trace out the original list of signatures before filtering. 120 for (const auto& sig : isoData.getSignatureList()) 121 { 122 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 123 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 124 __attn(sig.getAttnType())); 125 } 126 127 // Filter for root cause attention. 128 libhei::Signature rootCause{}; 129 bool attnFound = filterRootCause(isoData, rootCause); 130 131 // If a root cause attention was found, or if this was a system checkstop, 132 // generate a PEL. 133 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type) 134 { 135 if (attnFound) 136 { 137 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 138 util::pdbg::getPath(rootCause.getChip()), 139 rootCause.toUint32(), __attn(rootCause.getAttnType())); 140 } 141 else 142 { 143 // This is bad. Analysis should have found a root cause attention 144 // for a system checkstop. Issues could range from code bugs to SCOM 145 // errors. Regardless, generate a PEL with FFDC to assist with 146 // debug. 147 trace::err("System checkstop with no root cause attention"); 148 rootCause = libhei::Signature{}; // just in case 149 } 150 151 // Start building the service data. 152 ServiceData servData{rootCause, i_type}; 153 154 // Apply any service actions, if needed. Note that there are no 155 // resolutions for manual analysis. 156 if (AnalysisType::MANUAL != i_type) 157 { 158 if (attnFound) 159 { 160 // Resolve the root cause attention. 161 RasDataParser rasData{}; 162 rasData.getResolution(rootCause)->resolve(servData); 163 } 164 else 165 { 166 // Analysis failed so apply the Level 2 Support resolution. 167 ProcedureCalloutResolution res{callout::Procedure::NEXTLVL, 168 callout::Priority::HIGH}; 169 res.resolve(servData); 170 } 171 } 172 173 // Create and commit a PEL. 174 o_plid = createPel(isoData, servData); 175 176 if (0 == o_plid) 177 { 178 trace::err("Failed to create PEL"); 179 } 180 else 181 { 182 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid); 183 184 // Gather/return information needed for dump. A hardware dump will 185 // always be used for system checkstop attenions. Software dumps 186 // will be reserved for MP-IPLs during TI analysis. 187 // TODO: Need ID from root cause. At the moment, HUID does not exist 188 // in devtree. Will need a better ID definition. 189 o_dump.unitId = 0; 190 o_dump.dumpType = attn::DumpType::Hardware; 191 } 192 } 193 else 194 { 195 // It is possible for TI handling, or manually initiated analysis via 196 // the command line, that there will not be an active attention. In 197 // which case, we will do nothing and let the caller of this function 198 // determine if this is the expected behavior. 199 trace::inf("No active attentions found"); 200 } 201 202 // All done, clean up the isolator. 203 trace::inf("Uninitializing isolator..."); 204 libhei::uninitialize(); 205 206 trace::inf("<<< exit analyzeHardware()"); 207 208 return o_plid; 209 } 210 211 //------------------------------------------------------------------------------ 212 213 /** 214 * @brief Get error isolator build information 215 * 216 * @return Pointer to build information 217 */ 218 const char* getBuildInfo() 219 { 220 return libhei::getBuildInfo(); 221 } 222 223 } // namespace analyzer 224