1 #include <assert.h> 2 #include <unistd.h> 3 4 #include <analyzer/analyzer_main.hpp> 5 #include <analyzer/ras-data/ras-data-parser.hpp> 6 #include <analyzer/service_data.hpp> 7 #include <attn/attn_dump.hpp> 8 #include <hei_main.hpp> 9 #include <util/pdbg.hpp> 10 #include <util/trace.hpp> 11 12 namespace analyzer 13 { 14 //------------------------------------------------------------------------------ 15 16 // Forward references for externally defined functions. 17 18 /** 19 * @brief Will get the list of active chip and initialize the isolator. 20 * @param o_chips The returned list of active chips. 21 */ 22 void initializeIsolator(std::vector<libhei::Chip>& o_chips); 23 24 /** 25 * @brief Will get the list of active chip and initialize the isolator. 26 * @param i_type The type of analysis to perform. See enum for details. 27 * @param i_isoData The data gathered during isolation (for FFDC). 28 * @param o_rootCause The returned root cause signature. 29 * @param i_rasData The RAS data parser. 30 * @return True, if root cause has been found. False, otherwise. 31 */ 32 bool filterRootCause(AnalysisType i_type, 33 const libhei::IsolationData& i_isoData, 34 libhei::Signature& o_rootCause, 35 const RasDataParser& i_rasData); 36 37 /** 38 * @brief Will create and submit a PEL using the given data. 39 * @param i_servData Data regarding service actions gathered during analysis. 40 * @return The platform log ID. Will return zero if no PEL is generated. 41 */ 42 uint32_t commitPel(const ServiceData& i_servData); 43 44 //------------------------------------------------------------------------------ 45 46 const char* __attn(libhei::AttentionType_t i_type) 47 { 48 const char* str = ""; 49 switch (i_type) 50 { 51 case libhei::ATTN_TYPE_CHECKSTOP: 52 str = "CHECKSTOP"; 53 break; 54 case libhei::ATTN_TYPE_UNIT_CS: 55 str = "UNIT_CS"; 56 break; 57 case libhei::ATTN_TYPE_RECOVERABLE: 58 str = "RECOVERABLE"; 59 break; 60 case libhei::ATTN_TYPE_SP_ATTN: 61 str = "SP_ATTN"; 62 break; 63 case libhei::ATTN_TYPE_HOST_ATTN: 64 str = "HOST_ATTN"; 65 break; 66 default: 67 trace::err("Unsupported attention type: %u", i_type); 68 assert(0); 69 } 70 return str; 71 } 72 73 //------------------------------------------------------------------------------ 74 75 const char* __analysisType(AnalysisType i_type) 76 { 77 const char* str = ""; 78 switch (i_type) 79 { 80 case AnalysisType::SYSTEM_CHECKSTOP: 81 str = "SYSTEM_CHECKSTOP"; 82 break; 83 case AnalysisType::TERMINATE_IMMEDIATE: 84 str = "TERMINATE_IMMEDIATE"; 85 break; 86 case AnalysisType::MANUAL: 87 str = "MANUAL"; 88 break; 89 default: 90 trace::err("Unsupported analysis type: %u", i_type); 91 assert(0); 92 } 93 return str; 94 } 95 96 //------------------------------------------------------------------------------ 97 98 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump) 99 { 100 uint32_t o_plid = 0; // default, zero indicates PEL was not created 101 102 if (!util::pdbg::queryHardwareAnalysisSupported()) 103 { 104 trace::err("Hardware error analysis is not supported on this system"); 105 return o_plid; 106 } 107 108 trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type)); 109 110 // Initialize the isolator and get all of the chips to be analyzed. 111 trace::inf("Initializing the isolator..."); 112 std::vector<libhei::Chip> chips; 113 initializeIsolator(chips); 114 115 // Isolate attentions. 116 trace::inf("Isolating errors: # of chips=%u", chips.size()); 117 libhei::IsolationData isoData{}; 118 libhei::isolate(chips, isoData); 119 120 // For debug, trace out the original list of signatures before filtering. 121 for (const auto& sig : isoData.getSignatureList()) 122 { 123 trace::inf("Signature: %s 0x%0" PRIx32 " %s", 124 util::pdbg::getPath(sig.getChip()), sig.toUint32(), 125 __attn(sig.getAttnType())); 126 } 127 128 // Filter for root cause attention. 129 libhei::Signature rootCause{}; 130 RasDataParser rasData{}; 131 bool attnFound = filterRootCause(i_type, isoData, rootCause, rasData); 132 133 // If a root cause attention was found, or if this was a system checkstop, 134 // generate a PEL. 135 if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type) 136 { 137 if (attnFound) 138 { 139 trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s", 140 util::pdbg::getPath(rootCause.getChip()), 141 rootCause.toUint32(), __attn(rootCause.getAttnType())); 142 } 143 else 144 { 145 // This is bad. Analysis should have found a root cause attention 146 // for a system checkstop. Issues could range from code bugs to SCOM 147 // errors. Regardless, generate a PEL with FFDC to assist with 148 // debug. 149 trace::err("System checkstop with no root cause attention"); 150 rootCause = libhei::Signature{}; // just in case 151 } 152 153 // Start building the service data. 154 ServiceData servData{rootCause, i_type, isoData}; 155 156 // Apply any service actions, if needed. Note that there are no 157 // resolutions for manual analysis. 158 if (AnalysisType::MANUAL != i_type) 159 { 160 if (attnFound) 161 { 162 try 163 { 164 // Resolve the root cause attention. 165 rasData.getResolution(rootCause)->resolve(servData); 166 } 167 catch (const std::exception& e) 168 { 169 trace::err("Exception caught during root cause analysis"); 170 trace::err(e.what()); 171 172 // We'll still want to create a PEL for the FFDC, but 173 // since the analysis failed, we need to callout Level 2 174 // Support. 175 servData.calloutProcedure(callout::Procedure::NEXTLVL, 176 callout::Priority::HIGH); 177 } 178 } 179 else 180 { 181 // Analysis failed so callout the Level 2 Support. 182 servData.calloutProcedure(callout::Procedure::NEXTLVL, 183 callout::Priority::HIGH); 184 } 185 } 186 187 // Create and commit a PEL. 188 o_plid = commitPel(servData); 189 190 if (0 == o_plid) 191 { 192 trace::err("Failed to create PEL"); 193 } 194 else 195 { 196 trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid); 197 198 // Gather/return information needed for dump. A hardware dump will 199 // always be used for system checkstop attenions. Software dumps 200 // will be reserved for MP-IPLs during TI analysis. 201 // TODO: Need ID from root cause. At the moment, HUID does not exist 202 // in devtree. Will need a better ID definition. 203 o_dump.unitId = 0; 204 o_dump.dumpType = attn::DumpType::Hardware; 205 } 206 } 207 else 208 { 209 // It is possible for TI handling, or manually initiated analysis via 210 // the command line, that there will not be an active attention. In 211 // which case, we will do nothing and let the caller of this function 212 // determine if this is the expected behavior. 213 trace::inf("No active attentions found"); 214 } 215 216 // All done, clean up the isolator. 217 trace::inf("Uninitializing isolator..."); 218 libhei::uninitialize(); 219 220 trace::inf("<<< exit analyzeHardware()"); 221 222 return o_plid; 223 } 224 225 //------------------------------------------------------------------------------ 226 227 } // namespace analyzer 228