1 #include <assert.h> 2 3 #include <analyzer/analyzer_main.hpp> 4 #include <analyzer/ras-data/ras-data-parser.hpp> 5 #include <hei_main.hpp> 6 #include <hei_util.hpp> 7 #include <util/pdbg.hpp> 8 9 #include <algorithm> 10 #include <limits> 11 #include <string> 12 13 namespace analyzer 14 { 15 //------------------------------------------------------------------------------ 16 __findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list, 18 libhei::Signature& o_rootCause) 19 { 20 // TODO: Consider returning all of them instead of one as root cause. 21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 22 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() && 23 (42 == t.getBit() || 43 == t.getBit())); 24 }); 25 26 if (i_list.end() != itr) 27 { 28 o_rootCause = *itr; 29 return true; 30 } 31 32 return false; 33 } 34 35 //------------------------------------------------------------------------------ 36 __findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list, 38 libhei::Signature& o_rootCause) 39 { 40 using namespace util::pdbg; 41 42 // TODO: Consider returning all of them instead of one as root cause. 43 44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK"); 45 46 // First, look for any PLL unlock attentions reported by a processsor chip. 47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 48 return (nodeId == t.getId() && 49 TYPE_PROC == getTrgtType(getTrgt(t.getChip()))); 50 }); 51 52 if (i_list.end() != itr1) 53 { 54 o_rootCause = *itr1; 55 return true; 56 } 57 58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This 59 // is specifically for Odyssey, which are the only OCMBs that would report 60 // PLL unlock attentions. 61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 62 return (nodeId == t.getId() && 63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip()))); 64 }); 65 66 if (i_list.end() != itr2) 67 { 68 o_rootCause = *itr2; 69 return true; 70 } 71 72 return false; 73 } 74 75 //------------------------------------------------------------------------------ 76 __findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list, 78 libhei::Signature& o_rootCause, 79 const RasDataParser& i_rasData) 80 { 81 using namespace util::pdbg; 82 83 using func = libhei::NodeId_t (*)(const std::string& i_str); 84 func __hash = libhei::hash<libhei::NodeId_t>; 85 86 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR"); 87 static const auto mc_ustl_fir = __hash("MC_USTL_FIR"); 88 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT"); 89 90 // First, look for any chip checkstops from the connected OCMBs. 91 for (const auto& s : i_list) 92 { 93 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip()))) 94 { 95 continue; // OCMBs only 96 } 97 98 // TODO: The chip data for Explorer chips currently report chip 99 // checkstops as unit checkstops. Once the chip data has been 100 // updated, the check for unit checkstops here will need to be 101 // removed. 102 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() || 103 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType()) 104 { 105 o_rootCause = s; 106 return true; 107 } 108 } 109 110 // Now, look for any channel failure attentions on the processor side of the 111 // memory bus. 112 for (const auto& s : i_list) 113 { 114 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip()))) 115 { 116 continue; // processors only 117 } 118 119 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or 120 // MC_USTLFIR are considered a channel failure attention. 121 // TODO: The "channel failure" designation is actually configurable via 122 // other registers. We just happen to expect anything that is 123 // configured to channel failure to also be configured to unit 124 // checkstop. Eventually, we will need some mechanism to check the 125 // configuration registers for a more accurate analysis. 126 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() && 127 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) && 128 !i_rasData.isFlagSet(s, 129 RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 130 { 131 o_rootCause = s; 132 return true; 133 } 134 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in 135 // MC_OMI_DL_FIR that are hardwired to channel failure. 136 else if (mc_omi_dl_err_rpt == s.getId()) 137 { 138 o_rootCause = s; 139 return true; 140 } 141 } 142 143 return false; // default, nothing found 144 } 145 146 //------------------------------------------------------------------------------ 147 148 // Will query if a signature is a potential system checkstop root cause. 149 // attention. Note that this function excludes memory channel failure attentions 150 // which are checked in __findMemoryChannelFailure(). __findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature, 152 const RasDataParser& i_rasData) 153 { 154 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set. 155 if (i_rasData.isFlagSet(i_signature, 156 RasDataParser::RasDataFlags::CS_POSSIBLE) || 157 i_rasData.isFlagSet(i_signature, 158 RasDataParser::RasDataFlags::SUE_SOURCE)) 159 { 160 return true; 161 } 162 163 return false; // default, nothing found 164 } 165 166 //------------------------------------------------------------------------------ 167 __findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list, 169 libhei::Signature& o_rootCause, 170 const RasDataParser& i_rasData) 171 { 172 for (const auto& s : i_list) 173 { 174 // Only looking for recoverable attentions. 175 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType()) 176 { 177 continue; 178 } 179 180 if (__findCsRootCause(s, i_rasData)) 181 { 182 o_rootCause = s; 183 return true; 184 } 185 } 186 187 return false; // default, nothing found 188 } 189 190 //------------------------------------------------------------------------------ 191 __findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list, 193 libhei::Signature& o_rootCause, 194 const RasDataParser& i_rasData) 195 { 196 for (const auto& s : i_list) 197 { 198 // Only looking for unit checkstop attentions. 199 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType()) 200 { 201 continue; 202 } 203 204 if (__findCsRootCause(s, i_rasData)) 205 { 206 o_rootCause = s; 207 return true; 208 } 209 } 210 211 return false; // default, nothing found 212 } 213 214 //------------------------------------------------------------------------------ 215 __findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list, 217 libhei::Signature& o_rootCause, 218 const RasDataParser& i_rasData) 219 { 220 using namespace util::pdbg; 221 222 // If we have any attentions from an OCMB, assume isolation to the OCMBs 223 // was successful and the ATTN_FROM_OCMB flag does not need to be checked. 224 for (const auto& s : i_list) 225 { 226 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip()))) 227 { 228 return false; 229 } 230 } 231 232 for (const auto& s : i_list) 233 { 234 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 235 { 236 o_rootCause = s; 237 return true; 238 } 239 } 240 241 return false; // default, nothing found 242 } 243 244 //------------------------------------------------------------------------------ 245 __findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list, 247 libhei::Signature& o_rootCause) 248 { 249 using namespace util::pdbg; 250 251 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR"); 252 253 for (const auto& s : i_list) 254 { 255 const auto targetType = getTrgtType(getTrgt(s.getChip())); 256 const auto id = s.getId(); 257 const auto attnType = s.getAttnType(); 258 259 // Find any processor with chip checkstop attention that did not 260 // originate from the PB_EXT_FIR. 261 if ((TYPE_PROC == targetType) && 262 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id)) 263 { 264 o_rootCause = s; 265 return true; 266 } 267 } 268 269 return false; // default, nothing found 270 } 271 272 //------------------------------------------------------------------------------ 273 __findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list, 275 libhei::Signature& o_rootCause, 276 const RasDataParser& i_rasData) 277 { 278 using namespace util::pdbg; 279 using rdf = RasDataParser::RasDataFlags; 280 281 for (const auto& signature : i_list) 282 { 283 const auto attnType = signature.getAttnType(); 284 285 // Only looking for recoverable or unit checkstop attentions. 286 if (libhei::ATTN_TYPE_RECOVERABLE != attnType && 287 libhei::ATTN_TYPE_UNIT_CS != attnType) 288 { 289 continue; 290 } 291 292 // Skip any signature with the 'recovered_error' or 'informational_only' 293 // flags. 294 if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) || 295 i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) || 296 i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY)) 297 { 298 continue; 299 } 300 301 // At this point, the attention has not been explicitly ignored. So 302 // return this signature and exit. 303 o_rootCause = signature; 304 return true; 305 } 306 307 return false; // default, nothing found 308 } 309 310 //------------------------------------------------------------------------------ 311 findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)312 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData, 313 libhei::Signature& o_rootCause, 314 const RasDataParser& i_rasData) 315 { 316 // We'll need to make a copy of the list so that the original list is 317 // maintained for the PEL. 318 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 319 320 // START WORKAROUND 321 // TODO: Filtering should be data driven. Until that support is available, 322 // use the following isolation rules. 323 324 // Ensure the list is not empty before continuing. 325 if (list.empty()) 326 { 327 return false; // nothing more to do 328 } 329 330 // First, look for any RCS OSC errors. This must always be first because 331 // they can cause downstream PLL unlock attentions. 332 if (__findRcsOscError(list, o_rootCause)) 333 { 334 return true; 335 } 336 337 // Second, look for any PLL unlock attentions. This must always be second 338 // because PLL unlock attentions can cause any number of downstream 339 // attentions, including a system checkstop. 340 if (__findPllUnlock(list, o_rootCause)) 341 { 342 return true; 343 } 344 345 // Regardless of the analysis type, always look for anything that could be 346 // blamed as the root cause of a system checkstop. 347 348 // Memory channel failure attentions will produce SUEs and likely cause 349 // downstream attentions, including a system checkstop. 350 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData)) 351 { 352 return true; 353 } 354 355 // Look for any recoverable attentions that have been identified as a 356 // potential root cause of a system checkstop attention. These would include 357 // any attention that would generate an SUE. Note that is it possible for 358 // recoverables to generate unit checkstop attentions so we must check them 359 // first. 360 if (__findCsRootCause_RE(list, o_rootCause, i_rasData)) 361 { 362 return true; 363 } 364 365 // Look for any unit checkstop attentions (other than memory channel 366 // failures) that have been identified as a potential root cause of a 367 // system checkstop attention. These would include any attention that would 368 // generate an SUE. 369 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData)) 370 { 371 return true; 372 } 373 374 // If no other viable root cause has been found, check for any signatures 375 // with the ATTN_FROM_OCMB flag in case there was an attention from an 376 // inaccessible OCMB. 377 if (__findOcmbAttnBits(list, o_rootCause, i_rasData)) 378 { 379 return true; 380 } 381 382 // Look for any system checkstop attentions that originated from within the 383 // chip that reported the attention. In other words, no external checkstop 384 // attentions. 385 if (__findNonExternalCs(list, o_rootCause)) 386 { 387 return true; 388 } 389 390 if (AnalysisType::SYSTEM_CHECKSTOP != i_type) 391 { 392 // No system checkstop root cause attentions were found. Next, look for 393 // any recoverable or unit checkstop attentions that could be associated 394 // with a TI. 395 if (__findTiRootCause(list, o_rootCause, i_rasData)) 396 { 397 return true; 398 } 399 400 if (AnalysisType::TERMINATE_IMMEDIATE != i_type) 401 { 402 // No attentions associated with a system checkstop or TI were 403 // found. Simply, return the first entry in the list. 404 o_rootCause = list.front(); 405 return true; 406 } 407 } 408 409 // END WORKAROUND 410 411 return false; // default, no active attentions found. 412 } 413 414 //------------------------------------------------------------------------------ 415 __findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)416 bool __findIueTh(const std::vector<libhei::Signature>& i_list, 417 libhei::Signature& o_rootCause) 418 { 419 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 420 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() && 421 (17 == t.getBit() || 37 == t.getBit())) || 422 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() && 423 (18 == t.getBit() || 38 == t.getBit())); 424 }); 425 426 if (i_list.end() != itr) 427 { 428 o_rootCause = *itr; 429 return true; 430 } 431 432 return false; 433 } 434 435 //------------------------------------------------------------------------------ 436 rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)437 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData, 438 libhei::Signature& o_rootCause, 439 const RasDataParser& i_rasData) 440 { 441 using func = libhei::NodeId_t (*)(const std::string& i_str); 442 func __hash = libhei::hash<libhei::NodeId_t>; 443 444 // Check for any special cases that exist for specific FIR bits. 445 446 // If the channel fail was specifically a firmware initiated channel fail 447 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for 448 // any IUE bits that are on that would have caused the channel fail 449 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38] 450 // for Odyssey OCMBs). 451 452 // Explorer SRQFIR 453 static const auto srqfir = __hash("SRQFIR"); 454 // Odyssey SRQ_FIR 455 static const auto srq_fir = __hash("SRQ_FIR"); 456 457 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 458 459 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) || 460 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) && 461 __findIueTh(list, o_rootCause)) 462 { 463 // If __findIueTh returned true, o_rootCause was updated, return. 464 return; 465 } 466 467 // Check if the root cause found was a potential side effect of an 468 // ODP data corruption error. If it was, check if any other signature 469 // in the signature list was a potential root cause. 470 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT; 471 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE; 472 if (i_rasData.isFlagSet(o_rootCause, OdpSide)) 473 { 474 for (const auto& s : list) 475 { 476 if (i_rasData.isFlagSet(s, OdpRoot)) 477 { 478 // ODP data corruption root cause found, return. 479 o_rootCause = s; 480 return; 481 } 482 } 483 } 484 485 // Odyssey RDF_FIR 486 static const auto rdf_fir = __hash("RDF_FIR"); 487 488 // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on, 489 // check if bit 41 is also on. 490 if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit()) 491 { 492 // Look for RDF_FIR[41] 493 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) { 494 return (rdf_fir == t.getId() && 41 == t.getBit()); 495 }); 496 if (list.end() != itr) 497 { 498 o_rootCause = *itr; 499 } 500 } 501 } 502 503 //------------------------------------------------------------------------------ 504 filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)505 bool filterRootCause(AnalysisType i_type, 506 const libhei::IsolationData& i_isoData, 507 libhei::Signature& o_rootCause, 508 const RasDataParser& i_rasData) 509 { 510 // Find the initial root cause attention based on common rules for FIR 511 // isolation. 512 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData); 513 514 // If some root cause was found, handle any special cases for specific FIR 515 // bits that require additional logic to determine the root cause. 516 if (true == rc) 517 { 518 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData); 519 } 520 521 return rc; 522 } 523 524 //------------------------------------------------------------------------------ 525 526 } // namespace analyzer 527