1 #include <assert.h> 2 3 #include <analyzer/analyzer_main.hpp> 4 #include <analyzer/ras-data/ras-data-parser.hpp> 5 #include <hei_main.hpp> 6 #include <hei_util.hpp> 7 #include <util/pdbg.hpp> 8 9 #include <algorithm> 10 #include <limits> 11 #include <string> 12 13 namespace analyzer 14 { 15 //------------------------------------------------------------------------------ 16 17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list, 18 libhei::Signature& o_rootCause) 19 { 20 // TODO: Consider returning all of them instead of one as root cause. 21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 22 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() && 23 (42 == t.getBit() || 43 == t.getBit())); 24 }); 25 26 if (i_list.end() != itr) 27 { 28 o_rootCause = *itr; 29 return true; 30 } 31 32 return false; 33 } 34 35 //------------------------------------------------------------------------------ 36 37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list, 38 libhei::Signature& o_rootCause) 39 { 40 using namespace util::pdbg; 41 42 // TODO: Consider returning all of them instead of one as root cause. 43 44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK"); 45 46 // First, look for any PLL unlock attentions reported by a processsor chip. 47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 48 return (nodeId == t.getId() && 49 TYPE_PROC == getTrgtType(getTrgt(t.getChip()))); 50 }); 51 52 if (i_list.end() != itr1) 53 { 54 o_rootCause = *itr1; 55 return true; 56 } 57 58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This 59 // is specifically for Odyssey, which are the only OCMBs that would report 60 // PLL unlock attentions. 61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 62 return (nodeId == t.getId() && 63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip()))); 64 }); 65 66 if (i_list.end() != itr2) 67 { 68 o_rootCause = *itr2; 69 return true; 70 } 71 72 return false; 73 } 74 75 //------------------------------------------------------------------------------ 76 77 bool __findIueTh(const std::vector<libhei::Signature>& i_list, 78 libhei::Signature& o_rootCause) 79 { 80 // TODO: These bit values propbably changed in Odyssey. Will need to 81 // consider flags instead of arbitrary values. 82 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 83 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() && 84 (17 == t.getBit() || 37 == t.getBit())); 85 }); 86 87 if (i_list.end() != itr) 88 { 89 o_rootCause = *itr; 90 return true; 91 } 92 93 return false; 94 } 95 96 //------------------------------------------------------------------------------ 97 98 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list, 99 libhei::Signature& o_rootCause, 100 const RasDataParser& i_rasData) 101 { 102 using namespace util::pdbg; 103 104 using func = libhei::NodeId_t (*)(const std::string& i_str); 105 func __hash = libhei::hash<libhei::NodeId_t>; 106 107 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR"); 108 static const auto mc_ustl_fir = __hash("MC_USTL_FIR"); 109 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT"); 110 static const auto srqfir = __hash("SRQFIR"); 111 112 // First, look for any chip checkstops from the connected OCMBs. 113 for (const auto& s : i_list) 114 { 115 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip()))) 116 { 117 continue; // OCMBs only 118 } 119 120 // TODO: The chip data for Explorer chips currently report chip 121 // checkstops as unit checkstops. Once the chip data has been 122 // updated, the check for unit checkstops here will need to be 123 // removed. 124 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() || 125 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType()) 126 { 127 // Special Case: 128 // If the channel fail was specifically a firmware initiated 129 // channel fail (SRQFIR[25]) check for any IUE bits that are on 130 // that would have caused that (RDFFIR[17,37]). 131 // TODO: These bit values probably changed in Odyssey. Will need to 132 // consider flags instead of arbitrary values. 133 if ((srqfir == s.getId() && 25 == s.getBit()) && 134 __findIueTh(i_list, o_rootCause)) 135 { 136 return true; 137 } 138 139 o_rootCause = s; 140 return true; 141 } 142 } 143 144 // Now, look for any channel failure attentions on the processor side of the 145 // memory bus. 146 for (const auto& s : i_list) 147 { 148 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip()))) 149 { 150 continue; // processors only 151 } 152 153 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or 154 // MC_USTLFIR are considered a channel failure attention. 155 // TODO: The "channel failure" designation is actually configurable via 156 // other registers. We just happen to expect anything that is 157 // configured to channel failure to also be configured to unit 158 // checkstop. Eventually, we will need some mechanism to check the 159 // configuration registers for a more accurate analysis. 160 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() && 161 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) && 162 !i_rasData.isFlagSet(s, 163 RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 164 { 165 o_rootCause = s; 166 return true; 167 } 168 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in 169 // MC_OMI_DL_FIR that are hardwired to channel failure. 170 else if (mc_omi_dl_err_rpt == s.getId()) 171 { 172 o_rootCause = s; 173 return true; 174 } 175 } 176 177 return false; // default, nothing found 178 } 179 180 //------------------------------------------------------------------------------ 181 182 // Will query if a signature is a potential system checkstop root cause. 183 // attention. Note that this function excludes memory channel failure attentions 184 // which are checked in __findMemoryChannelFailure(). 185 bool __findCsRootCause(const libhei::Signature& i_signature, 186 const RasDataParser& i_rasData) 187 { 188 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set. 189 if (i_rasData.isFlagSet(i_signature, 190 RasDataParser::RasDataFlags::CS_POSSIBLE) || 191 i_rasData.isFlagSet(i_signature, 192 RasDataParser::RasDataFlags::SUE_SOURCE)) 193 { 194 return true; 195 } 196 197 return false; // default, nothing found 198 } 199 200 //------------------------------------------------------------------------------ 201 202 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list, 203 libhei::Signature& o_rootCause, 204 const RasDataParser& i_rasData) 205 { 206 for (const auto& s : i_list) 207 { 208 // Only looking for recoverable attentions. 209 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType()) 210 { 211 continue; 212 } 213 214 if (__findCsRootCause(s, i_rasData)) 215 { 216 o_rootCause = s; 217 return true; 218 } 219 } 220 221 return false; // default, nothing found 222 } 223 224 //------------------------------------------------------------------------------ 225 226 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list, 227 libhei::Signature& o_rootCause, 228 const RasDataParser& i_rasData) 229 { 230 for (const auto& s : i_list) 231 { 232 // Only looking for unit checkstop attentions. 233 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType()) 234 { 235 continue; 236 } 237 238 if (__findCsRootCause(s, i_rasData)) 239 { 240 o_rootCause = s; 241 return true; 242 } 243 } 244 245 return false; // default, nothing found 246 } 247 248 //------------------------------------------------------------------------------ 249 250 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list, 251 libhei::Signature& o_rootCause, 252 const RasDataParser& i_rasData) 253 { 254 using namespace util::pdbg; 255 256 // If we have any attentions from an OCMB, assume isolation to the OCMBs 257 // was successful and the ATTN_FROM_OCMB flag does not need to be checked. 258 for (const auto& s : i_list) 259 { 260 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip()))) 261 { 262 return false; 263 } 264 } 265 266 for (const auto& s : i_list) 267 { 268 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 269 { 270 o_rootCause = s; 271 return true; 272 } 273 } 274 275 return false; // default, nothing found 276 } 277 278 //------------------------------------------------------------------------------ 279 280 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list, 281 libhei::Signature& o_rootCause) 282 { 283 using namespace util::pdbg; 284 285 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR"); 286 287 for (const auto& s : i_list) 288 { 289 const auto targetType = getTrgtType(getTrgt(s.getChip())); 290 const auto id = s.getId(); 291 const auto attnType = s.getAttnType(); 292 293 // Find any processor with chip checkstop attention that did not 294 // originate from the PB_EXT_FIR. 295 if ((TYPE_PROC == targetType) && 296 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id)) 297 { 298 o_rootCause = s; 299 return true; 300 } 301 } 302 303 return false; // default, nothing found 304 } 305 306 //------------------------------------------------------------------------------ 307 308 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list, 309 libhei::Signature& o_rootCause) 310 { 311 using namespace util::pdbg; 312 313 using func = libhei::NodeId_t (*)(const std::string& i_str); 314 func __hash = libhei::hash<libhei::NodeId_t>; 315 316 // PROC registers 317 static const auto tp_local_fir = __hash("TP_LOCAL_FIR"); 318 static const auto occ_fir = __hash("OCC_FIR"); 319 static const auto pbao_fir = __hash("PBAO_FIR"); 320 static const auto n0_local_fir = __hash("N0_LOCAL_FIR"); 321 static const auto int_cq_fir = __hash("INT_CQ_FIR"); 322 static const auto nx_cq_fir = __hash("NX_CQ_FIR"); 323 static const auto nx_dma_eng_fir = __hash("NX_DMA_ENG_FIR"); 324 static const auto vas_fir = __hash("VAS_FIR"); 325 static const auto n1_local_fir = __hash("N1_LOCAL_FIR"); 326 static const auto mcd_fir = __hash("MCD_FIR"); 327 static const auto pb_station_fir_en_1 = __hash("PB_STATION_FIR_EN_1"); 328 static const auto pb_station_fir_en_2 = __hash("PB_STATION_FIR_EN_2"); 329 static const auto pb_station_fir_en_3 = __hash("PB_STATION_FIR_EN_3"); 330 static const auto pb_station_fir_en_4 = __hash("PB_STATION_FIR_EN_4"); 331 static const auto pb_station_fir_es_1 = __hash("PB_STATION_FIR_ES_1"); 332 static const auto pb_station_fir_es_2 = __hash("PB_STATION_FIR_ES_2"); 333 static const auto pb_station_fir_es_3 = __hash("PB_STATION_FIR_ES_3"); 334 static const auto pb_station_fir_es_4 = __hash("PB_STATION_FIR_ES_4"); 335 static const auto pb_station_fir_eq = __hash("PB_STATION_FIR_EQ"); 336 static const auto psihb_fir = __hash("PSIHB_FIR"); 337 static const auto pbaf_fir = __hash("PBAF_FIR"); 338 static const auto lpc_fir = __hash("LPC_FIR"); 339 static const auto eq_core_fir = __hash("EQ_CORE_FIR"); 340 static const auto eq_l2_fir = __hash("EQ_L2_FIR"); 341 static const auto eq_l3_fir = __hash("EQ_L3_FIR"); 342 static const auto eq_ncu_fir = __hash("EQ_NCU_FIR"); 343 static const auto eq_local_fir = __hash("EQ_LOCAL_FIR"); 344 static const auto eq_qme_fir = __hash("EQ_QME_FIR"); 345 static const auto iohs_local_fir = __hash("IOHS_LOCAL_FIR"); 346 static const auto iohs_dlp_fir_oc = __hash("IOHS_DLP_FIR_OC"); 347 static const auto iohs_dlp_fir_smp = __hash("IOHS_DLP_FIR_SMP"); 348 static const auto mc_local_fir = __hash("MC_LOCAL_FIR"); 349 static const auto mc_fir = __hash("MC_FIR"); 350 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR"); 351 static const auto mc_ustl_fir = __hash("MC_USTL_FIR"); 352 static const auto nmmu_cq_fir = __hash("NMMU_CQ_FIR"); 353 static const auto nmmu_fir = __hash("NMMU_FIR"); 354 static const auto mc_omi_dl = __hash("MC_OMI_DL"); 355 static const auto pau_local_fir = __hash("PAU_LOCAL_FIR"); 356 static const auto pau_ptl_fir = __hash("PAU_PTL_FIR"); 357 static const auto pau_phy_fir = __hash("PAU_PHY_FIR"); 358 static const auto pau_fir_0 = __hash("PAU_FIR_0"); 359 static const auto pau_fir_2 = __hash("PAU_FIR_2"); 360 static const auto pci_local_fir = __hash("PCI_LOCAL_FIR"); 361 static const auto pci_iop_fir = __hash("PCI_IOP_FIR"); 362 static const auto pci_nest_fir = __hash("PCI_NEST_FIR"); 363 364 // OCMB registers 365 static const auto ocmb_lfir = __hash("OCMB_LFIR"); 366 static const auto mmiofir = __hash("MMIOFIR"); 367 static const auto srqfir = __hash("SRQFIR"); 368 static const auto rdffir = __hash("RDFFIR"); 369 static const auto tlxfir = __hash("TLXFIR"); 370 static const auto omi_dl = __hash("OMI_DL"); 371 372 for (const auto& signature : i_list) 373 { 374 const auto targetType = getTrgtType(getTrgt(signature.getChip())); 375 const auto attnType = signature.getAttnType(); 376 const auto id = signature.getId(); 377 const auto bit = signature.getBit(); 378 379 // Only looking for recoverable or unit checkstop attentions. 380 if (libhei::ATTN_TYPE_RECOVERABLE != attnType && 381 libhei::ATTN_TYPE_UNIT_CS != attnType) 382 { 383 continue; 384 } 385 386 // Ignore attentions that should not be blamed as root cause of a TI. 387 // This would include informational only FIRs or correctable errors. 388 if (TYPE_PROC == targetType) 389 { 390 if (tp_local_fir == id && 391 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 392 5 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 393 11 == bit || 20 == bit || 22 == bit || 23 == bit || 394 24 == bit || 38 == bit || 40 == bit || 41 == bit || 395 46 == bit || 47 == bit || 48 == bit || 55 == bit || 396 56 == bit || 57 == bit || 58 == bit || 59 == bit)) 397 { 398 continue; 399 } 400 401 if (occ_fir == id && 402 (9 == bit || 10 == bit || 15 == bit || 20 == bit || 21 == bit || 403 22 == bit || 23 == bit || 32 == bit || 33 == bit || 404 34 == bit || 36 == bit || 42 == bit || 43 == bit || 405 46 == bit || 47 == bit || 48 == bit || 51 == bit || 406 52 == bit || 53 == bit || 54 == bit || 57 == bit)) 407 { 408 continue; 409 } 410 411 if (pbao_fir == id && 412 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 11 == bit || 413 13 == bit || 15 == bit || 16 == bit || 17 == bit)) 414 { 415 continue; 416 } 417 418 if ((n0_local_fir == id || n1_local_fir == id || 419 iohs_local_fir == id || mc_local_fir == id || 420 pau_local_fir == id || pci_local_fir == id) && 421 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 422 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit || 423 10 == bit || 11 == bit || 20 == bit || 21 == bit)) 424 { 425 continue; 426 } 427 428 if (int_cq_fir == id && 429 (0 == bit || 3 == bit || 5 == bit || 7 == bit || 36 == bit || 430 47 == bit || 48 == bit || 49 == bit || 50 == bit || 431 58 == bit || 59 == bit || 60 == bit)) 432 { 433 continue; 434 } 435 436 if (nx_cq_fir == id && 437 (1 == bit || 4 == bit || 18 == bit || 32 == bit || 33 == bit)) 438 { 439 continue; 440 } 441 442 if (nx_dma_eng_fir == id && 443 (4 == bit || 6 == bit || 9 == bit || 10 == bit || 11 == bit || 444 34 == bit || 35 == bit || 36 == bit || 37 == bit || 39 == bit)) 445 { 446 continue; 447 } 448 449 if (vas_fir == id && 450 (8 == bit || 9 == bit || 11 == bit || 12 == bit || 13 == bit)) 451 { 452 continue; 453 } 454 455 if (mcd_fir == id && (0 == bit)) 456 { 457 continue; 458 } 459 460 if ((pb_station_fir_en_1 == id || pb_station_fir_en_2 == id || 461 pb_station_fir_en_3 == id || pb_station_fir_en_4 == id || 462 pb_station_fir_es_1 == id || pb_station_fir_es_2 == id || 463 pb_station_fir_es_3 == id || pb_station_fir_es_4 == id || 464 pb_station_fir_eq == id) && 465 (9 == bit)) 466 { 467 continue; 468 } 469 470 if (psihb_fir == id && (0 == bit || 23 == bit)) 471 { 472 continue; 473 } 474 475 if (pbaf_fir == id && 476 (0 == bit || 1 == bit || 3 == bit || 4 == bit || 5 == bit || 477 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 478 11 == bit || 19 == bit || 20 == bit || 21 == bit || 479 28 == bit || 29 == bit || 30 == bit || 31 == bit || 480 32 == bit || 33 == bit || 34 == bit || 35 == bit || 36 == bit)) 481 { 482 continue; 483 } 484 485 if (lpc_fir == id && (5 == bit)) 486 { 487 continue; 488 } 489 490 if (eq_core_fir == id && 491 (0 == bit || 2 == bit || 4 == bit || 7 == bit || 9 == bit || 492 11 == bit || 13 == bit || 18 == bit || 21 == bit || 493 24 == bit || 29 == bit || 31 == bit || 37 == bit || 494 43 == bit || 56 == bit || 57 == bit)) 495 { 496 continue; 497 } 498 499 if (eq_l2_fir == id && 500 (0 == bit || 6 == bit || 11 == bit || 19 == bit || 36 == bit)) 501 { 502 continue; 503 } 504 505 if (eq_l3_fir == id && 506 (3 == bit || 4 == bit || 7 == bit || 10 == bit || 13 == bit)) 507 { 508 continue; 509 } 510 511 if (eq_ncu_fir == id && (9 == bit)) 512 { 513 continue; 514 } 515 516 if (eq_local_fir == id && 517 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 5 == bit || 518 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 519 11 == bit || 12 == bit || 13 == bit || 14 == bit || 520 15 == bit || 16 == bit || 20 == bit || 21 == bit || 521 22 == bit || 23 == bit || 24 == bit || 25 == bit || 522 26 == bit || 27 == bit || 28 == bit || 29 == bit || 523 30 == bit || 31 == bit || 32 == bit || 33 == bit || 524 34 == bit || 35 == bit || 36 == bit || 37 == bit || 525 38 == bit || 39 == bit)) 526 { 527 continue; 528 } 529 530 if (eq_qme_fir == id && (7 == bit || 25 == bit)) 531 { 532 continue; 533 } 534 535 if (iohs_dlp_fir_oc == id && 536 (6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 537 48 == bit || 49 == bit || 52 == bit || 53 == bit)) 538 { 539 continue; 540 } 541 542 if (iohs_dlp_fir_smp == id && 543 (6 == bit || 7 == bit || 14 == bit || 15 == bit || 16 == bit || 544 17 == bit || 38 == bit || 39 == bit || 44 == bit || 545 45 == bit || 50 == bit || 51 == bit)) 546 { 547 continue; 548 } 549 550 if (mc_fir == id && 551 (5 == bit || 8 == bit || 15 == bit || 16 == bit)) 552 { 553 continue; 554 } 555 556 if (mc_dstl_fir == id && 557 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 558 5 == bit || 6 == bit || 7 == bit || 14 == bit || 15 == bit)) 559 { 560 continue; 561 } 562 563 if (mc_ustl_fir == id && 564 (6 == bit || 20 == bit || 33 == bit || 34 == bit)) 565 { 566 continue; 567 } 568 569 if (nmmu_cq_fir == id && (8 == bit || 11 == bit || 14 == bit)) 570 { 571 continue; 572 } 573 574 if (nmmu_fir == id && 575 (0 == bit || 3 == bit || 8 == bit || 9 == bit || 10 == bit || 576 11 == bit || 12 == bit || 13 == bit || 14 == bit || 577 15 == bit || 30 == bit || 31 == bit || 41 == bit)) 578 { 579 continue; 580 } 581 582 if (mc_omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 583 7 == bit || 9 == bit || 10 == bit)) 584 { 585 continue; 586 } 587 588 if (pau_ptl_fir == id && (5 == bit || 9 == bit)) 589 { 590 continue; 591 } 592 593 if (pau_phy_fir == id && 594 (2 == bit || 3 == bit || 6 == bit || 7 == bit || 15 == bit)) 595 { 596 continue; 597 } 598 599 if (pau_fir_0 == id && (13 == bit || 30 == bit || 41 == bit)) 600 { 601 continue; 602 } 603 604 if (pau_fir_2 == id && (19 == bit || 46 == bit || 49 == bit)) 605 { 606 continue; 607 } 608 609 if (pci_iop_fir == id && 610 (0 == bit || 2 == bit || 4 == bit || 6 == bit || 7 == bit || 611 8 == bit || 10 == bit)) 612 { 613 continue; 614 } 615 616 if (pci_nest_fir == id && (2 == bit || 5 == bit)) 617 { 618 continue; 619 } 620 } 621 else if (TYPE_OCMB == targetType) 622 { 623 if (ocmb_lfir == id && 624 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 23 == bit || 625 37 == bit || 63 == bit)) 626 { 627 continue; 628 } 629 630 if (mmiofir == id && (2 == bit)) 631 { 632 continue; 633 } 634 635 if (srqfir == id && 636 (2 == bit || 4 == bit || 14 == bit || 15 == bit || 23 == bit || 637 25 == bit || 28 == bit)) 638 { 639 continue; 640 } 641 642 if (rdffir == id && 643 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 644 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit || 645 18 == bit || 38 == bit || 40 == bit || 41 == bit || 646 45 == bit || 46 == bit)) 647 { 648 continue; 649 } 650 651 if (tlxfir == id && (0 == bit || 9 == bit || 26 == bit)) 652 { 653 continue; 654 } 655 656 if (omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 7 == bit || 657 9 == bit || 10 == bit)) 658 { 659 continue; 660 } 661 } 662 663 // At this point, the attention has not been explicitly ignored. So 664 // return this signature and exit. 665 o_rootCause = signature; 666 return true; 667 } 668 669 return false; // default, nothing found 670 } 671 672 //------------------------------------------------------------------------------ 673 674 bool filterRootCause(AnalysisType i_type, 675 const libhei::IsolationData& i_isoData, 676 libhei::Signature& o_rootCause, 677 const RasDataParser& i_rasData) 678 { 679 // We'll need to make a copy of the list so that the original list is 680 // maintained for the PEL. 681 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 682 683 // START WORKAROUND 684 // TODO: Filtering should be data driven. Until that support is available, 685 // use the following isolation rules. 686 687 // Ensure the list is not empty before continuing. 688 if (list.empty()) 689 { 690 return false; // nothing more to do 691 } 692 693 // First, look for any RCS OSC errors. This must always be first because 694 // they can cause downstream PLL unlock attentions. 695 if (__findRcsOscError(list, o_rootCause)) 696 { 697 return true; 698 } 699 700 // Second, look for any PLL unlock attentions. This must always be second 701 // because PLL unlock attentions can cause any number of downstream 702 // attentions, including a system checkstop. 703 if (__findPllUnlock(list, o_rootCause)) 704 { 705 return true; 706 } 707 708 // Regardless of the analysis type, always look for anything that could be 709 // blamed as the root cause of a system checkstop. 710 711 // Memory channel failure attentions will produce SUEs and likely cause 712 // downstream attentions, including a system checkstop. 713 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData)) 714 { 715 return true; 716 } 717 718 // Look for any recoverable attentions that have been identified as a 719 // potential root cause of a system checkstop attention. These would include 720 // any attention that would generate an SUE. Note that is it possible for 721 // recoverables to generate unit checkstop attentions so we must check them 722 // first. 723 if (__findCsRootCause_RE(list, o_rootCause, i_rasData)) 724 { 725 return true; 726 } 727 728 // Look for any unit checkstop attentions (other than memory channel 729 // failures) that have been identified as a potential root cause of a 730 // system checkstop attention. These would include any attention that would 731 // generate an SUE. 732 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData)) 733 { 734 return true; 735 } 736 737 // If no other viable root cause has been found, check for any signatures 738 // with the ATTN_FROM_OCMB flag in case there was an attention from an 739 // inaccessible OCMB. 740 if (__findOcmbAttnBits(list, o_rootCause, i_rasData)) 741 { 742 return true; 743 } 744 745 // Look for any system checkstop attentions that originated from within the 746 // chip that reported the attention. In other words, no external checkstop 747 // attentions. 748 if (__findNonExternalCs(list, o_rootCause)) 749 { 750 return true; 751 } 752 753 if (AnalysisType::SYSTEM_CHECKSTOP != i_type) 754 { 755 // No system checkstop root cause attentions were found. Next, look for 756 // any recoverable or unit checkstop attentions that could be associated 757 // with a TI. 758 if (__findTiRootCause(list, o_rootCause)) 759 { 760 return true; 761 } 762 763 if (AnalysisType::TERMINATE_IMMEDIATE != i_type) 764 { 765 // No attentions associated with a system checkstop or TI were 766 // found. Simply, return the first entry in the list. 767 o_rootCause = list.front(); 768 return true; 769 } 770 } 771 772 // END WORKAROUND 773 774 return false; // default, no active attentions found. 775 } 776 777 //------------------------------------------------------------------------------ 778 779 } // namespace analyzer 780