1 #include <assert.h> 2 3 #include <analyzer/analyzer_main.hpp> 4 #include <analyzer/ras-data/ras-data-parser.hpp> 5 #include <hei_main.hpp> 6 #include <hei_util.hpp> 7 #include <util/pdbg.hpp> 8 9 #include <algorithm> 10 #include <limits> 11 #include <string> 12 13 namespace analyzer 14 { 15 //------------------------------------------------------------------------------ 16 17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list, 18 libhei::Signature& o_rootCause) 19 { 20 // TODO: Consider returning all of them instead of one as root cause. 21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 22 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() && 23 (42 == t.getBit() || 43 == t.getBit())); 24 }); 25 26 if (i_list.end() != itr) 27 { 28 o_rootCause = *itr; 29 return true; 30 } 31 32 return false; 33 } 34 35 //------------------------------------------------------------------------------ 36 37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list, 38 libhei::Signature& o_rootCause) 39 { 40 using namespace util::pdbg; 41 42 // TODO: Consider returning all of them instead of one as root cause. 43 44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK"); 45 46 // First, look for any PLL unlock attentions reported by a processsor chip. 47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 48 return (nodeId == t.getId() && 49 TYPE_PROC == getTrgtType(getTrgt(t.getChip()))); 50 }); 51 52 if (i_list.end() != itr1) 53 { 54 o_rootCause = *itr1; 55 return true; 56 } 57 58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This 59 // is specifically for Odyssey, which are the only OCMBs that would report 60 // PLL unlock attentions. 61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 62 return (nodeId == t.getId() && 63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip()))); 64 }); 65 66 if (i_list.end() != itr2) 67 { 68 o_rootCause = *itr2; 69 return true; 70 } 71 72 return false; 73 } 74 75 //------------------------------------------------------------------------------ 76 77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list, 78 libhei::Signature& o_rootCause, 79 const RasDataParser& i_rasData) 80 { 81 using namespace util::pdbg; 82 83 using func = libhei::NodeId_t (*)(const std::string& i_str); 84 func __hash = libhei::hash<libhei::NodeId_t>; 85 86 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR"); 87 static const auto mc_ustl_fir = __hash("MC_USTL_FIR"); 88 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT"); 89 90 // First, look for any chip checkstops from the connected OCMBs. 91 for (const auto& s : i_list) 92 { 93 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip()))) 94 { 95 continue; // OCMBs only 96 } 97 98 // TODO: The chip data for Explorer chips currently report chip 99 // checkstops as unit checkstops. Once the chip data has been 100 // updated, the check for unit checkstops here will need to be 101 // removed. 102 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() || 103 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType()) 104 { 105 o_rootCause = s; 106 return true; 107 } 108 } 109 110 // Now, look for any channel failure attentions on the processor side of the 111 // memory bus. 112 for (const auto& s : i_list) 113 { 114 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip()))) 115 { 116 continue; // processors only 117 } 118 119 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or 120 // MC_USTLFIR are considered a channel failure attention. 121 // TODO: The "channel failure" designation is actually configurable via 122 // other registers. We just happen to expect anything that is 123 // configured to channel failure to also be configured to unit 124 // checkstop. Eventually, we will need some mechanism to check the 125 // configuration registers for a more accurate analysis. 126 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() && 127 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) && 128 !i_rasData.isFlagSet(s, 129 RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 130 { 131 o_rootCause = s; 132 return true; 133 } 134 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in 135 // MC_OMI_DL_FIR that are hardwired to channel failure. 136 else if (mc_omi_dl_err_rpt == s.getId()) 137 { 138 o_rootCause = s; 139 return true; 140 } 141 } 142 143 return false; // default, nothing found 144 } 145 146 //------------------------------------------------------------------------------ 147 148 // Will query if a signature is a potential system checkstop root cause. 149 // attention. Note that this function excludes memory channel failure attentions 150 // which are checked in __findMemoryChannelFailure(). 151 bool __findCsRootCause(const libhei::Signature& i_signature, 152 const RasDataParser& i_rasData) 153 { 154 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set. 155 if (i_rasData.isFlagSet(i_signature, 156 RasDataParser::RasDataFlags::CS_POSSIBLE) || 157 i_rasData.isFlagSet(i_signature, 158 RasDataParser::RasDataFlags::SUE_SOURCE)) 159 { 160 return true; 161 } 162 163 return false; // default, nothing found 164 } 165 166 //------------------------------------------------------------------------------ 167 168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list, 169 libhei::Signature& o_rootCause, 170 const RasDataParser& i_rasData) 171 { 172 for (const auto& s : i_list) 173 { 174 // Only looking for recoverable attentions. 175 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType()) 176 { 177 continue; 178 } 179 180 if (__findCsRootCause(s, i_rasData)) 181 { 182 o_rootCause = s; 183 return true; 184 } 185 } 186 187 return false; // default, nothing found 188 } 189 190 //------------------------------------------------------------------------------ 191 192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list, 193 libhei::Signature& o_rootCause, 194 const RasDataParser& i_rasData) 195 { 196 for (const auto& s : i_list) 197 { 198 // Only looking for unit checkstop attentions. 199 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType()) 200 { 201 continue; 202 } 203 204 if (__findCsRootCause(s, i_rasData)) 205 { 206 o_rootCause = s; 207 return true; 208 } 209 } 210 211 return false; // default, nothing found 212 } 213 214 //------------------------------------------------------------------------------ 215 216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list, 217 libhei::Signature& o_rootCause, 218 const RasDataParser& i_rasData) 219 { 220 using namespace util::pdbg; 221 222 // If we have any attentions from an OCMB, assume isolation to the OCMBs 223 // was successful and the ATTN_FROM_OCMB flag does not need to be checked. 224 for (const auto& s : i_list) 225 { 226 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip()))) 227 { 228 return false; 229 } 230 } 231 232 for (const auto& s : i_list) 233 { 234 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB)) 235 { 236 o_rootCause = s; 237 return true; 238 } 239 } 240 241 return false; // default, nothing found 242 } 243 244 //------------------------------------------------------------------------------ 245 246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list, 247 libhei::Signature& o_rootCause) 248 { 249 using namespace util::pdbg; 250 251 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR"); 252 253 for (const auto& s : i_list) 254 { 255 const auto targetType = getTrgtType(getTrgt(s.getChip())); 256 const auto id = s.getId(); 257 const auto attnType = s.getAttnType(); 258 259 // Find any processor with chip checkstop attention that did not 260 // originate from the PB_EXT_FIR. 261 if ((TYPE_PROC == targetType) && 262 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id)) 263 { 264 o_rootCause = s; 265 return true; 266 } 267 } 268 269 return false; // default, nothing found 270 } 271 272 //------------------------------------------------------------------------------ 273 274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list, 275 libhei::Signature& o_rootCause) 276 { 277 using namespace util::pdbg; 278 279 using func = libhei::NodeId_t (*)(const std::string& i_str); 280 func __hash = libhei::hash<libhei::NodeId_t>; 281 282 // PROC registers 283 static const auto tp_local_fir = __hash("TP_LOCAL_FIR"); 284 static const auto occ_fir = __hash("OCC_FIR"); 285 static const auto pbao_fir = __hash("PBAO_FIR"); 286 static const auto n0_local_fir = __hash("N0_LOCAL_FIR"); 287 static const auto int_cq_fir = __hash("INT_CQ_FIR"); 288 static const auto nx_cq_fir = __hash("NX_CQ_FIR"); 289 static const auto nx_dma_eng_fir = __hash("NX_DMA_ENG_FIR"); 290 static const auto vas_fir = __hash("VAS_FIR"); 291 static const auto n1_local_fir = __hash("N1_LOCAL_FIR"); 292 static const auto mcd_fir = __hash("MCD_FIR"); 293 static const auto pb_station_fir_en_1 = __hash("PB_STATION_FIR_EN_1"); 294 static const auto pb_station_fir_en_2 = __hash("PB_STATION_FIR_EN_2"); 295 static const auto pb_station_fir_en_3 = __hash("PB_STATION_FIR_EN_3"); 296 static const auto pb_station_fir_en_4 = __hash("PB_STATION_FIR_EN_4"); 297 static const auto pb_station_fir_es_1 = __hash("PB_STATION_FIR_ES_1"); 298 static const auto pb_station_fir_es_2 = __hash("PB_STATION_FIR_ES_2"); 299 static const auto pb_station_fir_es_3 = __hash("PB_STATION_FIR_ES_3"); 300 static const auto pb_station_fir_es_4 = __hash("PB_STATION_FIR_ES_4"); 301 static const auto pb_station_fir_eq = __hash("PB_STATION_FIR_EQ"); 302 static const auto psihb_fir = __hash("PSIHB_FIR"); 303 static const auto pbaf_fir = __hash("PBAF_FIR"); 304 static const auto lpc_fir = __hash("LPC_FIR"); 305 static const auto eq_core_fir = __hash("EQ_CORE_FIR"); 306 static const auto eq_l2_fir = __hash("EQ_L2_FIR"); 307 static const auto eq_l3_fir = __hash("EQ_L3_FIR"); 308 static const auto eq_ncu_fir = __hash("EQ_NCU_FIR"); 309 static const auto eq_local_fir = __hash("EQ_LOCAL_FIR"); 310 static const auto eq_qme_fir = __hash("EQ_QME_FIR"); 311 static const auto iohs_local_fir = __hash("IOHS_LOCAL_FIR"); 312 static const auto iohs_dlp_fir_oc = __hash("IOHS_DLP_FIR_OC"); 313 static const auto iohs_dlp_fir_smp = __hash("IOHS_DLP_FIR_SMP"); 314 static const auto mc_local_fir = __hash("MC_LOCAL_FIR"); 315 static const auto mc_fir = __hash("MC_FIR"); 316 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR"); 317 static const auto mc_ustl_fir = __hash("MC_USTL_FIR"); 318 static const auto nmmu_cq_fir = __hash("NMMU_CQ_FIR"); 319 static const auto nmmu_fir = __hash("NMMU_FIR"); 320 static const auto mc_omi_dl = __hash("MC_OMI_DL"); 321 static const auto pau_local_fir = __hash("PAU_LOCAL_FIR"); 322 static const auto pau_ptl_fir = __hash("PAU_PTL_FIR"); 323 static const auto pau_phy_fir = __hash("PAU_PHY_FIR"); 324 static const auto pau_fir_0 = __hash("PAU_FIR_0"); 325 static const auto pau_fir_2 = __hash("PAU_FIR_2"); 326 static const auto pci_local_fir = __hash("PCI_LOCAL_FIR"); 327 static const auto pci_iop_fir = __hash("PCI_IOP_FIR"); 328 static const auto pci_nest_fir = __hash("PCI_NEST_FIR"); 329 330 // OCMB registers 331 static const auto ocmb_lfir = __hash("OCMB_LFIR"); 332 static const auto mmiofir = __hash("MMIOFIR"); 333 static const auto srqfir = __hash("SRQFIR"); 334 static const auto rdffir = __hash("RDFFIR"); 335 static const auto tlxfir = __hash("TLXFIR"); 336 static const auto omi_dl = __hash("OMI_DL"); 337 338 for (const auto& signature : i_list) 339 { 340 const auto targetType = getTrgtType(getTrgt(signature.getChip())); 341 const auto attnType = signature.getAttnType(); 342 const auto id = signature.getId(); 343 const auto bit = signature.getBit(); 344 345 // Only looking for recoverable or unit checkstop attentions. 346 if (libhei::ATTN_TYPE_RECOVERABLE != attnType && 347 libhei::ATTN_TYPE_UNIT_CS != attnType) 348 { 349 continue; 350 } 351 352 // Ignore attentions that should not be blamed as root cause of a TI. 353 // This would include informational only FIRs or correctable errors. 354 if (TYPE_PROC == targetType) 355 { 356 if (tp_local_fir == id && 357 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 358 5 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 359 11 == bit || 20 == bit || 22 == bit || 23 == bit || 360 24 == bit || 38 == bit || 40 == bit || 41 == bit || 361 46 == bit || 47 == bit || 48 == bit || 55 == bit || 362 56 == bit || 57 == bit || 58 == bit || 59 == bit)) 363 { 364 continue; 365 } 366 367 if (occ_fir == id && 368 (9 == bit || 10 == bit || 15 == bit || 20 == bit || 21 == bit || 369 22 == bit || 23 == bit || 32 == bit || 33 == bit || 370 34 == bit || 36 == bit || 42 == bit || 43 == bit || 371 46 == bit || 47 == bit || 48 == bit || 51 == bit || 372 52 == bit || 53 == bit || 54 == bit || 57 == bit)) 373 { 374 continue; 375 } 376 377 if (pbao_fir == id && 378 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 11 == bit || 379 13 == bit || 15 == bit || 16 == bit || 17 == bit)) 380 { 381 continue; 382 } 383 384 if ((n0_local_fir == id || n1_local_fir == id || 385 iohs_local_fir == id || mc_local_fir == id || 386 pau_local_fir == id || pci_local_fir == id) && 387 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 388 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit || 389 10 == bit || 11 == bit || 20 == bit || 21 == bit)) 390 { 391 continue; 392 } 393 394 if (int_cq_fir == id && 395 (0 == bit || 3 == bit || 5 == bit || 7 == bit || 36 == bit || 396 47 == bit || 48 == bit || 49 == bit || 50 == bit || 397 58 == bit || 59 == bit || 60 == bit)) 398 { 399 continue; 400 } 401 402 if (nx_cq_fir == id && 403 (1 == bit || 4 == bit || 18 == bit || 32 == bit || 33 == bit)) 404 { 405 continue; 406 } 407 408 if (nx_dma_eng_fir == id && 409 (4 == bit || 6 == bit || 9 == bit || 10 == bit || 11 == bit || 410 34 == bit || 35 == bit || 36 == bit || 37 == bit || 39 == bit)) 411 { 412 continue; 413 } 414 415 if (vas_fir == id && 416 (8 == bit || 9 == bit || 11 == bit || 12 == bit || 13 == bit)) 417 { 418 continue; 419 } 420 421 if (mcd_fir == id && (0 == bit)) 422 { 423 continue; 424 } 425 426 if ((pb_station_fir_en_1 == id || pb_station_fir_en_2 == id || 427 pb_station_fir_en_3 == id || pb_station_fir_en_4 == id || 428 pb_station_fir_es_1 == id || pb_station_fir_es_2 == id || 429 pb_station_fir_es_3 == id || pb_station_fir_es_4 == id || 430 pb_station_fir_eq == id) && 431 (9 == bit)) 432 { 433 continue; 434 } 435 436 if (psihb_fir == id && (0 == bit || 23 == bit)) 437 { 438 continue; 439 } 440 441 if (pbaf_fir == id && 442 (0 == bit || 1 == bit || 3 == bit || 4 == bit || 5 == bit || 443 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 444 11 == bit || 19 == bit || 20 == bit || 21 == bit || 445 28 == bit || 29 == bit || 30 == bit || 31 == bit || 446 32 == bit || 33 == bit || 34 == bit || 35 == bit || 36 == bit)) 447 { 448 continue; 449 } 450 451 if (lpc_fir == id && (5 == bit)) 452 { 453 continue; 454 } 455 456 if (eq_core_fir == id && 457 (0 == bit || 2 == bit || 4 == bit || 7 == bit || 9 == bit || 458 11 == bit || 13 == bit || 18 == bit || 21 == bit || 459 24 == bit || 29 == bit || 31 == bit || 37 == bit || 460 43 == bit || 56 == bit || 57 == bit)) 461 { 462 continue; 463 } 464 465 if (eq_l2_fir == id && 466 (0 == bit || 6 == bit || 11 == bit || 19 == bit || 36 == bit)) 467 { 468 continue; 469 } 470 471 if (eq_l3_fir == id && 472 (3 == bit || 4 == bit || 7 == bit || 10 == bit || 13 == bit)) 473 { 474 continue; 475 } 476 477 if (eq_ncu_fir == id && (9 == bit)) 478 { 479 continue; 480 } 481 482 if (eq_local_fir == id && 483 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 5 == bit || 484 6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 485 11 == bit || 12 == bit || 13 == bit || 14 == bit || 486 15 == bit || 16 == bit || 20 == bit || 21 == bit || 487 22 == bit || 23 == bit || 24 == bit || 25 == bit || 488 26 == bit || 27 == bit || 28 == bit || 29 == bit || 489 30 == bit || 31 == bit || 32 == bit || 33 == bit || 490 34 == bit || 35 == bit || 36 == bit || 37 == bit || 491 38 == bit || 39 == bit)) 492 { 493 continue; 494 } 495 496 if (eq_qme_fir == id && (7 == bit || 25 == bit)) 497 { 498 continue; 499 } 500 501 if (iohs_dlp_fir_oc == id && 502 (6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit || 503 48 == bit || 49 == bit || 52 == bit || 53 == bit)) 504 { 505 continue; 506 } 507 508 if (iohs_dlp_fir_smp == id && 509 (6 == bit || 7 == bit || 14 == bit || 15 == bit || 16 == bit || 510 17 == bit || 38 == bit || 39 == bit || 44 == bit || 511 45 == bit || 50 == bit || 51 == bit)) 512 { 513 continue; 514 } 515 516 if (mc_fir == id && 517 (5 == bit || 8 == bit || 15 == bit || 16 == bit)) 518 { 519 continue; 520 } 521 522 if (mc_dstl_fir == id && 523 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 524 5 == bit || 6 == bit || 7 == bit || 14 == bit || 15 == bit)) 525 { 526 continue; 527 } 528 529 if (mc_ustl_fir == id && 530 (6 == bit || 20 == bit || 33 == bit || 34 == bit)) 531 { 532 continue; 533 } 534 535 if (nmmu_cq_fir == id && (8 == bit || 11 == bit || 14 == bit)) 536 { 537 continue; 538 } 539 540 if (nmmu_fir == id && 541 (0 == bit || 3 == bit || 8 == bit || 9 == bit || 10 == bit || 542 11 == bit || 12 == bit || 13 == bit || 14 == bit || 543 15 == bit || 30 == bit || 31 == bit || 41 == bit)) 544 { 545 continue; 546 } 547 548 if (mc_omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 549 7 == bit || 9 == bit || 10 == bit)) 550 { 551 continue; 552 } 553 554 if (pau_ptl_fir == id && (5 == bit || 9 == bit)) 555 { 556 continue; 557 } 558 559 if (pau_phy_fir == id && 560 (2 == bit || 3 == bit || 6 == bit || 7 == bit || 15 == bit)) 561 { 562 continue; 563 } 564 565 if (pau_fir_0 == id && (13 == bit || 30 == bit || 41 == bit)) 566 { 567 continue; 568 } 569 570 if (pau_fir_2 == id && (19 == bit || 46 == bit || 49 == bit)) 571 { 572 continue; 573 } 574 575 if (pci_iop_fir == id && 576 (0 == bit || 2 == bit || 4 == bit || 6 == bit || 7 == bit || 577 8 == bit || 10 == bit)) 578 { 579 continue; 580 } 581 582 if (pci_nest_fir == id && (2 == bit || 5 == bit)) 583 { 584 continue; 585 } 586 } 587 else if (TYPE_OCMB == targetType) 588 { 589 if (ocmb_lfir == id && 590 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 23 == bit || 591 37 == bit || 63 == bit)) 592 { 593 continue; 594 } 595 596 if (mmiofir == id && (2 == bit)) 597 { 598 continue; 599 } 600 601 if (srqfir == id && 602 (2 == bit || 4 == bit || 14 == bit || 15 == bit || 23 == bit || 603 25 == bit || 28 == bit)) 604 { 605 continue; 606 } 607 608 if (rdffir == id && 609 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit || 610 5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit || 611 18 == bit || 38 == bit || 40 == bit || 41 == bit || 612 45 == bit || 46 == bit)) 613 { 614 continue; 615 } 616 617 if (tlxfir == id && (0 == bit || 9 == bit || 26 == bit)) 618 { 619 continue; 620 } 621 622 if (omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 7 == bit || 623 9 == bit || 10 == bit)) 624 { 625 continue; 626 } 627 } 628 629 // At this point, the attention has not been explicitly ignored. So 630 // return this signature and exit. 631 o_rootCause = signature; 632 return true; 633 } 634 635 return false; // default, nothing found 636 } 637 638 //------------------------------------------------------------------------------ 639 640 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData, 641 libhei::Signature& o_rootCause, 642 const RasDataParser& i_rasData) 643 { 644 // We'll need to make a copy of the list so that the original list is 645 // maintained for the PEL. 646 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 647 648 // START WORKAROUND 649 // TODO: Filtering should be data driven. Until that support is available, 650 // use the following isolation rules. 651 652 // Ensure the list is not empty before continuing. 653 if (list.empty()) 654 { 655 return false; // nothing more to do 656 } 657 658 // First, look for any RCS OSC errors. This must always be first because 659 // they can cause downstream PLL unlock attentions. 660 if (__findRcsOscError(list, o_rootCause)) 661 { 662 return true; 663 } 664 665 // Second, look for any PLL unlock attentions. This must always be second 666 // because PLL unlock attentions can cause any number of downstream 667 // attentions, including a system checkstop. 668 if (__findPllUnlock(list, o_rootCause)) 669 { 670 return true; 671 } 672 673 // Regardless of the analysis type, always look for anything that could be 674 // blamed as the root cause of a system checkstop. 675 676 // Memory channel failure attentions will produce SUEs and likely cause 677 // downstream attentions, including a system checkstop. 678 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData)) 679 { 680 return true; 681 } 682 683 // Look for any recoverable attentions that have been identified as a 684 // potential root cause of a system checkstop attention. These would include 685 // any attention that would generate an SUE. Note that is it possible for 686 // recoverables to generate unit checkstop attentions so we must check them 687 // first. 688 if (__findCsRootCause_RE(list, o_rootCause, i_rasData)) 689 { 690 return true; 691 } 692 693 // Look for any unit checkstop attentions (other than memory channel 694 // failures) that have been identified as a potential root cause of a 695 // system checkstop attention. These would include any attention that would 696 // generate an SUE. 697 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData)) 698 { 699 return true; 700 } 701 702 // If no other viable root cause has been found, check for any signatures 703 // with the ATTN_FROM_OCMB flag in case there was an attention from an 704 // inaccessible OCMB. 705 if (__findOcmbAttnBits(list, o_rootCause, i_rasData)) 706 { 707 return true; 708 } 709 710 // Look for any system checkstop attentions that originated from within the 711 // chip that reported the attention. In other words, no external checkstop 712 // attentions. 713 if (__findNonExternalCs(list, o_rootCause)) 714 { 715 return true; 716 } 717 718 if (AnalysisType::SYSTEM_CHECKSTOP != i_type) 719 { 720 // No system checkstop root cause attentions were found. Next, look for 721 // any recoverable or unit checkstop attentions that could be associated 722 // with a TI. 723 if (__findTiRootCause(list, o_rootCause)) 724 { 725 return true; 726 } 727 728 if (AnalysisType::TERMINATE_IMMEDIATE != i_type) 729 { 730 // No attentions associated with a system checkstop or TI were 731 // found. Simply, return the first entry in the list. 732 o_rootCause = list.front(); 733 return true; 734 } 735 } 736 737 // END WORKAROUND 738 739 return false; // default, no active attentions found. 740 } 741 742 //------------------------------------------------------------------------------ 743 744 bool __findIueTh(const std::vector<libhei::Signature>& i_list, 745 libhei::Signature& o_rootCause) 746 { 747 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) { 748 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() && 749 (17 == t.getBit() || 37 == t.getBit())) || 750 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() && 751 (18 == t.getBit() || 38 == t.getBit())); 752 }); 753 754 if (i_list.end() != itr) 755 { 756 o_rootCause = *itr; 757 return true; 758 } 759 760 return false; 761 } 762 763 //------------------------------------------------------------------------------ 764 765 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData, 766 libhei::Signature& o_rootCause, 767 const RasDataParser& i_rasData) 768 { 769 using func = libhei::NodeId_t (*)(const std::string& i_str); 770 func __hash = libhei::hash<libhei::NodeId_t>; 771 772 // Check for any special cases that exist for specific FIR bits. 773 774 // If the channel fail was specifically a firmware initiated channel fail 775 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for 776 // any IUE bits that are on that would have caused the channel fail 777 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38] 778 // for Odyssey OCMBs). 779 780 // Explorer SRQFIR 781 static const auto srqfir = __hash("SRQFIR"); 782 // Odyssey SRQ_FIR 783 static const auto srq_fir = __hash("SRQ_FIR"); 784 785 std::vector<libhei::Signature> list{i_isoData.getSignatureList()}; 786 787 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) || 788 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) && 789 __findIueTh(list, o_rootCause)) 790 { 791 // If __findIueTh returned true, o_rootCause was updated, return. 792 return; 793 } 794 795 // Check if the root cause found was a potential side effect of an 796 // ODP data corruption error. If it was, check if any other signature 797 // in the signature list was a potential root cause. 798 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT; 799 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE; 800 if (i_rasData.isFlagSet(o_rootCause, OdpSide)) 801 { 802 for (const auto& s : list) 803 { 804 if (i_rasData.isFlagSet(s, OdpRoot)) 805 { 806 // ODP data corruption root cause found, return. 807 o_rootCause = s; 808 return; 809 } 810 } 811 } 812 813 // Odyssey RDF_FIR 814 static const auto rdf_fir = __hash("RDF_FIR"); 815 816 // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on, 817 // check if bit 41 is also on. 818 if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit()) 819 { 820 // Look for RDF_FIR[41] 821 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) { 822 return (rdf_fir == t.getId() && 41 == t.getBit()); 823 }); 824 if (list.end() != itr) 825 { 826 o_rootCause = *itr; 827 } 828 } 829 } 830 831 //------------------------------------------------------------------------------ 832 833 bool filterRootCause(AnalysisType i_type, 834 const libhei::IsolationData& i_isoData, 835 libhei::Signature& o_rootCause, 836 const RasDataParser& i_rasData) 837 { 838 // Find the initial root cause attention based on common rules for FIR 839 // isolation. 840 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData); 841 842 // If some root cause was found, handle any special cases for specific FIR 843 // bits that require additional logic to determine the root cause. 844 if (true == rc) 845 { 846 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData); 847 } 848 849 return rc; 850 } 851 852 //------------------------------------------------------------------------------ 853 854 } // namespace analyzer 855