1 #include <assert.h>
2 
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/ras-data/ras-data-parser.hpp>
5 #include <hei_main.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8 
9 #include <algorithm>
10 #include <limits>
11 #include <string>
12 
13 namespace analyzer
14 {
15 //------------------------------------------------------------------------------
16 
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18                        libhei::Signature& o_rootCause)
19 {
20     // TODO: Consider returning all of them instead of one as root cause.
21     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22         return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23                 (42 == t.getBit() || 43 == t.getBit()));
24     });
25 
26     if (i_list.end() != itr)
27     {
28         o_rootCause = *itr;
29         return true;
30     }
31 
32     return false;
33 }
34 
35 //------------------------------------------------------------------------------
36 
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38                      libhei::Signature& o_rootCause)
39 {
40     using namespace util::pdbg;
41 
42     // TODO: Consider returning all of them instead of one as root cause.
43 
44     auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45 
46     // First, look for any PLL unlock attentions reported by a processsor chip.
47     auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48         return (nodeId == t.getId() &&
49                 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50     });
51 
52     if (i_list.end() != itr1)
53     {
54         o_rootCause = *itr1;
55         return true;
56     }
57 
58     // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59     // is specifically for Odyssey, which are the only OCMBs that would report
60     // PLL unlock attentions.
61     auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62         return (nodeId == t.getId() &&
63                 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64     });
65 
66     if (i_list.end() != itr2)
67     {
68         o_rootCause = *itr2;
69         return true;
70     }
71 
72     return false;
73 }
74 
75 //------------------------------------------------------------------------------
76 
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78                                 libhei::Signature& o_rootCause,
79                                 const RasDataParser& i_rasData)
80 {
81     using namespace util::pdbg;
82 
83     using func = libhei::NodeId_t (*)(const std::string& i_str);
84     func __hash = libhei::hash<libhei::NodeId_t>;
85 
86     static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87     static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88     static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89 
90     // First, look for any chip checkstops from the connected OCMBs.
91     for (const auto& s : i_list)
92     {
93         if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94         {
95             continue; // OCMBs only
96         }
97 
98         // TODO: The chip data for Explorer chips currently report chip
99         //       checkstops as unit checkstops. Once the chip data has been
100         //       updated, the check for unit checkstops here will need to be
101         //       removed.
102         if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103             libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104         {
105             o_rootCause = s;
106             return true;
107         }
108     }
109 
110     // Now, look for any channel failure attentions on the processor side of the
111     // memory bus.
112     for (const auto& s : i_list)
113     {
114         if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115         {
116             continue; // processors only
117         }
118 
119         // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120         // MC_USTLFIR are considered a channel failure attention.
121         // TODO: The "channel failure" designation is actually configurable via
122         //       other registers. We just happen to expect anything that is
123         //       configured to channel failure to also be configured to unit
124         //       checkstop. Eventually, we will need some mechanism to check the
125         //       configuration registers for a more accurate analysis.
126         if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127             (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128             !i_rasData.isFlagSet(s,
129                                  RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130         {
131             o_rootCause = s;
132             return true;
133         }
134         // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135         // MC_OMI_DL_FIR that are hardwired to channel failure.
136         else if (mc_omi_dl_err_rpt == s.getId())
137         {
138             o_rootCause = s;
139             return true;
140         }
141     }
142 
143     return false; // default, nothing found
144 }
145 
146 //------------------------------------------------------------------------------
147 
148 // Will query if a signature is a potential system checkstop root cause.
149 // attention. Note that this function excludes memory channel failure attentions
150 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature,
152                        const RasDataParser& i_rasData)
153 {
154     // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155     if (i_rasData.isFlagSet(i_signature,
156                             RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157         i_rasData.isFlagSet(i_signature,
158                             RasDataParser::RasDataFlags::SUE_SOURCE))
159     {
160         return true;
161     }
162 
163     return false; // default, nothing found
164 }
165 
166 //------------------------------------------------------------------------------
167 
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169                           libhei::Signature& o_rootCause,
170                           const RasDataParser& i_rasData)
171 {
172     for (const auto& s : i_list)
173     {
174         // Only looking for recoverable attentions.
175         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176         {
177             continue;
178         }
179 
180         if (__findCsRootCause(s, i_rasData))
181         {
182             o_rootCause = s;
183             return true;
184         }
185     }
186 
187     return false; // default, nothing found
188 }
189 
190 //------------------------------------------------------------------------------
191 
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193                            libhei::Signature& o_rootCause,
194                            const RasDataParser& i_rasData)
195 {
196     for (const auto& s : i_list)
197     {
198         // Only looking for unit checkstop attentions.
199         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200         {
201             continue;
202         }
203 
204         if (__findCsRootCause(s, i_rasData))
205         {
206             o_rootCause = s;
207             return true;
208         }
209     }
210 
211     return false; // default, nothing found
212 }
213 
214 //------------------------------------------------------------------------------
215 
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217                         libhei::Signature& o_rootCause,
218                         const RasDataParser& i_rasData)
219 {
220     using namespace util::pdbg;
221 
222     // If we have any attentions from an OCMB, assume isolation to the OCMBs
223     // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224     for (const auto& s : i_list)
225     {
226         if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227         {
228             return false;
229         }
230     }
231 
232     for (const auto& s : i_list)
233     {
234         if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235         {
236             o_rootCause = s;
237             return true;
238         }
239     }
240 
241     return false; // default, nothing found
242 }
243 
244 //------------------------------------------------------------------------------
245 
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247                          libhei::Signature& o_rootCause)
248 {
249     using namespace util::pdbg;
250 
251     static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252 
253     for (const auto& s : i_list)
254     {
255         const auto targetType = getTrgtType(getTrgt(s.getChip()));
256         const auto id = s.getId();
257         const auto attnType = s.getAttnType();
258 
259         // Find any processor with chip checkstop attention that did not
260         // originate from the PB_EXT_FIR.
261         if ((TYPE_PROC == targetType) &&
262             (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263         {
264             o_rootCause = s;
265             return true;
266         }
267     }
268 
269     return false; // default, nothing found
270 }
271 
272 //------------------------------------------------------------------------------
273 
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275                        libhei::Signature& o_rootCause)
276 {
277     using namespace util::pdbg;
278 
279     using func = libhei::NodeId_t (*)(const std::string& i_str);
280     func __hash = libhei::hash<libhei::NodeId_t>;
281 
282     // PROC registers
283     static const auto tp_local_fir = __hash("TP_LOCAL_FIR");
284     static const auto occ_fir = __hash("OCC_FIR");
285     static const auto pbao_fir = __hash("PBAO_FIR");
286     static const auto n0_local_fir = __hash("N0_LOCAL_FIR");
287     static const auto int_cq_fir = __hash("INT_CQ_FIR");
288     static const auto nx_cq_fir = __hash("NX_CQ_FIR");
289     static const auto nx_dma_eng_fir = __hash("NX_DMA_ENG_FIR");
290     static const auto vas_fir = __hash("VAS_FIR");
291     static const auto n1_local_fir = __hash("N1_LOCAL_FIR");
292     static const auto mcd_fir = __hash("MCD_FIR");
293     static const auto pb_station_fir_en_1 = __hash("PB_STATION_FIR_EN_1");
294     static const auto pb_station_fir_en_2 = __hash("PB_STATION_FIR_EN_2");
295     static const auto pb_station_fir_en_3 = __hash("PB_STATION_FIR_EN_3");
296     static const auto pb_station_fir_en_4 = __hash("PB_STATION_FIR_EN_4");
297     static const auto pb_station_fir_es_1 = __hash("PB_STATION_FIR_ES_1");
298     static const auto pb_station_fir_es_2 = __hash("PB_STATION_FIR_ES_2");
299     static const auto pb_station_fir_es_3 = __hash("PB_STATION_FIR_ES_3");
300     static const auto pb_station_fir_es_4 = __hash("PB_STATION_FIR_ES_4");
301     static const auto pb_station_fir_eq = __hash("PB_STATION_FIR_EQ");
302     static const auto psihb_fir = __hash("PSIHB_FIR");
303     static const auto pbaf_fir = __hash("PBAF_FIR");
304     static const auto lpc_fir = __hash("LPC_FIR");
305     static const auto eq_core_fir = __hash("EQ_CORE_FIR");
306     static const auto eq_l2_fir = __hash("EQ_L2_FIR");
307     static const auto eq_l3_fir = __hash("EQ_L3_FIR");
308     static const auto eq_ncu_fir = __hash("EQ_NCU_FIR");
309     static const auto eq_local_fir = __hash("EQ_LOCAL_FIR");
310     static const auto eq_qme_fir = __hash("EQ_QME_FIR");
311     static const auto iohs_local_fir = __hash("IOHS_LOCAL_FIR");
312     static const auto iohs_dlp_fir_oc = __hash("IOHS_DLP_FIR_OC");
313     static const auto iohs_dlp_fir_smp = __hash("IOHS_DLP_FIR_SMP");
314     static const auto mc_local_fir = __hash("MC_LOCAL_FIR");
315     static const auto mc_fir = __hash("MC_FIR");
316     static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
317     static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
318     static const auto nmmu_cq_fir = __hash("NMMU_CQ_FIR");
319     static const auto nmmu_fir = __hash("NMMU_FIR");
320     static const auto mc_omi_dl = __hash("MC_OMI_DL");
321     static const auto pau_local_fir = __hash("PAU_LOCAL_FIR");
322     static const auto pau_ptl_fir = __hash("PAU_PTL_FIR");
323     static const auto pau_phy_fir = __hash("PAU_PHY_FIR");
324     static const auto pau_fir_0 = __hash("PAU_FIR_0");
325     static const auto pau_fir_2 = __hash("PAU_FIR_2");
326     static const auto pci_local_fir = __hash("PCI_LOCAL_FIR");
327     static const auto pci_iop_fir = __hash("PCI_IOP_FIR");
328     static const auto pci_nest_fir = __hash("PCI_NEST_FIR");
329 
330     // OCMB registers
331     static const auto ocmb_lfir = __hash("OCMB_LFIR");
332     static const auto mmiofir = __hash("MMIOFIR");
333     static const auto srqfir = __hash("SRQFIR");
334     static const auto rdffir = __hash("RDFFIR");
335     static const auto tlxfir = __hash("TLXFIR");
336     static const auto omi_dl = __hash("OMI_DL");
337 
338     for (const auto& signature : i_list)
339     {
340         const auto targetType = getTrgtType(getTrgt(signature.getChip()));
341         const auto attnType = signature.getAttnType();
342         const auto id = signature.getId();
343         const auto bit = signature.getBit();
344 
345         // Only looking for recoverable or unit checkstop attentions.
346         if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
347             libhei::ATTN_TYPE_UNIT_CS != attnType)
348         {
349             continue;
350         }
351 
352         // Ignore attentions that should not be blamed as root cause of a TI.
353         // This would include informational only FIRs or correctable errors.
354         if (TYPE_PROC == targetType)
355         {
356             if (tp_local_fir == id &&
357                 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
358                  5 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
359                  11 == bit || 20 == bit || 22 == bit || 23 == bit ||
360                  24 == bit || 38 == bit || 40 == bit || 41 == bit ||
361                  46 == bit || 47 == bit || 48 == bit || 55 == bit ||
362                  56 == bit || 57 == bit || 58 == bit || 59 == bit))
363             {
364                 continue;
365             }
366 
367             if (occ_fir == id &&
368                 (9 == bit || 10 == bit || 15 == bit || 20 == bit || 21 == bit ||
369                  22 == bit || 23 == bit || 32 == bit || 33 == bit ||
370                  34 == bit || 36 == bit || 42 == bit || 43 == bit ||
371                  46 == bit || 47 == bit || 48 == bit || 51 == bit ||
372                  52 == bit || 53 == bit || 54 == bit || 57 == bit))
373             {
374                 continue;
375             }
376 
377             if (pbao_fir == id &&
378                 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 11 == bit ||
379                  13 == bit || 15 == bit || 16 == bit || 17 == bit))
380             {
381                 continue;
382             }
383 
384             if ((n0_local_fir == id || n1_local_fir == id ||
385                  iohs_local_fir == id || mc_local_fir == id ||
386                  pau_local_fir == id || pci_local_fir == id) &&
387                 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
388                  5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit ||
389                  10 == bit || 11 == bit || 20 == bit || 21 == bit))
390             {
391                 continue;
392             }
393 
394             if (int_cq_fir == id &&
395                 (0 == bit || 3 == bit || 5 == bit || 7 == bit || 36 == bit ||
396                  47 == bit || 48 == bit || 49 == bit || 50 == bit ||
397                  58 == bit || 59 == bit || 60 == bit))
398             {
399                 continue;
400             }
401 
402             if (nx_cq_fir == id &&
403                 (1 == bit || 4 == bit || 18 == bit || 32 == bit || 33 == bit))
404             {
405                 continue;
406             }
407 
408             if (nx_dma_eng_fir == id &&
409                 (4 == bit || 6 == bit || 9 == bit || 10 == bit || 11 == bit ||
410                  34 == bit || 35 == bit || 36 == bit || 37 == bit || 39 == bit))
411             {
412                 continue;
413             }
414 
415             if (vas_fir == id &&
416                 (8 == bit || 9 == bit || 11 == bit || 12 == bit || 13 == bit))
417             {
418                 continue;
419             }
420 
421             if (mcd_fir == id && (0 == bit))
422             {
423                 continue;
424             }
425 
426             if ((pb_station_fir_en_1 == id || pb_station_fir_en_2 == id ||
427                  pb_station_fir_en_3 == id || pb_station_fir_en_4 == id ||
428                  pb_station_fir_es_1 == id || pb_station_fir_es_2 == id ||
429                  pb_station_fir_es_3 == id || pb_station_fir_es_4 == id ||
430                  pb_station_fir_eq == id) &&
431                 (9 == bit))
432             {
433                 continue;
434             }
435 
436             if (psihb_fir == id && (0 == bit || 23 == bit))
437             {
438                 continue;
439             }
440 
441             if (pbaf_fir == id &&
442                 (0 == bit || 1 == bit || 3 == bit || 4 == bit || 5 == bit ||
443                  6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
444                  11 == bit || 19 == bit || 20 == bit || 21 == bit ||
445                  28 == bit || 29 == bit || 30 == bit || 31 == bit ||
446                  32 == bit || 33 == bit || 34 == bit || 35 == bit || 36 == bit))
447             {
448                 continue;
449             }
450 
451             if (lpc_fir == id && (5 == bit))
452             {
453                 continue;
454             }
455 
456             if (eq_core_fir == id &&
457                 (0 == bit || 2 == bit || 4 == bit || 7 == bit || 9 == bit ||
458                  11 == bit || 13 == bit || 18 == bit || 21 == bit ||
459                  24 == bit || 29 == bit || 31 == bit || 37 == bit ||
460                  43 == bit || 56 == bit || 57 == bit))
461             {
462                 continue;
463             }
464 
465             if (eq_l2_fir == id &&
466                 (0 == bit || 6 == bit || 11 == bit || 19 == bit || 36 == bit))
467             {
468                 continue;
469             }
470 
471             if (eq_l3_fir == id &&
472                 (3 == bit || 4 == bit || 7 == bit || 10 == bit || 13 == bit))
473             {
474                 continue;
475             }
476 
477             if (eq_ncu_fir == id && (9 == bit))
478             {
479                 continue;
480             }
481 
482             if (eq_local_fir == id &&
483                 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 5 == bit ||
484                  6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
485                  11 == bit || 12 == bit || 13 == bit || 14 == bit ||
486                  15 == bit || 16 == bit || 20 == bit || 21 == bit ||
487                  22 == bit || 23 == bit || 24 == bit || 25 == bit ||
488                  26 == bit || 27 == bit || 28 == bit || 29 == bit ||
489                  30 == bit || 31 == bit || 32 == bit || 33 == bit ||
490                  34 == bit || 35 == bit || 36 == bit || 37 == bit ||
491                  38 == bit || 39 == bit))
492             {
493                 continue;
494             }
495 
496             if (eq_qme_fir == id && (7 == bit || 25 == bit))
497             {
498                 continue;
499             }
500 
501             if (iohs_dlp_fir_oc == id &&
502                 (6 == bit || 7 == bit || 8 == bit || 9 == bit || 10 == bit ||
503                  48 == bit || 49 == bit || 52 == bit || 53 == bit))
504             {
505                 continue;
506             }
507 
508             if (iohs_dlp_fir_smp == id &&
509                 (6 == bit || 7 == bit || 14 == bit || 15 == bit || 16 == bit ||
510                  17 == bit || 38 == bit || 39 == bit || 44 == bit ||
511                  45 == bit || 50 == bit || 51 == bit))
512             {
513                 continue;
514             }
515 
516             if (mc_fir == id &&
517                 (5 == bit || 8 == bit || 15 == bit || 16 == bit))
518             {
519                 continue;
520             }
521 
522             if (mc_dstl_fir == id &&
523                 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
524                  5 == bit || 6 == bit || 7 == bit || 14 == bit || 15 == bit))
525             {
526                 continue;
527             }
528 
529             if (mc_ustl_fir == id &&
530                 (6 == bit || 20 == bit || 33 == bit || 34 == bit))
531             {
532                 continue;
533             }
534 
535             if (nmmu_cq_fir == id && (8 == bit || 11 == bit || 14 == bit))
536             {
537                 continue;
538             }
539 
540             if (nmmu_fir == id &&
541                 (0 == bit || 3 == bit || 8 == bit || 9 == bit || 10 == bit ||
542                  11 == bit || 12 == bit || 13 == bit || 14 == bit ||
543                  15 == bit || 30 == bit || 31 == bit || 41 == bit))
544             {
545                 continue;
546             }
547 
548             if (mc_omi_dl == id && (2 == bit || 3 == bit || 6 == bit ||
549                                     7 == bit || 9 == bit || 10 == bit))
550             {
551                 continue;
552             }
553 
554             if (pau_ptl_fir == id && (5 == bit || 9 == bit))
555             {
556                 continue;
557             }
558 
559             if (pau_phy_fir == id &&
560                 (2 == bit || 3 == bit || 6 == bit || 7 == bit || 15 == bit))
561             {
562                 continue;
563             }
564 
565             if (pau_fir_0 == id && (13 == bit || 30 == bit || 41 == bit))
566             {
567                 continue;
568             }
569 
570             if (pau_fir_2 == id && (19 == bit || 46 == bit || 49 == bit))
571             {
572                 continue;
573             }
574 
575             if (pci_iop_fir == id &&
576                 (0 == bit || 2 == bit || 4 == bit || 6 == bit || 7 == bit ||
577                  8 == bit || 10 == bit))
578             {
579                 continue;
580             }
581 
582             if (pci_nest_fir == id && (2 == bit || 5 == bit))
583             {
584                 continue;
585             }
586         }
587         else if (TYPE_OCMB == targetType)
588         {
589             if (ocmb_lfir == id &&
590                 (0 == bit || 1 == bit || 2 == bit || 8 == bit || 23 == bit ||
591                  37 == bit || 63 == bit))
592             {
593                 continue;
594             }
595 
596             if (mmiofir == id && (2 == bit))
597             {
598                 continue;
599             }
600 
601             if (srqfir == id &&
602                 (2 == bit || 4 == bit || 14 == bit || 15 == bit || 23 == bit ||
603                  25 == bit || 28 == bit))
604             {
605                 continue;
606             }
607 
608             if (rdffir == id &&
609                 (0 == bit || 1 == bit || 2 == bit || 3 == bit || 4 == bit ||
610                  5 == bit || 6 == bit || 7 == bit || 8 == bit || 9 == bit ||
611                  18 == bit || 38 == bit || 40 == bit || 41 == bit ||
612                  45 == bit || 46 == bit))
613             {
614                 continue;
615             }
616 
617             if (tlxfir == id && (0 == bit || 9 == bit || 26 == bit))
618             {
619                 continue;
620             }
621 
622             if (omi_dl == id && (2 == bit || 3 == bit || 6 == bit || 7 == bit ||
623                                  9 == bit || 10 == bit))
624             {
625                 continue;
626             }
627         }
628 
629         // At this point, the attention has not been explicitly ignored. So
630         // return this signature and exit.
631         o_rootCause = signature;
632         return true;
633     }
634 
635     return false; // default, nothing found
636 }
637 
638 //------------------------------------------------------------------------------
639 
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)640 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
641                    libhei::Signature& o_rootCause,
642                    const RasDataParser& i_rasData)
643 {
644     // We'll need to make a copy of the list so that the original list is
645     // maintained for the PEL.
646     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
647 
648     // START WORKAROUND
649     // TODO: Filtering should be data driven. Until that support is available,
650     //       use the following isolation rules.
651 
652     // Ensure the list is not empty before continuing.
653     if (list.empty())
654     {
655         return false; // nothing more to do
656     }
657 
658     // First, look for any RCS OSC errors. This must always be first because
659     // they can cause downstream PLL unlock attentions.
660     if (__findRcsOscError(list, o_rootCause))
661     {
662         return true;
663     }
664 
665     // Second, look for any PLL unlock attentions. This must always be second
666     // because PLL unlock attentions can cause any number of downstream
667     // attentions, including a system checkstop.
668     if (__findPllUnlock(list, o_rootCause))
669     {
670         return true;
671     }
672 
673     // Regardless of the analysis type, always look for anything that could be
674     // blamed as the root cause of a system checkstop.
675 
676     // Memory channel failure attentions will produce SUEs and likely cause
677     // downstream attentions, including a system checkstop.
678     if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
679     {
680         return true;
681     }
682 
683     // Look for any recoverable attentions that have been identified as a
684     // potential root cause of a system checkstop attention. These would include
685     // any attention that would generate an SUE. Note that is it possible for
686     // recoverables to generate unit checkstop attentions so we must check them
687     // first.
688     if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
689     {
690         return true;
691     }
692 
693     // Look for any unit checkstop attentions (other than memory channel
694     // failures) that have been identified as a potential root cause of a
695     // system checkstop attention. These would include any attention that would
696     // generate an SUE.
697     if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
698     {
699         return true;
700     }
701 
702     // If no other viable root cause has been found, check for any signatures
703     // with the ATTN_FROM_OCMB flag in case there was an attention from an
704     // inaccessible OCMB.
705     if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
706     {
707         return true;
708     }
709 
710     // Look for any system checkstop attentions that originated from within the
711     // chip that reported the attention. In other words, no external checkstop
712     // attentions.
713     if (__findNonExternalCs(list, o_rootCause))
714     {
715         return true;
716     }
717 
718     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
719     {
720         // No system checkstop root cause attentions were found. Next, look for
721         // any recoverable or unit checkstop attentions that could be associated
722         // with a TI.
723         if (__findTiRootCause(list, o_rootCause))
724         {
725             return true;
726         }
727 
728         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
729         {
730             // No attentions associated with a system checkstop or TI were
731             // found. Simply, return the first entry in the list.
732             o_rootCause = list.front();
733             return true;
734         }
735     }
736 
737     // END WORKAROUND
738 
739     return false; // default, no active attentions found.
740 }
741 
742 //------------------------------------------------------------------------------
743 
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)744 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
745                  libhei::Signature& o_rootCause)
746 {
747     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
748         return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
749                 (17 == t.getBit() || 37 == t.getBit())) ||
750                (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
751                 (18 == t.getBit() || 38 == t.getBit()));
752     });
753 
754     if (i_list.end() != itr)
755     {
756         o_rootCause = *itr;
757         return true;
758     }
759 
760     return false;
761 }
762 
763 //------------------------------------------------------------------------------
764 
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)765 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
766                            libhei::Signature& o_rootCause,
767                            const RasDataParser& i_rasData)
768 {
769     using func = libhei::NodeId_t (*)(const std::string& i_str);
770     func __hash = libhei::hash<libhei::NodeId_t>;
771 
772     // Check for any special cases that exist for specific FIR bits.
773 
774     // If the channel fail was specifically a firmware initiated channel fail
775     // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
776     // any IUE bits that are on that would have caused the channel fail
777     // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
778     // for Odyssey OCMBs).
779 
780     // Explorer SRQFIR
781     static const auto srqfir = __hash("SRQFIR");
782     // Odyssey SRQ_FIR
783     static const auto srq_fir = __hash("SRQ_FIR");
784 
785     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
786 
787     if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
788          (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
789         __findIueTh(list, o_rootCause))
790     {
791         // If __findIueTh returned true, o_rootCause was updated, return.
792         return;
793     }
794 
795     // Check if the root cause found was a potential side effect of an
796     // ODP data corruption error. If it was, check if any other signature
797     // in the signature list was a potential root cause.
798     auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
799     auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
800     if (i_rasData.isFlagSet(o_rootCause, OdpSide))
801     {
802         for (const auto& s : list)
803         {
804             if (i_rasData.isFlagSet(s, OdpRoot))
805             {
806                 // ODP data corruption root cause found, return.
807                 o_rootCause = s;
808                 return;
809             }
810         }
811     }
812 
813     // Odyssey RDF_FIR
814     static const auto rdf_fir = __hash("RDF_FIR");
815 
816     // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
817     // check if bit 41 is also on.
818     if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
819     {
820         // Look for RDF_FIR[41]
821         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
822             return (rdf_fir == t.getId() && 41 == t.getBit());
823         });
824         if (list.end() != itr)
825         {
826             o_rootCause = *itr;
827         }
828     }
829 }
830 
831 //------------------------------------------------------------------------------
832 
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)833 bool filterRootCause(AnalysisType i_type,
834                      const libhei::IsolationData& i_isoData,
835                      libhei::Signature& o_rootCause,
836                      const RasDataParser& i_rasData)
837 {
838     // Find the initial root cause attention based on common rules for FIR
839     // isolation.
840     bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
841 
842     // If some root cause was found, handle any special cases for specific FIR
843     // bits that require additional logic to determine the root cause.
844     if (true == rc)
845     {
846         rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
847     }
848 
849     return rc;
850 }
851 
852 //------------------------------------------------------------------------------
853 
854 } // namespace analyzer
855