xref: /openbmc/openpower-hw-diags/analyzer/filter-root-cause.cpp (revision 0cab8ada29d783a1008285ca2166b5af632eaa26)
1 #include <assert.h>
2 
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/plugins/plugin.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <hei_main.hpp>
7 #include <hei_util.hpp>
8 #include <util/pdbg.hpp>
9 
10 #include <algorithm>
11 #include <limits>
12 #include <string>
13 
14 namespace analyzer
15 {
16 //------------------------------------------------------------------------------
17 
__lookForBits(const std::vector<libhei::Signature> & i_sigList,libhei::Signature & o_rootCause,std::vector<libhei::ChipType_t> i_chipTypes,const char * i_fir,std::vector<uint8_t> i_bitList)18 bool __lookForBits(const std::vector<libhei::Signature>& i_sigList,
19                    libhei::Signature& o_rootCause,
20                    std::vector<libhei::ChipType_t> i_chipTypes,
21                    const char* i_fir, std::vector<uint8_t> i_bitList)
22 {
23     using func = libhei::NodeId_t (*)(const std::string& i_str);
24     func __hash = libhei::hash<libhei::NodeId_t>;
25 
26     auto itr =
27         std::find_if(i_sigList.begin(), i_sigList.end(), [&](const auto& sig) {
28             for (const auto& type : i_chipTypes)
29             {
30                 if (type != sig.getChip().getType())
31                 {
32                     continue;
33                 }
34                 for (const auto& bit : i_bitList)
35                 {
36                     if (__hash(i_fir) == sig.getId() && bit == sig.getBit())
37                     {
38                         return true;
39                     }
40                     else
41                     {
42                         continue;
43                     }
44                 }
45             }
46             return false;
47         });
48 
49     if (i_sigList.end() != itr)
50     {
51         o_rootCause = *itr;
52         return true;
53     }
54 
55     return false;
56 }
57 
58 //------------------------------------------------------------------------------
59 
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)60 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
61                      libhei::Signature& o_rootCause)
62 {
63     using namespace util::pdbg;
64 
65     // TODO: Consider returning all of them instead of one as root cause.
66 
67     auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
68 
69     // First, look for any PLL unlock attentions reported by a processsor chip.
70     auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
71         return (nodeId == t.getId() &&
72                 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
73     });
74 
75     if (i_list.end() != itr1)
76     {
77         o_rootCause = *itr1;
78         return true;
79     }
80 
81     // Then, look for any PLL unlock attentions reported by an OCMB chip. This
82     // is specifically for Odyssey, which are the only OCMBs that would report
83     // PLL unlock attentions.
84     auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
85         return (nodeId == t.getId() &&
86                 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
87     });
88 
89     if (i_list.end() != itr2)
90     {
91         o_rootCause = *itr2;
92         return true;
93     }
94 
95     return false;
96 }
97 
98 //------------------------------------------------------------------------------
99 
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)100 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
101                                 libhei::Signature& o_rootCause,
102                                 const RasDataParser& i_rasData)
103 {
104     using namespace util::pdbg;
105 
106     using func = libhei::NodeId_t (*)(const std::string& i_str);
107     func __hash = libhei::hash<libhei::NodeId_t>;
108 
109     static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
110     static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
111     static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
112 
113     // First, look for any chip checkstops from the connected OCMBs.
114     for (const auto& s : i_list)
115     {
116         if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
117         {
118             continue; // OCMBs only
119         }
120 
121         // TODO: The chip data for Explorer chips currently report chip
122         //       checkstops as unit checkstops. Once the chip data has been
123         //       updated, the check for unit checkstops here will need to be
124         //       removed.
125         if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
126             libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
127         {
128             o_rootCause = s;
129             return true;
130         }
131     }
132 
133     // Now, look for any channel failure attentions on the processor side of the
134     // memory bus.
135     for (const auto& s : i_list)
136     {
137         if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
138         {
139             continue; // processors only
140         }
141 
142         // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
143         // MC_USTLFIR are considered a channel failure attention.
144         // TODO: The "channel failure" designation is actually configurable via
145         //       other registers. We just happen to expect anything that is
146         //       configured to channel failure to also be configured to unit
147         //       checkstop. Eventually, we will need some mechanism to check the
148         //       configuration registers for a more accurate analysis.
149         if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
150             (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
151             (s.getChip().getType() == analyzer::P10_10 ||
152              s.getChip().getType() == analyzer::P10_20) &&
153             !i_rasData.isFlagSet(s,
154                                  RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
155         {
156             o_rootCause = s;
157             return true;
158         }
159         // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
160         // MC_OMI_DL_FIR that are hardwired to channel failure.
161         else if (mc_omi_dl_err_rpt == s.getId())
162         {
163             o_rootCause = s;
164             return true;
165         }
166     }
167 
168     return false; // default, nothing found
169 }
170 
171 //------------------------------------------------------------------------------
172 
173 // Will query if a signature is a potential system checkstop root cause.
174 // attention. Note that this function excludes memory channel failure attentions
175 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)176 bool __findCsRootCause(const libhei::Signature& i_signature,
177                        const RasDataParser& i_rasData)
178 {
179     // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
180     if (i_rasData.isFlagSet(i_signature,
181                             RasDataParser::RasDataFlags::CS_POSSIBLE) ||
182         i_rasData.isFlagSet(i_signature,
183                             RasDataParser::RasDataFlags::SUE_SOURCE))
184     {
185         return true;
186     }
187 
188     return false; // default, nothing found
189 }
190 
191 //------------------------------------------------------------------------------
192 
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)193 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
194                           libhei::Signature& o_rootCause,
195                           const RasDataParser& i_rasData)
196 {
197     for (const auto& s : i_list)
198     {
199         // Only looking for recoverable attentions.
200         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
201         {
202             continue;
203         }
204 
205         if (__findCsRootCause(s, i_rasData))
206         {
207             o_rootCause = s;
208             return true;
209         }
210     }
211 
212     return false; // default, nothing found
213 }
214 
215 //------------------------------------------------------------------------------
216 
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)217 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
218                            libhei::Signature& o_rootCause,
219                            const RasDataParser& i_rasData)
220 {
221     for (const auto& s : i_list)
222     {
223         // Only looking for unit checkstop attentions.
224         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
225         {
226             continue;
227         }
228 
229         if (__findCsRootCause(s, i_rasData))
230         {
231             o_rootCause = s;
232             return true;
233         }
234     }
235 
236     return false; // default, nothing found
237 }
238 
239 //------------------------------------------------------------------------------
240 
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)241 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
242                         libhei::Signature& o_rootCause,
243                         const RasDataParser& i_rasData)
244 {
245     using namespace util::pdbg;
246 
247     // If we have any attentions from an OCMB, assume isolation to the OCMBs
248     // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
249     for (const auto& s : i_list)
250     {
251         if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
252         {
253             return false;
254         }
255     }
256 
257     for (const auto& s : i_list)
258     {
259         if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
260         {
261             o_rootCause = s;
262             return true;
263         }
264     }
265 
266     return false; // default, nothing found
267 }
268 
269 //------------------------------------------------------------------------------
270 
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)271 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
272                          libhei::Signature& o_rootCause)
273 {
274     using namespace util::pdbg;
275 
276     static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
277 
278     for (const auto& s : i_list)
279     {
280         const auto targetType = getTrgtType(getTrgt(s.getChip()));
281         const auto id = s.getId();
282         const auto attnType = s.getAttnType();
283 
284         // Find any processor with chip checkstop attention that did not
285         // originate from the PB_EXT_FIR.
286         if ((TYPE_PROC == targetType) &&
287             (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
288         {
289             o_rootCause = s;
290             return true;
291         }
292     }
293 
294     return false; // default, nothing found
295 }
296 
297 //------------------------------------------------------------------------------
298 
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)299 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
300                        libhei::Signature& o_rootCause,
301                        const RasDataParser& i_rasData)
302 {
303     using namespace util::pdbg;
304     using rdf = RasDataParser::RasDataFlags;
305 
306     for (const auto& signature : i_list)
307     {
308         const auto attnType = signature.getAttnType();
309 
310         // Only looking for recoverable or unit checkstop attentions.
311         if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
312             libhei::ATTN_TYPE_UNIT_CS != attnType)
313         {
314             continue;
315         }
316 
317         // Skip any signature with the 'recovered_error', 'informational_only',
318         // or 'attn_from_ocmb' flags.
319         if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
320             i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
321             i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY) ||
322             i_rasData.isFlagSet(signature, rdf::ATTN_FROM_OCMB))
323         {
324             continue;
325         }
326 
327         // At this point, the attention has not been explicitly ignored. So
328         // return this signature and exit.
329         o_rootCause = signature;
330         return true;
331     }
332 
333     return false; // default, nothing found
334 }
335 
336 //------------------------------------------------------------------------------
337 
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)338 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
339                    libhei::Signature& o_rootCause,
340                    const RasDataParser& i_rasData)
341 {
342     // We'll need to make a copy of the list so that the original list is
343     // maintained for the PEL.
344     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
345 
346     // START WORKAROUND
347     // TODO: Filtering should be data driven. Until that support is available,
348     //       use the following isolation rules.
349 
350     // Ensure the list is not empty before continuing.
351     if (list.empty())
352     {
353         return false; // nothing more to do
354     }
355 
356     // First, look for any RCS OSC errors. This must always be first because
357     // they can cause downstream PLL unlock attentions.
358     if (__lookForBits(list, o_rootCause, {analyzer::P10_10, analyzer::P10_20},
359                       "TP_LOCAL_FIR", {42, 43}))
360     {
361         return true;
362     }
363 
364     // Second, look for any PLL unlock attentions. This must always be second
365     // because PLL unlock attentions can cause any number of downstream
366     // attentions, including a system checkstop.
367     if (__findPllUnlock(list, o_rootCause))
368     {
369         return true;
370     }
371 
372     // Regardless of the analysis type, always look for anything that could be
373     // blamed as the root cause of a system checkstop.
374 
375     // Memory channel failure attentions will produce SUEs and likely cause
376     // downstream attentions, including a system checkstop.
377     if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
378     {
379         return true;
380     }
381 
382     // Look for any recoverable attentions that have been identified as a
383     // potential root cause of a system checkstop attention. These would include
384     // any attention that would generate an SUE. Note that is it possible for
385     // recoverables to generate unit checkstop attentions so we must check them
386     // first.
387     if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
388     {
389         return true;
390     }
391 
392     // Look for any unit checkstop attentions (other than memory channel
393     // failures) that have been identified as a potential root cause of a
394     // system checkstop attention. These would include any attention that would
395     // generate an SUE.
396     if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
397     {
398         return true;
399     }
400 
401     // If no other viable root cause has been found, check for any signatures
402     // with the ATTN_FROM_OCMB flag in case there was an attention from an
403     // inaccessible OCMB.
404     if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
405     {
406         return true;
407     }
408 
409     // Look for any system checkstop attentions that originated from within the
410     // chip that reported the attention. In other words, no external checkstop
411     // attentions.
412     if (__findNonExternalCs(list, o_rootCause))
413     {
414         return true;
415     }
416 
417     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
418     {
419         // No system checkstop root cause attentions were found. Next, look for
420         // any recoverable or unit checkstop attentions that could be associated
421         // with a TI.
422         if (__findTiRootCause(list, o_rootCause, i_rasData))
423         {
424             return true;
425         }
426 
427         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
428         {
429             // No attentions associated with a system checkstop or TI were
430             // found. Simply, return the first entry in the list.
431             o_rootCause = list.front();
432             return true;
433         }
434     }
435 
436     // END WORKAROUND
437 
438     return false; // default, no active attentions found.
439 }
440 
441 //------------------------------------------------------------------------------
442 
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)443 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
444                  libhei::Signature& o_rootCause)
445 {
446     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
447         return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
448                 (17 == t.getBit() || 37 == t.getBit())) ||
449                (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
450                 (18 == t.getBit() || 38 == t.getBit()));
451     });
452 
453     if (i_list.end() != itr)
454     {
455         o_rootCause = *itr;
456         return true;
457     }
458 
459     return false;
460 }
461 
462 //------------------------------------------------------------------------------
463 
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)464 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
465                            libhei::Signature& o_rootCause,
466                            const RasDataParser& i_rasData)
467 {
468     using func = libhei::NodeId_t (*)(const std::string& i_str);
469     func __hash = libhei::hash<libhei::NodeId_t>;
470 
471     // Check for any special cases that exist for specific FIR bits.
472 
473     // If the channel fail was specifically a firmware initiated channel fail
474     // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
475     // any IUE bits that are on that would have caused the channel fail
476     // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
477     // for Odyssey OCMBs).
478 
479     // Explorer SRQFIR
480     static const auto srqfir = __hash("SRQFIR");
481     // Odyssey SRQ_FIR
482     static const auto srq_fir = __hash("SRQ_FIR");
483 
484     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
485 
486     if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
487          (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
488         __findIueTh(list, o_rootCause))
489     {
490         // If __findIueTh returned true, o_rootCause was updated, return.
491         return;
492     }
493 
494     // Check if the root cause found was a potential side effect of an
495     // ODP data corruption error. If it was, check if any other signature
496     // in the signature list was a potential root cause.
497     auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
498     auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
499     if (i_rasData.isFlagSet(o_rootCause, OdpSide))
500     {
501         for (const auto& s : list)
502         {
503             if (i_rasData.isFlagSet(s, OdpRoot))
504             {
505                 // ODP data corruption root cause found, return.
506                 o_rootCause = s;
507                 return;
508             }
509         }
510     }
511 
512     // Odyssey RDF_FIR
513     static const auto rdf_fir = __hash("RDF_FIR");
514 
515     // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
516     // check if bit 41 is also on.
517     if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
518     {
519         // Look for RDF_FIR[41]
520         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
521             return (rdf_fir == t.getId() && 41 == t.getBit());
522         });
523         if (list.end() != itr)
524         {
525             o_rootCause = *itr;
526         }
527     }
528 }
529 
530 //------------------------------------------------------------------------------
531 
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)532 bool filterRootCause(AnalysisType i_type,
533                      const libhei::IsolationData& i_isoData,
534                      libhei::Signature& o_rootCause,
535                      const RasDataParser& i_rasData)
536 {
537     // Find the initial root cause attention based on common rules for FIR
538     // isolation.
539     bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
540 
541     // If some root cause was found, handle any special cases for specific FIR
542     // bits that require additional logic to determine the root cause.
543     if (true == rc)
544     {
545         rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
546     }
547 
548     return rc;
549 }
550 
551 //------------------------------------------------------------------------------
552 
553 } // namespace analyzer
554