xref: /openbmc/openpower-hw-diags/analyzer/filter-root-cause.cpp (revision 622cd4bef036f34a400dca8f7d870ddf42b1e257)
1  #include <assert.h>
2  
3  #include <analyzer/analyzer_main.hpp>
4  #include <analyzer/ras-data/ras-data-parser.hpp>
5  #include <hei_main.hpp>
6  #include <hei_util.hpp>
7  #include <util/pdbg.hpp>
8  
9  #include <algorithm>
10  #include <limits>
11  #include <string>
12  
13  namespace analyzer
14  {
15  //------------------------------------------------------------------------------
16  
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17  bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18                         libhei::Signature& o_rootCause)
19  {
20      // TODO: Consider returning all of them instead of one as root cause.
21      auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22          return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23                  (42 == t.getBit() || 43 == t.getBit()));
24      });
25  
26      if (i_list.end() != itr)
27      {
28          o_rootCause = *itr;
29          return true;
30      }
31  
32      return false;
33  }
34  
35  //------------------------------------------------------------------------------
36  
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37  bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38                       libhei::Signature& o_rootCause)
39  {
40      using namespace util::pdbg;
41  
42      // TODO: Consider returning all of them instead of one as root cause.
43  
44      auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45  
46      // First, look for any PLL unlock attentions reported by a processsor chip.
47      auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48          return (nodeId == t.getId() &&
49                  TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50      });
51  
52      if (i_list.end() != itr1)
53      {
54          o_rootCause = *itr1;
55          return true;
56      }
57  
58      // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59      // is specifically for Odyssey, which are the only OCMBs that would report
60      // PLL unlock attentions.
61      auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62          return (nodeId == t.getId() &&
63                  TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64      });
65  
66      if (i_list.end() != itr2)
67      {
68          o_rootCause = *itr2;
69          return true;
70      }
71  
72      return false;
73  }
74  
75  //------------------------------------------------------------------------------
76  
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77  bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78                                  libhei::Signature& o_rootCause,
79                                  const RasDataParser& i_rasData)
80  {
81      using namespace util::pdbg;
82  
83      using func = libhei::NodeId_t (*)(const std::string& i_str);
84      func __hash = libhei::hash<libhei::NodeId_t>;
85  
86      static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87      static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88      static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89  
90      // First, look for any chip checkstops from the connected OCMBs.
91      for (const auto& s : i_list)
92      {
93          if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94          {
95              continue; // OCMBs only
96          }
97  
98          // TODO: The chip data for Explorer chips currently report chip
99          //       checkstops as unit checkstops. Once the chip data has been
100          //       updated, the check for unit checkstops here will need to be
101          //       removed.
102          if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103              libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104          {
105              o_rootCause = s;
106              return true;
107          }
108      }
109  
110      // Now, look for any channel failure attentions on the processor side of the
111      // memory bus.
112      for (const auto& s : i_list)
113      {
114          if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115          {
116              continue; // processors only
117          }
118  
119          // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120          // MC_USTLFIR are considered a channel failure attention.
121          // TODO: The "channel failure" designation is actually configurable via
122          //       other registers. We just happen to expect anything that is
123          //       configured to channel failure to also be configured to unit
124          //       checkstop. Eventually, we will need some mechanism to check the
125          //       configuration registers for a more accurate analysis.
126          if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127              (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128              !i_rasData.isFlagSet(s,
129                                   RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130          {
131              o_rootCause = s;
132              return true;
133          }
134          // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135          // MC_OMI_DL_FIR that are hardwired to channel failure.
136          else if (mc_omi_dl_err_rpt == s.getId())
137          {
138              o_rootCause = s;
139              return true;
140          }
141      }
142  
143      return false; // default, nothing found
144  }
145  
146  //------------------------------------------------------------------------------
147  
148  // Will query if a signature is a potential system checkstop root cause.
149  // attention. Note that this function excludes memory channel failure attentions
150  // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151  bool __findCsRootCause(const libhei::Signature& i_signature,
152                         const RasDataParser& i_rasData)
153  {
154      // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155      if (i_rasData.isFlagSet(i_signature,
156                              RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157          i_rasData.isFlagSet(i_signature,
158                              RasDataParser::RasDataFlags::SUE_SOURCE))
159      {
160          return true;
161      }
162  
163      return false; // default, nothing found
164  }
165  
166  //------------------------------------------------------------------------------
167  
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168  bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169                            libhei::Signature& o_rootCause,
170                            const RasDataParser& i_rasData)
171  {
172      for (const auto& s : i_list)
173      {
174          // Only looking for recoverable attentions.
175          if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176          {
177              continue;
178          }
179  
180          if (__findCsRootCause(s, i_rasData))
181          {
182              o_rootCause = s;
183              return true;
184          }
185      }
186  
187      return false; // default, nothing found
188  }
189  
190  //------------------------------------------------------------------------------
191  
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192  bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193                             libhei::Signature& o_rootCause,
194                             const RasDataParser& i_rasData)
195  {
196      for (const auto& s : i_list)
197      {
198          // Only looking for unit checkstop attentions.
199          if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200          {
201              continue;
202          }
203  
204          if (__findCsRootCause(s, i_rasData))
205          {
206              o_rootCause = s;
207              return true;
208          }
209      }
210  
211      return false; // default, nothing found
212  }
213  
214  //------------------------------------------------------------------------------
215  
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216  bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217                          libhei::Signature& o_rootCause,
218                          const RasDataParser& i_rasData)
219  {
220      using namespace util::pdbg;
221  
222      // If we have any attentions from an OCMB, assume isolation to the OCMBs
223      // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224      for (const auto& s : i_list)
225      {
226          if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227          {
228              return false;
229          }
230      }
231  
232      for (const auto& s : i_list)
233      {
234          if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235          {
236              o_rootCause = s;
237              return true;
238          }
239      }
240  
241      return false; // default, nothing found
242  }
243  
244  //------------------------------------------------------------------------------
245  
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246  bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247                           libhei::Signature& o_rootCause)
248  {
249      using namespace util::pdbg;
250  
251      static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252  
253      for (const auto& s : i_list)
254      {
255          const auto targetType = getTrgtType(getTrgt(s.getChip()));
256          const auto id = s.getId();
257          const auto attnType = s.getAttnType();
258  
259          // Find any processor with chip checkstop attention that did not
260          // originate from the PB_EXT_FIR.
261          if ((TYPE_PROC == targetType) &&
262              (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263          {
264              o_rootCause = s;
265              return true;
266          }
267      }
268  
269      return false; // default, nothing found
270  }
271  
272  //------------------------------------------------------------------------------
273  
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274  bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275                         libhei::Signature& o_rootCause,
276                         const RasDataParser& i_rasData)
277  {
278      using namespace util::pdbg;
279      using rdf = RasDataParser::RasDataFlags;
280  
281      for (const auto& signature : i_list)
282      {
283          const auto attnType = signature.getAttnType();
284  
285          // Only looking for recoverable or unit checkstop attentions.
286          if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
287              libhei::ATTN_TYPE_UNIT_CS != attnType)
288          {
289              continue;
290          }
291  
292          // Skip any signature with the 'recovered_error' or 'informational_only'
293          // flags.
294          if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
295              i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
296              i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY))
297          {
298              continue;
299          }
300  
301          // At this point, the attention has not been explicitly ignored. So
302          // return this signature and exit.
303          o_rootCause = signature;
304          return true;
305      }
306  
307      return false; // default, nothing found
308  }
309  
310  //------------------------------------------------------------------------------
311  
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)312  bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
313                     libhei::Signature& o_rootCause,
314                     const RasDataParser& i_rasData)
315  {
316      // We'll need to make a copy of the list so that the original list is
317      // maintained for the PEL.
318      std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
319  
320      // START WORKAROUND
321      // TODO: Filtering should be data driven. Until that support is available,
322      //       use the following isolation rules.
323  
324      // Ensure the list is not empty before continuing.
325      if (list.empty())
326      {
327          return false; // nothing more to do
328      }
329  
330      // First, look for any RCS OSC errors. This must always be first because
331      // they can cause downstream PLL unlock attentions.
332      if (__findRcsOscError(list, o_rootCause))
333      {
334          return true;
335      }
336  
337      // Second, look for any PLL unlock attentions. This must always be second
338      // because PLL unlock attentions can cause any number of downstream
339      // attentions, including a system checkstop.
340      if (__findPllUnlock(list, o_rootCause))
341      {
342          return true;
343      }
344  
345      // Regardless of the analysis type, always look for anything that could be
346      // blamed as the root cause of a system checkstop.
347  
348      // Memory channel failure attentions will produce SUEs and likely cause
349      // downstream attentions, including a system checkstop.
350      if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
351      {
352          return true;
353      }
354  
355      // Look for any recoverable attentions that have been identified as a
356      // potential root cause of a system checkstop attention. These would include
357      // any attention that would generate an SUE. Note that is it possible for
358      // recoverables to generate unit checkstop attentions so we must check them
359      // first.
360      if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
361      {
362          return true;
363      }
364  
365      // Look for any unit checkstop attentions (other than memory channel
366      // failures) that have been identified as a potential root cause of a
367      // system checkstop attention. These would include any attention that would
368      // generate an SUE.
369      if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
370      {
371          return true;
372      }
373  
374      // If no other viable root cause has been found, check for any signatures
375      // with the ATTN_FROM_OCMB flag in case there was an attention from an
376      // inaccessible OCMB.
377      if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
378      {
379          return true;
380      }
381  
382      // Look for any system checkstop attentions that originated from within the
383      // chip that reported the attention. In other words, no external checkstop
384      // attentions.
385      if (__findNonExternalCs(list, o_rootCause))
386      {
387          return true;
388      }
389  
390      if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
391      {
392          // No system checkstop root cause attentions were found. Next, look for
393          // any recoverable or unit checkstop attentions that could be associated
394          // with a TI.
395          if (__findTiRootCause(list, o_rootCause, i_rasData))
396          {
397              return true;
398          }
399  
400          if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
401          {
402              // No attentions associated with a system checkstop or TI were
403              // found. Simply, return the first entry in the list.
404              o_rootCause = list.front();
405              return true;
406          }
407      }
408  
409      // END WORKAROUND
410  
411      return false; // default, no active attentions found.
412  }
413  
414  //------------------------------------------------------------------------------
415  
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)416  bool __findIueTh(const std::vector<libhei::Signature>& i_list,
417                   libhei::Signature& o_rootCause)
418  {
419      auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
420          return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
421                  (17 == t.getBit() || 37 == t.getBit())) ||
422                 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
423                  (18 == t.getBit() || 38 == t.getBit()));
424      });
425  
426      if (i_list.end() != itr)
427      {
428          o_rootCause = *itr;
429          return true;
430      }
431  
432      return false;
433  }
434  
435  //------------------------------------------------------------------------------
436  
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)437  void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
438                             libhei::Signature& o_rootCause,
439                             const RasDataParser& i_rasData)
440  {
441      using func = libhei::NodeId_t (*)(const std::string& i_str);
442      func __hash = libhei::hash<libhei::NodeId_t>;
443  
444      // Check for any special cases that exist for specific FIR bits.
445  
446      // If the channel fail was specifically a firmware initiated channel fail
447      // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
448      // any IUE bits that are on that would have caused the channel fail
449      // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
450      // for Odyssey OCMBs).
451  
452      // Explorer SRQFIR
453      static const auto srqfir = __hash("SRQFIR");
454      // Odyssey SRQ_FIR
455      static const auto srq_fir = __hash("SRQ_FIR");
456  
457      std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
458  
459      if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
460           (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
461          __findIueTh(list, o_rootCause))
462      {
463          // If __findIueTh returned true, o_rootCause was updated, return.
464          return;
465      }
466  
467      // Check if the root cause found was a potential side effect of an
468      // ODP data corruption error. If it was, check if any other signature
469      // in the signature list was a potential root cause.
470      auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
471      auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
472      if (i_rasData.isFlagSet(o_rootCause, OdpSide))
473      {
474          for (const auto& s : list)
475          {
476              if (i_rasData.isFlagSet(s, OdpRoot))
477              {
478                  // ODP data corruption root cause found, return.
479                  o_rootCause = s;
480                  return;
481              }
482          }
483      }
484  
485      // Odyssey RDF_FIR
486      static const auto rdf_fir = __hash("RDF_FIR");
487  
488      // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
489      // check if bit 41 is also on.
490      if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
491      {
492          // Look for RDF_FIR[41]
493          auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
494              return (rdf_fir == t.getId() && 41 == t.getBit());
495          });
496          if (list.end() != itr)
497          {
498              o_rootCause = *itr;
499          }
500      }
501  }
502  
503  //------------------------------------------------------------------------------
504  
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)505  bool filterRootCause(AnalysisType i_type,
506                       const libhei::IsolationData& i_isoData,
507                       libhei::Signature& o_rootCause,
508                       const RasDataParser& i_rasData)
509  {
510      // Find the initial root cause attention based on common rules for FIR
511      // isolation.
512      bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
513  
514      // If some root cause was found, handle any special cases for specific FIR
515      // bits that require additional logic to determine the root cause.
516      if (true == rc)
517      {
518          rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
519      }
520  
521      return rc;
522  }
523  
524  //------------------------------------------------------------------------------
525  
526  } // namespace analyzer
527