xref: /openbmc/openpower-hw-diags/analyzer/filter-root-cause.cpp (revision 34b0ce19b894e4e1b56d33134898a5919d8e2e48)
1 #include <assert.h>
2 
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/ras-data/ras-data-parser.hpp>
5 #include <hei_main.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8 
9 #include <algorithm>
10 #include <limits>
11 #include <string>
12 
13 namespace analyzer
14 {
15 //------------------------------------------------------------------------------
16 
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18                        libhei::Signature& o_rootCause)
19 {
20     // TODO: Consider returning all of them instead of one as root cause.
21     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22         return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23                 (42 == t.getBit() || 43 == t.getBit()));
24     });
25 
26     if (i_list.end() != itr)
27     {
28         o_rootCause = *itr;
29         return true;
30     }
31 
32     return false;
33 }
34 
35 //------------------------------------------------------------------------------
36 
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38                      libhei::Signature& o_rootCause)
39 {
40     using namespace util::pdbg;
41 
42     // TODO: Consider returning all of them instead of one as root cause.
43 
44     auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45 
46     // First, look for any PLL unlock attentions reported by a processsor chip.
47     auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48         return (nodeId == t.getId() &&
49                 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50     });
51 
52     if (i_list.end() != itr1)
53     {
54         o_rootCause = *itr1;
55         return true;
56     }
57 
58     // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59     // is specifically for Odyssey, which are the only OCMBs that would report
60     // PLL unlock attentions.
61     auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62         return (nodeId == t.getId() &&
63                 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64     });
65 
66     if (i_list.end() != itr2)
67     {
68         o_rootCause = *itr2;
69         return true;
70     }
71 
72     return false;
73 }
74 
75 //------------------------------------------------------------------------------
76 
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78                                 libhei::Signature& o_rootCause,
79                                 const RasDataParser& i_rasData)
80 {
81     using namespace util::pdbg;
82 
83     using func = libhei::NodeId_t (*)(const std::string& i_str);
84     func __hash = libhei::hash<libhei::NodeId_t>;
85 
86     static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87     static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88     static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89 
90     // First, look for any chip checkstops from the connected OCMBs.
91     for (const auto& s : i_list)
92     {
93         if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94         {
95             continue; // OCMBs only
96         }
97 
98         // TODO: The chip data for Explorer chips currently report chip
99         //       checkstops as unit checkstops. Once the chip data has been
100         //       updated, the check for unit checkstops here will need to be
101         //       removed.
102         if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103             libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104         {
105             o_rootCause = s;
106             return true;
107         }
108     }
109 
110     // Now, look for any channel failure attentions on the processor side of the
111     // memory bus.
112     for (const auto& s : i_list)
113     {
114         if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115         {
116             continue; // processors only
117         }
118 
119         // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120         // MC_USTLFIR are considered a channel failure attention.
121         // TODO: The "channel failure" designation is actually configurable via
122         //       other registers. We just happen to expect anything that is
123         //       configured to channel failure to also be configured to unit
124         //       checkstop. Eventually, we will need some mechanism to check the
125         //       configuration registers for a more accurate analysis.
126         if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127             (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128             !i_rasData.isFlagSet(s,
129                                  RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130         {
131             o_rootCause = s;
132             return true;
133         }
134         // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135         // MC_OMI_DL_FIR that are hardwired to channel failure.
136         else if (mc_omi_dl_err_rpt == s.getId())
137         {
138             o_rootCause = s;
139             return true;
140         }
141     }
142 
143     return false; // default, nothing found
144 }
145 
146 //------------------------------------------------------------------------------
147 
148 // Will query if a signature is a potential system checkstop root cause.
149 // attention. Note that this function excludes memory channel failure attentions
150 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature,
152                        const RasDataParser& i_rasData)
153 {
154     // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155     if (i_rasData.isFlagSet(i_signature,
156                             RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157         i_rasData.isFlagSet(i_signature,
158                             RasDataParser::RasDataFlags::SUE_SOURCE))
159     {
160         return true;
161     }
162 
163     return false; // default, nothing found
164 }
165 
166 //------------------------------------------------------------------------------
167 
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169                           libhei::Signature& o_rootCause,
170                           const RasDataParser& i_rasData)
171 {
172     for (const auto& s : i_list)
173     {
174         // Only looking for recoverable attentions.
175         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176         {
177             continue;
178         }
179 
180         if (__findCsRootCause(s, i_rasData))
181         {
182             o_rootCause = s;
183             return true;
184         }
185     }
186 
187     return false; // default, nothing found
188 }
189 
190 //------------------------------------------------------------------------------
191 
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193                            libhei::Signature& o_rootCause,
194                            const RasDataParser& i_rasData)
195 {
196     for (const auto& s : i_list)
197     {
198         // Only looking for unit checkstop attentions.
199         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200         {
201             continue;
202         }
203 
204         if (__findCsRootCause(s, i_rasData))
205         {
206             o_rootCause = s;
207             return true;
208         }
209     }
210 
211     return false; // default, nothing found
212 }
213 
214 //------------------------------------------------------------------------------
215 
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217                         libhei::Signature& o_rootCause,
218                         const RasDataParser& i_rasData)
219 {
220     using namespace util::pdbg;
221 
222     // If we have any attentions from an OCMB, assume isolation to the OCMBs
223     // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224     for (const auto& s : i_list)
225     {
226         if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227         {
228             return false;
229         }
230     }
231 
232     for (const auto& s : i_list)
233     {
234         if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235         {
236             o_rootCause = s;
237             return true;
238         }
239     }
240 
241     return false; // default, nothing found
242 }
243 
244 //------------------------------------------------------------------------------
245 
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247                          libhei::Signature& o_rootCause)
248 {
249     using namespace util::pdbg;
250 
251     static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252 
253     for (const auto& s : i_list)
254     {
255         const auto targetType = getTrgtType(getTrgt(s.getChip()));
256         const auto id = s.getId();
257         const auto attnType = s.getAttnType();
258 
259         // Find any processor with chip checkstop attention that did not
260         // originate from the PB_EXT_FIR.
261         if ((TYPE_PROC == targetType) &&
262             (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263         {
264             o_rootCause = s;
265             return true;
266         }
267     }
268 
269     return false; // default, nothing found
270 }
271 
272 //------------------------------------------------------------------------------
273 
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275                        libhei::Signature& o_rootCause,
276                        const RasDataParser& i_rasData)
277 {
278     using namespace util::pdbg;
279     using rdf = RasDataParser::RasDataFlags;
280 
281     for (const auto& signature : i_list)
282     {
283         const auto attnType = signature.getAttnType();
284 
285         // Only looking for recoverable or unit checkstop attentions.
286         if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
287             libhei::ATTN_TYPE_UNIT_CS != attnType)
288         {
289             continue;
290         }
291 
292         // Skip any signature with the 'recovered_error', 'informational_only',
293         // or 'attn_from_ocmb' flags.
294         if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
295             i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
296             i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY) ||
297             i_rasData.isFlagSet(signature, rdf::ATTN_FROM_OCMB))
298         {
299             continue;
300         }
301 
302         // At this point, the attention has not been explicitly ignored. So
303         // return this signature and exit.
304         o_rootCause = signature;
305         return true;
306     }
307 
308     return false; // default, nothing found
309 }
310 
311 //------------------------------------------------------------------------------
312 
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)313 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
314                    libhei::Signature& o_rootCause,
315                    const RasDataParser& i_rasData)
316 {
317     // We'll need to make a copy of the list so that the original list is
318     // maintained for the PEL.
319     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
320 
321     // START WORKAROUND
322     // TODO: Filtering should be data driven. Until that support is available,
323     //       use the following isolation rules.
324 
325     // Ensure the list is not empty before continuing.
326     if (list.empty())
327     {
328         return false; // nothing more to do
329     }
330 
331     // First, look for any RCS OSC errors. This must always be first because
332     // they can cause downstream PLL unlock attentions.
333     if (__findRcsOscError(list, o_rootCause))
334     {
335         return true;
336     }
337 
338     // Second, look for any PLL unlock attentions. This must always be second
339     // because PLL unlock attentions can cause any number of downstream
340     // attentions, including a system checkstop.
341     if (__findPllUnlock(list, o_rootCause))
342     {
343         return true;
344     }
345 
346     // Regardless of the analysis type, always look for anything that could be
347     // blamed as the root cause of a system checkstop.
348 
349     // Memory channel failure attentions will produce SUEs and likely cause
350     // downstream attentions, including a system checkstop.
351     if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
352     {
353         return true;
354     }
355 
356     // Look for any recoverable attentions that have been identified as a
357     // potential root cause of a system checkstop attention. These would include
358     // any attention that would generate an SUE. Note that is it possible for
359     // recoverables to generate unit checkstop attentions so we must check them
360     // first.
361     if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
362     {
363         return true;
364     }
365 
366     // Look for any unit checkstop attentions (other than memory channel
367     // failures) that have been identified as a potential root cause of a
368     // system checkstop attention. These would include any attention that would
369     // generate an SUE.
370     if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
371     {
372         return true;
373     }
374 
375     // If no other viable root cause has been found, check for any signatures
376     // with the ATTN_FROM_OCMB flag in case there was an attention from an
377     // inaccessible OCMB.
378     if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
379     {
380         return true;
381     }
382 
383     // Look for any system checkstop attentions that originated from within the
384     // chip that reported the attention. In other words, no external checkstop
385     // attentions.
386     if (__findNonExternalCs(list, o_rootCause))
387     {
388         return true;
389     }
390 
391     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
392     {
393         // No system checkstop root cause attentions were found. Next, look for
394         // any recoverable or unit checkstop attentions that could be associated
395         // with a TI.
396         if (__findTiRootCause(list, o_rootCause, i_rasData))
397         {
398             return true;
399         }
400 
401         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
402         {
403             // No attentions associated with a system checkstop or TI were
404             // found. Simply, return the first entry in the list.
405             o_rootCause = list.front();
406             return true;
407         }
408     }
409 
410     // END WORKAROUND
411 
412     return false; // default, no active attentions found.
413 }
414 
415 //------------------------------------------------------------------------------
416 
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)417 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
418                  libhei::Signature& o_rootCause)
419 {
420     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
421         return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
422                 (17 == t.getBit() || 37 == t.getBit())) ||
423                (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
424                 (18 == t.getBit() || 38 == t.getBit()));
425     });
426 
427     if (i_list.end() != itr)
428     {
429         o_rootCause = *itr;
430         return true;
431     }
432 
433     return false;
434 }
435 
436 //------------------------------------------------------------------------------
437 
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)438 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
439                            libhei::Signature& o_rootCause,
440                            const RasDataParser& i_rasData)
441 {
442     using func = libhei::NodeId_t (*)(const std::string& i_str);
443     func __hash = libhei::hash<libhei::NodeId_t>;
444 
445     // Check for any special cases that exist for specific FIR bits.
446 
447     // If the channel fail was specifically a firmware initiated channel fail
448     // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
449     // any IUE bits that are on that would have caused the channel fail
450     // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
451     // for Odyssey OCMBs).
452 
453     // Explorer SRQFIR
454     static const auto srqfir = __hash("SRQFIR");
455     // Odyssey SRQ_FIR
456     static const auto srq_fir = __hash("SRQ_FIR");
457 
458     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
459 
460     if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
461          (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
462         __findIueTh(list, o_rootCause))
463     {
464         // If __findIueTh returned true, o_rootCause was updated, return.
465         return;
466     }
467 
468     // Check if the root cause found was a potential side effect of an
469     // ODP data corruption error. If it was, check if any other signature
470     // in the signature list was a potential root cause.
471     auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
472     auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
473     if (i_rasData.isFlagSet(o_rootCause, OdpSide))
474     {
475         for (const auto& s : list)
476         {
477             if (i_rasData.isFlagSet(s, OdpRoot))
478             {
479                 // ODP data corruption root cause found, return.
480                 o_rootCause = s;
481                 return;
482             }
483         }
484     }
485 
486     // Odyssey RDF_FIR
487     static const auto rdf_fir = __hash("RDF_FIR");
488 
489     // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
490     // check if bit 41 is also on.
491     if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
492     {
493         // Look for RDF_FIR[41]
494         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
495             return (rdf_fir == t.getId() && 41 == t.getBit());
496         });
497         if (list.end() != itr)
498         {
499             o_rootCause = *itr;
500         }
501     }
502 }
503 
504 //------------------------------------------------------------------------------
505 
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)506 bool filterRootCause(AnalysisType i_type,
507                      const libhei::IsolationData& i_isoData,
508                      libhei::Signature& o_rootCause,
509                      const RasDataParser& i_rasData)
510 {
511     // Find the initial root cause attention based on common rules for FIR
512     // isolation.
513     bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
514 
515     // If some root cause was found, handle any special cases for specific FIR
516     // bits that require additional logic to determine the root cause.
517     if (true == rc)
518     {
519         rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
520     }
521 
522     return rc;
523 }
524 
525 //------------------------------------------------------------------------------
526 
527 } // namespace analyzer
528