1 #include <assert.h>
2 
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/ras-data/ras-data-parser.hpp>
5 #include <hei_main.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8 
9 #include <algorithm>
10 #include <limits>
11 #include <string>
12 
13 namespace analyzer
14 {
15 //------------------------------------------------------------------------------
16 
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18                        libhei::Signature& o_rootCause)
19 {
20     // TODO: Consider returning all of them instead of one as root cause.
21     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22         return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23                 (42 == t.getBit() || 43 == t.getBit()));
24     });
25 
26     if (i_list.end() != itr)
27     {
28         o_rootCause = *itr;
29         return true;
30     }
31 
32     return false;
33 }
34 
35 //------------------------------------------------------------------------------
36 
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38                      libhei::Signature& o_rootCause)
39 {
40     using namespace util::pdbg;
41 
42     // TODO: Consider returning all of them instead of one as root cause.
43 
44     auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45 
46     // First, look for any PLL unlock attentions reported by a processsor chip.
47     auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48         return (nodeId == t.getId() &&
49                 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50     });
51 
52     if (i_list.end() != itr1)
53     {
54         o_rootCause = *itr1;
55         return true;
56     }
57 
58     // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59     // is specifically for Odyssey, which are the only OCMBs that would report
60     // PLL unlock attentions.
61     auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62         return (nodeId == t.getId() &&
63                 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64     });
65 
66     if (i_list.end() != itr2)
67     {
68         o_rootCause = *itr2;
69         return true;
70     }
71 
72     return false;
73 }
74 
75 //------------------------------------------------------------------------------
76 
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78                                 libhei::Signature& o_rootCause,
79                                 const RasDataParser& i_rasData)
80 {
81     using namespace util::pdbg;
82 
83     using func = libhei::NodeId_t (*)(const std::string& i_str);
84     func __hash = libhei::hash<libhei::NodeId_t>;
85 
86     static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87     static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88     static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89 
90     // First, look for any chip checkstops from the connected OCMBs.
91     for (const auto& s : i_list)
92     {
93         if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94         {
95             continue; // OCMBs only
96         }
97 
98         // TODO: The chip data for Explorer chips currently report chip
99         //       checkstops as unit checkstops. Once the chip data has been
100         //       updated, the check for unit checkstops here will need to be
101         //       removed.
102         if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103             libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104         {
105             o_rootCause = s;
106             return true;
107         }
108     }
109 
110     // Now, look for any channel failure attentions on the processor side of the
111     // memory bus.
112     for (const auto& s : i_list)
113     {
114         if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115         {
116             continue; // processors only
117         }
118 
119         // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120         // MC_USTLFIR are considered a channel failure attention.
121         // TODO: The "channel failure" designation is actually configurable via
122         //       other registers. We just happen to expect anything that is
123         //       configured to channel failure to also be configured to unit
124         //       checkstop. Eventually, we will need some mechanism to check the
125         //       configuration registers for a more accurate analysis.
126         if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127             (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128             !i_rasData.isFlagSet(s,
129                                  RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130         {
131             o_rootCause = s;
132             return true;
133         }
134         // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135         // MC_OMI_DL_FIR that are hardwired to channel failure.
136         else if (mc_omi_dl_err_rpt == s.getId())
137         {
138             o_rootCause = s;
139             return true;
140         }
141     }
142 
143     return false; // default, nothing found
144 }
145 
146 //------------------------------------------------------------------------------
147 
148 // Will query if a signature is a potential system checkstop root cause.
149 // attention. Note that this function excludes memory channel failure attentions
150 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature,
152                        const RasDataParser& i_rasData)
153 {
154     // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155     if (i_rasData.isFlagSet(i_signature,
156                             RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157         i_rasData.isFlagSet(i_signature,
158                             RasDataParser::RasDataFlags::SUE_SOURCE))
159     {
160         return true;
161     }
162 
163     return false; // default, nothing found
164 }
165 
166 //------------------------------------------------------------------------------
167 
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169                           libhei::Signature& o_rootCause,
170                           const RasDataParser& i_rasData)
171 {
172     for (const auto& s : i_list)
173     {
174         // Only looking for recoverable attentions.
175         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176         {
177             continue;
178         }
179 
180         if (__findCsRootCause(s, i_rasData))
181         {
182             o_rootCause = s;
183             return true;
184         }
185     }
186 
187     return false; // default, nothing found
188 }
189 
190 //------------------------------------------------------------------------------
191 
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193                            libhei::Signature& o_rootCause,
194                            const RasDataParser& i_rasData)
195 {
196     for (const auto& s : i_list)
197     {
198         // Only looking for unit checkstop attentions.
199         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200         {
201             continue;
202         }
203 
204         if (__findCsRootCause(s, i_rasData))
205         {
206             o_rootCause = s;
207             return true;
208         }
209     }
210 
211     return false; // default, nothing found
212 }
213 
214 //------------------------------------------------------------------------------
215 
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217                         libhei::Signature& o_rootCause,
218                         const RasDataParser& i_rasData)
219 {
220     using namespace util::pdbg;
221 
222     // If we have any attentions from an OCMB, assume isolation to the OCMBs
223     // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224     for (const auto& s : i_list)
225     {
226         if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227         {
228             return false;
229         }
230     }
231 
232     for (const auto& s : i_list)
233     {
234         if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235         {
236             o_rootCause = s;
237             return true;
238         }
239     }
240 
241     return false; // default, nothing found
242 }
243 
244 //------------------------------------------------------------------------------
245 
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247                          libhei::Signature& o_rootCause)
248 {
249     using namespace util::pdbg;
250 
251     static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252 
253     for (const auto& s : i_list)
254     {
255         const auto targetType = getTrgtType(getTrgt(s.getChip()));
256         const auto id = s.getId();
257         const auto attnType = s.getAttnType();
258 
259         // Find any processor with chip checkstop attention that did not
260         // originate from the PB_EXT_FIR.
261         if ((TYPE_PROC == targetType) &&
262             (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263         {
264             o_rootCause = s;
265             return true;
266         }
267     }
268 
269     return false; // default, nothing found
270 }
271 
272 //------------------------------------------------------------------------------
273 
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275                        libhei::Signature& o_rootCause,
276                        const RasDataParser& i_rasData)
277 {
278     using namespace util::pdbg;
279     using rdf = RasDataParser::RasDataFlags;
280 
281     for (const auto& signature : i_list)
282     {
283         const auto attnType = signature.getAttnType();
284 
285         // Only looking for recoverable or unit checkstop attentions.
286         if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
287             libhei::ATTN_TYPE_UNIT_CS != attnType)
288         {
289             continue;
290         }
291 
292         // Skip any signature with the 'recovered_error' or 'informational_only'
293         // flags.
294         if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
295             i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
296             i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY))
297         {
298             continue;
299         }
300 
301         // At this point, the attention has not been explicitly ignored. So
302         // return this signature and exit.
303         o_rootCause = signature;
304         return true;
305     }
306 
307     return false; // default, nothing found
308 }
309 
310 //------------------------------------------------------------------------------
311 
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)312 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
313                    libhei::Signature& o_rootCause,
314                    const RasDataParser& i_rasData)
315 {
316     // We'll need to make a copy of the list so that the original list is
317     // maintained for the PEL.
318     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
319 
320     // START WORKAROUND
321     // TODO: Filtering should be data driven. Until that support is available,
322     //       use the following isolation rules.
323 
324     // Ensure the list is not empty before continuing.
325     if (list.empty())
326     {
327         return false; // nothing more to do
328     }
329 
330     // First, look for any RCS OSC errors. This must always be first because
331     // they can cause downstream PLL unlock attentions.
332     if (__findRcsOscError(list, o_rootCause))
333     {
334         return true;
335     }
336 
337     // Second, look for any PLL unlock attentions. This must always be second
338     // because PLL unlock attentions can cause any number of downstream
339     // attentions, including a system checkstop.
340     if (__findPllUnlock(list, o_rootCause))
341     {
342         return true;
343     }
344 
345     // Regardless of the analysis type, always look for anything that could be
346     // blamed as the root cause of a system checkstop.
347 
348     // Memory channel failure attentions will produce SUEs and likely cause
349     // downstream attentions, including a system checkstop.
350     if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
351     {
352         return true;
353     }
354 
355     // Look for any recoverable attentions that have been identified as a
356     // potential root cause of a system checkstop attention. These would include
357     // any attention that would generate an SUE. Note that is it possible for
358     // recoverables to generate unit checkstop attentions so we must check them
359     // first.
360     if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
361     {
362         return true;
363     }
364 
365     // Look for any unit checkstop attentions (other than memory channel
366     // failures) that have been identified as a potential root cause of a
367     // system checkstop attention. These would include any attention that would
368     // generate an SUE.
369     if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
370     {
371         return true;
372     }
373 
374     // If no other viable root cause has been found, check for any signatures
375     // with the ATTN_FROM_OCMB flag in case there was an attention from an
376     // inaccessible OCMB.
377     if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
378     {
379         return true;
380     }
381 
382     // Look for any system checkstop attentions that originated from within the
383     // chip that reported the attention. In other words, no external checkstop
384     // attentions.
385     if (__findNonExternalCs(list, o_rootCause))
386     {
387         return true;
388     }
389 
390     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
391     {
392         // No system checkstop root cause attentions were found. Next, look for
393         // any recoverable or unit checkstop attentions that could be associated
394         // with a TI.
395         if (__findTiRootCause(list, o_rootCause, i_rasData))
396         {
397             return true;
398         }
399 
400         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
401         {
402             // No attentions associated with a system checkstop or TI were
403             // found. Simply, return the first entry in the list.
404             o_rootCause = list.front();
405             return true;
406         }
407     }
408 
409     // END WORKAROUND
410 
411     return false; // default, no active attentions found.
412 }
413 
414 //------------------------------------------------------------------------------
415 
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)416 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
417                  libhei::Signature& o_rootCause)
418 {
419     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
420         return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
421                 (17 == t.getBit() || 37 == t.getBit())) ||
422                (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
423                 (18 == t.getBit() || 38 == t.getBit()));
424     });
425 
426     if (i_list.end() != itr)
427     {
428         o_rootCause = *itr;
429         return true;
430     }
431 
432     return false;
433 }
434 
435 //------------------------------------------------------------------------------
436 
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)437 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
438                            libhei::Signature& o_rootCause,
439                            const RasDataParser& i_rasData)
440 {
441     using func = libhei::NodeId_t (*)(const std::string& i_str);
442     func __hash = libhei::hash<libhei::NodeId_t>;
443 
444     // Check for any special cases that exist for specific FIR bits.
445 
446     // If the channel fail was specifically a firmware initiated channel fail
447     // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
448     // any IUE bits that are on that would have caused the channel fail
449     // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
450     // for Odyssey OCMBs).
451 
452     // Explorer SRQFIR
453     static const auto srqfir = __hash("SRQFIR");
454     // Odyssey SRQ_FIR
455     static const auto srq_fir = __hash("SRQ_FIR");
456 
457     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
458 
459     if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
460          (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
461         __findIueTh(list, o_rootCause))
462     {
463         // If __findIueTh returned true, o_rootCause was updated, return.
464         return;
465     }
466 
467     // Check if the root cause found was a potential side effect of an
468     // ODP data corruption error. If it was, check if any other signature
469     // in the signature list was a potential root cause.
470     auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
471     auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
472     if (i_rasData.isFlagSet(o_rootCause, OdpSide))
473     {
474         for (const auto& s : list)
475         {
476             if (i_rasData.isFlagSet(s, OdpRoot))
477             {
478                 // ODP data corruption root cause found, return.
479                 o_rootCause = s;
480                 return;
481             }
482         }
483     }
484 
485     // Odyssey RDF_FIR
486     static const auto rdf_fir = __hash("RDF_FIR");
487 
488     // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
489     // check if bit 41 is also on.
490     if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
491     {
492         // Look for RDF_FIR[41]
493         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
494             return (rdf_fir == t.getId() && 41 == t.getBit());
495         });
496         if (list.end() != itr)
497         {
498             o_rootCause = *itr;
499         }
500     }
501 }
502 
503 //------------------------------------------------------------------------------
504 
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)505 bool filterRootCause(AnalysisType i_type,
506                      const libhei::IsolationData& i_isoData,
507                      libhei::Signature& o_rootCause,
508                      const RasDataParser& i_rasData)
509 {
510     // Find the initial root cause attention based on common rules for FIR
511     // isolation.
512     bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
513 
514     // If some root cause was found, handle any special cases for specific FIR
515     // bits that require additional logic to determine the root cause.
516     if (true == rc)
517     {
518         rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
519     }
520 
521     return rc;
522 }
523 
524 //------------------------------------------------------------------------------
525 
526 } // namespace analyzer
527