1 #include <assert.h>
2
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/plugins/plugin.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <hei_main.hpp>
7 #include <hei_util.hpp>
8 #include <util/pdbg.hpp>
9
10 #include <algorithm>
11 #include <limits>
12 #include <string>
13
14 namespace analyzer
15 {
16 //------------------------------------------------------------------------------
17
__lookForBits(const std::vector<libhei::Signature> & i_sigList,libhei::Signature & o_rootCause,std::vector<libhei::ChipType_t> i_chipTypes,const char * i_fir,std::vector<uint8_t> i_bitList)18 bool __lookForBits(const std::vector<libhei::Signature>& i_sigList,
19 libhei::Signature& o_rootCause,
20 std::vector<libhei::ChipType_t> i_chipTypes,
21 const char* i_fir, std::vector<uint8_t> i_bitList)
22 {
23 using func = libhei::NodeId_t (*)(const std::string& i_str);
24 func __hash = libhei::hash<libhei::NodeId_t>;
25
26 auto itr =
27 std::find_if(i_sigList.begin(), i_sigList.end(), [&](const auto& sig) {
28 for (const auto& type : i_chipTypes)
29 {
30 if (type != sig.getChip().getType())
31 {
32 continue;
33 }
34 for (const auto& bit : i_bitList)
35 {
36 if (__hash(i_fir) == sig.getId() && bit == sig.getBit())
37 {
38 return true;
39 }
40 else
41 {
42 continue;
43 }
44 }
45 }
46 return false;
47 });
48
49 if (i_sigList.end() != itr)
50 {
51 o_rootCause = *itr;
52 return true;
53 }
54
55 return false;
56 }
57
58 //------------------------------------------------------------------------------
59
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)60 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
61 libhei::Signature& o_rootCause)
62 {
63 using namespace util::pdbg;
64
65 // TODO: Consider returning all of them instead of one as root cause.
66
67 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
68
69 // First, look for any PLL unlock attentions reported by a processsor chip.
70 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
71 return (nodeId == t.getId() &&
72 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
73 });
74
75 if (i_list.end() != itr1)
76 {
77 o_rootCause = *itr1;
78 return true;
79 }
80
81 // Then, look for any PLL unlock attentions reported by an OCMB chip. This
82 // is specifically for Odyssey, which are the only OCMBs that would report
83 // PLL unlock attentions.
84 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
85 return (nodeId == t.getId() &&
86 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
87 });
88
89 if (i_list.end() != itr2)
90 {
91 o_rootCause = *itr2;
92 return true;
93 }
94
95 return false;
96 }
97
98 //------------------------------------------------------------------------------
99
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)100 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
101 libhei::Signature& o_rootCause,
102 const RasDataParser& i_rasData)
103 {
104 using namespace util::pdbg;
105
106 using func = libhei::NodeId_t (*)(const std::string& i_str);
107 func __hash = libhei::hash<libhei::NodeId_t>;
108
109 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
110 static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
111 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
112
113 // First, look for any chip checkstops from the connected OCMBs.
114 for (const auto& s : i_list)
115 {
116 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
117 {
118 continue; // OCMBs only
119 }
120
121 // TODO: The chip data for Explorer chips currently report chip
122 // checkstops as unit checkstops. Once the chip data has been
123 // updated, the check for unit checkstops here will need to be
124 // removed.
125 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
126 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
127 {
128 o_rootCause = s;
129 return true;
130 }
131 }
132
133 // Now, look for any channel failure attentions on the processor side of the
134 // memory bus.
135 for (const auto& s : i_list)
136 {
137 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
138 {
139 continue; // processors only
140 }
141
142 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
143 // MC_USTLFIR are considered a channel failure attention.
144 // TODO: The "channel failure" designation is actually configurable via
145 // other registers. We just happen to expect anything that is
146 // configured to channel failure to also be configured to unit
147 // checkstop. Eventually, we will need some mechanism to check the
148 // configuration registers for a more accurate analysis.
149 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
150 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
151 (s.getChip().getType() == analyzer::P10_10 ||
152 s.getChip().getType() == analyzer::P10_20) &&
153 !i_rasData.isFlagSet(s,
154 RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
155 {
156 o_rootCause = s;
157 return true;
158 }
159 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
160 // MC_OMI_DL_FIR that are hardwired to channel failure.
161 else if (mc_omi_dl_err_rpt == s.getId())
162 {
163 o_rootCause = s;
164 return true;
165 }
166 }
167
168 return false; // default, nothing found
169 }
170
171 //------------------------------------------------------------------------------
172
173 // Will query if a signature is a potential system checkstop root cause.
174 // attention. Note that this function excludes memory channel failure attentions
175 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)176 bool __findCsRootCause(const libhei::Signature& i_signature,
177 const RasDataParser& i_rasData)
178 {
179 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
180 if (i_rasData.isFlagSet(i_signature,
181 RasDataParser::RasDataFlags::CS_POSSIBLE) ||
182 i_rasData.isFlagSet(i_signature,
183 RasDataParser::RasDataFlags::SUE_SOURCE))
184 {
185 return true;
186 }
187
188 return false; // default, nothing found
189 }
190
191 //------------------------------------------------------------------------------
192
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)193 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
194 libhei::Signature& o_rootCause,
195 const RasDataParser& i_rasData)
196 {
197 for (const auto& s : i_list)
198 {
199 // Only looking for recoverable attentions.
200 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
201 {
202 continue;
203 }
204
205 if (__findCsRootCause(s, i_rasData))
206 {
207 o_rootCause = s;
208 return true;
209 }
210 }
211
212 return false; // default, nothing found
213 }
214
215 //------------------------------------------------------------------------------
216
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)217 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
218 libhei::Signature& o_rootCause,
219 const RasDataParser& i_rasData)
220 {
221 for (const auto& s : i_list)
222 {
223 // Only looking for unit checkstop attentions.
224 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
225 {
226 continue;
227 }
228
229 if (__findCsRootCause(s, i_rasData))
230 {
231 o_rootCause = s;
232 return true;
233 }
234 }
235
236 return false; // default, nothing found
237 }
238
239 //------------------------------------------------------------------------------
240
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)241 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
242 libhei::Signature& o_rootCause,
243 const RasDataParser& i_rasData)
244 {
245 using namespace util::pdbg;
246
247 // If we have any attentions from an OCMB, assume isolation to the OCMBs
248 // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
249 for (const auto& s : i_list)
250 {
251 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
252 {
253 return false;
254 }
255 }
256
257 for (const auto& s : i_list)
258 {
259 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
260 {
261 o_rootCause = s;
262 return true;
263 }
264 }
265
266 return false; // default, nothing found
267 }
268
269 //------------------------------------------------------------------------------
270
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)271 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
272 libhei::Signature& o_rootCause)
273 {
274 using namespace util::pdbg;
275
276 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
277
278 for (const auto& s : i_list)
279 {
280 const auto targetType = getTrgtType(getTrgt(s.getChip()));
281 const auto id = s.getId();
282 const auto attnType = s.getAttnType();
283
284 // Find any processor with chip checkstop attention that did not
285 // originate from the PB_EXT_FIR.
286 if ((TYPE_PROC == targetType) &&
287 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
288 {
289 o_rootCause = s;
290 return true;
291 }
292 }
293
294 return false; // default, nothing found
295 }
296
297 //------------------------------------------------------------------------------
298
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)299 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
300 libhei::Signature& o_rootCause,
301 const RasDataParser& i_rasData)
302 {
303 using namespace util::pdbg;
304 using rdf = RasDataParser::RasDataFlags;
305
306 for (const auto& signature : i_list)
307 {
308 const auto attnType = signature.getAttnType();
309
310 // Only looking for recoverable or unit checkstop attentions.
311 if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
312 libhei::ATTN_TYPE_UNIT_CS != attnType)
313 {
314 continue;
315 }
316
317 // Skip any signature with the 'recovered_error', 'informational_only',
318 // or 'attn_from_ocmb' flags.
319 if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
320 i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
321 i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY) ||
322 i_rasData.isFlagSet(signature, rdf::ATTN_FROM_OCMB))
323 {
324 continue;
325 }
326
327 // At this point, the attention has not been explicitly ignored. So
328 // return this signature and exit.
329 o_rootCause = signature;
330 return true;
331 }
332
333 return false; // default, nothing found
334 }
335
336 //------------------------------------------------------------------------------
337
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)338 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
339 libhei::Signature& o_rootCause,
340 const RasDataParser& i_rasData)
341 {
342 // We'll need to make a copy of the list so that the original list is
343 // maintained for the PEL.
344 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
345
346 // START WORKAROUND
347 // TODO: Filtering should be data driven. Until that support is available,
348 // use the following isolation rules.
349
350 // Ensure the list is not empty before continuing.
351 if (list.empty())
352 {
353 return false; // nothing more to do
354 }
355
356 // First, look for any RCS OSC errors. This must always be first because
357 // they can cause downstream PLL unlock attentions.
358 if (__lookForBits(list, o_rootCause, {analyzer::P10_10, analyzer::P10_20},
359 "TP_LOCAL_FIR", {42, 43}))
360 {
361 return true;
362 }
363
364 // Second, look for any PLL unlock attentions. This must always be second
365 // because PLL unlock attentions can cause any number of downstream
366 // attentions, including a system checkstop.
367 if (__findPllUnlock(list, o_rootCause))
368 {
369 return true;
370 }
371
372 // Regardless of the analysis type, always look for anything that could be
373 // blamed as the root cause of a system checkstop.
374
375 // Memory channel failure attentions will produce SUEs and likely cause
376 // downstream attentions, including a system checkstop.
377 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
378 {
379 return true;
380 }
381
382 // Look for any recoverable attentions that have been identified as a
383 // potential root cause of a system checkstop attention. These would include
384 // any attention that would generate an SUE. Note that is it possible for
385 // recoverables to generate unit checkstop attentions so we must check them
386 // first.
387 if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
388 {
389 return true;
390 }
391
392 // Look for any unit checkstop attentions (other than memory channel
393 // failures) that have been identified as a potential root cause of a
394 // system checkstop attention. These would include any attention that would
395 // generate an SUE.
396 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
397 {
398 return true;
399 }
400
401 // If no other viable root cause has been found, check for any signatures
402 // with the ATTN_FROM_OCMB flag in case there was an attention from an
403 // inaccessible OCMB.
404 if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
405 {
406 return true;
407 }
408
409 // Look for any system checkstop attentions that originated from within the
410 // chip that reported the attention. In other words, no external checkstop
411 // attentions.
412 if (__findNonExternalCs(list, o_rootCause))
413 {
414 return true;
415 }
416
417 if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
418 {
419 // No system checkstop root cause attentions were found. Next, look for
420 // any recoverable or unit checkstop attentions that could be associated
421 // with a TI.
422 if (__findTiRootCause(list, o_rootCause, i_rasData))
423 {
424 return true;
425 }
426
427 if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
428 {
429 // No attentions associated with a system checkstop or TI were
430 // found. Simply, return the first entry in the list.
431 o_rootCause = list.front();
432 return true;
433 }
434 }
435
436 // END WORKAROUND
437
438 return false; // default, no active attentions found.
439 }
440
441 //------------------------------------------------------------------------------
442
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)443 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
444 libhei::Signature& o_rootCause)
445 {
446 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
447 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
448 (17 == t.getBit() || 37 == t.getBit())) ||
449 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
450 (18 == t.getBit() || 38 == t.getBit()));
451 });
452
453 if (i_list.end() != itr)
454 {
455 o_rootCause = *itr;
456 return true;
457 }
458
459 return false;
460 }
461
462 //------------------------------------------------------------------------------
463
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)464 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
465 libhei::Signature& o_rootCause,
466 const RasDataParser& i_rasData)
467 {
468 using func = libhei::NodeId_t (*)(const std::string& i_str);
469 func __hash = libhei::hash<libhei::NodeId_t>;
470
471 // Check for any special cases that exist for specific FIR bits.
472
473 // If the channel fail was specifically a firmware initiated channel fail
474 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
475 // any IUE bits that are on that would have caused the channel fail
476 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
477 // for Odyssey OCMBs).
478
479 // Explorer SRQFIR
480 static const auto srqfir = __hash("SRQFIR");
481 // Odyssey SRQ_FIR
482 static const auto srq_fir = __hash("SRQ_FIR");
483
484 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
485
486 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
487 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
488 __findIueTh(list, o_rootCause))
489 {
490 // If __findIueTh returned true, o_rootCause was updated, return.
491 return;
492 }
493
494 // Check if the root cause found was a potential side effect of an
495 // ODP data corruption error. If it was, check if any other signature
496 // in the signature list was a potential root cause.
497 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
498 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
499 if (i_rasData.isFlagSet(o_rootCause, OdpSide))
500 {
501 for (const auto& s : list)
502 {
503 if (i_rasData.isFlagSet(s, OdpRoot))
504 {
505 // ODP data corruption root cause found, return.
506 o_rootCause = s;
507 return;
508 }
509 }
510 }
511
512 // Odyssey RDF_FIR
513 static const auto rdf_fir = __hash("RDF_FIR");
514
515 // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
516 // check if bit 41 is also on.
517 if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
518 {
519 // Look for RDF_FIR[41]
520 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
521 return (rdf_fir == t.getId() && 41 == t.getBit());
522 });
523 if (list.end() != itr)
524 {
525 o_rootCause = *itr;
526 }
527 }
528 }
529
530 //------------------------------------------------------------------------------
531
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)532 bool filterRootCause(AnalysisType i_type,
533 const libhei::IsolationData& i_isoData,
534 libhei::Signature& o_rootCause,
535 const RasDataParser& i_rasData)
536 {
537 // Find the initial root cause attention based on common rules for FIR
538 // isolation.
539 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
540
541 // If some root cause was found, handle any special cases for specific FIR
542 // bits that require additional logic to determine the root cause.
543 if (true == rc)
544 {
545 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
546 }
547
548 return rc;
549 }
550
551 //------------------------------------------------------------------------------
552
553 } // namespace analyzer
554