1 #include <assert.h>
2
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/ras-data/ras-data-parser.hpp>
5 #include <hei_main.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8
9 #include <algorithm>
10 #include <limits>
11 #include <string>
12
13 namespace analyzer
14 {
15 //------------------------------------------------------------------------------
16
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18 libhei::Signature& o_rootCause)
19 {
20 // TODO: Consider returning all of them instead of one as root cause.
21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23 (42 == t.getBit() || 43 == t.getBit()));
24 });
25
26 if (i_list.end() != itr)
27 {
28 o_rootCause = *itr;
29 return true;
30 }
31
32 return false;
33 }
34
35 //------------------------------------------------------------------------------
36
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38 libhei::Signature& o_rootCause)
39 {
40 using namespace util::pdbg;
41
42 // TODO: Consider returning all of them instead of one as root cause.
43
44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45
46 // First, look for any PLL unlock attentions reported by a processsor chip.
47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48 return (nodeId == t.getId() &&
49 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50 });
51
52 if (i_list.end() != itr1)
53 {
54 o_rootCause = *itr1;
55 return true;
56 }
57
58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59 // is specifically for Odyssey, which are the only OCMBs that would report
60 // PLL unlock attentions.
61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62 return (nodeId == t.getId() &&
63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64 });
65
66 if (i_list.end() != itr2)
67 {
68 o_rootCause = *itr2;
69 return true;
70 }
71
72 return false;
73 }
74
75 //------------------------------------------------------------------------------
76
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78 libhei::Signature& o_rootCause,
79 const RasDataParser& i_rasData)
80 {
81 using namespace util::pdbg;
82
83 using func = libhei::NodeId_t (*)(const std::string& i_str);
84 func __hash = libhei::hash<libhei::NodeId_t>;
85
86 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87 static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89
90 // First, look for any chip checkstops from the connected OCMBs.
91 for (const auto& s : i_list)
92 {
93 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94 {
95 continue; // OCMBs only
96 }
97
98 // TODO: The chip data for Explorer chips currently report chip
99 // checkstops as unit checkstops. Once the chip data has been
100 // updated, the check for unit checkstops here will need to be
101 // removed.
102 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104 {
105 o_rootCause = s;
106 return true;
107 }
108 }
109
110 // Now, look for any channel failure attentions on the processor side of the
111 // memory bus.
112 for (const auto& s : i_list)
113 {
114 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115 {
116 continue; // processors only
117 }
118
119 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120 // MC_USTLFIR are considered a channel failure attention.
121 // TODO: The "channel failure" designation is actually configurable via
122 // other registers. We just happen to expect anything that is
123 // configured to channel failure to also be configured to unit
124 // checkstop. Eventually, we will need some mechanism to check the
125 // configuration registers for a more accurate analysis.
126 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128 !i_rasData.isFlagSet(s,
129 RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130 {
131 o_rootCause = s;
132 return true;
133 }
134 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135 // MC_OMI_DL_FIR that are hardwired to channel failure.
136 else if (mc_omi_dl_err_rpt == s.getId())
137 {
138 o_rootCause = s;
139 return true;
140 }
141 }
142
143 return false; // default, nothing found
144 }
145
146 //------------------------------------------------------------------------------
147
148 // Will query if a signature is a potential system checkstop root cause.
149 // attention. Note that this function excludes memory channel failure attentions
150 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature,
152 const RasDataParser& i_rasData)
153 {
154 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155 if (i_rasData.isFlagSet(i_signature,
156 RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157 i_rasData.isFlagSet(i_signature,
158 RasDataParser::RasDataFlags::SUE_SOURCE))
159 {
160 return true;
161 }
162
163 return false; // default, nothing found
164 }
165
166 //------------------------------------------------------------------------------
167
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169 libhei::Signature& o_rootCause,
170 const RasDataParser& i_rasData)
171 {
172 for (const auto& s : i_list)
173 {
174 // Only looking for recoverable attentions.
175 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176 {
177 continue;
178 }
179
180 if (__findCsRootCause(s, i_rasData))
181 {
182 o_rootCause = s;
183 return true;
184 }
185 }
186
187 return false; // default, nothing found
188 }
189
190 //------------------------------------------------------------------------------
191
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193 libhei::Signature& o_rootCause,
194 const RasDataParser& i_rasData)
195 {
196 for (const auto& s : i_list)
197 {
198 // Only looking for unit checkstop attentions.
199 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200 {
201 continue;
202 }
203
204 if (__findCsRootCause(s, i_rasData))
205 {
206 o_rootCause = s;
207 return true;
208 }
209 }
210
211 return false; // default, nothing found
212 }
213
214 //------------------------------------------------------------------------------
215
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217 libhei::Signature& o_rootCause,
218 const RasDataParser& i_rasData)
219 {
220 using namespace util::pdbg;
221
222 // If we have any attentions from an OCMB, assume isolation to the OCMBs
223 // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224 for (const auto& s : i_list)
225 {
226 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227 {
228 return false;
229 }
230 }
231
232 for (const auto& s : i_list)
233 {
234 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235 {
236 o_rootCause = s;
237 return true;
238 }
239 }
240
241 return false; // default, nothing found
242 }
243
244 //------------------------------------------------------------------------------
245
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247 libhei::Signature& o_rootCause)
248 {
249 using namespace util::pdbg;
250
251 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252
253 for (const auto& s : i_list)
254 {
255 const auto targetType = getTrgtType(getTrgt(s.getChip()));
256 const auto id = s.getId();
257 const auto attnType = s.getAttnType();
258
259 // Find any processor with chip checkstop attention that did not
260 // originate from the PB_EXT_FIR.
261 if ((TYPE_PROC == targetType) &&
262 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263 {
264 o_rootCause = s;
265 return true;
266 }
267 }
268
269 return false; // default, nothing found
270 }
271
272 //------------------------------------------------------------------------------
273
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275 libhei::Signature& o_rootCause,
276 const RasDataParser& i_rasData)
277 {
278 using namespace util::pdbg;
279 using rdf = RasDataParser::RasDataFlags;
280
281 for (const auto& signature : i_list)
282 {
283 const auto attnType = signature.getAttnType();
284
285 // Only looking for recoverable or unit checkstop attentions.
286 if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
287 libhei::ATTN_TYPE_UNIT_CS != attnType)
288 {
289 continue;
290 }
291
292 // Skip any signature with the 'recovered_error', 'informational_only',
293 // or 'attn_from_ocmb' flags.
294 if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
295 i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
296 i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY) ||
297 i_rasData.isFlagSet(signature, rdf::ATTN_FROM_OCMB))
298 {
299 continue;
300 }
301
302 // At this point, the attention has not been explicitly ignored. So
303 // return this signature and exit.
304 o_rootCause = signature;
305 return true;
306 }
307
308 return false; // default, nothing found
309 }
310
311 //------------------------------------------------------------------------------
312
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)313 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
314 libhei::Signature& o_rootCause,
315 const RasDataParser& i_rasData)
316 {
317 // We'll need to make a copy of the list so that the original list is
318 // maintained for the PEL.
319 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
320
321 // START WORKAROUND
322 // TODO: Filtering should be data driven. Until that support is available,
323 // use the following isolation rules.
324
325 // Ensure the list is not empty before continuing.
326 if (list.empty())
327 {
328 return false; // nothing more to do
329 }
330
331 // First, look for any RCS OSC errors. This must always be first because
332 // they can cause downstream PLL unlock attentions.
333 if (__findRcsOscError(list, o_rootCause))
334 {
335 return true;
336 }
337
338 // Second, look for any PLL unlock attentions. This must always be second
339 // because PLL unlock attentions can cause any number of downstream
340 // attentions, including a system checkstop.
341 if (__findPllUnlock(list, o_rootCause))
342 {
343 return true;
344 }
345
346 // Regardless of the analysis type, always look for anything that could be
347 // blamed as the root cause of a system checkstop.
348
349 // Memory channel failure attentions will produce SUEs and likely cause
350 // downstream attentions, including a system checkstop.
351 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
352 {
353 return true;
354 }
355
356 // Look for any recoverable attentions that have been identified as a
357 // potential root cause of a system checkstop attention. These would include
358 // any attention that would generate an SUE. Note that is it possible for
359 // recoverables to generate unit checkstop attentions so we must check them
360 // first.
361 if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
362 {
363 return true;
364 }
365
366 // Look for any unit checkstop attentions (other than memory channel
367 // failures) that have been identified as a potential root cause of a
368 // system checkstop attention. These would include any attention that would
369 // generate an SUE.
370 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
371 {
372 return true;
373 }
374
375 // If no other viable root cause has been found, check for any signatures
376 // with the ATTN_FROM_OCMB flag in case there was an attention from an
377 // inaccessible OCMB.
378 if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
379 {
380 return true;
381 }
382
383 // Look for any system checkstop attentions that originated from within the
384 // chip that reported the attention. In other words, no external checkstop
385 // attentions.
386 if (__findNonExternalCs(list, o_rootCause))
387 {
388 return true;
389 }
390
391 if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
392 {
393 // No system checkstop root cause attentions were found. Next, look for
394 // any recoverable or unit checkstop attentions that could be associated
395 // with a TI.
396 if (__findTiRootCause(list, o_rootCause, i_rasData))
397 {
398 return true;
399 }
400
401 if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
402 {
403 // No attentions associated with a system checkstop or TI were
404 // found. Simply, return the first entry in the list.
405 o_rootCause = list.front();
406 return true;
407 }
408 }
409
410 // END WORKAROUND
411
412 return false; // default, no active attentions found.
413 }
414
415 //------------------------------------------------------------------------------
416
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)417 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
418 libhei::Signature& o_rootCause)
419 {
420 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
421 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
422 (17 == t.getBit() || 37 == t.getBit())) ||
423 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
424 (18 == t.getBit() || 38 == t.getBit()));
425 });
426
427 if (i_list.end() != itr)
428 {
429 o_rootCause = *itr;
430 return true;
431 }
432
433 return false;
434 }
435
436 //------------------------------------------------------------------------------
437
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)438 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
439 libhei::Signature& o_rootCause,
440 const RasDataParser& i_rasData)
441 {
442 using func = libhei::NodeId_t (*)(const std::string& i_str);
443 func __hash = libhei::hash<libhei::NodeId_t>;
444
445 // Check for any special cases that exist for specific FIR bits.
446
447 // If the channel fail was specifically a firmware initiated channel fail
448 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
449 // any IUE bits that are on that would have caused the channel fail
450 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
451 // for Odyssey OCMBs).
452
453 // Explorer SRQFIR
454 static const auto srqfir = __hash("SRQFIR");
455 // Odyssey SRQ_FIR
456 static const auto srq_fir = __hash("SRQ_FIR");
457
458 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
459
460 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
461 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
462 __findIueTh(list, o_rootCause))
463 {
464 // If __findIueTh returned true, o_rootCause was updated, return.
465 return;
466 }
467
468 // Check if the root cause found was a potential side effect of an
469 // ODP data corruption error. If it was, check if any other signature
470 // in the signature list was a potential root cause.
471 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
472 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
473 if (i_rasData.isFlagSet(o_rootCause, OdpSide))
474 {
475 for (const auto& s : list)
476 {
477 if (i_rasData.isFlagSet(s, OdpRoot))
478 {
479 // ODP data corruption root cause found, return.
480 o_rootCause = s;
481 return;
482 }
483 }
484 }
485
486 // Odyssey RDF_FIR
487 static const auto rdf_fir = __hash("RDF_FIR");
488
489 // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
490 // check if bit 41 is also on.
491 if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
492 {
493 // Look for RDF_FIR[41]
494 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
495 return (rdf_fir == t.getId() && 41 == t.getBit());
496 });
497 if (list.end() != itr)
498 {
499 o_rootCause = *itr;
500 }
501 }
502 }
503
504 //------------------------------------------------------------------------------
505
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)506 bool filterRootCause(AnalysisType i_type,
507 const libhei::IsolationData& i_isoData,
508 libhei::Signature& o_rootCause,
509 const RasDataParser& i_rasData)
510 {
511 // Find the initial root cause attention based on common rules for FIR
512 // isolation.
513 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
514
515 // If some root cause was found, handle any special cases for specific FIR
516 // bits that require additional logic to determine the root cause.
517 if (true == rc)
518 {
519 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
520 }
521
522 return rc;
523 }
524
525 //------------------------------------------------------------------------------
526
527 } // namespace analyzer
528