1 #include <assert.h>
2
3 #include <analyzer/analyzer_main.hpp>
4 #include <analyzer/ras-data/ras-data-parser.hpp>
5 #include <hei_main.hpp>
6 #include <hei_util.hpp>
7 #include <util/pdbg.hpp>
8
9 #include <algorithm>
10 #include <limits>
11 #include <string>
12
13 namespace analyzer
14 {
15 //------------------------------------------------------------------------------
16
__findRcsOscError(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18 libhei::Signature& o_rootCause)
19 {
20 // TODO: Consider returning all of them instead of one as root cause.
21 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22 return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23 (42 == t.getBit() || 43 == t.getBit()));
24 });
25
26 if (i_list.end() != itr)
27 {
28 o_rootCause = *itr;
29 return true;
30 }
31
32 return false;
33 }
34
35 //------------------------------------------------------------------------------
36
__findPllUnlock(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38 libhei::Signature& o_rootCause)
39 {
40 using namespace util::pdbg;
41
42 // TODO: Consider returning all of them instead of one as root cause.
43
44 auto nodeId = libhei::hash<libhei::NodeId_t>("PLL_UNLOCK");
45
46 // First, look for any PLL unlock attentions reported by a processsor chip.
47 auto itr1 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
48 return (nodeId == t.getId() &&
49 TYPE_PROC == getTrgtType(getTrgt(t.getChip())));
50 });
51
52 if (i_list.end() != itr1)
53 {
54 o_rootCause = *itr1;
55 return true;
56 }
57
58 // Then, look for any PLL unlock attentions reported by an OCMB chip. This
59 // is specifically for Odyssey, which are the only OCMBs that would report
60 // PLL unlock attentions.
61 auto itr2 = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
62 return (nodeId == t.getId() &&
63 TYPE_OCMB == getTrgtType(getTrgt(t.getChip())));
64 });
65
66 if (i_list.end() != itr2)
67 {
68 o_rootCause = *itr2;
69 return true;
70 }
71
72 return false;
73 }
74
75 //------------------------------------------------------------------------------
76
__findMemoryChannelFailure(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)77 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
78 libhei::Signature& o_rootCause,
79 const RasDataParser& i_rasData)
80 {
81 using namespace util::pdbg;
82
83 using func = libhei::NodeId_t (*)(const std::string& i_str);
84 func __hash = libhei::hash<libhei::NodeId_t>;
85
86 static const auto mc_dstl_fir = __hash("MC_DSTL_FIR");
87 static const auto mc_ustl_fir = __hash("MC_USTL_FIR");
88 static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
89
90 // First, look for any chip checkstops from the connected OCMBs.
91 for (const auto& s : i_list)
92 {
93 if (TYPE_OCMB != getTrgtType(getTrgt(s.getChip())))
94 {
95 continue; // OCMBs only
96 }
97
98 // TODO: The chip data for Explorer chips currently report chip
99 // checkstops as unit checkstops. Once the chip data has been
100 // updated, the check for unit checkstops here will need to be
101 // removed.
102 if (libhei::ATTN_TYPE_CHIP_CS == s.getAttnType() ||
103 libhei::ATTN_TYPE_UNIT_CS == s.getAttnType())
104 {
105 o_rootCause = s;
106 return true;
107 }
108 }
109
110 // Now, look for any channel failure attentions on the processor side of the
111 // memory bus.
112 for (const auto& s : i_list)
113 {
114 if (TYPE_PROC != getTrgtType(getTrgt(s.getChip())))
115 {
116 continue; // processors only
117 }
118
119 // Any unit checkstop attentions that originated from the MC_DSTL_FIR or
120 // MC_USTLFIR are considered a channel failure attention.
121 // TODO: The "channel failure" designation is actually configurable via
122 // other registers. We just happen to expect anything that is
123 // configured to channel failure to also be configured to unit
124 // checkstop. Eventually, we will need some mechanism to check the
125 // configuration registers for a more accurate analysis.
126 if (libhei::ATTN_TYPE_UNIT_CS == s.getAttnType() &&
127 (mc_dstl_fir == s.getId() || mc_ustl_fir == s.getId()) &&
128 !i_rasData.isFlagSet(s,
129 RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
130 {
131 o_rootCause = s;
132 return true;
133 }
134 // Any signatures from MC_OMI_DL_ERR_RPT feed into the only bits in
135 // MC_OMI_DL_FIR that are hardwired to channel failure.
136 else if (mc_omi_dl_err_rpt == s.getId())
137 {
138 o_rootCause = s;
139 return true;
140 }
141 }
142
143 return false; // default, nothing found
144 }
145
146 //------------------------------------------------------------------------------
147
148 // Will query if a signature is a potential system checkstop root cause.
149 // attention. Note that this function excludes memory channel failure attentions
150 // which are checked in __findMemoryChannelFailure().
__findCsRootCause(const libhei::Signature & i_signature,const RasDataParser & i_rasData)151 bool __findCsRootCause(const libhei::Signature& i_signature,
152 const RasDataParser& i_rasData)
153 {
154 // Check if the input signature has the CS_POSSIBLE or SUE_SOURCE flag set.
155 if (i_rasData.isFlagSet(i_signature,
156 RasDataParser::RasDataFlags::CS_POSSIBLE) ||
157 i_rasData.isFlagSet(i_signature,
158 RasDataParser::RasDataFlags::SUE_SOURCE))
159 {
160 return true;
161 }
162
163 return false; // default, nothing found
164 }
165
166 //------------------------------------------------------------------------------
167
__findCsRootCause_RE(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)168 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
169 libhei::Signature& o_rootCause,
170 const RasDataParser& i_rasData)
171 {
172 for (const auto& s : i_list)
173 {
174 // Only looking for recoverable attentions.
175 if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
176 {
177 continue;
178 }
179
180 if (__findCsRootCause(s, i_rasData))
181 {
182 o_rootCause = s;
183 return true;
184 }
185 }
186
187 return false; // default, nothing found
188 }
189
190 //------------------------------------------------------------------------------
191
__findCsRootCause_UCS(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)192 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
193 libhei::Signature& o_rootCause,
194 const RasDataParser& i_rasData)
195 {
196 for (const auto& s : i_list)
197 {
198 // Only looking for unit checkstop attentions.
199 if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
200 {
201 continue;
202 }
203
204 if (__findCsRootCause(s, i_rasData))
205 {
206 o_rootCause = s;
207 return true;
208 }
209 }
210
211 return false; // default, nothing found
212 }
213
214 //------------------------------------------------------------------------------
215
__findOcmbAttnBits(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)216 bool __findOcmbAttnBits(const std::vector<libhei::Signature>& i_list,
217 libhei::Signature& o_rootCause,
218 const RasDataParser& i_rasData)
219 {
220 using namespace util::pdbg;
221
222 // If we have any attentions from an OCMB, assume isolation to the OCMBs
223 // was successful and the ATTN_FROM_OCMB flag does not need to be checked.
224 for (const auto& s : i_list)
225 {
226 if (TYPE_OCMB == getTrgtType(getTrgt(s.getChip())))
227 {
228 return false;
229 }
230 }
231
232 for (const auto& s : i_list)
233 {
234 if (i_rasData.isFlagSet(s, RasDataParser::RasDataFlags::ATTN_FROM_OCMB))
235 {
236 o_rootCause = s;
237 return true;
238 }
239 }
240
241 return false; // default, nothing found
242 }
243
244 //------------------------------------------------------------------------------
245
__findNonExternalCs(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)246 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
247 libhei::Signature& o_rootCause)
248 {
249 using namespace util::pdbg;
250
251 static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
252
253 for (const auto& s : i_list)
254 {
255 const auto targetType = getTrgtType(getTrgt(s.getChip()));
256 const auto id = s.getId();
257 const auto attnType = s.getAttnType();
258
259 // Find any processor with chip checkstop attention that did not
260 // originate from the PB_EXT_FIR.
261 if ((TYPE_PROC == targetType) &&
262 (libhei::ATTN_TYPE_CHIP_CS == attnType) && (pb_ext_fir != id))
263 {
264 o_rootCause = s;
265 return true;
266 }
267 }
268
269 return false; // default, nothing found
270 }
271
272 //------------------------------------------------------------------------------
273
__findTiRootCause(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)274 bool __findTiRootCause(const std::vector<libhei::Signature>& i_list,
275 libhei::Signature& o_rootCause,
276 const RasDataParser& i_rasData)
277 {
278 using namespace util::pdbg;
279 using rdf = RasDataParser::RasDataFlags;
280
281 for (const auto& signature : i_list)
282 {
283 const auto attnType = signature.getAttnType();
284
285 // Only looking for recoverable or unit checkstop attentions.
286 if (libhei::ATTN_TYPE_RECOVERABLE != attnType &&
287 libhei::ATTN_TYPE_UNIT_CS != attnType)
288 {
289 continue;
290 }
291
292 // Skip any signature with the 'recovered_error' or 'informational_only'
293 // flags.
294 if (i_rasData.isFlagSet(signature, rdf::RECOVERED_ERROR) ||
295 i_rasData.isFlagSet(signature, rdf::INFORMATIONAL_ONLY) ||
296 i_rasData.isFlagSet(signature, rdf::MNFG_INFORMATIONAL_ONLY))
297 {
298 continue;
299 }
300
301 // At this point, the attention has not been explicitly ignored. So
302 // return this signature and exit.
303 o_rootCause = signature;
304 return true;
305 }
306
307 return false; // default, nothing found
308 }
309
310 //------------------------------------------------------------------------------
311
findRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)312 bool findRootCause(AnalysisType i_type, const libhei::IsolationData& i_isoData,
313 libhei::Signature& o_rootCause,
314 const RasDataParser& i_rasData)
315 {
316 // We'll need to make a copy of the list so that the original list is
317 // maintained for the PEL.
318 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
319
320 // START WORKAROUND
321 // TODO: Filtering should be data driven. Until that support is available,
322 // use the following isolation rules.
323
324 // Ensure the list is not empty before continuing.
325 if (list.empty())
326 {
327 return false; // nothing more to do
328 }
329
330 // First, look for any RCS OSC errors. This must always be first because
331 // they can cause downstream PLL unlock attentions.
332 if (__findRcsOscError(list, o_rootCause))
333 {
334 return true;
335 }
336
337 // Second, look for any PLL unlock attentions. This must always be second
338 // because PLL unlock attentions can cause any number of downstream
339 // attentions, including a system checkstop.
340 if (__findPllUnlock(list, o_rootCause))
341 {
342 return true;
343 }
344
345 // Regardless of the analysis type, always look for anything that could be
346 // blamed as the root cause of a system checkstop.
347
348 // Memory channel failure attentions will produce SUEs and likely cause
349 // downstream attentions, including a system checkstop.
350 if (__findMemoryChannelFailure(list, o_rootCause, i_rasData))
351 {
352 return true;
353 }
354
355 // Look for any recoverable attentions that have been identified as a
356 // potential root cause of a system checkstop attention. These would include
357 // any attention that would generate an SUE. Note that is it possible for
358 // recoverables to generate unit checkstop attentions so we must check them
359 // first.
360 if (__findCsRootCause_RE(list, o_rootCause, i_rasData))
361 {
362 return true;
363 }
364
365 // Look for any unit checkstop attentions (other than memory channel
366 // failures) that have been identified as a potential root cause of a
367 // system checkstop attention. These would include any attention that would
368 // generate an SUE.
369 if (__findCsRootCause_UCS(list, o_rootCause, i_rasData))
370 {
371 return true;
372 }
373
374 // If no other viable root cause has been found, check for any signatures
375 // with the ATTN_FROM_OCMB flag in case there was an attention from an
376 // inaccessible OCMB.
377 if (__findOcmbAttnBits(list, o_rootCause, i_rasData))
378 {
379 return true;
380 }
381
382 // Look for any system checkstop attentions that originated from within the
383 // chip that reported the attention. In other words, no external checkstop
384 // attentions.
385 if (__findNonExternalCs(list, o_rootCause))
386 {
387 return true;
388 }
389
390 if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
391 {
392 // No system checkstop root cause attentions were found. Next, look for
393 // any recoverable or unit checkstop attentions that could be associated
394 // with a TI.
395 if (__findTiRootCause(list, o_rootCause, i_rasData))
396 {
397 return true;
398 }
399
400 if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
401 {
402 // No attentions associated with a system checkstop or TI were
403 // found. Simply, return the first entry in the list.
404 o_rootCause = list.front();
405 return true;
406 }
407 }
408
409 // END WORKAROUND
410
411 return false; // default, no active attentions found.
412 }
413
414 //------------------------------------------------------------------------------
415
__findIueTh(const std::vector<libhei::Signature> & i_list,libhei::Signature & o_rootCause)416 bool __findIueTh(const std::vector<libhei::Signature>& i_list,
417 libhei::Signature& o_rootCause)
418 {
419 auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
420 return (libhei::hash<libhei::NodeId_t>("RDFFIR") == t.getId() &&
421 (17 == t.getBit() || 37 == t.getBit())) ||
422 (libhei::hash<libhei::NodeId_t>("RDF_FIR") == t.getId() &&
423 (18 == t.getBit() || 38 == t.getBit()));
424 });
425
426 if (i_list.end() != itr)
427 {
428 o_rootCause = *itr;
429 return true;
430 }
431
432 return false;
433 }
434
435 //------------------------------------------------------------------------------
436
rootCauseSpecialCases(const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)437 void rootCauseSpecialCases(const libhei::IsolationData& i_isoData,
438 libhei::Signature& o_rootCause,
439 const RasDataParser& i_rasData)
440 {
441 using func = libhei::NodeId_t (*)(const std::string& i_str);
442 func __hash = libhei::hash<libhei::NodeId_t>;
443
444 // Check for any special cases that exist for specific FIR bits.
445
446 // If the channel fail was specifically a firmware initiated channel fail
447 // (SRQFIR[25] for Explorer OCMBs, SRQ_FIR[46] for Odyssey OCMBs) check for
448 // any IUE bits that are on that would have caused the channel fail
449 // (RDFFIR[17,37] for Explorer OCMBs, RDF_FIR_0[18,38] or RDF_FIR_1[18,38]
450 // for Odyssey OCMBs).
451
452 // Explorer SRQFIR
453 static const auto srqfir = __hash("SRQFIR");
454 // Odyssey SRQ_FIR
455 static const auto srq_fir = __hash("SRQ_FIR");
456
457 std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
458
459 if (((srqfir == o_rootCause.getId() && 25 == o_rootCause.getBit()) ||
460 (srq_fir == o_rootCause.getId() && 46 == o_rootCause.getBit())) &&
461 __findIueTh(list, o_rootCause))
462 {
463 // If __findIueTh returned true, o_rootCause was updated, return.
464 return;
465 }
466
467 // Check if the root cause found was a potential side effect of an
468 // ODP data corruption error. If it was, check if any other signature
469 // in the signature list was a potential root cause.
470 auto OdpSide = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_SIDE_EFFECT;
471 auto OdpRoot = RasDataParser::RasDataFlags::ODP_DATA_CORRUPT_ROOT_CAUSE;
472 if (i_rasData.isFlagSet(o_rootCause, OdpSide))
473 {
474 for (const auto& s : list)
475 {
476 if (i_rasData.isFlagSet(s, OdpRoot))
477 {
478 // ODP data corruption root cause found, return.
479 o_rootCause = s;
480 return;
481 }
482 }
483 }
484
485 // Odyssey RDF_FIR
486 static const auto rdf_fir = __hash("RDF_FIR");
487
488 // RDF_FIR[41] can be the root cause of RDF_FIR[16], so if bit 16 is on,
489 // check if bit 41 is also on.
490 if (rdf_fir == o_rootCause.getId() && 16 == o_rootCause.getBit())
491 {
492 // Look for RDF_FIR[41]
493 auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
494 return (rdf_fir == t.getId() && 41 == t.getBit());
495 });
496 if (list.end() != itr)
497 {
498 o_rootCause = *itr;
499 }
500 }
501 }
502
503 //------------------------------------------------------------------------------
504
filterRootCause(AnalysisType i_type,const libhei::IsolationData & i_isoData,libhei::Signature & o_rootCause,const RasDataParser & i_rasData)505 bool filterRootCause(AnalysisType i_type,
506 const libhei::IsolationData& i_isoData,
507 libhei::Signature& o_rootCause,
508 const RasDataParser& i_rasData)
509 {
510 // Find the initial root cause attention based on common rules for FIR
511 // isolation.
512 bool rc = findRootCause(i_type, i_isoData, o_rootCause, i_rasData);
513
514 // If some root cause was found, handle any special cases for specific FIR
515 // bits that require additional logic to determine the root cause.
516 if (true == rc)
517 {
518 rootCauseSpecialCases(i_isoData, o_rootCause, i_rasData);
519 }
520
521 return rc;
522 }
523
524 //------------------------------------------------------------------------------
525
526 } // namespace analyzer
527