1 #include <assert.h>
2 
3 #include <analyzer_main.hpp>
4 #include <hei_main.hpp>
5 #include <hei_util.hpp>
6 #include <util/pdbg.hpp>
7 
8 #include <algorithm>
9 #include <limits>
10 #include <string>
11 
12 namespace analyzer
13 {
14 
15 //------------------------------------------------------------------------------
16 
17 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
18                        libhei::Signature& o_rootCause)
19 {
20     // TODO: Consider returning all of them instead of one as root cause.
21     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
22         return (libhei::hash<libhei::NodeId_t>("TP_LOCAL_FIR") == t.getId() &&
23                 (42 == t.getBit() || 43 == t.getBit()));
24     });
25 
26     if (i_list.end() != itr)
27     {
28         o_rootCause = *itr;
29         return true;
30     }
31 
32     return false;
33 }
34 
35 //------------------------------------------------------------------------------
36 
37 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
38                      libhei::Signature& o_rootCause)
39 {
40     // TODO: Consider returning all of them instead of one as root cause.
41     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
42         return (libhei::hash<libhei::NodeId_t>("PLL_UNLOCK") == t.getId() &&
43                 (0 == t.getBit() || 1 == t.getBit()));
44     });
45 
46     if (i_list.end() != itr)
47     {
48         o_rootCause = *itr;
49         return true;
50     }
51 
52     return false;
53 }
54 
55 //------------------------------------------------------------------------------
56 
57 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
58                                 libhei::Signature& o_rootCause)
59 {
60     using namespace util::pdbg;
61 
62     using func  = libhei::NodeId_t (*)(const std::string& i_str);
63     func __hash = libhei::hash<libhei::NodeId_t>;
64 
65     static const auto mc_dstl_fir       = __hash("MC_DSTL_FIR");
66     static const auto mc_ustl_fir       = __hash("MC_USTL_FIR");
67     static const auto mc_omi_dl_err_rpt = __hash("MC_OMI_DL_ERR_RPT");
68 
69     for (const auto s : i_list)
70     {
71         const auto targetType = getTrgtType(getTrgt(s.getChip()));
72         const auto id         = s.getId();
73         const auto bit        = s.getBit();
74         const auto attnType   = s.getAttnType();
75 
76         // Look for any unit checkstop attentions from OCMBs.
77         if (TYPE_OCMB == targetType)
78         {
79             // Any unit checkstop attentions will trigger a channel failure.
80             if (libhei::ATTN_TYPE_UNIT_CS == attnType)
81             {
82                 o_rootCause = s;
83                 return true;
84             }
85         }
86         // Look for channel failure attentions on processors.
87         else if (TYPE_PROC == targetType)
88         {
89             // TODO: All of these channel failure bits are configurable.
90             //       Eventually, we will need some mechanism to check that
91             //       config registers for a more accurate analysis. For now,
92             //       simply check for all bits that could potentially be
93             //       configured to channel failure.
94 
95             // Any unit checkstop bit in the MC_DSTL_FIR or MC_USTL_FIR could
96             // be a channel failure.
97             if (libhei::ATTN_TYPE_UNIT_CS == attnType)
98             {
99                 // Ignore bits MC_DSTL_FIR[0:7] because they simply indicate
100                 // attentions occurred on the attached OCMBs.
101                 if ((mc_dstl_fir == id && 8 <= bit) || (mc_ustl_fir == id))
102                 {
103                     o_rootCause = s;
104                     return true;
105                 }
106             }
107 
108             // All bits in MC_OMI_DL_ERR_RPT eventually feed into
109             // MC_OMI_DL_FIR[0,20] which are configurable to channel failure.
110             if (mc_omi_dl_err_rpt == id)
111             {
112                 o_rootCause = s;
113                 return true;
114             }
115         }
116     }
117 
118     return false; // default, nothing found
119 }
120 
121 //------------------------------------------------------------------------------
122 
123 // Will query if a signature is a potential system checkstop root cause.
124 // attention. Note that this function excludes memory channel failure attentions
125 // and core unit checkstop attentions.
126 bool __findCsRootCause(const libhei::Signature& i_signature)
127 {
128     using namespace util::pdbg;
129 
130     using func  = libhei::NodeId_t (*)(const std::string& i_str);
131     func __hash = libhei::hash<libhei::NodeId_t>;
132 
133     // PROC registers
134     static const auto eq_core_fir      = __hash("EQ_CORE_FIR");
135     static const auto eq_l2_fir        = __hash("EQ_L2_FIR");
136     static const auto eq_l3_fir        = __hash("EQ_L3_FIR");
137     static const auto eq_ncu_fir       = __hash("EQ_NCU_FIR");
138     static const auto iohs_dlp_fir_oc  = __hash("IOHS_DLP_FIR_OC");
139     static const auto iohs_dlp_fir_smp = __hash("IOHS_DLP_FIR_SMP");
140     static const auto nx_cq_fir        = __hash("NX_CQ_FIR");
141     static const auto nx_dma_eng_fir   = __hash("NX_DMA_ENG_FIR");
142     static const auto pau_fir_0        = __hash("PAU_FIR_0");
143     static const auto pau_fir_1        = __hash("PAU_FIR_1");
144     static const auto pau_fir_2        = __hash("PAU_FIR_2");
145     static const auto pau_ptl_fir      = __hash("PAU_PTL_FIR");
146 
147     // OCMB registers
148     static const auto rdffir = __hash("RDFFIR");
149 
150     const auto targetType = getTrgtType(getTrgt(i_signature.getChip()));
151     const auto id         = i_signature.getId();
152     const auto bit        = i_signature.getBit();
153 
154     if (TYPE_PROC == targetType)
155     {
156         if (eq_core_fir == id &&
157             (3 == bit || 5 == bit || 8 == bit || 12 == bit || 22 == bit ||
158              25 == bit || 32 == bit || 36 == bit || 38 == bit || 46 == bit ||
159              47 == bit || 57 == bit))
160         {
161             return true;
162         }
163 
164         if (eq_l2_fir == id &&
165             (1 == bit || 12 == bit || 13 == bit || 17 == bit || 18 == bit ||
166              20 == bit || 27 == bit))
167         {
168             return true;
169         }
170 
171         if (eq_l3_fir == id &&
172             (2 == bit || 5 == bit || 8 == bit || 11 == bit || 17 == bit))
173         {
174             return true;
175         }
176 
177         if (eq_ncu_fir == id && (3 == bit || 4 == bit || 5 == bit || 7 == bit ||
178                                  8 == bit || 10 == bit || 17 == bit))
179         {
180             return true;
181         }
182 
183         if (iohs_dlp_fir_oc == id && (54 <= bit && bit <= 61))
184         {
185             return true;
186         }
187 
188         if (iohs_dlp_fir_smp == id && (54 <= bit && bit <= 61))
189         {
190             return true;
191         }
192 
193         if (nx_cq_fir == id && (7 == bit || 16 == bit || 21 == bit))
194         {
195             return true;
196         }
197 
198         if (nx_dma_eng_fir == id && (0 == bit))
199         {
200             return true;
201         }
202 
203         if (pau_fir_0 == id &&
204             (15 == bit || 18 == bit || 19 == bit || 25 == bit || 26 == bit ||
205              29 == bit || 33 == bit || 34 == bit || 35 == bit || 40 == bit ||
206              42 == bit || 44 == bit || 45 == bit))
207         {
208             return true;
209         }
210 
211         if (pau_fir_1 == id &&
212             (13 == bit || 14 == bit || 15 == bit || 37 == bit || 39 == bit ||
213              40 == bit || 41 == bit || 42 == bit))
214         {
215             return true;
216         }
217 
218         if (pau_fir_2 == id &&
219             ((4 <= bit && bit <= 18) || (20 <= bit && bit <= 31) ||
220              (36 <= bit && bit <= 41) || 45 == bit || 47 == bit || 48 == bit ||
221              50 == bit || 51 == bit || 52 == bit))
222         {
223             return true;
224         }
225 
226         if (pau_ptl_fir == id && (4 == bit || 8 == bit))
227         {
228             return true;
229         }
230     }
231     else if (TYPE_OCMB == targetType)
232     {
233         if (rdffir == id && (14 == bit || 15 == bit || 17 == bit || 37 == bit))
234         {
235             return true;
236         }
237     }
238 
239     return false; // default, nothing found
240 }
241 
242 //------------------------------------------------------------------------------
243 
244 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
245                           libhei::Signature& o_rootCause)
246 {
247     for (const auto s : i_list)
248     {
249         // Only looking for recoverable attentions.
250         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
251         {
252             continue;
253         }
254 
255         if (__findCsRootCause(s))
256         {
257             o_rootCause = s;
258             return true;
259         }
260     }
261 
262     return false; // default, nothing found
263 }
264 
265 //------------------------------------------------------------------------------
266 
267 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
268                            libhei::Signature& o_rootCause)
269 {
270     for (const auto s : i_list)
271     {
272         // Only looking for unit checkstop attentions.
273         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
274         {
275             continue;
276         }
277 
278         if (__findCsRootCause(s))
279         {
280             o_rootCause = s;
281             return true;
282         }
283     }
284 
285     return false; // default, nothing found
286 }
287 
288 //------------------------------------------------------------------------------
289 
290 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
291                          libhei::Signature& o_rootCause)
292 {
293     using namespace util::pdbg;
294 
295     static const auto pb_ext_fir = libhei::hash<libhei::NodeId_t>("PB_EXT_FIR");
296 
297     for (const auto s : i_list)
298     {
299         const auto targetType = getTrgtType(getTrgt(s.getChip()));
300         const auto id         = s.getId();
301         const auto attnType   = s.getAttnType();
302 
303         // Find any processor with system checkstop attention that did not
304         // originate from the PB_EXT_FIR.
305         if ((TYPE_PROC == targetType) &&
306             (libhei::ATTN_TYPE_CHECKSTOP == attnType) && (pb_ext_fir != id))
307         {
308             o_rootCause = s;
309             return true;
310         }
311     }
312 
313     return false; // default, nothing found
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 bool filterRootCause(AnalysisType i_type,
319                      const libhei::IsolationData& i_isoData,
320                      libhei::Signature& o_rootCause)
321 {
322     // We'll need to make a copy of the list so that the original list is
323     // maintained for the PEL.
324     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
325 
326     // START WORKAROUND
327     // TODO: Filtering should be data driven. Until that support is available,
328     //       use the following isolation rules.
329 
330     // Ensure the list is not empty before continuing.
331     if (list.empty())
332     {
333         return false; // nothing more to do
334     }
335 
336     // First, look for any RCS OSC errors. This must always be first because
337     // they can cause downstream PLL unlock attentions.
338     if (__findRcsOscError(list, o_rootCause))
339     {
340         return true;
341     }
342 
343     // Second, look for any PLL unlock attentions. This must always be second
344     // because PLL unlock attentions can cause any number of downstream
345     // attentions, including a system checkstop.
346     if (__findPllUnlock(list, o_rootCause))
347     {
348         return true;
349     }
350 
351     // Regardless of the analysis type, always look for anything that could be
352     // blamed as the root cause of a system checkstop.
353 
354     // Memory channel failure attentions will produce SUEs and likely cause
355     // downstream attentions, including a system checkstop.
356     if (__findMemoryChannelFailure(list, o_rootCause))
357     {
358         return true;
359     }
360 
361     // Look for any recoverable attentions that have been identified as a
362     // potential root cause of a system checkstop attention. These would include
363     // any attention that would generate an SUE. Note that is it possible for
364     // recoverables to generate unit checkstop attentions so we must check them
365     // first.
366     if (__findCsRootCause_RE(list, o_rootCause))
367     {
368         return true;
369     }
370 
371     // Look for any unit checkstop attentions (other than memory channel
372     // failures) that have been identified as a potential root cause of a
373     // system checkstop attention. These would include any attention that would
374     // generate an SUE.
375     if (__findCsRootCause_UCS(list, o_rootCause))
376     {
377         return true;
378     }
379 
380     // Look for any system checkstop attentions that originated from within the
381     // chip that reported the attention. In other words, no external checkstop
382     // attentions.
383     if (__findNonExternalCs(list, o_rootCause))
384     {
385         return true;
386     }
387 
388     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
389     {
390         // No system checkstop root cause attentions were found. Next, look for
391         // any recoverable or unit checkstop attentions that could be associated
392         // with a TI.
393 
394         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
395             return (libhei::ATTN_TYPE_RECOVERABLE == t.getAttnType() ||
396                     libhei::ATTN_TYPE_UNIT_CS == t.getAttnType());
397         });
398 
399         if (list.end() != itr)
400         {
401             o_rootCause = *itr;
402             return true;
403         }
404 
405         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
406         {
407             // No attentions associated with a system checkstop or TI were
408             // found. Simply, return the first entry in the list.
409             o_rootCause = list.front();
410             return true;
411         }
412     }
413 
414     // END WORKAROUND
415 
416     return false; // default, no active attentions found.
417 }
418 
419 //------------------------------------------------------------------------------
420 
421 } // namespace analyzer
422