1 #include <assert.h>
2 
3 #include <analyzer_main.hpp>
4 #include <hei_main.hpp>
5 #include <util/pdbg.hpp>
6 
7 #include <algorithm>
8 #include <limits>
9 #include <string>
10 
11 namespace analyzer
12 {
13 
14 //------------------------------------------------------------------------------
15 
16 uint64_t __hash(unsigned int i_bytes, const std::string& i_str)
17 {
18     // This hash is a simple "n*s[0] + (n-1)*s[1] + ... + s[n-1]" algorithm,
19     // where s[i] is a chunk from the input string the length of i_bytes.
20 
21     // Currently only supporting 1-8 byte hashes.
22     assert(1 <= i_bytes && i_bytes <= sizeof(uint64_t));
23 
24     // Start hashing each chunk.
25     uint64_t sumA = 0;
26     uint64_t sumB = 0;
27 
28     // Iterate one chunk at a time.
29     for (unsigned int i = 0; i < i_str.size(); i += i_bytes)
30     {
31         // Combine each chunk into a single integer value. If we reach the end
32         // of the string, pad with null characters.
33         uint64_t chunk = 0;
34         for (unsigned int j = 0; j < i_bytes; j++)
35         {
36             chunk <<= 8;
37             chunk |= (i + j < i_str.size()) ? i_str[i + j] : '\0';
38         }
39 
40         // Apply the simple hash.
41         sumA += chunk;
42         sumB += sumA;
43     }
44 
45     // Mask off everything except the target number of bytes.
46     auto mask = std::numeric_limits<uint64_t>::max();
47     sumB &= mask >> ((sizeof(uint64_t) - i_bytes) * 8);
48 
49     return sumB;
50 }
51 
52 //------------------------------------------------------------------------------
53 
54 bool __findRcsOscError(const std::vector<libhei::Signature>& i_list,
55                        libhei::Signature& o_rootCause)
56 {
57     // TODO: Consider returning all of them instead of one as root cause.
58     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
59         return (__hash(2, "TP_LOCAL_FIR") == t.getId() &&
60                 (42 == t.getBit() || 43 == t.getBit()));
61     });
62 
63     if (i_list.end() != itr)
64     {
65         o_rootCause = *itr;
66         return true;
67     }
68 
69     return false;
70 }
71 
72 //------------------------------------------------------------------------------
73 
74 bool __findPllUnlock(const std::vector<libhei::Signature>& i_list,
75                      libhei::Signature& o_rootCause)
76 {
77     // TODO: Consider returning all of them instead of one as root cause.
78     auto itr = std::find_if(i_list.begin(), i_list.end(), [&](const auto& t) {
79         return (__hash(2, "PLL_UNLOCK") == t.getId() &&
80                 (0 == t.getBit() || 1 == t.getBit()));
81     });
82 
83     if (i_list.end() != itr)
84     {
85         o_rootCause = *itr;
86         return true;
87     }
88 
89     return false;
90 }
91 
92 //------------------------------------------------------------------------------
93 
94 bool __findMemoryChannelFailure(const std::vector<libhei::Signature>& i_list,
95                                 libhei::Signature& o_rootCause)
96 {
97     using namespace util::pdbg;
98 
99     static const auto mc_dstl_fir       = __hash(2, "MC_DSTL_FIR");
100     static const auto mc_ustl_fir       = __hash(2, "MC_USTL_FIR");
101     static const auto mc_omi_dl_err_rpt = __hash(2, "MC_OMI_DL_ERR_RPT");
102 
103     for (const auto s : i_list)
104     {
105         const auto targetType = getTrgtType(getTrgt(s.getChip()));
106         const auto id         = s.getId();
107         const auto bit        = s.getBit();
108         const auto attnType   = s.getAttnType();
109 
110         // Look for any unit checkstop attentions from OCMBs.
111         if (TYPE_OCMB == targetType)
112         {
113             // Any unit checkstop attentions will trigger a channel failure.
114             if (libhei::ATTN_TYPE_UNIT_CS == attnType)
115             {
116                 o_rootCause = s;
117                 return true;
118             }
119         }
120         // Look for channel failure attentions on processors.
121         else if (TYPE_PROC == targetType)
122         {
123             // TODO: All of these channel failure bits are configurable.
124             //       Eventually, we will need some mechanism to check that
125             //       config registers for a more accurate analysis. For now,
126             //       simply check for all bits that could potentially be
127             //       configured to channel failure.
128 
129             // Any unit checkstop bit in the MC_DSTL_FIR or MC_USTL_FIR could
130             // be a channel failure.
131             if (libhei::ATTN_TYPE_UNIT_CS == attnType)
132             {
133                 // Ignore bits MC_DSTL_FIR[0:7] because they simply indicate
134                 // attentions occurred on the attached OCMBs.
135                 if ((mc_dstl_fir == id && 8 <= bit) || (mc_ustl_fir == id))
136                 {
137                     o_rootCause = s;
138                     return true;
139                 }
140             }
141 
142             // All bits in MC_OMI_DL_ERR_RPT eventually feed into
143             // MC_OMI_DL_FIR[0,20] which are configurable to channel failure.
144             if (mc_omi_dl_err_rpt == id)
145             {
146                 o_rootCause = s;
147                 return true;
148             }
149         }
150     }
151 
152     return false; // default, nothing found
153 }
154 
155 //------------------------------------------------------------------------------
156 
157 // Will query if a signature is a potential system checkstop root cause.
158 // attention. Note that this function excludes memory channel failure attentions
159 // and core unit checkstop attentions.
160 bool __findCsRootCause(const libhei::Signature& i_signature)
161 {
162     using namespace util::pdbg;
163 
164     // PROC registers
165     static const auto eq_core_fir      = __hash(2, "EQ_CORE_FIR");
166     static const auto eq_l2_fir        = __hash(2, "EQ_L2_FIR");
167     static const auto eq_l3_fir        = __hash(2, "EQ_L3_FIR");
168     static const auto eq_ncu_fir       = __hash(2, "EQ_NCU_FIR");
169     static const auto iohs_dlp_fir_oc  = __hash(2, "IOHS_DLP_FIR_OC");
170     static const auto iohs_dlp_fir_smp = __hash(2, "IOHS_DLP_FIR_SMP");
171     static const auto nx_cq_fir        = __hash(2, "NX_CQ_FIR");
172     static const auto nx_dma_eng_fir   = __hash(2, "NX_DMA_ENG_FIR");
173     static const auto pau_fir_0        = __hash(2, "PAU_FIR_0");
174     static const auto pau_fir_1        = __hash(2, "PAU_FIR_1");
175     static const auto pau_fir_2        = __hash(2, "PAU_FIR_2");
176     static const auto pau_ptl_fir      = __hash(2, "PAU_PTL_FIR");
177 
178     // OCMB registers
179     static const auto rdffir = __hash(2, "RDFFIR");
180 
181     const auto targetType = getTrgtType(getTrgt(i_signature.getChip()));
182     const auto id         = i_signature.getId();
183     const auto bit        = i_signature.getBit();
184 
185     if (TYPE_PROC == targetType)
186     {
187         if (eq_core_fir == id &&
188             (3 == bit || 5 == bit || 8 == bit || 12 == bit || 22 == bit ||
189              25 == bit || 32 == bit || 36 == bit || 38 == bit || 46 == bit ||
190              47 == bit || 57 == bit))
191         {
192             return true;
193         }
194 
195         if (eq_l2_fir == id &&
196             (1 == bit || 12 == bit || 13 == bit || 17 == bit || 18 == bit ||
197              20 == bit || 27 == bit))
198         {
199             return true;
200         }
201 
202         if (eq_l3_fir == id &&
203             (2 == bit || 5 == bit || 8 == bit || 11 == bit || 17 == bit))
204         {
205             return true;
206         }
207 
208         if (eq_ncu_fir == id && (3 == bit || 4 == bit || 5 == bit || 7 == bit ||
209                                  8 == bit || 10 == bit || 17 == bit))
210         {
211             return true;
212         }
213 
214         if (iohs_dlp_fir_oc == id && (54 <= bit && bit <= 61))
215         {
216             return true;
217         }
218 
219         if (iohs_dlp_fir_smp == id && (54 <= bit && bit <= 61))
220         {
221             return true;
222         }
223 
224         if (nx_cq_fir == id && (7 == bit || 16 == bit || 21 == bit))
225         {
226             return true;
227         }
228 
229         if (nx_dma_eng_fir == id && (0 == bit))
230         {
231             return true;
232         }
233 
234         if (pau_fir_0 == id &&
235             (15 == bit || 18 == bit || 19 == bit || 25 == bit || 26 == bit ||
236              29 == bit || 33 == bit || 34 == bit || 35 == bit || 40 == bit ||
237              42 == bit || 44 == bit || 45 == bit))
238         {
239             return true;
240         }
241 
242         if (pau_fir_1 == id &&
243             (13 == bit || 14 == bit || 15 == bit || 37 == bit || 39 == bit ||
244              40 == bit || 41 == bit || 42 == bit))
245         {
246             return true;
247         }
248 
249         if (pau_fir_2 == id &&
250             ((4 <= bit && bit <= 18) || (20 <= bit && bit <= 31) ||
251              (36 <= bit && bit <= 41) || 45 == bit || 47 == bit || 48 == bit ||
252              50 == bit || 51 == bit || 52 == bit))
253         {
254             return true;
255         }
256 
257         if (pau_ptl_fir == id && (4 == bit || 8 == bit))
258         {
259             return true;
260         }
261     }
262     else if (TYPE_OCMB == targetType)
263     {
264         if (rdffir == id && (14 == bit || 15 == bit || 17 == bit || 37 == bit))
265         {
266             return true;
267         }
268     }
269 
270     return false; // default, nothing found
271 }
272 
273 //------------------------------------------------------------------------------
274 
275 bool __findCsRootCause_RE(const std::vector<libhei::Signature>& i_list,
276                           libhei::Signature& o_rootCause)
277 {
278     for (const auto s : i_list)
279     {
280         // Only looking for recoverable attentions.
281         if (libhei::ATTN_TYPE_RECOVERABLE != s.getAttnType())
282         {
283             continue;
284         }
285 
286         if (__findCsRootCause(s))
287         {
288             o_rootCause = s;
289             return true;
290         }
291     }
292 
293     return false; // default, nothing found
294 }
295 
296 //------------------------------------------------------------------------------
297 
298 bool __findCsRootCause_UCS(const std::vector<libhei::Signature>& i_list,
299                            libhei::Signature& o_rootCause)
300 {
301     for (const auto s : i_list)
302     {
303         // Only looking for unit checkstop attentions.
304         if (libhei::ATTN_TYPE_UNIT_CS != s.getAttnType())
305         {
306             continue;
307         }
308 
309         if (__findCsRootCause(s))
310         {
311             o_rootCause = s;
312             return true;
313         }
314     }
315 
316     return false; // default, nothing found
317 }
318 
319 //------------------------------------------------------------------------------
320 
321 bool __findNonExternalCs(const std::vector<libhei::Signature>& i_list,
322                          libhei::Signature& o_rootCause)
323 {
324     using namespace util::pdbg;
325 
326     static const auto pb_ext_fir = __hash(2, "PB_EXT_FIR");
327 
328     for (const auto s : i_list)
329     {
330         const auto targetType = getTrgtType(getTrgt(s.getChip()));
331         const auto id         = s.getId();
332         const auto attnType   = s.getAttnType();
333 
334         // Find any processor with system checkstop attention that did not
335         // originate from the PB_EXT_FIR.
336         if ((TYPE_PROC == targetType) &&
337             (libhei::ATTN_TYPE_CHECKSTOP == attnType) && (pb_ext_fir != id))
338         {
339             o_rootCause = s;
340             return true;
341         }
342     }
343 
344     return false; // default, nothing found
345 }
346 
347 //------------------------------------------------------------------------------
348 
349 bool filterRootCause(AnalysisType i_type,
350                      const libhei::IsolationData& i_isoData,
351                      libhei::Signature& o_rootCause)
352 {
353     // We'll need to make a copy of the list so that the original list is
354     // maintained for the PEL.
355     std::vector<libhei::Signature> list{i_isoData.getSignatureList()};
356 
357     // START WORKAROUND
358     // TODO: Filtering should be data driven. Until that support is available,
359     //       use the following isolation rules.
360 
361     // Ensure the list is not empty before continuing.
362     if (list.empty())
363     {
364         return false; // nothing more to do
365     }
366 
367     // First, look for any RCS OSC errors. This must always be first because
368     // they can cause downstream PLL unlock attentions.
369     if (__findRcsOscError(list, o_rootCause))
370     {
371         return true;
372     }
373 
374     // Second, look for any PLL unlock attentions. This must always be second
375     // because PLL unlock attentions can cause any number of downstream
376     // attentions, including a system checkstop.
377     if (__findPllUnlock(list, o_rootCause))
378     {
379         return true;
380     }
381 
382     // Regardless of the analysis type, always look for anything that could be
383     // blamed as the root cause of a system checkstop.
384 
385     // Memory channel failure attentions will produce SUEs and likely cause
386     // downstream attentions, including a system checkstop.
387     if (__findMemoryChannelFailure(list, o_rootCause))
388     {
389         return true;
390     }
391 
392     // Look for any recoverable attentions that have been identified as a
393     // potential root cause of a system checkstop attention. These would include
394     // any attention that would generate an SUE. Note that is it possible for
395     // recoverables to generate unit checkstop attentions so we must check them
396     // first.
397     if (__findCsRootCause_RE(list, o_rootCause))
398     {
399         return true;
400     }
401 
402     // Look for any unit checkstop attentions (other than memory channel
403     // failures) that have been identified as a potential root cause of a
404     // system checkstop attention. These would include any attention that would
405     // generate an SUE.
406     if (__findCsRootCause_UCS(list, o_rootCause))
407     {
408         return true;
409     }
410 
411     // Look for any system checkstop attentions that originated from within the
412     // chip that reported the attention. In other words, no external checkstop
413     // attentions.
414     if (__findNonExternalCs(list, o_rootCause))
415     {
416         return true;
417     }
418 
419     if (AnalysisType::SYSTEM_CHECKSTOP != i_type)
420     {
421         // No system checkstop root cause attentions were found. Next, look for
422         // any recoverable or unit checkstop attentions that could be associated
423         // with a TI.
424 
425         auto itr = std::find_if(list.begin(), list.end(), [&](const auto& t) {
426             return (libhei::ATTN_TYPE_RECOVERABLE == t.getAttnType() ||
427                     libhei::ATTN_TYPE_UNIT_CS == t.getAttnType());
428         });
429 
430         if (list.end() != itr)
431         {
432             o_rootCause = *itr;
433             return true;
434         }
435 
436         if (AnalysisType::TERMINATE_IMMEDIATE != i_type)
437         {
438             // No attentions associated with a system checkstop or TI were
439             // found. Simply, return the first entry in the list.
440             o_rootCause = list.front();
441             return true;
442         }
443     }
444 
445     // END WORKAROUND
446 
447     return false; // default, no active attentions found.
448 }
449 
450 //------------------------------------------------------------------------------
451 
452 } // namespace analyzer
453