1 #include <assert.h>
2 #include <unistd.h>
3 
4 #include <analyzer/analyzer_main.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <analyzer/service_data.hpp>
7 #include <attn/attn_dump.hpp>
8 #include <hei_main.hpp>
9 #include <util/pdbg.hpp>
10 #include <util/trace.hpp>
11 
12 namespace analyzer
13 {
14 //------------------------------------------------------------------------------
15 
16 // Forward references for externally defined functions.
17 
18 /**
19  * @brief Will get the list of active chip and initialize the isolator.
20  * @param o_chips The returned list of active chips.
21  */
22 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
23 
24 /**
25  * @brief  Will get the list of active chip and initialize the isolator.
26  * @param  i_type      The type of analysis to perform. See enum for details.
27  * @param  i_isoData   The data gathered during isolation (for FFDC).
28  * @param  o_rootCause The returned root cause signature.
29  * @param  i_rasData   The RAS data parser.
30  * @return True, if root cause has been found. False, otherwise.
31  */
32 bool filterRootCause(AnalysisType i_type,
33                      const libhei::IsolationData& i_isoData,
34                      libhei::Signature& o_rootCause,
35                      const RasDataParser& i_rasData);
36 
37 /**
38  * @brief Will create and submit a PEL using the given data.
39  * @param i_servData  Data regarding service actions gathered during analysis.
40  * @return The platform log ID. Will return zero if no PEL is generated.
41  */
42 uint32_t commitPel(const ServiceData& i_servData);
43 
44 //------------------------------------------------------------------------------
45 
__attn(libhei::AttentionType_t i_type)46 const char* __attn(libhei::AttentionType_t i_type)
47 {
48     const char* str = "";
49     switch (i_type)
50     {
51         case libhei::ATTN_TYPE_CHIP_CS:
52             str = "CHIP_CS";
53             break;
54         case libhei::ATTN_TYPE_UNIT_CS:
55             str = "UNIT_CS";
56             break;
57         case libhei::ATTN_TYPE_RECOVERABLE:
58             str = "RECOVERABLE";
59             break;
60         case libhei::ATTN_TYPE_SP_ATTN:
61             str = "SP_ATTN";
62             break;
63         case libhei::ATTN_TYPE_HOST_ATTN:
64             str = "HOST_ATTN";
65             break;
66         default:
67             trace::err("Unsupported attention type: %u", i_type);
68             assert(0);
69     }
70     return str;
71 }
72 
73 //------------------------------------------------------------------------------
74 
__analysisType(AnalysisType i_type)75 const char* __analysisType(AnalysisType i_type)
76 {
77     const char* str = "";
78     switch (i_type)
79     {
80         case AnalysisType::SYSTEM_CHECKSTOP:
81             str = "SYSTEM_CHECKSTOP";
82             break;
83         case AnalysisType::TERMINATE_IMMEDIATE:
84             str = "TERMINATE_IMMEDIATE";
85             break;
86         case AnalysisType::MANUAL:
87             str = "MANUAL";
88             break;
89         default:
90             trace::err("Unsupported analysis type: %u", i_type);
91             assert(0);
92     }
93     return str;
94 }
95 
96 //------------------------------------------------------------------------------
97 
analyzeHardware(AnalysisType i_type,attn::DumpParameters & o_dump)98 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
99 {
100     uint32_t o_plid = 0; // default, zero indicates PEL was not created
101 
102     if (!util::pdbg::queryHardwareAnalysisSupported())
103     {
104         trace::err("Hardware error analysis is not supported on this system");
105         return o_plid;
106     }
107 
108     trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
109 
110     // Initialize the isolator and get all of the chips to be analyzed.
111     trace::inf("Initializing the isolator...");
112     std::vector<libhei::Chip> chips;
113     initializeIsolator(chips);
114 
115     // Isolate attentions.
116     trace::inf("Isolating errors: # of chips=%u", chips.size());
117     libhei::IsolationData isoData{};
118     libhei::isolate(chips, isoData);
119 
120     // For debug, trace out the original list of signatures before filtering.
121     for (const auto& sig : isoData.getSignatureList())
122     {
123         trace::inf("Signature: %s 0x%0" PRIx32 " %s",
124                    util::pdbg::getPath(sig.getChip()), sig.toUint32(),
125                    __attn(sig.getAttnType()));
126     }
127 
128     // Filter for root cause attention.
129     libhei::Signature rootCause{};
130     RasDataParser rasData{};
131     bool attnFound = false;
132     try
133     {
134         attnFound = filterRootCause(i_type, isoData, rootCause, rasData);
135     }
136     catch (const std::exception& e)
137     {
138         trace::err("Exception caught during root cause filtering");
139         trace::err(e.what());
140         attnFound = false; // just in case
141     }
142 
143     // If a root cause attention was found, or if this was a system checkstop,
144     // generate a PEL.
145     if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
146     {
147         if (attnFound)
148         {
149             trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
150                        util::pdbg::getPath(rootCause.getChip()),
151                        rootCause.toUint32(), __attn(rootCause.getAttnType()));
152         }
153         else
154         {
155             // This is bad. Analysis should have found a root cause attention
156             // for a system checkstop. Issues could range from code bugs to SCOM
157             // errors. Regardless, generate a PEL with FFDC to assist with
158             // debug.
159             trace::err("System checkstop with no root cause attention");
160             rootCause = libhei::Signature{}; // just in case
161         }
162 
163         // Start building the service data.
164         ServiceData servData{rootCause, i_type, isoData};
165 
166         // Apply any service actions, if needed. Note that there are no
167         // resolutions for manual analysis.
168         if (AnalysisType::MANUAL != i_type)
169         {
170             if (attnFound)
171             {
172                 try
173                 {
174                     // Resolve the root cause attention.
175                     rasData.getResolution(rootCause)->resolve(servData);
176                 }
177                 catch (const std::exception& e)
178                 {
179                     trace::err("Exception caught during root cause analysis");
180                     trace::err(e.what());
181 
182                     // We'll still want to create a PEL for the FFDC, but
183                     // since the analysis failed, we need to callout Level 2
184                     // Support.
185                     servData.calloutProcedure(callout::Procedure::NEXTLVL,
186                                               callout::Priority::HIGH);
187                 }
188             }
189             else
190             {
191                 // Analysis failed so callout the Level 2 Support.
192                 servData.calloutProcedure(callout::Procedure::NEXTLVL,
193                                           callout::Priority::HIGH);
194             }
195         }
196 
197         // Create and commit a PEL.
198         o_plid = commitPel(servData);
199 
200         if (0 == o_plid)
201         {
202             trace::err("Failed to create PEL");
203         }
204         else
205         {
206             trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
207 
208             // Gather/return information needed for dump. A hardware dump will
209             // always be used for system checkstop attenions. Software dumps
210             // will be reserved for MP-IPLs during TI analysis.
211             // TODO: Need ID from root cause. At the moment, HUID does not exist
212             //       in devtree. Will need a better ID definition.
213             o_dump.unitId = 0;
214             o_dump.dumpType = attn::DumpType::Hardware;
215         }
216     }
217     else
218     {
219         // It is possible for TI handling, or manually initiated analysis via
220         // the command line, that there will not be an active attention. In
221         // which case, we will do nothing and let the caller of this function
222         // determine if this is the expected behavior.
223         trace::inf("No active attentions found");
224     }
225 
226     // All done, clean up the isolator.
227     trace::inf("Uninitializing isolator...");
228     libhei::uninitialize();
229 
230     trace::inf("<<< exit analyzeHardware()");
231 
232     return o_plid;
233 }
234 
235 //------------------------------------------------------------------------------
236 
237 } // namespace analyzer
238