1 #include <assert.h>
2 #include <unistd.h>
3 
4 #include <analyzer/analyzer_main.hpp>
5 #include <analyzer/ras-data/ras-data-parser.hpp>
6 #include <analyzer/service_data.hpp>
7 #include <attn/attn_dump.hpp>
8 #include <hei_main.hpp>
9 #include <util/pdbg.hpp>
10 #include <util/trace.hpp>
11 
12 namespace analyzer
13 {
14 
15 //------------------------------------------------------------------------------
16 
17 // Forward references for externally defined functions.
18 
19 /**
20  * @brief Will get the list of active chip and initialize the isolator.
21  * @param o_chips The returned list of active chips.
22  */
23 void initializeIsolator(std::vector<libhei::Chip>& o_chips);
24 
25 /**
26  * @brief  Will get the list of active chip and initialize the isolator.
27  * @param  i_type      The type of analysis to perform. See enum for details.
28  * @param  i_isoData   The data gathered during isolation (for FFDC).
29  * @param  o_rootCause The returned root cause signature.
30  * @return True, if root cause has been found. False, otherwise.
31  */
32 bool filterRootCause(AnalysisType i_type,
33                      const libhei::IsolationData& i_isoData,
34                      libhei::Signature& o_rootCause);
35 
36 /**
37  * @brief Will create and submit a PEL using the given data.
38  * @param i_isoData   The data gathered during isolation (for FFDC).
39  * @param i_servData  Data regarding service actions gathered during analysis.
40  * @return The platform log ID. Will return zero if no PEL is generated.
41  */
42 uint32_t createPel(const libhei::IsolationData& i_isoData,
43                    const ServiceData& i_servData);
44 
45 //------------------------------------------------------------------------------
46 
47 const char* __attn(libhei::AttentionType_t i_type)
48 {
49     const char* str = "";
50     switch (i_type)
51     {
52         case libhei::ATTN_TYPE_CHECKSTOP:
53             str = "CHECKSTOP";
54             break;
55         case libhei::ATTN_TYPE_UNIT_CS:
56             str = "UNIT_CS";
57             break;
58         case libhei::ATTN_TYPE_RECOVERABLE:
59             str = "RECOVERABLE";
60             break;
61         case libhei::ATTN_TYPE_SP_ATTN:
62             str = "SP_ATTN";
63             break;
64         case libhei::ATTN_TYPE_HOST_ATTN:
65             str = "HOST_ATTN";
66             break;
67         default:
68             trace::err("Unsupported attention type: %u", i_type);
69             assert(0);
70     }
71     return str;
72 }
73 
74 //------------------------------------------------------------------------------
75 
76 const char* __analysisType(AnalysisType i_type)
77 {
78     const char* str = "";
79     switch (i_type)
80     {
81         case AnalysisType::SYSTEM_CHECKSTOP:
82             str = "SYSTEM_CHECKSTOP";
83             break;
84         case AnalysisType::TERMINATE_IMMEDIATE:
85             str = "TERMINATE_IMMEDIATE";
86             break;
87         case AnalysisType::MANUAL:
88             str = "MANUAL";
89             break;
90         default:
91             trace::err("Unsupported analysis type: %u", i_type);
92             assert(0);
93     }
94     return str;
95 }
96 
97 //------------------------------------------------------------------------------
98 
99 uint32_t analyzeHardware(AnalysisType i_type, attn::DumpParameters& o_dump)
100 {
101     uint32_t o_plid = 0; // default, zero indicates PEL was not created
102 
103     if (!util::pdbg::queryHardwareAnalysisSupported())
104     {
105         trace::err("Hardware error analysis is not supported on this system");
106         return o_plid;
107     }
108 
109     trace::inf(">>> enter analyzeHardware(%s)", __analysisType(i_type));
110 
111     // Initialize the isolator and get all of the chips to be analyzed.
112     trace::inf("Initializing the isolator...");
113     std::vector<libhei::Chip> chips;
114     initializeIsolator(chips);
115 
116     // Isolate attentions.
117     trace::inf("Isolating errors: # of chips=%u", chips.size());
118     libhei::IsolationData isoData{};
119     libhei::isolate(chips, isoData);
120 
121     // For debug, trace out the original list of signatures before filtering.
122     for (const auto& sig : isoData.getSignatureList())
123     {
124         trace::inf("Signature: %s 0x%0" PRIx32 " %s",
125                    util::pdbg::getPath(sig.getChip()), sig.toUint32(),
126                    __attn(sig.getAttnType()));
127     }
128 
129     // Filter for root cause attention.
130     libhei::Signature rootCause{};
131     bool attnFound = filterRootCause(i_type, isoData, rootCause);
132 
133     // If a root cause attention was found, or if this was a system checkstop,
134     // generate a PEL.
135     if (attnFound || AnalysisType::SYSTEM_CHECKSTOP == i_type)
136     {
137         if (attnFound)
138         {
139             trace::inf("Root cause attention: %s 0x%0" PRIx32 " %s",
140                        util::pdbg::getPath(rootCause.getChip()),
141                        rootCause.toUint32(), __attn(rootCause.getAttnType()));
142         }
143         else
144         {
145             // This is bad. Analysis should have found a root cause attention
146             // for a system checkstop. Issues could range from code bugs to SCOM
147             // errors. Regardless, generate a PEL with FFDC to assist with
148             // debug.
149             trace::err("System checkstop with no root cause attention");
150             rootCause = libhei::Signature{}; // just in case
151         }
152 
153         // Start building the service data.
154         ServiceData servData{rootCause, i_type};
155 
156         // Apply any service actions, if needed. Note that there are no
157         // resolutions for manual analysis.
158         if (AnalysisType::MANUAL != i_type)
159         {
160             if (attnFound)
161             {
162                 // Resolve the root cause attention.
163                 RasDataParser rasData{};
164                 rasData.getResolution(rootCause)->resolve(servData);
165             }
166             else
167             {
168                 // Analysis failed so apply the Level 2 Support resolution.
169                 ProcedureCalloutResolution res{callout::Procedure::NEXTLVL,
170                                                callout::Priority::HIGH};
171                 res.resolve(servData);
172             }
173         }
174 
175         // Create and commit a PEL.
176         o_plid = createPel(isoData, servData);
177 
178         if (0 == o_plid)
179         {
180             trace::err("Failed to create PEL");
181         }
182         else
183         {
184             trace::inf("PEL created: PLID=0x%0" PRIx32, o_plid);
185 
186             // Gather/return information needed for dump. A hardware dump will
187             // always be used for system checkstop attenions. Software dumps
188             // will be reserved for MP-IPLs during TI analysis.
189             // TODO: Need ID from root cause. At the moment, HUID does not exist
190             //       in devtree. Will need a better ID definition.
191             o_dump.unitId   = 0;
192             o_dump.dumpType = attn::DumpType::Hardware;
193         }
194     }
195     else
196     {
197         // It is possible for TI handling, or manually initiated analysis via
198         // the command line, that there will not be an active attention. In
199         // which case, we will do nothing and let the caller of this function
200         // determine if this is the expected behavior.
201         trace::inf("No active attentions found");
202     }
203 
204     // All done, clean up the isolator.
205     trace::inf("Uninitializing isolator...");
206     libhei::uninitialize();
207 
208     trace::inf("<<< exit analyzeHardware()");
209 
210     return o_plid;
211 }
212 
213 //------------------------------------------------------------------------------
214 
215 /**
216  * @brief Get error isolator build information
217  *
218  * @return Pointer to build information
219  */
220 const char* getBuildInfo()
221 {
222     return libhei::getBuildInfo();
223 }
224 
225 } // namespace analyzer
226